fix(csv-parse): record_delimiter and non default encoding (fix #365)

adaltas · Oct 12, 2022 · 16fdb2d · 16fdb2d
1 parent 8ed0e18
commit 16fdb2d
Show file tree

Hide file tree

Showing 15 changed files with 392 additions and 191 deletions.
diff --git a/demo/issues-esm/lib/365-utf16le-bom-windows.csv b/demo/issues-esm/lib/365-utf16le-bom-windows.csv
diff --git a/demo/issues-esm/lib/365-utf8-bom-windows.csv b/demo/issues-esm/lib/365-utf8-bom-windows.csv
@@ -0,0 +1,2 @@
+a,b,c
+1,2,3
diff --git a/demo/issues-esm/lib/365.js b/demo/issues-esm/lib/365.js
@@ -0,0 +1,20 @@
+
+import fs from 'fs/promises'
+import { parse } from 'csv-parse/sync'
+import { dirname, join } from 'path'
+import { fileURLToPath } from 'url'
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+(async () => {
+  const data = await fs.readFile(`${__dirname}/365-utf16le-bom-windows.csv`)
+  const records = parse(data, {bom: true})
+  console.log('utf16le', records)
+})();
+
+(async () => {
+  const data = await fs.readFile(`${__dirname}/365-utf8-bom-windows.csv`)
+  const records = parse(data, {bom: true})
+  console.log('utf8', records)
+})();
diff --git a/packages/csv-parse/dist/cjs/index.cjs b/packages/csv-parse/dist/cjs/index.cjs
@@ -155,7 +155,7 @@ const init_state = function(options){
     record: [],
     recordHasError: false,
     record_length: 0,
-    recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
+    recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
     trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
     wasQuoting: false,
     wasRowDelimiter: false,
@@ -620,16 +620,22 @@ const transform = function(original_options = {}) {
     state: init_state(options),
     __needMoreData: function(i, bufLen, end){
       if(end) return false;
-      const {quote} = this.options;
+      const {encoding, escape, quote} = this.options;
       const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
       const numOfCharLeft = bufLen - i - 1;
       const requiredLength = Math.max(
         needMoreDataSize,
         // Skip if the remaining buffer smaller than record delimiter
-        recordDelimiterMaxLength,
-        // Skip if the remaining buffer can be record delimiter following the closing quote
-        // 1 is for quote.length
-        quoting ? (quote.length + recordDelimiterMaxLength) : 0,
+        // If "record_delimiter" is yet to be discovered:
+        // 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
+        // 2. We set the length to windows line ending in the current encoding
+        // Note, that encoding is known from user or bom discovery at that point
+        // recordDelimiterMaxLength,
+        recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
+        // Skip if remaining buffer can be an escaped quote
+        quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
+        // Skip if remaining buffer can be record delimiter following the closing quote
+        quoting ? (quote.length) : 0,
       );
       return numOfCharLeft < requiredLength;
     },
@@ -1224,22 +1230,26 @@ const transform = function(original_options = {}) {
       return true;
     },
     __autoDiscoverRecordDelimiter: function(buf, pos){
-      const {encoding} = this.options;
-      const chr = buf[pos];
-      if(chr === cr){
-        if(buf[pos+1] === nl){
-          this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
-          this.state.recordDelimiterMaxLength = 2;
-          return 2;
-        }else {
-          this.options.record_delimiter.push(Buffer.from('\r', encoding));
-          this.state.recordDelimiterMaxLength = 1;
-          return 1;
+      const { encoding } = this.options;
+      // Note, we don't need to cache this information in state,
+      // It is only called on the first line until we find out a suitable
+      // record delimiter.
+      const rds = [
+        // Important, the windows line ending must be before mac os 9
+        Buffer.from('\r\n', encoding),
+        Buffer.from('\n', encoding),
+        Buffer.from('\r', encoding),
+      ];
+      loop: for(let i = 0; i < rds.length; i++){
+        const l = rds[i].length;
+        for(let j = 0; j < l; j++){
+          if(rds[i][j] !== buf[pos + j]){
+            continue loop;
+          }
         }
-      }else if(chr === nl){
-        this.options.record_delimiter.push(Buffer.from('\n', encoding));
-        this.state.recordDelimiterMaxLength = 1;
-        return 1;
+        this.options.record_delimiter.push(rds[i]);
+        this.state.recordDelimiterMaxLength = rds[i].length;
+        return rds[i].length;
       }
       return 0;
     },

diff --git a/packages/csv-parse/dist/cjs/sync.cjs b/packages/csv-parse/dist/cjs/sync.cjs
@@ -153,7 +153,7 @@ const init_state = function(options){
     record: [],
     recordHasError: false,
     record_length: 0,
-    recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
+    recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
     trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
     wasQuoting: false,
     wasRowDelimiter: false,
@@ -618,16 +618,22 @@ const transform = function(original_options = {}) {
     state: init_state(options),
     __needMoreData: function(i, bufLen, end){
       if(end) return false;
-      const {quote} = this.options;
+      const {encoding, escape, quote} = this.options;
       const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
       const numOfCharLeft = bufLen - i - 1;
       const requiredLength = Math.max(
         needMoreDataSize,
         // Skip if the remaining buffer smaller than record delimiter
-        recordDelimiterMaxLength,
-        // Skip if the remaining buffer can be record delimiter following the closing quote
-        // 1 is for quote.length
-        quoting ? (quote.length + recordDelimiterMaxLength) : 0,
+        // If "record_delimiter" is yet to be discovered:
+        // 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
+        // 2. We set the length to windows line ending in the current encoding
+        // Note, that encoding is known from user or bom discovery at that point
+        // recordDelimiterMaxLength,
+        recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
+        // Skip if remaining buffer can be an escaped quote
+        quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
+        // Skip if remaining buffer can be record delimiter following the closing quote
+        quoting ? (quote.length) : 0,
       );
       return numOfCharLeft < requiredLength;
     },
@@ -1222,22 +1228,26 @@ const transform = function(original_options = {}) {
       return true;
     },
     __autoDiscoverRecordDelimiter: function(buf, pos){
-      const {encoding} = this.options;
-      const chr = buf[pos];
-      if(chr === cr){
-        if(buf[pos+1] === nl){
-          this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
-          this.state.recordDelimiterMaxLength = 2;
-          return 2;
-        }else {
-          this.options.record_delimiter.push(Buffer.from('\r', encoding));
-          this.state.recordDelimiterMaxLength = 1;
-          return 1;
+      const { encoding } = this.options;
+      // Note, we don't need to cache this information in state,
+      // It is only called on the first line until we find out a suitable
+      // record delimiter.
+      const rds = [
+        // Important, the windows line ending must be before mac os 9
+        Buffer.from('\r\n', encoding),
+        Buffer.from('\n', encoding),
+        Buffer.from('\r', encoding),
+      ];
+      loop: for(let i = 0; i < rds.length; i++){
+        const l = rds[i].length;
+        for(let j = 0; j < l; j++){
+          if(rds[i][j] !== buf[pos + j]){
+            continue loop;
+          }
         }
-      }else if(chr === nl){
-        this.options.record_delimiter.push(Buffer.from('\n', encoding));
-        this.state.recordDelimiterMaxLength = 1;
-        return 1;
+        this.options.record_delimiter.push(rds[i]);
+        this.state.recordDelimiterMaxLength = rds[i].length;
+        return rds[i].length;
       }
       return 0;
     },

diff --git a/packages/csv-parse/dist/esm/index.js b/packages/csv-parse/dist/esm/index.js
@@ -5211,7 +5211,7 @@ const init_state = function(options){
     record: [],
     recordHasError: false,
     record_length: 0,
-    recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
+    recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
     trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
     wasQuoting: false,
     wasRowDelimiter: false,
@@ -5676,16 +5676,22 @@ const transform = function(original_options = {}) {
     state: init_state(options),
     __needMoreData: function(i, bufLen, end){
       if(end) return false;
-      const {quote} = this.options;
+      const {encoding, escape, quote} = this.options;
       const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
       const numOfCharLeft = bufLen - i - 1;
       const requiredLength = Math.max(
         needMoreDataSize,
         // Skip if the remaining buffer smaller than record delimiter
-        recordDelimiterMaxLength,
-        // Skip if the remaining buffer can be record delimiter following the closing quote
-        // 1 is for quote.length
-        quoting ? (quote.length + recordDelimiterMaxLength) : 0,
+        // If "record_delimiter" is yet to be discovered:
+        // 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
+        // 2. We set the length to windows line ending in the current encoding
+        // Note, that encoding is known from user or bom discovery at that point
+        // recordDelimiterMaxLength,
+        recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
+        // Skip if remaining buffer can be an escaped quote
+        quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
+        // Skip if remaining buffer can be record delimiter following the closing quote
+        quoting ? (quote.length) : 0,
       );
       return numOfCharLeft < requiredLength;
     },
@@ -6280,22 +6286,26 @@ const transform = function(original_options = {}) {
       return true;
     },
     __autoDiscoverRecordDelimiter: function(buf, pos){
-      const {encoding} = this.options;
-      const chr = buf[pos];
-      if(chr === cr){
-        if(buf[pos+1] === nl){
-          this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
-          this.state.recordDelimiterMaxLength = 2;
-          return 2;
-        }else {
-          this.options.record_delimiter.push(Buffer.from('\r', encoding));
-          this.state.recordDelimiterMaxLength = 1;
-          return 1;
+      const { encoding } = this.options;
+      // Note, we don't need to cache this information in state,
+      // It is only called on the first line until we find out a suitable
+      // record delimiter.
+      const rds = [
+        // Important, the windows line ending must be before mac os 9
+        Buffer.from('\r\n', encoding),
+        Buffer.from('\n', encoding),
+        Buffer.from('\r', encoding),
+      ];
+      loop: for(let i = 0; i < rds.length; i++){
+        const l = rds[i].length;
+        for(let j = 0; j < l; j++){
+          if(rds[i][j] !== buf[pos + j]){
+            continue loop;
+          }
         }
-      }else if(chr === nl){
-        this.options.record_delimiter.push(Buffer.from('\n', encoding));
-        this.state.recordDelimiterMaxLength = 1;
-        return 1;
+        this.options.record_delimiter.push(rds[i]);
+        this.state.recordDelimiterMaxLength = rds[i].length;
+        return rds[i].length;
       }
       return 0;
     },

diff --git a/packages/csv-parse/dist/esm/sync.js b/packages/csv-parse/dist/esm/sync.js
@@ -2121,7 +2121,7 @@ const init_state = function(options){
     record: [],
     recordHasError: false,
     record_length: 0,
-    recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
+    recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
     trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
     wasQuoting: false,
     wasRowDelimiter: false,
@@ -2586,16 +2586,22 @@ const transform = function(original_options = {}) {
     state: init_state(options),
     __needMoreData: function(i, bufLen, end){
       if(end) return false;
-      const {quote} = this.options;
+      const {encoding, escape, quote} = this.options;
       const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
       const numOfCharLeft = bufLen - i - 1;
       const requiredLength = Math.max(
         needMoreDataSize,
         // Skip if the remaining buffer smaller than record delimiter
-        recordDelimiterMaxLength,
-        // Skip if the remaining buffer can be record delimiter following the closing quote
-        // 1 is for quote.length
-        quoting ? (quote.length + recordDelimiterMaxLength) : 0,
+        // If "record_delimiter" is yet to be discovered:
+        // 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
+        // 2. We set the length to windows line ending in the current encoding
+        // Note, that encoding is known from user or bom discovery at that point
+        // recordDelimiterMaxLength,
+        recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
+        // Skip if remaining buffer can be an escaped quote
+        quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
+        // Skip if remaining buffer can be record delimiter following the closing quote
+        quoting ? (quote.length) : 0,
       );
       return numOfCharLeft < requiredLength;
     },
@@ -3190,22 +3196,26 @@ const transform = function(original_options = {}) {
       return true;
     },
     __autoDiscoverRecordDelimiter: function(buf, pos){
-      const {encoding} = this.options;
-      const chr = buf[pos];
-      if(chr === cr){
-        if(buf[pos+1] === nl){
-          this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
-          this.state.recordDelimiterMaxLength = 2;
-          return 2;
-        }else {
-          this.options.record_delimiter.push(Buffer.from('\r', encoding));
-          this.state.recordDelimiterMaxLength = 1;
-          return 1;
+      const { encoding } = this.options;
+      // Note, we don't need to cache this information in state,
+      // It is only called on the first line until we find out a suitable
+      // record delimiter.
+      const rds = [
+        // Important, the windows line ending must be before mac os 9
+        Buffer.from('\r\n', encoding),
+        Buffer.from('\n', encoding),
+        Buffer.from('\r', encoding),
+      ];
+      loop: for(let i = 0; i < rds.length; i++){
+        const l = rds[i].length;
+        for(let j = 0; j < l; j++){
+          if(rds[i][j] !== buf[pos + j]){
+            continue loop;
+          }
         }
-      }else if(chr === nl){
-        this.options.record_delimiter.push(Buffer.from('\n', encoding));
-        this.state.recordDelimiterMaxLength = 1;
-        return 1;
+        this.options.record_delimiter.push(rds[i]);
+        this.state.recordDelimiterMaxLength = rds[i].length;
+        return rds[i].length;
       }
       return 0;
     },