Skip to content

Commit

Permalink
fix(csv-parse): record_delimiter and non default encoding (fix #365)
Browse files Browse the repository at this point in the history
  • Loading branch information
wdavidw committed Oct 12, 2022
1 parent 8ed0e18 commit 16fdb2d
Show file tree
Hide file tree
Showing 15 changed files with 392 additions and 191 deletions.
Binary file added demo/issues-esm/lib/365-utf16le-bom-windows.csv
Binary file not shown.
2 changes: 2 additions & 0 deletions demo/issues-esm/lib/365-utf8-bom-windows.csv
@@ -0,0 +1,2 @@
a,b,c
1,2,3
20 changes: 20 additions & 0 deletions demo/issues-esm/lib/365.js
@@ -0,0 +1,20 @@

import fs from 'fs/promises'
import { parse } from 'csv-parse/sync'
import { dirname, join } from 'path'
import { fileURLToPath } from 'url'

const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

(async () => {
const data = await fs.readFile(`${__dirname}/365-utf16le-bom-windows.csv`)
const records = parse(data, {bom: true})
console.log('utf16le', records)
})();

(async () => {
const data = await fs.readFile(`${__dirname}/365-utf8-bom-windows.csv`)
const records = parse(data, {bom: true})
console.log('utf8', records)
})();
52 changes: 31 additions & 21 deletions packages/csv-parse/dist/cjs/index.cjs
Expand Up @@ -155,7 +155,7 @@ const init_state = function(options){
record: [],
recordHasError: false,
record_length: 0,
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
wasQuoting: false,
wasRowDelimiter: false,
Expand Down Expand Up @@ -620,16 +620,22 @@ const transform = function(original_options = {}) {
state: init_state(options),
__needMoreData: function(i, bufLen, end){
if(end) return false;
const {quote} = this.options;
const {encoding, escape, quote} = this.options;
const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
const numOfCharLeft = bufLen - i - 1;
const requiredLength = Math.max(
needMoreDataSize,
// Skip if the remaining buffer smaller than record delimiter
recordDelimiterMaxLength,
// Skip if the remaining buffer can be record delimiter following the closing quote
// 1 is for quote.length
quoting ? (quote.length + recordDelimiterMaxLength) : 0,
// If "record_delimiter" is yet to be discovered:
// 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
// 2. We set the length to windows line ending in the current encoding
// Note, that encoding is known from user or bom discovery at that point
// recordDelimiterMaxLength,
recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
// Skip if remaining buffer can be an escaped quote
quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
// Skip if remaining buffer can be record delimiter following the closing quote
quoting ? (quote.length) : 0,
);
return numOfCharLeft < requiredLength;
},
Expand Down Expand Up @@ -1224,22 +1230,26 @@ const transform = function(original_options = {}) {
return true;
},
__autoDiscoverRecordDelimiter: function(buf, pos){
const {encoding} = this.options;
const chr = buf[pos];
if(chr === cr){
if(buf[pos+1] === nl){
this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
this.state.recordDelimiterMaxLength = 2;
return 2;
}else {
this.options.record_delimiter.push(Buffer.from('\r', encoding));
this.state.recordDelimiterMaxLength = 1;
return 1;
const { encoding } = this.options;
// Note, we don't need to cache this information in state,
// It is only called on the first line until we find out a suitable
// record delimiter.
const rds = [
// Important, the windows line ending must be before mac os 9
Buffer.from('\r\n', encoding),
Buffer.from('\n', encoding),
Buffer.from('\r', encoding),
];
loop: for(let i = 0; i < rds.length; i++){
const l = rds[i].length;
for(let j = 0; j < l; j++){
if(rds[i][j] !== buf[pos + j]){
continue loop;
}
}
}else if(chr === nl){
this.options.record_delimiter.push(Buffer.from('\n', encoding));
this.state.recordDelimiterMaxLength = 1;
return 1;
this.options.record_delimiter.push(rds[i]);
this.state.recordDelimiterMaxLength = rds[i].length;
return rds[i].length;
}
return 0;
},
Expand Down
52 changes: 31 additions & 21 deletions packages/csv-parse/dist/cjs/sync.cjs
Expand Up @@ -153,7 +153,7 @@ const init_state = function(options){
record: [],
recordHasError: false,
record_length: 0,
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
wasQuoting: false,
wasRowDelimiter: false,
Expand Down Expand Up @@ -618,16 +618,22 @@ const transform = function(original_options = {}) {
state: init_state(options),
__needMoreData: function(i, bufLen, end){
if(end) return false;
const {quote} = this.options;
const {encoding, escape, quote} = this.options;
const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
const numOfCharLeft = bufLen - i - 1;
const requiredLength = Math.max(
needMoreDataSize,
// Skip if the remaining buffer smaller than record delimiter
recordDelimiterMaxLength,
// Skip if the remaining buffer can be record delimiter following the closing quote
// 1 is for quote.length
quoting ? (quote.length + recordDelimiterMaxLength) : 0,
// If "record_delimiter" is yet to be discovered:
// 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
// 2. We set the length to windows line ending in the current encoding
// Note, that encoding is known from user or bom discovery at that point
// recordDelimiterMaxLength,
recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
// Skip if remaining buffer can be an escaped quote
quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
// Skip if remaining buffer can be record delimiter following the closing quote
quoting ? (quote.length) : 0,
);
return numOfCharLeft < requiredLength;
},
Expand Down Expand Up @@ -1222,22 +1228,26 @@ const transform = function(original_options = {}) {
return true;
},
__autoDiscoverRecordDelimiter: function(buf, pos){
const {encoding} = this.options;
const chr = buf[pos];
if(chr === cr){
if(buf[pos+1] === nl){
this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
this.state.recordDelimiterMaxLength = 2;
return 2;
}else {
this.options.record_delimiter.push(Buffer.from('\r', encoding));
this.state.recordDelimiterMaxLength = 1;
return 1;
const { encoding } = this.options;
// Note, we don't need to cache this information in state,
// It is only called on the first line until we find out a suitable
// record delimiter.
const rds = [
// Important, the windows line ending must be before mac os 9
Buffer.from('\r\n', encoding),
Buffer.from('\n', encoding),
Buffer.from('\r', encoding),
];
loop: for(let i = 0; i < rds.length; i++){
const l = rds[i].length;
for(let j = 0; j < l; j++){
if(rds[i][j] !== buf[pos + j]){
continue loop;
}
}
}else if(chr === nl){
this.options.record_delimiter.push(Buffer.from('\n', encoding));
this.state.recordDelimiterMaxLength = 1;
return 1;
this.options.record_delimiter.push(rds[i]);
this.state.recordDelimiterMaxLength = rds[i].length;
return rds[i].length;
}
return 0;
},
Expand Down
52 changes: 31 additions & 21 deletions packages/csv-parse/dist/esm/index.js
Expand Up @@ -5211,7 +5211,7 @@ const init_state = function(options){
record: [],
recordHasError: false,
record_length: 0,
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
wasQuoting: false,
wasRowDelimiter: false,
Expand Down Expand Up @@ -5676,16 +5676,22 @@ const transform = function(original_options = {}) {
state: init_state(options),
__needMoreData: function(i, bufLen, end){
if(end) return false;
const {quote} = this.options;
const {encoding, escape, quote} = this.options;
const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
const numOfCharLeft = bufLen - i - 1;
const requiredLength = Math.max(
needMoreDataSize,
// Skip if the remaining buffer smaller than record delimiter
recordDelimiterMaxLength,
// Skip if the remaining buffer can be record delimiter following the closing quote
// 1 is for quote.length
quoting ? (quote.length + recordDelimiterMaxLength) : 0,
// If "record_delimiter" is yet to be discovered:
// 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
// 2. We set the length to windows line ending in the current encoding
// Note, that encoding is known from user or bom discovery at that point
// recordDelimiterMaxLength,
recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
// Skip if remaining buffer can be an escaped quote
quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
// Skip if remaining buffer can be record delimiter following the closing quote
quoting ? (quote.length) : 0,
);
return numOfCharLeft < requiredLength;
},
Expand Down Expand Up @@ -6280,22 +6286,26 @@ const transform = function(original_options = {}) {
return true;
},
__autoDiscoverRecordDelimiter: function(buf, pos){
const {encoding} = this.options;
const chr = buf[pos];
if(chr === cr){
if(buf[pos+1] === nl){
this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
this.state.recordDelimiterMaxLength = 2;
return 2;
}else {
this.options.record_delimiter.push(Buffer.from('\r', encoding));
this.state.recordDelimiterMaxLength = 1;
return 1;
const { encoding } = this.options;
// Note, we don't need to cache this information in state,
// It is only called on the first line until we find out a suitable
// record delimiter.
const rds = [
// Important, the windows line ending must be before mac os 9
Buffer.from('\r\n', encoding),
Buffer.from('\n', encoding),
Buffer.from('\r', encoding),
];
loop: for(let i = 0; i < rds.length; i++){
const l = rds[i].length;
for(let j = 0; j < l; j++){
if(rds[i][j] !== buf[pos + j]){
continue loop;
}
}
}else if(chr === nl){
this.options.record_delimiter.push(Buffer.from('\n', encoding));
this.state.recordDelimiterMaxLength = 1;
return 1;
this.options.record_delimiter.push(rds[i]);
this.state.recordDelimiterMaxLength = rds[i].length;
return rds[i].length;
}
return 0;
},
Expand Down
52 changes: 31 additions & 21 deletions packages/csv-parse/dist/esm/sync.js
Expand Up @@ -2121,7 +2121,7 @@ const init_state = function(options){
record: [],
recordHasError: false,
record_length: 0,
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)),
recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)),
trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]],
wasQuoting: false,
wasRowDelimiter: false,
Expand Down Expand Up @@ -2586,16 +2586,22 @@ const transform = function(original_options = {}) {
state: init_state(options),
__needMoreData: function(i, bufLen, end){
if(end) return false;
const {quote} = this.options;
const {encoding, escape, quote} = this.options;
const {quoting, needMoreDataSize, recordDelimiterMaxLength} = this.state;
const numOfCharLeft = bufLen - i - 1;
const requiredLength = Math.max(
needMoreDataSize,
// Skip if the remaining buffer smaller than record delimiter
recordDelimiterMaxLength,
// Skip if the remaining buffer can be record delimiter following the closing quote
// 1 is for quote.length
quoting ? (quote.length + recordDelimiterMaxLength) : 0,
// If "record_delimiter" is yet to be discovered:
// 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
// 2. We set the length to windows line ending in the current encoding
// Note, that encoding is known from user or bom discovery at that point
// recordDelimiterMaxLength,
recordDelimiterMaxLength === 0 ? Buffer.from('\r\n', encoding).length : recordDelimiterMaxLength,
// Skip if remaining buffer can be an escaped quote
quoting ? ((escape === null ? 0 : escape.length) + quote.length) : 0,
// Skip if remaining buffer can be record delimiter following the closing quote
quoting ? (quote.length) : 0,
);
return numOfCharLeft < requiredLength;
},
Expand Down Expand Up @@ -3190,22 +3196,26 @@ const transform = function(original_options = {}) {
return true;
},
__autoDiscoverRecordDelimiter: function(buf, pos){
const {encoding} = this.options;
const chr = buf[pos];
if(chr === cr){
if(buf[pos+1] === nl){
this.options.record_delimiter.push(Buffer.from('\r\n', encoding));
this.state.recordDelimiterMaxLength = 2;
return 2;
}else {
this.options.record_delimiter.push(Buffer.from('\r', encoding));
this.state.recordDelimiterMaxLength = 1;
return 1;
const { encoding } = this.options;
// Note, we don't need to cache this information in state,
// It is only called on the first line until we find out a suitable
// record delimiter.
const rds = [
// Important, the windows line ending must be before mac os 9
Buffer.from('\r\n', encoding),
Buffer.from('\n', encoding),
Buffer.from('\r', encoding),
];
loop: for(let i = 0; i < rds.length; i++){
const l = rds[i].length;
for(let j = 0; j < l; j++){
if(rds[i][j] !== buf[pos + j]){
continue loop;
}
}
}else if(chr === nl){
this.options.record_delimiter.push(Buffer.from('\n', encoding));
this.state.recordDelimiterMaxLength = 1;
return 1;
this.options.record_delimiter.push(rds[i]);
this.state.recordDelimiterMaxLength = rds[i].length;
return rds[i].length;
}
return 0;
},
Expand Down

0 comments on commit 16fdb2d

Please sign in to comment.