From 8bf52f0d5c25ee2423cb1629d3e9103534668c83 Mon Sep 17 00:00:00 2001 From: David Worms Date: Wed, 29 Jun 2022 12:29:50 +0200 Subject: [PATCH] fix(csv-parse): rtrim encoding support (fix #349) --- packages/csv-parse/dist/cjs/index.cjs | 60 ++++++++++++++++------ packages/csv-parse/dist/cjs/sync.cjs | 60 ++++++++++++++++------ packages/csv-parse/dist/esm/index.js | 60 ++++++++++++++++------ packages/csv-parse/dist/esm/sync.js | 60 ++++++++++++++++------ packages/csv-parse/dist/iife/index.js | 60 ++++++++++++++++------ packages/csv-parse/dist/iife/sync.js | 60 ++++++++++++++++------ packages/csv-parse/dist/umd/index.js | 60 ++++++++++++++++------ packages/csv-parse/dist/umd/sync.js | 60 ++++++++++++++++------ packages/csv-parse/lib/api/index.js | 41 +++++++++------ packages/csv-parse/lib/api/init_state.js | 19 ++++++- packages/csv-parse/test/option.trim.coffee | 27 ++++++++++ 11 files changed, 414 insertions(+), 153 deletions(-) diff --git a/packages/csv-parse/dist/cjs/index.cjs b/packages/csv-parse/dist/cjs/index.cjs index d960b5f09..052adac81 100644 --- a/packages/csv-parse/dist/cjs/index.cjs +++ b/packages/csv-parse/dist/cjs/index.cjs @@ -115,6 +115,16 @@ class ResizeableBuffer{ } } +// white space characters +// https://en.wikipedia.org/wiki/Whitespace_character +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types +// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff +const np = 12; +const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal +const space = 32; +const tab = 9; + const init_state = function(options){ return { bomSkipped: false, @@ -148,7 +158,14 @@ const init_state = function(options){ recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]], wasQuoting: false, - wasRowDelimiter: false + wasRowDelimiter: false, + timchars: [ + Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding), + ] }; }; @@ -571,15 +588,9 @@ const isRecordEmpty = function(record){ return record.every((field) => field == null || field.toString && field.toString().trim() === ''); }; -// white space characters -// https://en.wikipedia.org/wiki/Whitespace_character -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types -// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff -const tab = 9; -const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal -const np = 12; -const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal -const space = 32; +const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const boms = { // Note, the following are equals: // Buffer.from("\ufeff") @@ -724,7 +735,7 @@ const transform = function(original_options = {}) { if(this.state.commenting === false && this.__isQuote(buf, pos)){ if(this.state.quoting === true){ const nextChr = buf[pos+quote.length]; - const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr); + const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length); @@ -834,7 +845,7 @@ const transform = function(original_options = {}) { } if(this.state.commenting === false){ if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){ - const err = this.__error( + return this.__error( new CsvError('CSV_MAX_RECORD_SIZE', [ 'Max Record Size:', 'record exceed the maximum number of tolerated bytes', @@ -842,15 +853,14 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); - if(err !== undefined) return err; } } - const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr); + const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if(lappend === true && rappend === true){ this.state.field.append(chr); - }else if(rtrim === true && !this.__isCharTrimable(chr)){ + }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){ return this.__error( new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [ 'Invalid Closing Quote:', @@ -858,6 +868,11 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); + }else { + if(lappend === false){ + pos += this.__isCharTrimable(buf, pos) - 1; + } + continue; } } if(end === true){ @@ -1114,8 +1129,19 @@ const transform = function(original_options = {}) { return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter - __isCharTrimable: function(chr){ - return chr === space || chr === tab || chr === cr || chr === nl || chr === np; + __isCharTrimable: function(buf, pos){ + const isTrim = (buf, pos) => { + const {timchars} = this.state; + loop1: for(let i = 0; i < timchars.length; i++){ + const timchar = timchars[i]; + for(let j = 0; j < timchar.length; j++){ + if(timchar[j] !== buf[pos+j]) continue loop1; + } + return timchar.length; + } + return 0; + }; + return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ diff --git a/packages/csv-parse/dist/cjs/sync.cjs b/packages/csv-parse/dist/cjs/sync.cjs index 9759e2cd7..1c0fac104 100644 --- a/packages/csv-parse/dist/cjs/sync.cjs +++ b/packages/csv-parse/dist/cjs/sync.cjs @@ -113,6 +113,16 @@ class ResizeableBuffer{ } } +// white space characters +// https://en.wikipedia.org/wiki/Whitespace_character +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types +// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff +const np = 12; +const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal +const space = 32; +const tab = 9; + const init_state = function(options){ return { bomSkipped: false, @@ -146,7 +156,14 @@ const init_state = function(options){ recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]], wasQuoting: false, - wasRowDelimiter: false + wasRowDelimiter: false, + timchars: [ + Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding), + ] }; }; @@ -569,15 +586,9 @@ const isRecordEmpty = function(record){ return record.every((field) => field == null || field.toString && field.toString().trim() === ''); }; -// white space characters -// https://en.wikipedia.org/wiki/Whitespace_character -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types -// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff -const tab = 9; -const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal -const np = 12; -const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal -const space = 32; +const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const boms = { // Note, the following are equals: // Buffer.from("\ufeff") @@ -722,7 +733,7 @@ const transform = function(original_options = {}) { if(this.state.commenting === false && this.__isQuote(buf, pos)){ if(this.state.quoting === true){ const nextChr = buf[pos+quote.length]; - const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr); + const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length); @@ -832,7 +843,7 @@ const transform = function(original_options = {}) { } if(this.state.commenting === false){ if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){ - const err = this.__error( + return this.__error( new CsvError('CSV_MAX_RECORD_SIZE', [ 'Max Record Size:', 'record exceed the maximum number of tolerated bytes', @@ -840,15 +851,14 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); - if(err !== undefined) return err; } } - const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr); + const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if(lappend === true && rappend === true){ this.state.field.append(chr); - }else if(rtrim === true && !this.__isCharTrimable(chr)){ + }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){ return this.__error( new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [ 'Invalid Closing Quote:', @@ -856,6 +866,11 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); + }else { + if(lappend === false){ + pos += this.__isCharTrimable(buf, pos) - 1; + } + continue; } } if(end === true){ @@ -1112,8 +1127,19 @@ const transform = function(original_options = {}) { return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter - __isCharTrimable: function(chr){ - return chr === space || chr === tab || chr === cr || chr === nl || chr === np; + __isCharTrimable: function(buf, pos){ + const isTrim = (buf, pos) => { + const {timchars} = this.state; + loop1: for(let i = 0; i < timchars.length; i++){ + const timchar = timchars[i]; + for(let j = 0; j < timchar.length; j++){ + if(timchar[j] !== buf[pos+j]) continue loop1; + } + return timchar.length; + } + return 0; + }; + return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ diff --git a/packages/csv-parse/dist/esm/index.js b/packages/csv-parse/dist/esm/index.js index ba7fb242d..6cf4910f7 100644 --- a/packages/csv-parse/dist/esm/index.js +++ b/packages/csv-parse/dist/esm/index.js @@ -5171,6 +5171,16 @@ class ResizeableBuffer{ } } +// white space characters +// https://en.wikipedia.org/wiki/Whitespace_character +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types +// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff +const np = 12; +const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal +const space = 32; +const tab = 9; + const init_state = function(options){ return { bomSkipped: false, @@ -5204,7 +5214,14 @@ const init_state = function(options){ recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]], wasQuoting: false, - wasRowDelimiter: false + wasRowDelimiter: false, + timchars: [ + Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding), + ] }; }; @@ -5627,15 +5644,9 @@ const isRecordEmpty = function(record){ return record.every((field) => field == null || field.toString && field.toString().trim() === ''); }; -// white space characters -// https://en.wikipedia.org/wiki/Whitespace_character -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types -// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff -const tab = 9; -const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal -const np = 12; -const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal -const space = 32; +const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const boms = { // Note, the following are equals: // Buffer.from("\ufeff") @@ -5780,7 +5791,7 @@ const transform = function(original_options = {}) { if(this.state.commenting === false && this.__isQuote(buf, pos)){ if(this.state.quoting === true){ const nextChr = buf[pos+quote.length]; - const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr); + const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length); @@ -5890,7 +5901,7 @@ const transform = function(original_options = {}) { } if(this.state.commenting === false){ if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){ - const err = this.__error( + return this.__error( new CsvError('CSV_MAX_RECORD_SIZE', [ 'Max Record Size:', 'record exceed the maximum number of tolerated bytes', @@ -5898,15 +5909,14 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); - if(err !== undefined) return err; } } - const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr); + const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if(lappend === true && rappend === true){ this.state.field.append(chr); - }else if(rtrim === true && !this.__isCharTrimable(chr)){ + }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){ return this.__error( new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [ 'Invalid Closing Quote:', @@ -5914,6 +5924,11 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); + }else { + if(lappend === false){ + pos += this.__isCharTrimable(buf, pos) - 1; + } + continue; } } if(end === true){ @@ -6170,8 +6185,19 @@ const transform = function(original_options = {}) { return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter - __isCharTrimable: function(chr){ - return chr === space || chr === tab || chr === cr || chr === nl || chr === np; + __isCharTrimable: function(buf, pos){ + const isTrim = (buf, pos) => { + const {timchars} = this.state; + loop1: for(let i = 0; i < timchars.length; i++){ + const timchar = timchars[i]; + for(let j = 0; j < timchar.length; j++){ + if(timchar[j] !== buf[pos+j]) continue loop1; + } + return timchar.length; + } + return 0; + }; + return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ diff --git a/packages/csv-parse/dist/esm/sync.js b/packages/csv-parse/dist/esm/sync.js index 0483eb852..adf3b0786 100644 --- a/packages/csv-parse/dist/esm/sync.js +++ b/packages/csv-parse/dist/esm/sync.js @@ -2081,6 +2081,16 @@ class ResizeableBuffer{ } } +// white space characters +// https://en.wikipedia.org/wiki/Whitespace_character +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types +// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff +const np = 12; +const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal +const space = 32; +const tab = 9; + const init_state = function(options){ return { bomSkipped: false, @@ -2114,7 +2124,14 @@ const init_state = function(options){ recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]], wasQuoting: false, - wasRowDelimiter: false + wasRowDelimiter: false, + timchars: [ + Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding), + ] }; }; @@ -2537,15 +2554,9 @@ const isRecordEmpty = function(record){ return record.every((field) => field == null || field.toString && field.toString().trim() === ''); }; -// white space characters -// https://en.wikipedia.org/wiki/Whitespace_character -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types -// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff -const tab = 9; -const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal -const np = 12; -const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal -const space = 32; +const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const boms = { // Note, the following are equals: // Buffer.from("\ufeff") @@ -2690,7 +2701,7 @@ const transform = function(original_options = {}) { if(this.state.commenting === false && this.__isQuote(buf, pos)){ if(this.state.quoting === true){ const nextChr = buf[pos+quote.length]; - const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr); + const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length); @@ -2800,7 +2811,7 @@ const transform = function(original_options = {}) { } if(this.state.commenting === false){ if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){ - const err = this.__error( + return this.__error( new CsvError('CSV_MAX_RECORD_SIZE', [ 'Max Record Size:', 'record exceed the maximum number of tolerated bytes', @@ -2808,15 +2819,14 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); - if(err !== undefined) return err; } } - const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr); + const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if(lappend === true && rappend === true){ this.state.field.append(chr); - }else if(rtrim === true && !this.__isCharTrimable(chr)){ + }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){ return this.__error( new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [ 'Invalid Closing Quote:', @@ -2824,6 +2834,11 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); + }else { + if(lappend === false){ + pos += this.__isCharTrimable(buf, pos) - 1; + } + continue; } } if(end === true){ @@ -3080,8 +3095,19 @@ const transform = function(original_options = {}) { return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter - __isCharTrimable: function(chr){ - return chr === space || chr === tab || chr === cr || chr === nl || chr === np; + __isCharTrimable: function(buf, pos){ + const isTrim = (buf, pos) => { + const {timchars} = this.state; + loop1: for(let i = 0; i < timchars.length; i++){ + const timchar = timchars[i]; + for(let j = 0; j < timchar.length; j++){ + if(timchar[j] !== buf[pos+j]) continue loop1; + } + return timchar.length; + } + return 0; + }; + return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ diff --git a/packages/csv-parse/dist/iife/index.js b/packages/csv-parse/dist/iife/index.js index 61832c474..ebb43b872 100644 --- a/packages/csv-parse/dist/iife/index.js +++ b/packages/csv-parse/dist/iife/index.js @@ -5174,6 +5174,16 @@ var csv_parse = (function (exports) { } } + // white space characters + // https://en.wikipedia.org/wiki/Whitespace_character + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types + // \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff + const np = 12; + const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal + const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const space = 32; + const tab = 9; + const init_state = function(options){ return { bomSkipped: false, @@ -5207,7 +5217,14 @@ var csv_parse = (function (exports) { recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]], wasQuoting: false, - wasRowDelimiter: false + wasRowDelimiter: false, + timchars: [ + Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding), + ] }; }; @@ -5630,15 +5647,9 @@ var csv_parse = (function (exports) { return record.every((field) => field == null || field.toString && field.toString().trim() === ''); }; - // white space characters - // https://en.wikipedia.org/wiki/Whitespace_character - // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types - // \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff - const tab = 9; - const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal - const np = 12; - const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal - const space = 32; + const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal + const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const boms = { // Note, the following are equals: // Buffer.from("\ufeff") @@ -5783,7 +5794,7 @@ var csv_parse = (function (exports) { if(this.state.commenting === false && this.__isQuote(buf, pos)){ if(this.state.quoting === true){ const nextChr = buf[pos+quote.length]; - const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr); + const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length); @@ -5893,7 +5904,7 @@ var csv_parse = (function (exports) { } if(this.state.commenting === false){ if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){ - const err = this.__error( + return this.__error( new CsvError('CSV_MAX_RECORD_SIZE', [ 'Max Record Size:', 'record exceed the maximum number of tolerated bytes', @@ -5901,15 +5912,14 @@ var csv_parse = (function (exports) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); - if(err !== undefined) return err; } } - const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr); + const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if(lappend === true && rappend === true){ this.state.field.append(chr); - }else if(rtrim === true && !this.__isCharTrimable(chr)){ + }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){ return this.__error( new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [ 'Invalid Closing Quote:', @@ -5917,6 +5927,11 @@ var csv_parse = (function (exports) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); + }else { + if(lappend === false){ + pos += this.__isCharTrimable(buf, pos) - 1; + } + continue; } } if(end === true){ @@ -6173,8 +6188,19 @@ var csv_parse = (function (exports) { return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter - __isCharTrimable: function(chr){ - return chr === space || chr === tab || chr === cr || chr === nl || chr === np; + __isCharTrimable: function(buf, pos){ + const isTrim = (buf, pos) => { + const {timchars} = this.state; + loop1: for(let i = 0; i < timchars.length; i++){ + const timchar = timchars[i]; + for(let j = 0; j < timchar.length; j++){ + if(timchar[j] !== buf[pos+j]) continue loop1; + } + return timchar.length; + } + return 0; + }; + return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ diff --git a/packages/csv-parse/dist/iife/sync.js b/packages/csv-parse/dist/iife/sync.js index 4dc6c0aa4..4922013ab 100644 --- a/packages/csv-parse/dist/iife/sync.js +++ b/packages/csv-parse/dist/iife/sync.js @@ -2084,6 +2084,16 @@ var csv_parse_sync = (function (exports) { } } + // white space characters + // https://en.wikipedia.org/wiki/Whitespace_character + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types + // \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff + const np = 12; + const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal + const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const space = 32; + const tab = 9; + const init_state = function(options){ return { bomSkipped: false, @@ -2117,7 +2127,14 @@ var csv_parse_sync = (function (exports) { recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]], wasQuoting: false, - wasRowDelimiter: false + wasRowDelimiter: false, + timchars: [ + Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding), + ] }; }; @@ -2540,15 +2557,9 @@ var csv_parse_sync = (function (exports) { return record.every((field) => field == null || field.toString && field.toString().trim() === ''); }; - // white space characters - // https://en.wikipedia.org/wiki/Whitespace_character - // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types - // \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff - const tab = 9; - const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal - const np = 12; - const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal - const space = 32; + const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal + const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const boms = { // Note, the following are equals: // Buffer.from("\ufeff") @@ -2693,7 +2704,7 @@ var csv_parse_sync = (function (exports) { if(this.state.commenting === false && this.__isQuote(buf, pos)){ if(this.state.quoting === true){ const nextChr = buf[pos+quote.length]; - const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr); + const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length); @@ -2803,7 +2814,7 @@ var csv_parse_sync = (function (exports) { } if(this.state.commenting === false){ if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){ - const err = this.__error( + return this.__error( new CsvError('CSV_MAX_RECORD_SIZE', [ 'Max Record Size:', 'record exceed the maximum number of tolerated bytes', @@ -2811,15 +2822,14 @@ var csv_parse_sync = (function (exports) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); - if(err !== undefined) return err; } } - const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr); + const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if(lappend === true && rappend === true){ this.state.field.append(chr); - }else if(rtrim === true && !this.__isCharTrimable(chr)){ + }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){ return this.__error( new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [ 'Invalid Closing Quote:', @@ -2827,6 +2837,11 @@ var csv_parse_sync = (function (exports) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); + }else { + if(lappend === false){ + pos += this.__isCharTrimable(buf, pos) - 1; + } + continue; } } if(end === true){ @@ -3083,8 +3098,19 @@ var csv_parse_sync = (function (exports) { return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter - __isCharTrimable: function(chr){ - return chr === space || chr === tab || chr === cr || chr === nl || chr === np; + __isCharTrimable: function(buf, pos){ + const isTrim = (buf, pos) => { + const {timchars} = this.state; + loop1: for(let i = 0; i < timchars.length; i++){ + const timchar = timchars[i]; + for(let j = 0; j < timchar.length; j++){ + if(timchar[j] !== buf[pos+j]) continue loop1; + } + return timchar.length; + } + return 0; + }; + return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ diff --git a/packages/csv-parse/dist/umd/index.js b/packages/csv-parse/dist/umd/index.js index b94cc7a31..b8f0020d1 100644 --- a/packages/csv-parse/dist/umd/index.js +++ b/packages/csv-parse/dist/umd/index.js @@ -5177,6 +5177,16 @@ } } + // white space characters + // https://en.wikipedia.org/wiki/Whitespace_character + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types + // \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff + const np = 12; + const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal + const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const space = 32; + const tab = 9; + const init_state = function(options){ return { bomSkipped: false, @@ -5210,7 +5220,14 @@ recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]], wasQuoting: false, - wasRowDelimiter: false + wasRowDelimiter: false, + timchars: [ + Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding), + ] }; }; @@ -5633,15 +5650,9 @@ return record.every((field) => field == null || field.toString && field.toString().trim() === ''); }; - // white space characters - // https://en.wikipedia.org/wiki/Whitespace_character - // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types - // \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff - const tab = 9; - const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal - const np = 12; - const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal - const space = 32; + const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal + const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const boms = { // Note, the following are equals: // Buffer.from("\ufeff") @@ -5786,7 +5797,7 @@ if(this.state.commenting === false && this.__isQuote(buf, pos)){ if(this.state.quoting === true){ const nextChr = buf[pos+quote.length]; - const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr); + const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length); @@ -5896,7 +5907,7 @@ } if(this.state.commenting === false){ if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){ - const err = this.__error( + return this.__error( new CsvError('CSV_MAX_RECORD_SIZE', [ 'Max Record Size:', 'record exceed the maximum number of tolerated bytes', @@ -5904,15 +5915,14 @@ `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); - if(err !== undefined) return err; } } - const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr); + const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if(lappend === true && rappend === true){ this.state.field.append(chr); - }else if(rtrim === true && !this.__isCharTrimable(chr)){ + }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){ return this.__error( new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [ 'Invalid Closing Quote:', @@ -5920,6 +5930,11 @@ `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); + }else { + if(lappend === false){ + pos += this.__isCharTrimable(buf, pos) - 1; + } + continue; } } if(end === true){ @@ -6176,8 +6191,19 @@ return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter - __isCharTrimable: function(chr){ - return chr === space || chr === tab || chr === cr || chr === nl || chr === np; + __isCharTrimable: function(buf, pos){ + const isTrim = (buf, pos) => { + const {timchars} = this.state; + loop1: for(let i = 0; i < timchars.length; i++){ + const timchar = timchars[i]; + for(let j = 0; j < timchar.length; j++){ + if(timchar[j] !== buf[pos+j]) continue loop1; + } + return timchar.length; + } + return 0; + }; + return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ diff --git a/packages/csv-parse/dist/umd/sync.js b/packages/csv-parse/dist/umd/sync.js index a855938ff..a7a6eb5a5 100644 --- a/packages/csv-parse/dist/umd/sync.js +++ b/packages/csv-parse/dist/umd/sync.js @@ -2087,6 +2087,16 @@ } } + // white space characters + // https://en.wikipedia.org/wiki/Whitespace_character + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types + // \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff + const np = 12; + const cr$1 = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal + const nl$1 = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const space = 32; + const tab = 9; + const init_state = function(options){ return { bomSkipped: false, @@ -2120,7 +2130,14 @@ recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]], wasQuoting: false, - wasRowDelimiter: false + wasRowDelimiter: false, + timchars: [ + Buffer.from(Buffer.from([cr$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([nl$1], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding), + ] }; }; @@ -2543,15 +2560,9 @@ return record.every((field) => field == null || field.toString && field.toString().trim() === ''); }; - // white space characters - // https://en.wikipedia.org/wiki/Whitespace_character - // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types - // \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff - const tab = 9; - const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal - const np = 12; - const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal - const space = 32; + const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal + const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const boms = { // Note, the following are equals: // Buffer.from("\ufeff") @@ -2696,7 +2707,7 @@ if(this.state.commenting === false && this.__isQuote(buf, pos)){ if(this.state.quoting === true){ const nextChr = buf[pos+quote.length]; - const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr); + const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length); @@ -2806,7 +2817,7 @@ } if(this.state.commenting === false){ if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){ - const err = this.__error( + return this.__error( new CsvError('CSV_MAX_RECORD_SIZE', [ 'Max Record Size:', 'record exceed the maximum number of tolerated bytes', @@ -2814,15 +2825,14 @@ `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); - if(err !== undefined) return err; } } - const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr); + const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if(lappend === true && rappend === true){ this.state.field.append(chr); - }else if(rtrim === true && !this.__isCharTrimable(chr)){ + }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){ return this.__error( new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [ 'Invalid Closing Quote:', @@ -2830,6 +2840,11 @@ `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); + }else { + if(lappend === false){ + pos += this.__isCharTrimable(buf, pos) - 1; + } + continue; } } if(end === true){ @@ -3086,8 +3101,19 @@ return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter - __isCharTrimable: function(chr){ - return chr === space || chr === tab || chr === cr || chr === nl || chr === np; + __isCharTrimable: function(buf, pos){ + const isTrim = (buf, pos) => { + const {timchars} = this.state; + loop1: for(let i = 0; i < timchars.length; i++){ + const timchar = timchars[i]; + for(let j = 0; j < timchar.length; j++){ + if(timchar[j] !== buf[pos+j]) continue loop1; + } + return timchar.length; + } + return 0; + }; + return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ diff --git a/packages/csv-parse/lib/api/index.js b/packages/csv-parse/lib/api/index.js index 505cd0964..82d37350d 100644 --- a/packages/csv-parse/lib/api/index.js +++ b/packages/csv-parse/lib/api/index.js @@ -8,15 +8,9 @@ const isRecordEmpty = function(record){ return record.every((field) => field == null || field.toString && field.toString().trim() === ''); }; -// white space characters -// https://en.wikipedia.org/wiki/Whitespace_character -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types -// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff -const tab = 9; -const nl = 10; // \n, 0x0A in hexadecimal, 10 in decimal -const np = 12; -const cr = 13; // \r, 0x0D in hexadécimal, 13 in decimal -const space = 32; +const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal + const boms = { // Note, the following are equals: // Buffer.from("\ufeff") @@ -161,7 +155,7 @@ const transform = function(original_options = {}) { if(this.state.commenting === false && this.__isQuote(buf, pos)){ if(this.state.quoting === true){ const nextChr = buf[pos+quote.length]; - const isNextChrTrimable = rtrim && this.__isCharTrimable(nextChr); + const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos+quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos+quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter(buf, pos+quote.length, nextChr); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos+quote.length) : this.__isRecordDelimiter(nextChr, buf, pos+quote.length); @@ -271,7 +265,7 @@ const transform = function(original_options = {}) { } if(this.state.commenting === false){ if(max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size){ - const err = this.__error( + return this.__error( new CsvError('CSV_MAX_RECORD_SIZE', [ 'Max Record Size:', 'record exceed the maximum number of tolerated bytes', @@ -279,15 +273,14 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); - if(err !== undefined) return err; } } - const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(chr); + const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 || !this.__isCharTrimable(buf, pos); // rtrim in non quoting is handle in __onField const rappend = rtrim === false || this.state.wasQuoting === false; if(lappend === true && rappend === true){ this.state.field.append(chr); - }else if(rtrim === true && !this.__isCharTrimable(chr)){ + }else if(rtrim === true && !this.__isCharTrimable(buf, pos)){ return this.__error( new CsvError('CSV_NON_TRIMABLE_CHAR_AFTER_CLOSING_QUOTE', [ 'Invalid Closing Quote:', @@ -295,6 +288,11 @@ const transform = function(original_options = {}) { `at line ${this.info.lines}`, ], this.options, this.__infoField()) ); + }else{ + if(lappend === false){ + pos += this.__isCharTrimable(buf, pos) - 1; + } + continue; } } if(end === true){ @@ -551,8 +549,19 @@ const transform = function(original_options = {}) { return [undefined, field]; }, // Helper to test if a character is a space or a line delimiter - __isCharTrimable: function(chr){ - return chr === space || chr === tab || chr === cr || chr === nl || chr === np; + __isCharTrimable: function(buf, pos){ + const isTrim = (buf, pos) => { + const {timchars} = this.state; + loop1: for(let i = 0; i < timchars.length; i++){ + const timchar = timchars[i]; + for(let j = 0; j < timchar.length; j++){ + if(timchar[j] !== buf[pos+j]) continue loop1; + } + return timchar.length; + } + return 0; + }; + return isTrim(buf, pos); }, // Keep it in case we implement the `cast_int` option // __isInt(value){ diff --git a/packages/csv-parse/lib/api/init_state.js b/packages/csv-parse/lib/api/init_state.js index 28783e3a5..2f42ee666 100644 --- a/packages/csv-parse/lib/api/init_state.js +++ b/packages/csv-parse/lib/api/init_state.js @@ -1,6 +1,16 @@ import ResizeableBuffer from '../utils/ResizeableBuffer.js'; +// white space characters +// https://en.wikipedia.org/wiki/Whitespace_character +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types +// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff +const np = 12; +const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal +const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal +const space = 32; +const tab = 9; + const init_state = function(options){ return { bomSkipped: false, @@ -34,7 +44,14 @@ const init_state = function(options){ recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 2 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [Buffer.from(' ', options.encoding)[0], Buffer.from('\t', options.encoding)[0]], wasQuoting: false, - wasRowDelimiter: false + wasRowDelimiter: false, + timchars: [ + Buffer.from(Buffer.from([cr], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([nl], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([np], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([space], 'utf8').toString(), options.encoding), + Buffer.from(Buffer.from([tab], 'utf8').toString(), options.encoding), + ] }; }; diff --git a/packages/csv-parse/test/option.trim.coffee b/packages/csv-parse/test/option.trim.coffee index fb6d8a9e2..f74c67048 100644 --- a/packages/csv-parse/test/option.trim.coffee +++ b/packages/csv-parse/test/option.trim.coffee @@ -198,3 +198,30 @@ describe 'no trim', -> ' 28392898392, 1974,8.8392926E7,D EF , 23 , 2050-11-27' ].join('\n') parser.end() + +describe 'options', -> + + it.skip 'with encoding', (next) -> + parse Buffer.from('a, a', 'utf8'), + encoding: 'utf8' + trim: true + , (err, records) -> + # records[0].map (record) -> console.log Buffer.from(record, 'utf16le') + records.should.eql [['a', 'a']] unless err + next err + + it 'ltrim with encoding', (next) -> + parse Buffer.from('ф, ф', 'utf16le'), + encoding: 'utf16le' + trim: true + , (err, records) -> + records.should.eql [['ф', 'ф']] unless err + next err + + it 'rtrim with encoding', (next) -> + parse Buffer.from('ф ,ф', 'utf16le'), + encoding: 'utf16le' + trim: true + , (err, records) -> + records.should.eql [['ф', 'ф']] unless err + next err