forked from FasterXML/jackson-dataformats-text
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CsvParser.java
1238 lines (1098 loc) · 41.1 KB
/
CsvParser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package com.fasterxml.jackson.dataformat.csv;
import com.fasterxml.jackson.core.*;
import com.fasterxml.jackson.core.base.ParserMinimalBase;
import com.fasterxml.jackson.core.json.DupDetector;
import com.fasterxml.jackson.core.json.JsonReadContext;
import com.fasterxml.jackson.core.util.ByteArrayBuilder;
import com.fasterxml.jackson.dataformat.csv.impl.CsvDecoder;
import com.fasterxml.jackson.dataformat.csv.impl.CsvIOContext;
import com.fasterxml.jackson.dataformat.csv.impl.TextBuffer;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.math.BigDecimal;
import java.math.BigInteger;
/**
* {@link JsonParser} implementation used to expose CSV documents
* in form that allows other Jackson functionality to deal
* with it.
*<p>
* Implementation is based on a state-machine that pulls information
* using {@link CsvDecoder}.
*/
public class CsvParser
extends ParserMinimalBase
{
// @since 2.9.9: just to protect against bugs, DoS, limit number of column defs we may read
private final static int MAX_COLUMNS = 99999;
/**
* Enumeration that defines all togglable features for CSV parsers
*/
public enum Feature
implements FormatFeature
{
/**
* Feature determines whether spaces around separator characters
* (commas) are to be automatically trimmed before being reported
* or not.
* Note that this does NOT force trimming of possible white space from
* within double-quoted values, but only those surrounding unquoted
* values (white space outside of double-quotes is never included regardless
* of trimming).
*<p>
* Default value is false, as per <a href="http://tools.ietf.org/html/rfc4180">RFC-4180</a>.
*/
TRIM_SPACES(false),
/**
* Feature that determines how stream of records (usually CSV lines, but sometimes
* multiple lines when linefeeds are included in quoted values) is exposed:
* either as a sequence of Objects (false), or as an Array of Objects (true).
* Using stream of Objects is convenient when using
* <code>ObjectMapper.readValues(...)</code>
* and array of Objects convenient when binding to <code>List</code>s or
* arrays of values.
*<p>
* Default value is false, meaning that by default a CSV document is exposed as
* a sequence of root-level Object entries.
*/
WRAP_AS_ARRAY(false),
/**
* Feature that allows ignoring of unmappable "extra" columns; that is, values for
* columns that appear after columns for which types are defined. When disabled,
* an exception is thrown for such column values, but if enabled, they are
* silently ignored.
*<p>
* Feature is disabled by default.
*/
IGNORE_TRAILING_UNMAPPABLE(false),
/**
* Feature that allows skipping input lines that are completely empty, instead
* of being decoded as lines of just a single column with empty String value (or,
* depending on binding, `null`).
*<p>
* Feature is disabled by default.
*/
SKIP_EMPTY_LINES(false),
/**
* Feature that allows there to be a trailing single extraneous data
* column that is empty. When this feature is disabled, any extraneous
* column, regardless of content will cause an exception to be thrown.
* Disabling this feature is only useful when
* IGNORE_TRAILING_UNMAPPABLE is also disabled.
*/
ALLOW_TRAILING_COMMA(true),
/**
* Feature that allows accepting "hash comments" by default, similar to
* {@link CsvSchema#withAllowComments(boolean)}. If enabled, such comments
* are by default allowed on all columns of all documents.
*
* @since 2.10
*/
ALLOW_COMMENTS(false),
/**
* Feature that allows failing (with a {@link CsvMappingException}) in cases
* where number of column values encountered is less than number of columns
* declared in active schema ("missing columns").
*<p>
* Note that this feature has precedence over {@link #INSERT_NULLS_FOR_MISSING_COLUMNS}
*<p>
* Feature is disabled by default.
*/
FAIL_ON_MISSING_COLUMNS(false),
/**
* Feature that allows "inserting" virtual key / `null` value pairs in case
* a row contains fewer columns than declared by configured schema.
* This typically has the effect of forcing an explicit `null` assigment (or
* corresponding "null value", if so configured) at databinding level.
* If disabled, no extra work is done and values for "missing" columns are
* not exposed as part of the token stream.
*<p>
* Note that this feature is only considered if
* {@link #INSERT_NULLS_FOR_MISSING_COLUMNS}
* is disabled.
*<p>
* Feature is disabled by default.
*/
INSERT_NULLS_FOR_MISSING_COLUMNS(false),
;
final boolean _defaultState;
final int _mask;
/**
* Method that calculates bit set (flags) of all features that
* are enabled by default.
*/
public static int collectDefaults()
{
int flags = 0;
for (Feature f : values()) {
if (f.enabledByDefault()) {
flags |= f.getMask();
}
}
return flags;
}
private Feature(boolean defaultState) {
_defaultState = defaultState;
_mask = (1 << ordinal());
}
@Override
public boolean enabledByDefault() { return _defaultState; }
@Override
public boolean enabledIn(int flags) { return (flags & _mask) != 0; }
@Override
public int getMask() { return _mask; }
}
private final static CsvSchema EMPTY_SCHEMA;
static {
EMPTY_SCHEMA = CsvSchema.emptySchema();
}
/*
/**********************************************************************
/* State constants
/**********************************************************************
*/
/**
* Initial state before anything is read from document.
*/
protected final static int STATE_DOC_START = 0;
/**
* State before logical start of a record, in which next
* token to return will be {@link JsonToken#START_OBJECT}
* (or if no Schema is provided, {@link JsonToken#START_ARRAY}).
*/
protected final static int STATE_RECORD_START = 1;
/**
* State in which next entry will be available, returning
* either {@link JsonToken#FIELD_NAME} or value
* (depending on whether entries are expressed as
* Objects or just Arrays); or
* matching close marker.
*/
protected final static int STATE_NEXT_ENTRY = 2;
/**
* State in which value matching field name will
* be returned.
*/
protected final static int STATE_NAMED_VALUE = 3;
/**
* State in which "unnamed" value (entry in an array)
* will be returned, if one available; otherwise
* end-array is returned.
*/
protected final static int STATE_UNNAMED_VALUE = 4;
/**
* State in which a column value has been determined to be of
* an array type, and will need to be split into multiple
* values. This can currently only occur for named values.
*/
protected final static int STATE_IN_ARRAY = 5;
/**
* State in which we have encountered more column values than there should be,
* and need to basically skip extra values if callers tries to advance parser
* state.
*/
protected final static int STATE_SKIP_EXTRA_COLUMNS = 6;
/**
* State in which we should expose name token for a "missing column"
* (for which placeholder `null` value is to be added as well);
* see {@link Feature#INSERT_NULLS_FOR_MISSING_COLUMNS} for details.
*/
protected final static int STATE_MISSING_NAME = 7;
/**
* State in which we should expose `null` value token as a value for
* "missing" column;
* see {@link Feature#INSERT_NULLS_FOR_MISSING_COLUMNS} for details.
*/
protected final static int STATE_MISSING_VALUE = 8;
/**
* State in which end marker is returned; either
* null (if no array wrapping), or
* {@link JsonToken#END_ARRAY} for wrapping.
* This step will loop, returning series of nulls
* if {@link #nextToken} is called multiple times.
*/
protected final static int STATE_DOC_END = 9;
/*
/**********************************************************************
/* Configuration
/**********************************************************************
*/
protected int _formatFeatures;
/**
* Definition of columns being read.
*/
protected CsvSchema _schema;
/**
* Number of columns defined by schema.
*/
protected int _columnCount = 0;
/*
/**********************************************************************
/* State
/**********************************************************************
*/
/**
* Information about parser context, context in which
* the next token is to be parsed (root, array, object).
*/
protected JsonReadContext _parsingContext;
/**
* Name of column that we exposed most recently, accessible after
* {@link JsonToken#FIELD_NAME} as well as value tokens immediately
* following field name.
*/
protected String _currentName;
/**
* String value for the current column, if accessed.
*/
protected String _currentValue;
/**
* Index of the column we are exposing
*/
protected int _columnIndex;
/**
* Current logical state of the parser; one of <code>STATE_</code>
* constants.
*/
protected int _state = STATE_DOC_START;
/**
* We will hold on to decoded binary data, for duration of
* current event, so that multiple calls to
* {@link #getBinaryValue} will not need to decode data more
* than once.
*/
protected byte[] _binaryValue;
/**
* Pointer to the first character of the next array value to return.
*/
protected int _arrayValueStart;
/**
* Contents of the cell, to be split into distinct array values.
*/
protected String _arrayValue;
protected String _arraySeparator;
protected String _nullValue;
/*
/**********************************************************************
/* Helper objects
/**********************************************************************
*/
/**
* Thing that actually reads the CSV content
*/
protected final CsvDecoder _reader;
/**
* Buffer that contains contents of all values after processing
* of doubled-quotes, escaped characters.
*/
protected final TextBuffer _textBuffer;
protected ByteArrayBuilder _byteArrayBuilder;
/*
/**********************************************************************
/* Life-cycle
/**********************************************************************
*/
public CsvParser(ObjectReadContext readCtxt, CsvIOContext ioCtxt,
int stdFeatures, int csvFeatures, CsvSchema schema,
Reader reader)
{
super(readCtxt, stdFeatures);
if (reader == null) {
throw new IllegalArgumentException("Can not pass `null` as `java.io.Reader` to read from");
}
_textBuffer = ioCtxt.csvTextBuffer();
DupDetector dups = StreamReadFeature.STRICT_DUPLICATE_DETECTION.enabledIn(stdFeatures)
? DupDetector.rootDetector(this) : null;
_formatFeatures = csvFeatures;
_parsingContext = JsonReadContext.createRootContext(dups);
_reader = new CsvDecoder(ioCtxt, this, reader, schema, _textBuffer,
stdFeatures, csvFeatures);
setSchema(schema);
}
/*
/**********************************************************
/* Versioned
/**********************************************************
*/
@Override
public Version version() {
return PackageVersion.VERSION;
}
/*
/**********************************************************
/* Overridden methods
/**********************************************************
*/
@Override
public boolean canUseSchema(FormatSchema schema) {
return (schema instanceof CsvSchema);
}
@Override
public void setSchema(FormatSchema schema)
{
if (schema instanceof CsvSchema) {
_schema = (CsvSchema) schema;
String str = _schema.getNullValueString();
_nullValue = str;
} else if (schema == null) {
schema = EMPTY_SCHEMA;
} else {
super.setSchema(schema);
}
_columnCount = _schema.size();
_reader.setSchema(_schema);
}
@Override
public int releaseBuffered(Writer out) throws IOException {
return _reader.releaseBuffered(out);
}
@Override
public boolean isClosed() { return _reader.isClosed(); }
@Override
public void close() throws IOException { _reader.close(); }
/*
/**********************************************************
/* FormatFeature support
/**********************************************************
*/
@Override
public int formatReadFeatures() {
return _formatFeatures;
}
/*
/***************************************************
/* Public API, configuration
/***************************************************
*/
/**
* Method for enabling specified CSV feature
* (check {@link Feature} for list of features)
*/
public JsonParser enable(Feature f)
{
_formatFeatures |= f.getMask();
return this;
}
/**
* Method for disabling specified CSV feature
* (check {@link Feature} for list of features)
*/
public JsonParser disable(Feature f)
{
_formatFeatures &= ~f.getMask();
return this;
}
/**
* Method for enabling or disabling specified CSV feature
* (check {@link Feature} for list of features)
*/
public JsonParser configure(Feature f, boolean state)
{
if (state) {
enable(f);
} else {
disable(f);
}
return this;
}
/**
* Method for checking whether specified CSV {@link Feature}
* is enabled.
*/
public boolean isEnabled(Feature f) {
return (_formatFeatures & f.getMask()) != 0;
}
/**
* Accessor for getting active schema definition: it may be
* "empty" (no column definitions), but will never be null
* since it defaults to an empty schema (and default configuration)
*/
@Override
public CsvSchema getSchema() {
return _schema;
}
/*
/**********************************************************
/* Location info
/**********************************************************
*/
@Override
public TokenStreamContext getParsingContext() {
return _parsingContext;
}
@Override
public JsonLocation getTokenLocation() {
return _reader.getTokenLocation();
}
@Override
public JsonLocation getCurrentLocation() {
return _reader.getCurrentLocation();
}
@Override
public Object getInputSource() {
return _reader.getInputSource();
}
/*
/**********************************************************
/* Parsing, basic
/**********************************************************
*/
/**
* We need to override this method to support coercion from basic
* String value into array, in cases where schema does not
* specify actual type.
*/
@Override
public boolean isExpectedStartArrayToken() {
if (_currToken == null) {
return false;
}
switch (_currToken.id()) {
case JsonTokenId.ID_FIELD_NAME:
case JsonTokenId.ID_START_OBJECT:
case JsonTokenId.ID_END_OBJECT:
case JsonTokenId.ID_END_ARRAY:
return false;
case JsonTokenId.ID_START_ARRAY:
return true;
}
// Otherwise: may coerce into array, iff we have essentially "untyped" column
if (_columnIndex < _columnCount) {
CsvSchema.Column column = _schema.column(_columnIndex);
if (column.getType() == CsvSchema.ColumnType.STRING) {
_startArray(column);
return true;
}
}
// 30-Dec-2014, tatu: Seems like it should be possible to allow this
// in non-array-wrapped case too (for 2.5), so let's try that:
else if (_currToken == JsonToken.VALUE_STRING) {
_startArray(CsvSchema.Column.PLACEHOLDER);
return true;
}
return false;
}
@Override
public String currentName() throws IOException {
return _currentName;
}
@Override
public void overrideCurrentName(String name) {
_currentName = name;
}
@Override
public JsonToken nextToken() throws IOException
{
_binaryValue = null;
switch (_state) {
case STATE_DOC_START:
return (_currToken = _handleStartDoc());
case STATE_RECORD_START:
return (_currToken = _handleRecordStart());
case STATE_NEXT_ENTRY:
return (_currToken = _handleNextEntry());
case STATE_NAMED_VALUE:
return (_currToken = _handleNamedValue());
case STATE_UNNAMED_VALUE:
return (_currToken = _handleUnnamedValue());
case STATE_IN_ARRAY:
return (_currToken = _handleArrayValue());
case STATE_SKIP_EXTRA_COLUMNS:
// Need to just skip whatever remains
return _skipUntilEndOfLine();
case STATE_MISSING_NAME:
return (_currToken = _handleMissingName());
case STATE_MISSING_VALUE:
return (_currToken = _handleMissingValue());
case STATE_DOC_END:
_reader.close();
if (_parsingContext.inRoot()) {
return null;
}
// should always be in array, actually... but:
boolean inArray = _parsingContext.inArray();
_parsingContext = _parsingContext.getParent();
return inArray ? JsonToken.END_ARRAY : JsonToken.END_OBJECT;
default:
throw new IllegalStateException();
}
}
/*
/**********************************************************
/* Parsing, optimized methods
/**********************************************************
*/
@Override
public boolean nextFieldName(SerializableString str) throws IOException {
// Optimize for expected case of getting FIELD_NAME:
if (_state == STATE_NEXT_ENTRY) {
_binaryValue = null;
JsonToken t = _handleNextEntry();
_currToken = t;
if (t == JsonToken.FIELD_NAME) {
return str.getValue().equals(_currentName);
}
return false;
}
// unlikely, but verify just in case
return (nextToken() == JsonToken.FIELD_NAME) && str.getValue().equals(currentName());
}
@Override
public String nextFieldName() throws IOException
{
// Optimize for expected case of getting FIELD_NAME:
if (_state == STATE_NEXT_ENTRY) {
_binaryValue = null;
JsonToken t = _handleNextEntry();
_currToken = t;
if (t == JsonToken.FIELD_NAME) {
return _currentName;
}
return null;
}
// unlikely, but verify just in case
return (nextToken() == JsonToken.FIELD_NAME) ? currentName() : null;
}
@Override
public String nextTextValue() throws IOException
{
_binaryValue = null;
JsonToken t;
if (_state == STATE_NAMED_VALUE) {
_currToken = t = _handleNamedValue();
if (t == JsonToken.VALUE_STRING) {
return _currentValue;
}
} else if (_state == STATE_UNNAMED_VALUE) {
_currToken = t = _handleUnnamedValue();
if (t == JsonToken.VALUE_STRING) {
return _currentValue;
}
} else {
t = nextToken();
if (t == JsonToken.VALUE_STRING) {
return getText();
}
}
return null;
}
/*
/**********************************************************
/* Parsing, helper methods, regular
/**********************************************************
*/
/**
* Method called to process the expected header line
*/
protected void _readHeaderLine() throws IOException {
/*
When the header line is present and the settings ask for it
to be processed, two different options are possible:
a) The schema has been populated. In this case, build a new
schema where the order matches the *actual* order in which
the given CSV file offers its columns, iif _schema.reordersColumns()
is set to true; there cases the consumer of the csv file
knows about the columns but not necessarily the order in
which they are defined.
b) The schema has not been populated. In this case, build a
default schema based on the columns found in the header.
*/
if (_schema.size() > 0 && !_schema.reordersColumns()) {
if (_schema.strictHeaders()) {
String name;
for (CsvSchema.Column column : _schema._columns) {
name = _reader.nextString();
if (name == null) {
_reportError(String.format("Missing header %s", column.getName()));
} else if (!column.getName().equals(name)) {
_reportError(String.format("Expected header %s, actual header %s", column.getName(), name));
}
}
if ((name = _reader.nextString()) != null) {
_reportError(String.format("Extra header %s", name));
}
} else {
int allowed = MAX_COLUMNS;
while (_reader.nextString() != null) {
// If we don't care about validation, just skip. But protect against infinite loop
if (--allowed < 0) {
_reportError("Internal error: skipped "+MAX_COLUMNS+" header columns");
}
}
}
return;
}
// either the schema is empty or reorder columns flag is set
String name;
CsvSchema.Builder builder = _schema.rebuild().clearColumns();
int count = 0;
while ((name = _reader.nextString()) != null) {
// one more thing: always trim names, regardless of config settings
name = name.trim();
// See if "old" schema defined type; if so, use that type...
CsvSchema.Column prev = _schema.column(name);
if (prev != null) {
builder.addColumn(name, prev.getType());
} else {
builder.addColumn(name);
}
if (++count > MAX_COLUMNS) {
_reportError("Internal error: reached maximum of "+MAX_COLUMNS+" header columns");
}
}
// Ok: did we get any columns?
CsvSchema newSchema = builder.build();
int size = newSchema.size();
if (size < 2) { // 1 just because we may get 'empty' header name
String first = (size == 0) ? "" : newSchema.columnName(0).trim();
if (first.length() == 0) {
_reportCsvMappingError("Empty header line: can not bind data");
}
}
// otherwise we will use what we got
setSchema(builder.build());
}
/**
* Method called to handle details of initializing things to return
* the very first token.
*/
protected JsonToken _handleStartDoc() throws IOException
{
// also, if comments enabled, or skip empty lines, may need to skip leading ones
_reader.skipLinesWhenNeeded();
// First things first: are we expecting header line? If so, read, process
if (_schema.usesHeader()) {
_readHeaderLine();
_reader.skipLinesWhenNeeded();
}
// and if we are to skip the first data line, skip it
if (_schema.skipsFirstDataRow()) {
_reader.skipLine();
_reader.skipLinesWhenNeeded();
}
// Only one real complication, actually; empty documents (zero bytes).
// Those have no entries. Should be easy enough to detect like so:
final boolean wrapAsArray = Feature.WRAP_AS_ARRAY.enabledIn(_formatFeatures);
if (!_reader.hasMoreInput()) {
_state = STATE_DOC_END;
// but even empty sequence must still be wrapped in logical array
if (wrapAsArray) {
_parsingContext = _reader.childArrayContext(_parsingContext);
return JsonToken.START_ARRAY;
}
return null;
}
if (wrapAsArray) {
_parsingContext = _reader.childArrayContext(_parsingContext);
_state = STATE_RECORD_START;
return JsonToken.START_ARRAY;
}
// otherwise, same as regular new entry...
return _handleRecordStart();
}
protected JsonToken _handleRecordStart() throws IOException
{
_columnIndex = 0;
if (_columnCount == 0) { // no schema; exposed as an array
_state = STATE_UNNAMED_VALUE;
_parsingContext = _reader.childArrayContext(_parsingContext);
return JsonToken.START_ARRAY;
}
// otherwise, exposed as an Object
_parsingContext = _reader.childObjectContext(_parsingContext);
_state = STATE_NEXT_ENTRY;
return JsonToken.START_OBJECT;
}
protected JsonToken _handleNextEntry() throws IOException
{
// NOTE: only called when we do have real Schema
String next;
try {
next = _reader.nextString();
} catch (IOException e) {
// 12-Oct-2015, tatu: Need to resync here as well...
_state = STATE_SKIP_EXTRA_COLUMNS;
throw e;
}
if (next == null) { // end of record or input...
// 16-Mar-2017, tatu: [dataformat-csv#137] Missing column(s)?
if (_columnIndex < _columnCount) {
return _handleMissingColumns();
}
return _handleObjectRowEnd();
}
_currentValue = next;
if (_columnIndex >= _columnCount) {
return _handleExtraColumn(next);
}
_state = STATE_NAMED_VALUE;
_currentName = _schema.columnName(_columnIndex);
return JsonToken.FIELD_NAME;
}
protected JsonToken _handleNamedValue() throws IOException
{
// 06-Oct-2015, tatu: During recovery, may get past all regular columns,
// but we also need to allow access past... sort of.
if (_columnIndex < _columnCount) {
CsvSchema.Column column = _schema.column(_columnIndex);
++_columnIndex;
if (column.isArray()) {
_startArray(column);
return JsonToken.START_ARRAY;
}
}
_state = STATE_NEXT_ENTRY;
if (_nullValue != null) {
if (_nullValue.equals(_currentValue)) {
return JsonToken.VALUE_NULL;
}
}
return JsonToken.VALUE_STRING;
}
protected JsonToken _handleUnnamedValue() throws IOException
{
String next = _reader.nextString();
if (next == null) { // end of record or input...
_parsingContext = _parsingContext.getParent();
if (!_reader.startNewLine()) { // end of whole thing...
_state = STATE_DOC_END;
} else {
// no, just end of record
_state = STATE_RECORD_START;
}
return JsonToken.END_ARRAY;
}
// state remains the same
_currentValue = next;
++_columnIndex;
if (_nullValue != null) {
if (_nullValue.equals(next)) {
return JsonToken.VALUE_NULL;
}
}
return JsonToken.VALUE_STRING;
}
protected JsonToken _handleArrayValue() throws IOException
{
int offset = _arrayValueStart;
if (offset < 0) { // just returned last value
_parsingContext = _parsingContext.getParent();
// no arrays in arrays (at least for now), so must be back to named value
_state = STATE_NEXT_ENTRY;
return JsonToken.END_ARRAY;
}
int end = _arrayValue.indexOf(_arraySeparator, offset);
if (end < 0) { // last value
_arrayValueStart = end; // end marker, regardless
// 11-Feb-2015, tatu: Tricky, As per [dataformat-csv#66]; empty Strings really
// should not emit any values. Not sure if trim
if (offset == 0) { // no separator
// for now, let's use trimming for checking
if (_arrayValue.isEmpty() || _arrayValue.trim().isEmpty()) {
_parsingContext = _parsingContext.getParent();
_state = STATE_NEXT_ENTRY;
return JsonToken.END_ARRAY;
}
_currentValue = _arrayValue;
} else {
_currentValue = _arrayValue.substring(offset);
}
} else {
_currentValue = _arrayValue.substring(offset, end);
_arrayValueStart = end+_arraySeparator.length();
}
if (isEnabled(Feature.TRIM_SPACES)) {
_currentValue = _currentValue.trim();
}
if (_nullValue != null) {
if (_nullValue.equals(_currentValue)) {
return JsonToken.VALUE_NULL;
}
}
return JsonToken.VALUE_STRING;
}
/*
/**********************************************************
/* Parsing, helper methods, extra column(s)
/**********************************************************
*/
/**
* Helper method called when an extraneous column value is found.
* What happens then depends on configuration, but there are three
* main choices: ignore value (and rest of line); expose extra value
* as "any property" using configured name, or throw an exception.
*/
protected JsonToken _handleExtraColumn(String value) throws IOException
{
// If "any properties" enabled, expose as such
String anyProp = _schema.getAnyPropertyName();
if (anyProp != null) {
_currentName = anyProp;
_state = STATE_NAMED_VALUE;
return JsonToken.FIELD_NAME;
}
_currentName = null;
// With [dataformat-csv#95] we'll simply ignore extra
if (Feature.IGNORE_TRAILING_UNMAPPABLE.enabledIn(_formatFeatures)) {
_state = STATE_SKIP_EXTRA_COLUMNS;
return _skipUntilEndOfLine();
}
// 14-Mar-2012, tatu: As per [dataformat-csv#1], let's allow one specific case
// of extra: if we get just one all-whitespace entry, that can be just skipped
_state = STATE_SKIP_EXTRA_COLUMNS;
if (_columnIndex == _columnCount && Feature.ALLOW_TRAILING_COMMA.enabledIn(_formatFeatures)) {
value = value.trim();
if (value.isEmpty()) {
// if so, need to verify we then get the end-of-record;
// easiest to do by just calling ourselves again...
String next = _reader.nextString();
if (next == null) { // should end of record or input
return _handleObjectRowEnd();
}
}
}
// 21-May-2015, tatu: Need to enter recovery mode, to skip remainder of the line
return _reportCsvMappingError("Too many entries: expected at most %d (value #%d (%d chars) \"%s\")",
_columnCount, _columnIndex, value.length(), value);
}
/*
/**********************************************************
/* Parsing, helper methods, missing column(s)
/**********************************************************
*/
/**
* Helper method called when end of row occurs before finding values for
* all schema-specified columns.
*/
protected JsonToken _handleMissingColumns() throws IOException
{
if (Feature.FAIL_ON_MISSING_COLUMNS.enabledIn(_formatFeatures)) {
// First: to allow recovery, set states to expose next line, if any
_handleObjectRowEnd();
// and then report actual problem
return _reportCsvMappingError("Not enough column values: expected %d, found %d",
_columnCount, _columnIndex);
}
if (Feature.INSERT_NULLS_FOR_MISSING_COLUMNS.enabledIn(_formatFeatures)) {
_state = STATE_MISSING_VALUE;
_currentName = _schema.columnName(_columnIndex);
_currentValue = null;
return JsonToken.FIELD_NAME;
}
return _handleObjectRowEnd();
}
protected JsonToken _handleMissingName() throws IOException
{
if (++_columnIndex < _columnCount) {
_state = STATE_MISSING_VALUE;
_currentName = _schema.columnName(_columnIndex);
// _currentValue already set to null earlier
return JsonToken.FIELD_NAME;
}
return _handleObjectRowEnd();
}