forked from numpy/numpy
/
tokenize.h
87 lines (69 loc) · 2.35 KB
/
tokenize.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_
#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_
#include <Python.h>
#include "numpy/ndarraytypes.h"
#include "textreading/stream.h"
#include "textreading/parser_config.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef enum {
/* Initialization of fields */
TOKENIZE_INIT,
TOKENIZE_CHECK_QUOTED,
/* Main field parsing states */
TOKENIZE_UNQUOTED,
TOKENIZE_UNQUOTED_WHITESPACE,
TOKENIZE_QUOTED,
/* Handling of two character control sequences (except "\r\n") */
TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE,
/* Line end handling */
TOKENIZE_LINE_END,
TOKENIZE_EAT_CRLF, /* "\r\n" support (carriage return, line feed) */
TOKENIZE_GOTO_LINE_END,
} tokenizer_parsing_state;
typedef struct {
size_t offset;
bool quoted;
} field_info;
typedef struct {
tokenizer_parsing_state state;
/* Either TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE: */
tokenizer_parsing_state unquoted_state;
int unicode_kind;
int buf_state;
/* the buffer we are currently working on */
char *pos;
char *end;
/*
* Space to copy words into. Due to `add_field` not growing the buffer
* but writing a \0 termination, the buffer must always be two larger
* (add_field can be called twice if a row ends in a delimiter: "123,").
* The first byte beyond the current word is always NUL'ed on write, the
* second byte is there to allow easy appending of an additional empty
* word at the end (this word is also NUL terminated).
*/
npy_intp field_buffer_length;
npy_intp field_buffer_pos;
Py_UCS4 *field_buffer;
/*
* Fields, including information about the field being quoted. This
* always includes one "additional" empty field. The length of a field
* is equal to `fields[i+1].offset - fields[i].offset - 1`.
*
* The tokenizer assumes at least one field is allocated.
*/
npy_intp num_fields;
npy_intp fields_size;
field_info *fields;
} tokenizer_state;
NPY_NO_EXPORT void
npy_tokenizer_clear(tokenizer_state *ts);
NPY_NO_EXPORT int
npy_tokenizer_init(tokenizer_state *ts, parser_config *config);
NPY_NO_EXPORT int
npy_tokenize(stream *s, tokenizer_state *ts, parser_config *const config);
#ifdef __cplusplus
}
#endif
#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ */