diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp index d020c2251ea4..9cb32634d825 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.cpp +++ b/numpy/core/src/multiarray/textreading/tokenize.cpp @@ -45,7 +45,8 @@ copy_to_field_buffer(tokenizer_state *ts, const UCS *chunk_start, const UCS *chunk_end) { npy_intp chunk_length = chunk_end - chunk_start; - npy_intp size = chunk_length + ts->field_buffer_pos + 2; + /* Space for length +1 termination, +2 additional padding for add_field */ + npy_intp size = chunk_length + ts->field_buffer_pos + 3; if (NPY_UNLIKELY(ts->field_buffer_length < size)) { npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4)); @@ -104,6 +105,7 @@ add_field(tokenizer_state *ts) ts->num_fields += 1; /* Ensure this (currently empty) word is NUL terminated. */ ts->field_buffer[ts->field_buffer_pos] = '\0'; + assert(ts->field_buffer_length > ts->field_buffer_pos); return 0; } diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h index d0ea4638329c..53e97760ff9e 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.h +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -46,8 +46,9 @@ typedef struct { char *pos; char *end; /* - * Space to copy words into. The buffer must always be at least two NUL - * entries longer (8 bytes) than the actual word (including initially). + * Space to copy words into. Due to `add_field` not growing the buffer + * but writing a \0 termination, the buffer must always be two larger + * (add_field can be called twice if a row ends in a delimiter: "123,"). * The first byte beyond the current word is always NUL'ed on write, the * second byte is there to allow easy appending of an additional empty * word at the end (this word is also NUL terminated). diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 0b8fe3c479c6..819a8dda466d 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -1011,3 +1011,15 @@ def test_control_characters_as_bytes(): """Byte control characters (comments, delimiter) are supported.""" a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",") assert_equal(a, [1, 2, 3]) + + +@pytest.mark.filterwarnings('ignore::UserWarning') +def test_field_growing_cases(): + # Test empty field appending/growing (each field still takes 1 character) + # to see if the final field appending does not create issues. + res = np.loadtxt([""], delimiter=",", dtype=bytes) + assert len(res) == 0 + + for i in range(1, 1024): + res = np.loadtxt(["," * i], delimiter=",", dtype=bytes) + assert len(res) == i+1