Skip to content

Commit

Permalink
Merge pull request #22847 from charris/backport-22836
Browse files Browse the repository at this point in the history
BUG: Ensure correct behavior for rows ending in delimiter in loadtxt
  • Loading branch information
charris committed Dec 21, 2022
2 parents c690bcf + 9291c0c commit d6dbe36
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 3 deletions.
4 changes: 3 additions & 1 deletion numpy/core/src/multiarray/textreading/tokenize.cpp
Expand Up @@ -45,7 +45,8 @@ copy_to_field_buffer(tokenizer_state *ts,
const UCS *chunk_start, const UCS *chunk_end)
{
npy_intp chunk_length = chunk_end - chunk_start;
npy_intp size = chunk_length + ts->field_buffer_pos + 2;
/* Space for length +1 termination, +2 additional padding for add_field */
npy_intp size = chunk_length + ts->field_buffer_pos + 3;

if (NPY_UNLIKELY(ts->field_buffer_length < size)) {
npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4));
Expand Down Expand Up @@ -104,6 +105,7 @@ add_field(tokenizer_state *ts)
ts->num_fields += 1;
/* Ensure this (currently empty) word is NUL terminated. */
ts->field_buffer[ts->field_buffer_pos] = '\0';
assert(ts->field_buffer_length > ts->field_buffer_pos);
return 0;
}

Expand Down
5 changes: 3 additions & 2 deletions numpy/core/src/multiarray/textreading/tokenize.h
Expand Up @@ -46,8 +46,9 @@ typedef struct {
char *pos;
char *end;
/*
* Space to copy words into. The buffer must always be at least two NUL
* entries longer (8 bytes) than the actual word (including initially).
* Space to copy words into. Due to `add_field` not growing the buffer
* but writing a \0 termination, the buffer must always be two larger
* (add_field can be called twice if a row ends in a delimiter: "123,").
* The first byte beyond the current word is always NUL'ed on write, the
* second byte is there to allow easy appending of an additional empty
* word at the end (this word is also NUL terminated).
Expand Down
12 changes: 12 additions & 0 deletions numpy/lib/tests/test_loadtxt.py
Expand Up @@ -1011,3 +1011,15 @@ def test_control_characters_as_bytes():
"""Byte control characters (comments, delimiter) are supported."""
a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",")
assert_equal(a, [1, 2, 3])


@pytest.mark.filterwarnings('ignore::UserWarning')
def test_field_growing_cases():
# Test empty field appending/growing (each field still takes 1 character)
# to see if the final field appending does not create issues.
res = np.loadtxt([""], delimiter=",", dtype=bytes)
assert len(res) == 0

for i in range(1, 1024):
res = np.loadtxt(["," * i], delimiter=",", dtype=bytes)
assert len(res) == i+1

0 comments on commit d6dbe36

Please sign in to comment.