Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Ensure correct behavior for rows ending in delimiter in loadtxt #22847

Merged
merged 1 commit into from Dec 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion numpy/core/src/multiarray/textreading/tokenize.cpp
Expand Up @@ -45,7 +45,8 @@ copy_to_field_buffer(tokenizer_state *ts,
const UCS *chunk_start, const UCS *chunk_end)
{
npy_intp chunk_length = chunk_end - chunk_start;
npy_intp size = chunk_length + ts->field_buffer_pos + 2;
/* Space for length +1 termination, +2 additional padding for add_field */
npy_intp size = chunk_length + ts->field_buffer_pos + 3;

if (NPY_UNLIKELY(ts->field_buffer_length < size)) {
npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4));
Expand Down Expand Up @@ -104,6 +105,7 @@ add_field(tokenizer_state *ts)
ts->num_fields += 1;
/* Ensure this (currently empty) word is NUL terminated. */
ts->field_buffer[ts->field_buffer_pos] = '\0';
assert(ts->field_buffer_length > ts->field_buffer_pos);
return 0;
}

Expand Down
5 changes: 3 additions & 2 deletions numpy/core/src/multiarray/textreading/tokenize.h
Expand Up @@ -46,8 +46,9 @@ typedef struct {
char *pos;
char *end;
/*
* Space to copy words into. The buffer must always be at least two NUL
* entries longer (8 bytes) than the actual word (including initially).
* Space to copy words into. Due to `add_field` not growing the buffer
* but writing a \0 termination, the buffer must always be two larger
* (add_field can be called twice if a row ends in a delimiter: "123,").
* The first byte beyond the current word is always NUL'ed on write, the
* second byte is there to allow easy appending of an additional empty
* word at the end (this word is also NUL terminated).
Expand Down
12 changes: 12 additions & 0 deletions numpy/lib/tests/test_loadtxt.py
Expand Up @@ -1011,3 +1011,15 @@ def test_control_characters_as_bytes():
"""Byte control characters (comments, delimiter) are supported."""
a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",")
assert_equal(a, [1, 2, 3])


@pytest.mark.filterwarnings('ignore::UserWarning')
def test_field_growing_cases():
# Test empty field appending/growing (each field still takes 1 character)
# to see if the final field appending does not create issues.
res = np.loadtxt([""], delimiter=",", dtype=bytes)
assert len(res) == 0

for i in range(1, 1024):
res = np.loadtxt(["," * i], delimiter=",", dtype=bytes)
assert len(res) == i+1