New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: add support for nan-like null strings in string replace #26355
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -1300,7 +1300,9 @@ string_replace_strided_loop( | |||||
|
||||||
PyArray_StringDTypeObject *descr0 = | ||||||
(PyArray_StringDTypeObject *)context->descriptors[0]; | ||||||
int has_null = descr0->na_object != NULL; | ||||||
int has_string_na = descr0->has_string_na; | ||||||
int has_nan_na = descr0->has_nan_na; | ||||||
const npy_static_string *default_string = &descr0->default_string; | ||||||
|
||||||
|
||||||
|
@@ -1330,11 +1332,29 @@ string_replace_strided_loop( | |||||
goto fail; | ||||||
} | ||||||
else if (i1_isnull || i2_isnull || i3_isnull) { | ||||||
if (!has_string_na) { | ||||||
npy_gil_error(PyExc_ValueError, | ||||||
"Null values are not supported as replacement arguments " | ||||||
"for replace"); | ||||||
goto fail; | ||||||
if (has_null && !has_string_na) { | ||||||
if (i2_isnull || i3_isnull) { | ||||||
npy_gil_error(PyExc_ValueError, | ||||||
"Null values are not supported as search " | ||||||
"patterns or replacement strings for " | ||||||
"replace"); | ||||||
goto fail; | ||||||
} | ||||||
else if (i1_isnull) { | ||||||
if (has_nan_na) { | ||||||
if (NpyString_pack_null(oallocator, ops) < 0) { | ||||||
npy_gil_error(PyExc_MemoryError, | ||||||
"Failed to deallocate string in replace"); | ||||||
goto fail; | ||||||
} | ||||||
goto next_step; | ||||||
} | ||||||
else { | ||||||
npy_gil_error(PyExc_ValueError, | ||||||
"Only NaN-like null strings can be used " | ||||||
"as search strings for replace"); | ||||||
} | ||||||
} | ||||||
} | ||||||
else { | ||||||
if (i1_isnull) { | ||||||
|
@@ -1349,32 +1369,50 @@ string_replace_strided_loop( | |||||
} | ||||||
} | ||||||
|
||||||
// conservatively overallocate | ||||||
// TODO check overflow | ||||||
size_t max_size; | ||||||
if (i2s.size == 0) { | ||||||
// interleaving | ||||||
max_size = i1s.size + (i1s.size + 1)*(i3s.size); | ||||||
} | ||||||
else { | ||||||
// replace i2 with i3 | ||||||
max_size = i1s.size * (i3s.size/i2s.size + 1); | ||||||
} | ||||||
char *new_buf = (char *)PyMem_RawCalloc(max_size, 1); | ||||||
Buffer<ENCODING::UTF8> buf1((char *)i1s.buf, i1s.size); | ||||||
Buffer<ENCODING::UTF8> buf2((char *)i2s.buf, i2s.size); | ||||||
Buffer<ENCODING::UTF8> buf3((char *)i3s.buf, i3s.size); | ||||||
Buffer<ENCODING::UTF8> outbuf(new_buf, max_size); | ||||||
{ | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the new indentation? It already is in the loop. (And it makes reviewing harder...) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's because of the new use of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd probably have gone for top of the for-loop myself, but no big deal... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While I don't hate the |
||||||
Buffer<ENCODING::UTF8> buf1((char *)i1s.buf, i1s.size); | ||||||
Buffer<ENCODING::UTF8> buf2((char *)i2s.buf, i2s.size); | ||||||
|
||||||
size_t new_buf_size = string_replace( | ||||||
buf1, buf2, buf3, *(npy_int64 *)in4, outbuf); | ||||||
npy_int64 in_count = *(npy_int64*)in4; | ||||||
if (in_count == -1) { | ||||||
in_count = NPY_MAX_INT64; | ||||||
} | ||||||
|
||||||
if (NpyString_pack(oallocator, ops, new_buf, new_buf_size) < 0) { | ||||||
npy_gil_error(PyExc_MemoryError, "Failed to pack string in replace"); | ||||||
goto fail; | ||||||
} | ||||||
npy_int64 found_count = string_count<ENCODING::UTF8>( | ||||||
buf1, buf2, 0, NPY_MAX_INT64); | ||||||
if (found_count == -2) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Yes, it returns -2 due to |
||||||
goto fail; | ||||||
} | ||||||
|
||||||
PyMem_RawFree(new_buf); | ||||||
npy_intp count = Py_MIN(in_count, found_count); | ||||||
|
||||||
Buffer<ENCODING::UTF8> buf3((char *)i3s.buf, i3s.size); | ||||||
|
||||||
// conservatively overallocate | ||||||
// TODO check overflow | ||||||
size_t max_size; | ||||||
if (i2s.size == 0) { | ||||||
// interleaving | ||||||
max_size = i1s.size + (i1s.size + 1)*(i3s.size); | ||||||
} | ||||||
else { | ||||||
// replace i2 with i3 | ||||||
max_size = i1s.size * (i3s.size/i2s.size + 1); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That didn't change, but now that you have Also, am I confused by the division. It seems correct, but a bit overly complicated, since you can use
I.e. we replace at most count items (it might be less, if we can find overlaps with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. I agree this logic here is poorly motivated and using the |
||||||
} | ||||||
char *new_buf = (char *)PyMem_RawCalloc(max_size, 1); | ||||||
Buffer<ENCODING::UTF8> outbuf(new_buf, max_size); | ||||||
|
||||||
size_t new_buf_size = string_replace( | ||||||
buf1, buf2, buf3, count, outbuf); | ||||||
|
||||||
if (NpyString_pack(oallocator, ops, new_buf, new_buf_size) < 0) { | ||||||
npy_gil_error(PyExc_MemoryError, "Failed to pack string in replace"); | ||||||
goto fail; | ||||||
} | ||||||
|
||||||
PyMem_RawFree(new_buf); | ||||||
} | ||||||
next_step: | ||||||
|
||||||
in1 += strides[0]; | ||||||
in2 += strides[1]; | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1218,6 +1218,7 @@ def test_unary(string_array, unicode_array, function_name): | |
"strip", | ||
"lstrip", | ||
"rstrip", | ||
"replace" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @seberg this change makes sure the error paths are tested. |
||
"zfill", | ||
] | ||
|
||
|
@@ -1230,7 +1231,6 @@ def test_unary(string_array, unicode_array, function_name): | |
"count", | ||
"find", | ||
"rfind", | ||
"replace", | ||
] | ||
|
||
SUPPORTS_NULLS = ( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(just a curious note for now)
I think default strings don't actually hit this, right? The only subtlety (which I don't care about), is that the we don't mutate the default string stored on the dtype probably, but rather insert the same string every time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah good point; this error message isn't quite right, using a string as a missing string is also supported. Will update the error to match this.
Not sure what you're getting at about mutating strings, but that's why they're static strings that store the string data in a
const
buffer. Anyone mutating it is going out of their way to do so.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking of:
doesn't give a
StringDtype(na_value="parrot")
, I think so "bloats" memory.I don't mind that enough to worry (at least fo rnow, I think this is a niche feature)
EDIT: Sorry, first edit didn't use the same replacemnt as was the na_value... Also, to be clear, I am not sure that should happen!