Skip to content

Commit

Permalink
Improve object handling & testing of ensure_unicode (#9059)
Browse files Browse the repository at this point in the history
* Use `elif` for `decode` in `ensure_unicode`

* Handle Python Buffer Protocol in `ensure_unicode`

Any other arbitrary object (like `bytearray` or `memoryview` based
objects) can be decoded to `unicode` via `codecs.decode`. This is
analogous to what is done in `ensure_bytes`. So handle this case here.
If this also fails, then raise as usual.

* Include `ensure_unicode` tests for various objects

* Clarify error messages

* Use `uint8` in `array` tests

This is more consistent with the other tests, which also use this type.
Though `int8` also works.

* Pass `bytes` directly to `array`

Appears this already gets interpreted correctly by `array`. Should also
make the code easier to read for other maintainers.

* Use `from array import array`

Avoids the `array.array` bit which is a tad verbose.
  • Loading branch information
jakirkham committed May 11, 2022
1 parent 5fbda77 commit 5f74a6c
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 5 deletions.
29 changes: 27 additions & 2 deletions dask/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import array
import datetime
import functools
import operator
import pickle
from array import array

import pytest
from tlz import curry
Expand All @@ -21,6 +21,7 @@
ensure_bytes,
ensure_dict,
ensure_set,
ensure_unicode,
extra_titles,
factors,
format_bytes,
Expand All @@ -47,7 +48,7 @@


def test_ensure_bytes():
data = [b"1", "1", memoryview(b"1"), bytearray(b"1"), array.array("b", [49])]
data = [b"1", "1", memoryview(b"1"), bytearray(b"1"), array("B", b"1")]
for d in data:
result = ensure_bytes(d)
assert isinstance(result, bytes)
Expand All @@ -67,6 +68,30 @@ def test_ensure_bytes_pyarrow_buffer():
assert isinstance(result, bytes)


def test_ensure_unicode():
data = [b"1", "1", memoryview(b"1"), bytearray(b"1"), array("B", b"1")]
for d in data:
result = ensure_unicode(d)
assert isinstance(result, str)
assert result == "1"


def test_ensure_unicode_ndarray():
np = pytest.importorskip("numpy")
a = np.frombuffer(b"123", dtype="u1")
result = ensure_unicode(a)
assert isinstance(result, str)
assert result == "123"


def test_ensure_unicode_pyarrow_buffer():
pa = pytest.importorskip("pyarrow")
buf = pa.py_buffer(b"123")
result = ensure_unicode(buf)
assert isinstance(result, str)
assert result == "123"


def test_getargspec():
def func(x, y):
pass
Expand Down
13 changes: 10 additions & 3 deletions dask/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import codecs
import functools
import inspect
import os
Expand Down Expand Up @@ -921,7 +922,7 @@ def ensure_bytes(s) -> bytes:
return bytes(s)
except Exception as e:
raise TypeError(
f"Object {s} is neither a bytes object nor has an encode method"
f"Object {s} is neither a bytes object nor can be encoded to bytes"
) from e


Expand All @@ -935,9 +936,15 @@ def ensure_unicode(s) -> str:
"""
if isinstance(s, str):
return s
if hasattr(s, "decode"):
elif hasattr(s, "decode"):
return s.decode()
raise TypeError(f"Object {s} is neither a str object nor has an decode method")
else:
try:
return codecs.decode(s)
except Exception as e:
raise TypeError(
f"Object {s} is neither a str object nor can be decoded to str"
) from e


def digit(n, k, base):
Expand Down

0 comments on commit 5f74a6c

Please sign in to comment.