Skip to content

Commit

Permalink
add PyStringMethods::encode_utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
davidhewitt committed Feb 5, 2024
1 parent ecb4ecb commit 88a148a
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 5 deletions.
1 change: 1 addition & 0 deletions newsfragments/3800.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add `PyStringMethods::encode_utf8`.
39 changes: 34 additions & 5 deletions src/types/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ pub trait PyStringMethods<'py> {
/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
fn to_string_lossy(&self) -> Cow<'_, str>;

/// Encodes this string as a Python `bytes` object, using UTF-8 encoding.
fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;

/// Obtains the raw data backing the Python string.
///
/// If the Python string object was created through legacy APIs, its internal storage format
Expand Down Expand Up @@ -337,6 +340,14 @@ impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
self.as_borrowed().to_string_lossy()
}

fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
unsafe {
ffi::PyUnicode_AsUTF8String(self.as_ptr())
.assume_owned_or_err(self.py())
.downcast_into_unchecked::<PyBytes>()
}
}

#[cfg(not(Py_LIMITED_API))]
unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
self.as_borrowed().data()
Expand Down Expand Up @@ -371,11 +382,7 @@ impl<'a> Borrowed<'a, '_, PyString> {

#[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
{
let bytes = unsafe {
ffi::PyUnicode_AsUTF8String(self.as_ptr())
.assume_owned_or_err(self.py())?
.downcast_into_unchecked::<PyBytes>()
};
let bytes = self.encode_utf8()?;
Ok(Cow::Owned(
unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
))
Expand Down Expand Up @@ -535,6 +542,28 @@ mod tests {
})
}

#[test]
fn test_encode_utf8_unicode() {
Python::with_gil(|py| {
let s = "哈哈🐈";
let obj = PyString::new_bound(py, s);
assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
})
}

#[test]
fn test_encode_utf8_surrogate() {
Python::with_gil(|py| {
let obj: PyObject = py.eval(r"'\ud800'", None, None).unwrap().into();
assert!(obj
.bind(py)
.downcast::<PyString>()
.unwrap()
.encode_utf8()
.is_err());
})
}

#[test]
fn test_to_string_lossy() {
Python::with_gil(|py| {
Expand Down

0 comments on commit 88a148a

Please sign in to comment.