Skip to content

Commit bab05e8

Browse files
authored
Merge pull request #639 from Alexander-N/bytes-result
Guard against PyUnicode_AsUTF8AndSize returning null in PyString::as_bytes
2 parents 45eb9f4 + 7a4909b commit bab05e8

File tree

2 files changed

+56
-17
lines changed

2 files changed

+56
-17
lines changed

src/types/string.rs

+27-17
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ use crate::instance::PyNativeType;
77
use crate::object::PyObject;
88
use crate::types::PyAny;
99
use crate::AsPyPointer;
10+
use crate::IntoPy;
1011
use crate::Python;
11-
use crate::{exceptions, IntoPy};
1212
use crate::{ffi, FromPy};
1313
use std::borrow::Cow;
1414
use std::ops::Index;
@@ -59,37 +59,38 @@ impl PyString {
5959
}
6060

6161
/// Get the Python string as a byte slice.
62+
///
63+
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
64+
/// (containing unpaired surrogates).
6265
#[inline]
63-
pub fn as_bytes(&self) -> &[u8] {
66+
pub fn as_bytes(&self) -> PyResult<&[u8]> {
6467
unsafe {
6568
let mut size: ffi::Py_ssize_t = 0;
6669
let data = ffi::PyUnicode_AsUTF8AndSize(self.0.as_ptr(), &mut size) as *const u8;
67-
// PyUnicode_AsUTF8AndSize would return null if the pointer did not reference a valid
68-
// unicode object, but because we have a valid PyString, assume success
69-
debug_assert!(!data.is_null());
70-
std::slice::from_raw_parts(data, size as usize)
70+
if data.is_null() {
71+
Err(PyErr::fetch(self.py()))
72+
} else {
73+
Ok(std::slice::from_raw_parts(data, size as usize))
74+
}
7175
}
7276
}
7377

7478
/// Convert the `PyString` into a Rust string.
75-
///
76-
/// Returns a `UnicodeDecodeError` if the input is not valid unicode
77-
/// (containing unpaired surrogates).
7879
pub fn to_string(&self) -> PyResult<Cow<str>> {
79-
match std::str::from_utf8(self.as_bytes()) {
80-
Ok(s) => Ok(Cow::Borrowed(s)),
81-
Err(e) => Err(PyErr::from_instance(
82-
exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
83-
)),
84-
}
80+
let bytes = self.as_bytes()?;
81+
let string = std::str::from_utf8(bytes)?;
82+
Ok(Cow::Borrowed(string))
8583
}
8684

8785
/// Convert the `PyString` into a Rust string.
8886
///
8987
/// Unpaired surrogates invalid UTF-8 sequences are
9088
/// replaced with U+FFFD REPLACEMENT CHARACTER.
9189
pub fn to_string_lossy(&self) -> Cow<str> {
92-
String::from_utf8_lossy(self.as_bytes())
90+
// TODO: Handle error of `as_bytes`
91+
// see https://github.com/PyO3/pyo3/pull/634
92+
let bytes = self.as_bytes().unwrap();
93+
String::from_utf8_lossy(bytes)
9394
}
9495
}
9596

@@ -273,7 +274,16 @@ mod test {
273274
let s = "ascii 🐈";
274275
let obj: PyObject = PyString::new(py, s).into();
275276
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
276-
assert_eq!(s.as_bytes(), py_string.as_bytes());
277+
assert_eq!(s.as_bytes(), py_string.as_bytes().unwrap());
278+
}
279+
280+
#[test]
281+
fn test_as_bytes_surrogate() {
282+
let gil = Python::acquire_gil();
283+
let py = gil.python();
284+
let obj: PyObject = py.eval(r#"'\ud800'"#, None, None).unwrap().into();
285+
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
286+
assert!(py_string.as_bytes().is_err());
277287
}
278288

279289
#[test]

tests/test_string.rs

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
use pyo3::prelude::*;
2+
use pyo3::py_run;
3+
use pyo3::wrap_pyfunction;
4+
5+
mod common;
6+
7+
#[pyfunction]
8+
fn take_str(_s: &str) -> PyResult<()> {
9+
Ok(())
10+
}
11+
12+
#[test]
13+
fn test_unicode_encode_error() {
14+
let gil = Python::acquire_gil();
15+
let py = gil.python();
16+
17+
let take_str = wrap_pyfunction!(take_str)(py);
18+
py_run!(
19+
py,
20+
take_str,
21+
r#"
22+
try:
23+
take_str('\ud800')
24+
except UnicodeEncodeError as e:
25+
error_msg = "'utf-8' codec can't encode character '\\ud800' in position 0: surrogates not allowed"
26+
assert str(e) == error_msg
27+
"#
28+
);
29+
}

0 commit comments

Comments
 (0)