-
Notifications
You must be signed in to change notification settings - Fork 53
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow for non-ASCII decoding in legacy demangling #65
base: main
Are you sure you want to change the base?
Changes from all commits
c9faa0b
3cc40e7
8fdefc2
e9f84c1
3ed564f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,26 +74,12 @@ enum DemangleStyle<'a> { | |
V0(v0::Demangle<'a>), | ||
} | ||
|
||
/// De-mangles a Rust symbol into a more readable version | ||
/// | ||
/// This function will take a **mangled** symbol and return a value. When printed, | ||
/// the de-mangled version will be written. If the symbol does not look like | ||
/// a mangled symbol, the original value will be written instead. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use rustc_demangle::demangle; | ||
/// | ||
/// assert_eq!(demangle("_ZN4testE").to_string(), "test"); | ||
/// assert_eq!(demangle("_ZN3foo3barE").to_string(), "foo::bar"); | ||
/// assert_eq!(demangle("foo").to_string(), "foo"); | ||
/// ``` | ||
pub fn demangle(mut s: &str) -> Demangle { | ||
fn demangle_common(s: &str) -> Option<(DemangleStyle, &str)> { | ||
// During ThinLTO LLVM may import and rename internal symbols, so strip out | ||
// those endings first as they're one of the last manglings applied to symbol | ||
// names. | ||
let llvm = ".llvm."; | ||
let mut thinlto_stripped = s; | ||
if let Some(i) = s.find(llvm) { | ||
let candidate = &s[i + llvm.len()..]; | ||
let all_hex = candidate.chars().all(|c| match c { | ||
|
@@ -102,87 +88,120 @@ pub fn demangle(mut s: &str) -> Demangle { | |
}); | ||
|
||
if all_hex { | ||
s = &s[..i]; | ||
thinlto_stripped = &s[..i]; | ||
} | ||
} | ||
|
||
let mut suffix = ""; | ||
let mut style = match legacy::demangle(s) { | ||
Ok((d, s)) => { | ||
suffix = s; | ||
Some(DemangleStyle::Legacy(d)) | ||
} | ||
Err(()) => match v0::demangle(s) { | ||
Ok((d, s)) => { | ||
suffix = s; | ||
Some(DemangleStyle::V0(d)) | ||
} | ||
match legacy::demangle(thinlto_stripped) { | ||
Ok((d, suffix)) => Some((DemangleStyle::Legacy(d), suffix)), | ||
Err(()) => match v0::demangle(thinlto_stripped) { | ||
Ok((d, suffix)) => Some((DemangleStyle::V0(d), suffix)), | ||
// FIXME(eddyb) would it make sense to treat an unknown-validity | ||
// symbol (e.g. one that errored with `RecursedTooDeep`) as | ||
// v0-mangled, and have the error show up in the demangling? | ||
// (that error already gets past this initial check, and therefore | ||
// will show up in the demangling, if hidden behind a backref) | ||
Err(v0::ParseError::Invalid) | Err(v0::ParseError::RecursedTooDeep) => None, | ||
}, | ||
}; | ||
} | ||
} | ||
|
||
// Output like LLVM IR adds extra period-delimited words. See if | ||
// we are in that case and save the trailing words if so. | ||
if !suffix.is_empty() { | ||
if suffix.starts_with('.') && is_symbol_like(suffix) { | ||
// Keep the suffix. | ||
} else { | ||
// Reset the suffix and invalidate the demangling. | ||
suffix = ""; | ||
style = None; | ||
/// De-mangles a Rust symbol into a more readable version | ||
/// | ||
/// This function will take a **mangled** symbol and return a value. When printed, | ||
/// the de-mangled version will be written. If the symbol does not look like | ||
/// a mangled symbol, the original value will be written instead. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use rustc_demangle::demangle; | ||
/// | ||
/// assert_eq!(demangle("_ZN4testE").to_string(), "test"); | ||
/// assert_eq!(demangle("_ZN3foo3barE").to_string(), "foo::bar"); | ||
/// assert_eq!(demangle("foo").to_string(), "foo"); | ||
/// ``` | ||
pub fn demangle(s: &str) -> Demangle { | ||
if let Some((style, remainder)) = demangle_common(s) { | ||
// Output like LLVM IR adds extra period-delimited words. See if | ||
// we are in that case and save the trailing words if so. | ||
if remainder.is_empty() || (remainder.starts_with('.') && is_llvm_suffix_like(remainder)) { | ||
return Demangle { | ||
style: Some(style), | ||
original: s, | ||
suffix: remainder, | ||
}; | ||
} | ||
} | ||
|
||
Demangle { | ||
style, | ||
return Demangle { | ||
style: None, | ||
original: s, | ||
suffix, | ||
suffix: "", | ||
}; | ||
} | ||
|
||
#[cfg(feature = "std")] | ||
fn demangle_partial(s: &str) -> (Demangle, &str) { | ||
if let Some((style, remainder)) = demangle_common(s) { | ||
// Note: suffix is ALWAYS empty because we do not compute the | ||
// LLVM compatibility (nor do we care) | ||
return ( | ||
Demangle { | ||
style: Some(style), | ||
original: s, | ||
suffix: "", | ||
}, | ||
remainder, | ||
); | ||
} | ||
|
||
( | ||
Demangle { | ||
style: None, | ||
original: s, | ||
suffix: "", | ||
}, | ||
s, | ||
) | ||
} | ||
|
||
#[cfg(feature = "std")] | ||
fn demangle_line( | ||
line: &str, | ||
mut line: &str, | ||
output: &mut impl std::io::Write, | ||
include_hash: bool, | ||
) -> std::io::Result<()> { | ||
let mut head = 0; | ||
while head < line.len() { | ||
loop { | ||
// Move to the next potential match | ||
let next_head = match (line[head..].find("_ZN"), line[head..].find("_R")) { | ||
(Some(idx), None) | (None, Some(idx)) => head + idx, | ||
(Some(idx1), Some(idx2)) => head + idx1.min(idx2), | ||
let next_head = match (line.find("_ZN"), line.find("_R")) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this valid for OSX? This conflicts with the tests. |
||
(Some(idx), None) | (None, Some(idx)) => idx, | ||
(Some(idx1), Some(idx2)) => idx1.min(idx2), | ||
(None, None) => { | ||
// No more matches... | ||
line.len() | ||
} | ||
}; | ||
output.write_all(line[head..next_head].as_bytes())?; | ||
head = next_head; | ||
// Find the non-matching character. | ||
// | ||
// If we do not find a character, then until the end of the line is the | ||
// thing to demangle. | ||
let match_end = line[head..] | ||
.find(|ch: char| !(ch == '$' || ch == '.' || ch == '_' || ch.is_ascii_alphanumeric())) | ||
.map(|idx| head + idx) | ||
.unwrap_or(line.len()); | ||
|
||
let mangled = &line[head..match_end]; | ||
head = head + mangled.len(); | ||
if let Ok(demangled) = try_demangle(mangled) { | ||
output.write_all(line[..next_head].as_bytes())?; | ||
line = &line[next_head..]; | ||
|
||
if line.is_empty() { | ||
break; | ||
} | ||
|
||
let (demangled, remainder) = demangle_partial(line); | ||
line = remainder; | ||
|
||
if demangled.style.is_some() { | ||
if include_hash { | ||
write!(output, "{}", demangled)?; | ||
} else { | ||
write!(output, "{:#}", demangled)?; | ||
} | ||
} else { | ||
output.write_all(mangled.as_bytes())?; | ||
// there are maybe valid symbols inside this fake one | ||
output.write_all(&line.as_bytes()[..1])?; | ||
line = &line[1..]; | ||
Comment on lines
-185
to
+204
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note again a subtle difference: this can now identify and write symbols inside what appear to be other symbols. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't that accidentally demangle parts of C++ symbols as Rust symbols? If so that would break C++ demangling for programs that try Rust demangling before C++ demangling to handle both. Trying C++ demangling first is guaranteed to work for legacy Rust symbols, but doesn't demangle the |
||
} | ||
} | ||
Ok(()) | ||
|
@@ -250,7 +269,7 @@ impl<'a> Demangle<'a> { | |
} | ||
} | ||
|
||
fn is_symbol_like(s: &str) -> bool { | ||
fn is_llvm_suffix_like(s: &str) -> bool { | ||
s.chars().all(|c| { | ||
// Once `char::is_ascii_punctuation` and `char::is_ascii_alphanumeric` | ||
// have been stable for long enough, use those instead for clarity | ||
|
@@ -408,6 +427,14 @@ mod tests { | |
t!("_ZN4test1a2bcE", "test::a::bc"); | ||
} | ||
|
||
#[test] | ||
fn demangle_emoji() { | ||
t_err!("🐇"); | ||
t!("_ZN4🐇E", "🐇"); | ||
t_err!("_ZN4🐇"); | ||
t!("_ZN4🐇1a2bcE", "🐇::a::bc"); | ||
} | ||
|
||
#[test] | ||
fn demangle_dollars() { | ||
t!("_ZN4$RP$E", ")"); | ||
|
@@ -564,7 +591,7 @@ mod tests { | |
#[cfg(feature = "std")] | ||
fn demangle_str(input: &str) -> String { | ||
let mut output = Vec::new(); | ||
super::demangle_line(input, &mut output, false); | ||
super::demangle_line(input, &mut output, false).unwrap(); | ||
String::from_utf8(output).unwrap() | ||
} | ||
|
||
|
@@ -577,6 +604,15 @@ mod tests { | |
); | ||
} | ||
|
||
#[test] | ||
#[cfg(feature = "std")] | ||
fn find_multiple_emoji() { | ||
assert_eq!( | ||
demangle_str("_ZN4🐇E.llvm moocow _ZN4🐇E.llvm"), | ||
"🐇.llvm moocow 🐇.llvm" | ||
); | ||
} | ||
|
||
#[test] | ||
#[cfg(feature = "std")] | ||
fn interleaved_new_legacy() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note a subtle change here: previously, this would strip the thinlto data from s before assigning original to it, which could lead to the removal of symbol data before passing it to original. This seemed to be erroneous (e.g. if style is None I would expect original to be unchanged) and would truncate the remaining line if unchanged (since we must keep remainder data). It is possible to return to original behaviour if this change is unwanted.