diff --git a/Cargo.lock b/Cargo.lock index cc74e3b4cd..8e78865e4b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -981,6 +981,15 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "email_address" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2153bd83ebc09db15bcbdc3e2194d901804952e3dc96967e1cd3b0c5c32d112" +dependencies = [ + "serde", +] + [[package]] name = "encode_unicode" version = "0.3.6" @@ -1800,7 +1809,7 @@ dependencies = [ "cached", "check-if-email-exists", "doc-comment", - "fast_chemail", + "email_address", "futures", "glob", "html5ever", diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 8ca8bb76a9..fec66319fa 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -55,21 +55,21 @@ Or, you can accept all content/MIME types: `--headers "accept=*/*"`. See more info about the Accept header [over at MDN](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept). - ## Unreachable Mail Address We use https://github.com/reacherhq/check-if-email-exists for email checking. You can test your mail address with curl: -``` + +```bash curl -X POST \ 'https://api.reacher.email/v0/check_email' \ -H 'content-type: application/json' \ -H 'authorization: test_api_token' \ -d '{"to_email": "box@domain.test"}' ``` -Some settings on your mail server (such as SPF Policy, DNSBL) may prevent your email from being verified. -If you have an error with checking a working email, you can disable this check using the -[commandline parameter](https://github.com/lycheeverse/lychee#commandline-parameters) `--exclude-mail`. - - +Some settings on your mail server (such as `SPF` Policy, `DNSBL`) may prevent +your email from being verified. If you have an error with checking a working +email, you can disable this check using the [commandline +parameter](https://github.com/lycheeverse/lychee#commandline-parameters) +`--exclude-mail`. diff --git a/examples/builder/builder.rs b/examples/builder/builder.rs index 62ec4b1d13..8ff9dd9075 100644 --- a/examples/builder/builder.rs +++ b/examples/builder/builder.rs @@ -9,9 +9,9 @@ use std::{collections::HashSet, time::Duration}; #[allow(clippy::trivial_regex)] async fn main() -> Result<()> { // Excludes - let excludes = Some(RegexSet::new(&[r"example"]).unwrap()); + let excludes = Some(RegexSet::new([r"example"]).unwrap()); // Includes take precedence over excludes - let includes = Some(RegexSet::new(&[r"example.com"]).unwrap()); + let includes = Some(RegexSet::new([r"example.com"]).unwrap()); // Set custom request headers let mut headers = HeaderMap::new(); diff --git a/fixtures/TEST_EMAIL_QUERY_PARAMS.html b/fixtures/TEST_EMAIL_QUERY_PARAMS.html new file mode 100644 index 0000000000..7b2c170c2f --- /dev/null +++ b/fixtures/TEST_EMAIL_QUERY_PARAMS.html @@ -0,0 +1,14 @@ + + + Lychee Test + + +

+ Please email + hello@example.org + for any questions. +

+ + diff --git a/fixtures/TEST_EMAIL_QUERY_PARAMS.md b/fixtures/TEST_EMAIL_QUERY_PARAMS.md new file mode 100644 index 0000000000..8bf285f464 --- /dev/null +++ b/fixtures/TEST_EMAIL_QUERY_PARAMS.md @@ -0,0 +1 @@ +Please email [hello@example.org](mailto:hello@example.org?subject=%5BHello%5D) for any questions. diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index f72c2effb8..6978234288 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -127,6 +127,34 @@ mod cli { ) } + #[test] + fn test_email_html_with_subject() -> Result<()> { + let mut cmd = main_command(); + let input = fixtures_path().join("TEST_EMAIL_QUERY_PARAMS.html"); + + cmd.arg("--dump") + .arg(input) + .assert() + .success() + .stdout(contains("hello@example.org?subject=%5BHello%5D")); + + Ok(()) + } + + #[test] + fn test_email_markdown_with_subject() -> Result<()> { + let mut cmd = main_command(); + let input = fixtures_path().join("TEST_EMAIL_QUERY_PARAMS.md"); + + cmd.arg("--dump") + .arg(input) + .assert() + .success() + .stdout(contains("hello@example.org?subject=%5BHello%5D")); + + Ok(()) + } + /// Test that a GitHub link can be checked without specifying the token. #[test] fn test_check_github_no_token() -> Result<()> { diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 5e2232f74d..fe39eceffd 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -18,7 +18,7 @@ version = "0.10.1" [dependencies] check-if-email-exists = "0.9.0" -fast_chemail = "0.9.6" +email_address = "0.2.4" glob = "0.3.0" http = "0.2.8" linkify = "0.9.0" diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 00afb9dc85..31fdeeaedb 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -549,8 +549,13 @@ impl Client { } /// Check a mail address, or equivalently a `mailto` URI. + /// + /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`), + /// which are ignored by this check. The are not part of the mail address + /// and instead passed to a mail client. pub async fn check_mail(&self, uri: &Uri) -> Status { - let input = CheckEmailInput::new(uri.as_str().to_owned()); + let address = uri.url.path().to_string(); + let input = CheckEmailInput::new(address); let result = &(check_email(&input).await); if let Reachable::Invalid = result.is_reachable { diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index 421d43feab..07f6a2b06d 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -304,7 +304,7 @@ mod tests { #[test] fn test_overwrite_false_positives() { let includes = Includes { - regex: RegexSet::new(&[r"http://www.w3.org/1999/xhtml"]).unwrap(), + regex: RegexSet::new([r"http://www.w3.org/1999/xhtml"]).unwrap(), }; let filter = Filter { includes: Some(includes), @@ -316,7 +316,7 @@ mod tests { #[test] fn test_include_regex() { let includes = Includes { - regex: RegexSet::new(&[r"foo.example.com"]).unwrap(), + regex: RegexSet::new([r"foo.example.com"]).unwrap(), }; let filter = Filter { includes: Some(includes), @@ -344,7 +344,7 @@ mod tests { #[test] fn test_exclude_regex() { let excludes = Excludes { - regex: RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap(), + regex: RegexSet::new([r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap(), }; let filter = Filter { excludes: Some(excludes), @@ -361,10 +361,10 @@ mod tests { #[test] fn test_exclude_include_regex() { let includes = Includes { - regex: RegexSet::new(&[r"foo.example.com"]).unwrap(), + regex: RegexSet::new([r"foo.example.com"]).unwrap(), }; let excludes = Excludes { - regex: RegexSet::new(&[r"example.com"]).unwrap(), + regex: RegexSet::new([r"example.com"]).unwrap(), }; let filter = Filter { includes: Some(includes), diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs index 3fbd1df683..b2766162aa 100644 --- a/lychee-lib/src/helpers/path.rs +++ b/lychee-lib/src/helpers/path.rs @@ -97,8 +97,8 @@ fn join(base: PathBuf, dst: &Path) -> PathBuf { // // Unfortunately requires real files for `fs::canonicalize`. pub(crate) fn contains(parent: &PathBuf, child: &PathBuf) -> Result { - let parent = fs::canonicalize(&parent)?; - let child = fs::canonicalize(&child)?; + let parent = fs::canonicalize(parent)?; + let child = fs::canonicalize(child)?; Ok(child.starts_with(parent)) } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index a7ffa89265..2cb5dbbf5a 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -43,6 +43,9 @@ pub enum ErrorKind { /// Invalid Github URL #[error("Github URL is invalid: {0}")] InvalidGithubUrl(String), + /// The input is empty and not accepted as a valid URL + #[error("URL cannot be empty")] + EmptyUrl, /// The given string can not be parsed into a valid URL, e-mail address, or file path #[error("Cannot parse string `{1}` as website url: {0}")] ParseUrl(#[source] url::ParseError, String), @@ -180,6 +183,7 @@ impl Hash for ErrorKind { Self::InvalidGithubUrl(s) => s.hash(state), Self::DirTraversal(e) => e.to_string().hash(state), Self::FileNotFound(e) => e.to_string_lossy().hash(state), + Self::EmptyUrl => "Empty URL".hash(state), Self::ParseUrl(e, s) => (e.to_string(), s).hash(state), Self::InvalidURI(u) => u.hash(state), Self::InvalidUrlFromPath(p) => p.hash(state), diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index ed91ba1fb4..c64ff463ff 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -49,7 +49,7 @@ impl TryFrom<&PathBuf> for InputContent { fn try_from(path: &PathBuf) -> std::result::Result { let input = - fs::read_to_string(&path).map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?; + fs::read_to_string(path).map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?; Ok(Self { source: InputSource::String(input.clone()), diff --git a/lychee-lib/src/types/mail.rs b/lychee-lib/src/types/mail.rs index 4fec0443d7..584c700260 100644 --- a/lychee-lib/src/types/mail.rs +++ b/lychee-lib/src/types/mail.rs @@ -1,7 +1,7 @@ use check_if_email_exists::{CheckEmailOutput, Reachable}; /// A crude way to extract error details from the mail output. -/// This was added because `CheckEmailOutput` doesn't impl `Display` +/// This was added because `CheckEmailOutput` doesn't impl `Display`. pub(crate) fn error_from_output(o: &CheckEmailOutput) -> String { if let Err(_e) = o.misc.as_ref() { return "Error occurred connecting to this email server via SMTP".to_string(); diff --git a/lychee-lib/src/types/uri/valid.rs b/lychee-lib/src/types/uri/valid.rs index 61e4112a34..318086d284 100644 --- a/lychee-lib/src/types/uri/valid.rs +++ b/lychee-lib/src/types/uri/valid.rs @@ -1,6 +1,6 @@ use std::{convert::TryFrom, fmt::Display, net::IpAddr}; -use fast_chemail::parse_email; +use email_address::EmailAddress; use ip_network::Ipv6Network; use serde::{Deserialize, Serialize}; use url::Url; @@ -191,16 +191,48 @@ impl TryFrom for Uri { impl TryFrom<&str> for Uri { type Error = ErrorKind; + /// Create a new URI from a string + /// + /// Note: + /// We do not handle relative URLs here, as we do not know the base URL. + /// Furthermore paths also cannot be resolved, as we do not know the file system. + /// + /// # Errors + /// + /// Returns an error if the string is not a valid URI + /// fn try_from(s: &str) -> Result { - let s = s.trim_start_matches("mailto:"); - // Silently ignore mail parse errors as they are very common and expected for most URIs - if parse_email(s).is_err() { - match Url::parse(s) { - Ok(uri) => Ok(uri.into()), - Err(url_err) => Err(ErrorKind::ParseUrl(url_err, s.to_owned())), + // Empty strings are accepted when being parsed with `Url::parse`, + // but we don't want to accept them because there is no clear definition + // of "validity" in this case. + if s.is_empty() { + return Err(ErrorKind::EmptyUrl); + } + + match Url::parse(s) { + Ok(uri) => Ok(uri.into()), + Err(err) => { + // This could be a relative URL or a mail address or something + // else entirely. Try the mail address check first, as it's the + // most common case. Note that we use a relatively weak check + // here because + // - `fast_chemail::parse_email` does not accept parameters + // (`foo@example?subject=bar`), which are common for website + // contact forms + // - `check_if_email_exists` does additional spam detection, + // which we only want to execute when checking the email + // addresses, but not when printing all links with `--dump`. + if EmailAddress::is_valid(s) { + // Use the `mailto:` scheme for mail addresses, + // which will allow `Url::parse` to parse them. + if let Ok(uri) = Url::parse(&format!("mailto:{s}")) { + return Ok(uri.into()); + }; + }; + + // We do not handle relative URLs here, as we do not know the base URL. + Err(ErrorKind::ParseUrl(err, s.to_owned())) } - } else { - Ok(Url::parse(&format!("mailto:{s}")).unwrap().into()) } } } @@ -242,7 +274,7 @@ mod tests { } #[test] - fn test_uri_from_str() { + fn test_uri_from_url() { assert!(Uri::try_from("").is_err()); assert_eq!( Uri::try_from("https://example.com"), @@ -252,6 +284,10 @@ mod tests { Uri::try_from("https://example.com/@test/testing"), Ok(website("https://example.com/@test/testing")) ); + } + + #[test] + fn test_uri_from_email_str() { assert_eq!( Uri::try_from("mail@example.com"), Ok(mail("mail@example.com")) @@ -260,6 +296,10 @@ mod tests { Uri::try_from("mailto:mail@example.com"), Ok(mail("mail@example.com")) ); + assert_eq!( + Uri::try_from("mail@example.com?foo=bar"), + Ok(mail("mail@example.com?foo=bar")) + ); } #[test]