From 263732072a840bd017835bb5fef422393f5eb57d Mon Sep 17 00:00:00 2001 From: Alexandru Croitor Date: Fri, 3 Jan 2025 18:02:05 +0100 Subject: [PATCH 01/15] feat: Add support for cmake files --- Cargo.lock | 11 +++++++++++ harper-comments/Cargo.toml | 1 + harper-comments/src/comment_parser.rs | 2 ++ 3 files changed, 14 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 40f47aab..59c2f23d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -576,6 +576,7 @@ dependencies = [ "tree-sitter-bash", "tree-sitter-c", "tree-sitter-c-sharp", + "tree-sitter-cmake", "tree-sitter-cpp", "tree-sitter-go", "tree-sitter-haskell", @@ -1730,6 +1731,16 @@ dependencies = [ "tree-sitter", ] +[[package]] +name = "tree-sitter-cmake" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43e478c42991a8893a82295731fca038083894eb38b4eba1f05b43231beb7658" +dependencies = [ + "cc", + "tree-sitter", +] + [[package]] name = "tree-sitter-cpp" version = "0.20.5" diff --git a/harper-comments/Cargo.toml b/harper-comments/Cargo.toml index 15539bfa..1075a860 100644 --- a/harper-comments/Cargo.toml +++ b/harper-comments/Cargo.toml @@ -19,6 +19,7 @@ tree-sitter-javascript = "0.20.1" tree-sitter-go = "0.20.0" tree-sitter-c = "0.20.7" tree-sitter-cpp = "0.20.5" +tree-sitter-cmake = "=0.4.1" tree-sitter-ruby = "0.20.1" tree-sitter-swift = "=0.4.0" tree-sitter-c-sharp = "0.20.0" diff --git a/harper-comments/src/comment_parser.rs b/harper-comments/src/comment_parser.rs index aa1bf757..923b899b 100644 --- a/harper-comments/src/comment_parser.rs +++ b/harper-comments/src/comment_parser.rs @@ -29,6 +29,7 @@ impl CommentParser { "go" => tree_sitter_go::language(), "c" => tree_sitter_c::language(), "cpp" => tree_sitter_cpp::language(), + "cmake" => tree_sitter_cmake::language(), "ruby" => tree_sitter_ruby::language(), "swift" => tree_sitter_swift::language(), "csharp" => tree_sitter_c_sharp::language(), @@ -77,6 +78,7 @@ impl CommentParser { "go" => "go", "c" => "c", "cpp" => "cpp", + "cmake" => "cmake", "h" => "cpp", "rb" => "ruby", "swift" => "swift", From 38fa08aa36126b0759b11bbfc76cb6668bcfaa40 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 Jan 2025 01:39:50 +0000 Subject: [PATCH 02/15] build(deps): bump itertools from 0.13.0 to 0.14.0 Bumps [itertools](https://github.com/rust-itertools/itertools) from 0.13.0 to 0.14.0. - [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-itertools/itertools/compare/v0.13.0...v0.14.0) --- updated-dependencies: - dependency-name: itertools dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- Cargo.lock | 10 +++++----- harper-comments/Cargo.toml | 2 +- harper-core/Cargo.toml | 2 +- harper-ls/Cargo.toml | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 40f47aab..28fc7c36 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -570,7 +570,7 @@ dependencies = [ "harper-core", "harper-html", "harper-tree-sitter", - "itertools 0.13.0", + "itertools 0.14.0", "paste", "tree-sitter", "tree-sitter-bash", @@ -600,7 +600,7 @@ dependencies = [ "fst", "hashbrown 0.15.2", "is-macro", - "itertools 0.13.0", + "itertools 0.14.0", "lazy_static", "levenshtein_automata", "ordered-float", @@ -637,7 +637,7 @@ dependencies = [ "harper-comments", "harper-core", "harper-html", - "itertools 0.13.0", + "itertools 0.14.0", "once_cell", "open", "resolve-path", @@ -912,9 +912,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.13.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" dependencies = [ "either", ] diff --git a/harper-comments/Cargo.toml b/harper-comments/Cargo.toml index 15539bfa..a564d1ff 100644 --- a/harper-comments/Cargo.toml +++ b/harper-comments/Cargo.toml @@ -27,7 +27,7 @@ tree-sitter-lua = "0.0.19" tree-sitter-bash = "0.20.0" tree-sitter-java = "0.20.0" tree-sitter-nix = "0.0.1" -itertools = "0.13.0" +itertools = "0.14.0" tree-sitter-haskell = "0.15.0" [dev-dependencies] diff --git a/harper-core/Cargo.toml b/harper-core/Cargo.toml index 456b9a65..d8224560 100644 --- a/harper-core/Cargo.toml +++ b/harper-core/Cargo.toml @@ -12,7 +12,7 @@ blanket = "0.4.0" fst = "0.4.7" hashbrown = { version = "0.15.2", features = ["serde"] } is-macro = "0.3.6" -itertools = "0.13.0" +itertools = "0.14.0" lazy_static = "1.5.0" ordered-float = { version = "4.6.0", features = ["serde"] } paste = "1.0.14" diff --git a/harper-ls/Cargo.toml b/harper-ls/Cargo.toml index 1bf9f91f..1eed6857 100644 --- a/harper-ls/Cargo.toml +++ b/harper-ls/Cargo.toml @@ -18,7 +18,7 @@ once_cell = "1.20.2" dirs = "5.0.1" anyhow = "1.0.95" serde_json = "1.0.133" -itertools = "0.13.0" +itertools = "0.14.0" tracing = "0.1.41" tracing-subscriber = "0.3.19" resolve-path = "0.1.0" From 91adb6e68f222271a77ced61a85eb04f9ec0bbaf Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Mon, 6 Jan 2025 08:52:28 -0700 Subject: [PATCH 03/15] refactor(core): move em and en dash rules to their own linter --- harper-core/src/linting/dashes.rs | 89 ++++++++++++++++++++++++++++++ harper-core/src/linting/matcher.rs | 13 ----- harper-core/src/linting/mod.rs | 1 + 3 files changed, 90 insertions(+), 13 deletions(-) create mode 100644 harper-core/src/linting/dashes.rs diff --git a/harper-core/src/linting/dashes.rs b/harper-core/src/linting/dashes.rs new file mode 100644 index 00000000..0aef6e1d --- /dev/null +++ b/harper-core/src/linting/dashes.rs @@ -0,0 +1,89 @@ +use crate::{ + patterns::{EitherPattern, Pattern, SequencePattern}, + Token, TokenStringExt, +}; + +use super::{Lint, LintKind, PatternLinter, Suggestion}; + +pub struct Dashes { + pattern: Box, +} + +impl Default for Dashes { + fn default() -> Self { + let en_dash = SequencePattern::default().then_hyphen().then_hyphen(); + let em_dash = SequencePattern::default() + .then_hyphen() + .then_hyphen() + .then_hyphen(); + + let pattern = EitherPattern::new(vec![Box::new(em_dash), Box::new(en_dash)]); + + Self { + pattern: Box::new(pattern), + } + } +} + +impl PatternLinter for Dashes { + fn pattern(&self) -> &dyn Pattern { + self.pattern.as_ref() + } + + fn match_to_lint(&self, matched_tokens: &[Token], _source: &[char]) -> Lint { + let span = matched_tokens.span().unwrap(); + let lint_kind = LintKind::Formatting; + + match matched_tokens.len() { + 2 => Lint { + span, + lint_kind, + suggestions: vec![Suggestion::ReplaceWith(vec!['–'])], + message: "A sequence of hyphens is not an en dash.".to_owned(), + priority: 63, + }, + 3 => Lint { + span, + lint_kind, + suggestions: vec![Suggestion::ReplaceWith(vec!['—'])], + message: "A sequence of hyphens is not an em dash.".to_owned(), + priority: 63, + }, + _ => panic!("Received unexpected number of tokens."), + } + } + + fn description(&self) -> &'static str { + "Rather than outright using an em dash or en dash, authors often use a sequence of hyphens, expecting them to be condensed.\nThis rule does so." + } +} + +#[cfg(test)] +mod tests { + use crate::linting::tests::{assert_suggestion_count, assert_suggestion_result}; + + use super::Dashes; + + #[test] + fn catches_en_dash() { + assert_suggestion_result( + "pre--Industrial Revolution", + Dashes::default(), + "pre–Industrial Revolution", + ); + } + + #[test] + fn catches_em_dash() { + assert_suggestion_result( + "'There is no box' --- Scott", + Dashes::default(), + "'There is no box' — Scott", + ); + } + + #[test] + fn no_overlaps() { + assert_suggestion_count("'There is no box' --- Scott", Dashes::default(), 1); + } +} diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index 6e34785e..82582635 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -190,19 +190,6 @@ impl Matcher { "take", "a", "decision" => "make a decision" }; - // TODO: Improve the description for this lint specifically. - // We need to be more explicit that we are replacing with an Em dash - triggers.push(Rule { - pattern: vec![pt!(Hyphen), pt!(Hyphen), pt!(Hyphen)], - replace_with: vecword!("—"), - }); - - // Same goes for this En dash - triggers.push(Rule { - pattern: vec![pt!(Hyphen), pt!(Hyphen)], - replace_with: vecword!("–"), - }); - triggers.push(Rule { pattern: vec![pt!("L"), pt!(Period), pt!("L"), pt!(Period), pt!("M")], replace_with: vecword!("large language model"), diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 87e26c91..03ce9bbe 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -4,6 +4,7 @@ mod boring_words; mod capitalize_personal_pronouns; mod compound_words; mod correct_number_suffix; +mod dashes; mod dot_initialisms; mod ellipsis_length; mod linking_verbs; From 6cc4509af8c88e90b25024423e91d46d2aaca89b Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Mon, 6 Jan 2025 09:39:38 -0700 Subject: [PATCH 04/15] refactor(core): `CompoundWords` is more accurately called `MergeWords` --- harper-core/src/linting/compound_words.rs | 162 ---------------------- harper-core/src/linting/lint_group.rs | 4 +- harper-core/src/linting/matcher.rs | 1 - harper-core/src/linting/merge_words.rs | 100 +++++++++++++ harper-core/src/linting/mod.rs | 3 +- 5 files changed, 104 insertions(+), 166 deletions(-) delete mode 100644 harper-core/src/linting/compound_words.rs create mode 100644 harper-core/src/linting/merge_words.rs diff --git a/harper-core/src/linting/compound_words.rs b/harper-core/src/linting/compound_words.rs deleted file mode 100644 index 1d50f60a..00000000 --- a/harper-core/src/linting/compound_words.rs +++ /dev/null @@ -1,162 +0,0 @@ -use std::sync::Arc; - -use itertools::Itertools; - -use crate::{CharString, Dictionary, Document, FstDictionary, Span}; - -use super::{Lint, LintKind, Linter, Suggestion}; - -pub struct CompoundWords { - dict: Arc, -} - -impl CompoundWords { - pub fn new() -> Self { - Self { - dict: FstDictionary::curated(), - } - } -} - -impl Default for CompoundWords { - fn default() -> Self { - Self::new() - } -} - -impl Linter for CompoundWords { - fn lint(&mut self, document: &Document) -> Vec { - let mut lints = Vec::new(); - - let mut merged_word = CharString::new(); - let mut potential_compounds = Vec::new(); - - for (a, w, b) in document.tokens().tuple_windows() { - if !a.kind.is_word() || !w.kind.is_whitespace() || !b.kind.is_word() { - continue; - } - - let a_chars = document.get_span_content(a.span); - let b_chars = document.get_span_content(b.span); - - // Not super helpful in this case, so we skip it - if matches!(a_chars, ['a']) { - continue; - } - - potential_compounds.clear(); - - merged_word.clear(); - merged_word.extend_from_slice(a_chars); - merged_word.extend_from_slice(b_chars); - - // Check for closed compound words - if self.dict.contains_word(&merged_word) - && !a.kind.is_common_word() - && !b.kind.is_common_word() - { - potential_compounds.push(merged_word.clone()); - } - - if !potential_compounds.is_empty() { - lints.push(Lint { - span: Span::new(a.span.start, b.span.end), - lint_kind: LintKind::Spelling, - suggestions: potential_compounds - .drain(..) - .map(|v| Suggestion::ReplaceWith(v.to_vec())) - .collect(), - message: - "These two words are often combined to form a hyphenated compound word." - .to_owned(), - priority: 63, - }); - } - } - - lints - } - - fn description(&self) -> &str { - "Accidentally inserting a space inside a word is common. This rule looks for valid words that are split by whitespace." - } -} - -#[cfg(test)] -mod tests { - use crate::linting::tests::{assert_lint_count, assert_suggestion_count}; - - use super::CompoundWords; - - #[test] - fn scarecrow() { - assert_lint_count( - "I saw a scare crow in the field today.", - CompoundWords::default(), - 1, - ); - } - - #[test] - fn clean() { - assert_lint_count( - "When referring to the political party, make sure to treat them as a proper noun.", - CompoundWords::default(), - 0, - ); - } - - #[test] - fn bookshelf() { - assert_lint_count( - "I have a big book shelf in my room.", - CompoundWords::default(), - 1, - ); - } - - #[test] - fn sunscreen() { - assert_lint_count( - "Don't forget to apply your sunscreen before going out.", - CompoundWords::default(), - 0, - ); - } - - #[test] - fn birthday() { - assert_lint_count( - "We're having a big party to celebrate the couple's birthday today.", - CompoundWords::default(), - 0, - ); - } - - #[test] - fn hometown() { - assert_lint_count( - "My home town is a beautiful place with many historical land marks.", - CompoundWords::default(), - 2, - ); - } - - #[test] - fn assertions() { - assert_lint_count( - "Make sure to compile with debug ass ertions disabled.", - CompoundWords::default(), - 1, - ); - } - - #[test] - fn break_up() { - assert_suggestion_count( - "Like if you break up words you shouldn't.", - CompoundWords::default(), - 0, - ); - } -} diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs index 753df32d..b6b4aa93 100644 --- a/harper-core/src/linting/lint_group.rs +++ b/harper-core/src/linting/lint_group.rs @@ -5,13 +5,13 @@ use super::an_a::AnA; use super::avoid_curses::AvoidCurses; use super::boring_words::BoringWords; use super::capitalize_personal_pronouns::CapitalizePersonalPronouns; -use super::compound_words::CompoundWords; use super::correct_number_suffix::CorrectNumberSuffix; use super::dot_initialisms::DotInitialisms; use super::ellipsis_length::EllipsisLength; use super::linking_verbs::LinkingVerbs; use super::long_sentences::LongSentences; use super::matcher::Matcher; +use super::merge_words::MergeWords; use super::multiple_sequential_pronouns::MultipleSequentialPronouns; use super::number_suffix_capitalization::NumberSuffixCapitalization; use super::plural_conjugate::PluralConjugate; @@ -182,7 +182,7 @@ create_lint_group_config!( MicrosoftNames => true, AppleNames => true, AzureNames => true, - CompoundWords => true, + MergeWords => true, PluralConjugate => false ); diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index 82582635..369a05be 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -175,7 +175,6 @@ impl Matcher { "that","s" => "that is", "That","s" => "that is", "ms" => "milliseconds", - "t","he" => "the", "the","hing" => "the thing", "The","hing" => "The thing", "need","helps" => "need help", diff --git a/harper-core/src/linting/merge_words.rs b/harper-core/src/linting/merge_words.rs new file mode 100644 index 00000000..104a233f --- /dev/null +++ b/harper-core/src/linting/merge_words.rs @@ -0,0 +1,100 @@ +use std::sync::Arc; + +use itertools::Itertools; + +use crate::{CharString, CharStringExt, Dictionary, Document, FstDictionary, Span}; + +use super::{Lint, LintKind, Linter, Suggestion}; + +pub struct MergeWords { + dict: Arc, +} + +impl MergeWords { + pub fn new() -> Self { + Self { + dict: FstDictionary::curated(), + } + } +} + +impl Default for MergeWords { + fn default() -> Self { + Self::new() + } +} + +impl Linter for MergeWords { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + let mut merged_word = CharString::new(); + + for (a, w, b) in document.tokens().tuple_windows() { + if !a.kind.is_word() || !w.kind.is_whitespace() || !b.kind.is_word() { + continue; + } + + let a_chars = document.get_span_content(a.span); + let b_chars = document.get_span_content(b.span); + + // Not super helpful in this case, so we skip it + if matches!(a_chars, ['a']) || matches!(b_chars, ['a']) { + continue; + } + + merged_word.clear(); + merged_word.extend_from_slice(&a_chars.to_lower()); + merged_word.extend_from_slice(&b_chars.to_lower()); + + if self.dict.contains_word(&merged_word) + && (!self.dict.contains_word(a_chars) || !self.dict.contains_word(b_chars)) + { + lints.push(Lint { + span: Span::new(a.span.start, b.span.end), + lint_kind: LintKind::Spelling, + suggestions: vec![Suggestion::ReplaceWith(merged_word.to_vec())], + message: "These two words are often combined to form a closed compound word." + .to_owned(), + priority: 63, + }); + } + } + + lints + } + + fn description(&self) -> &str { + "Accidentally inserting a space inside a word is common. This rule looks for valid words that are split by whitespace." + } +} + +#[cfg(test)] +mod tests { + use crate::linting::tests::assert_lint_count; + + use super::MergeWords; + + #[test] + fn clean() { + assert_lint_count( + "When referring to the political party, make sure to treat them as a proper noun.", + MergeWords::default(), + 0, + ); + } + + #[test] + fn heretofore() { + assert_lint_count( + "This is a her etofore unseen problem.", + MergeWords::default(), + 1, + ); + } + + #[test] + fn therefore() { + assert_lint_count("The refore", MergeWords::default(), 1); + } +} diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 03ce9bbe..deec6fa1 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -2,7 +2,6 @@ mod an_a; mod avoid_curses; mod boring_words; mod capitalize_personal_pronouns; -mod compound_words; mod correct_number_suffix; mod dashes; mod dot_initialisms; @@ -12,6 +11,7 @@ mod lint; mod lint_group; mod long_sentences; mod matcher; +mod merge_words; mod multiple_sequential_pronouns; mod number_suffix_capitalization; mod pattern_linter; @@ -40,6 +40,7 @@ pub use lint::{Lint, LintKind, Suggestion}; pub use lint_group::{LintGroup, LintGroupConfig}; pub use long_sentences::LongSentences; pub use matcher::Matcher; +pub use merge_words::MergeWords; pub use multiple_sequential_pronouns::MultipleSequentialPronouns; pub use number_suffix_capitalization::NumberSuffixCapitalization; pub use pattern_linter::PatternLinter; From 2ff381a97190b945e9eaf5f5f94e64e7f98fe387 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Mon, 6 Jan 2025 09:44:51 -0700 Subject: [PATCH 05/15] docs(core): fix link --- harper-core/src/fat_token.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/harper-core/src/fat_token.rs b/harper-core/src/fat_token.rs index 7db6c609..26db3b52 100644 --- a/harper-core/src/fat_token.rs +++ b/harper-core/src/fat_token.rs @@ -2,8 +2,8 @@ use serde::{Deserialize, Serialize}; use crate::TokenKind; -/// A [`Token`] that holds its content as a fat [`Vec`] rather than as a -/// [`Span`]. +/// A [`Token`](crate::Token) that holds its content as a fat [`Vec`] rather than as a +/// [`Span`](crate::Span). #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)] pub struct FatToken { pub content: Vec, From 5e963042ed9096589b55316eab9a54a70325905c Mon Sep 17 00:00:00 2001 From: Clay Dugo Date: Mon, 6 Jan 2025 12:35:33 -0500 Subject: [PATCH 06/15] Address "one" for indefinite article rule --- harper-core/src/linting/an_a.rs | 1 + harper-core/tests/run_tests.rs | 1 + harper-core/tests/test_sources/article_usage.md | 1 + 3 files changed, 3 insertions(+) create mode 100644 harper-core/tests/test_sources/article_usage.md diff --git a/harper-core/src/linting/an_a.rs b/harper-core/src/linting/an_a.rs index fe1b0bb3..a0a11ea6 100644 --- a/harper-core/src/linting/an_a.rs +++ b/harper-core/src/linting/an_a.rs @@ -103,6 +103,7 @@ fn starts_with_vowel(word: &[char]) -> bool { [] | ['u', 'k', ..] | ['e', 'u', 'p', 'h', ..] | ['e', 'u', 'g' | 'l' | 'c', ..] + | ['o', 'n', 'e'] | ['o', 'n', 'c', 'e'] ) { return false; diff --git a/harper-core/tests/run_tests.rs b/harper-core/tests/run_tests.rs index eece70c2..cad2ea4d 100644 --- a/harper-core/tests/run_tests.rs +++ b/harper-core/tests/run_tests.rs @@ -44,3 +44,4 @@ create_test!(chinese_lorem_ipsum.md, 2); create_test!(obsidian_links.md, 2); create_test!(issue_267.md, 0); create_test!(proper_noun_capitalization.md, 2); +create_test!(article_usage.md, 0); diff --git a/harper-core/tests/test_sources/article_usage.md b/harper-core/tests/test_sources/article_usage.md new file mode 100644 index 00000000..297cb520 --- /dev/null +++ b/harper-core/tests/test_sources/article_usage.md @@ -0,0 +1 @@ +The total number of pixels in the image pertains to a one channel image. From b269d6cb1d50f0afd81a50a40e9c54641313196d Mon Sep 17 00:00:00 2001 From: Clay Dugo Date: Mon, 6 Jan 2025 12:38:29 -0500 Subject: [PATCH 07/15] Use issue naming style --- Cargo.lock | 159 +++++++++--------- harper-core/tests/run_tests.rs | 2 +- .../{article_usage.md => issue_358.md} | 0 3 files changed, 77 insertions(+), 84 deletions(-) rename harper-core/tests/test_sources/{article_usage.md => issue_358.md} (100%) diff --git a/Cargo.lock b/Cargo.lock index 40f47aab..549a65fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -28,9 +28,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "anes" @@ -105,9 +105,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.83" +version = "0.1.84" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" +checksum = "1b1244b10dcd56c92219da4e14caa97e312079e185f04ba3eea25061561dc0a0" dependencies = [ "proc-macro2", "quote", @@ -183,9 +183,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" [[package]] name = "cast" @@ -407,9 +407,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "foldhash" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" +checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" [[package]] name = "form_urlencoded" @@ -696,12 +696,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - [[package]] name = "hermit-abi" version = "0.4.0" @@ -880,7 +874,7 @@ version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" dependencies = [ - "hermit-abi 0.4.0", + "hermit-abi", "libc", "windows-sys 0.52.0", ] @@ -921,16 +915,17 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -951,9 +946,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.164" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libredox" @@ -967,9 +962,9 @@ dependencies = [ [[package]] name = "litemap" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" [[package]] name = "lock_api" @@ -1008,20 +1003,19 @@ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" dependencies = [ "adler2", ] [[package]] name = "mio" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ - "hermit-abi 0.3.9", "libc", "wasi", "windows-sys 0.52.0", @@ -1048,9 +1042,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.5" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] @@ -1069,9 +1063,9 @@ checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" [[package]] name = "open" -version = "5.3.1" +version = "5.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ecd52f0b8d15c40ce4820aa251ed5de032e5d91fab27f7db2f40d42a8bdf69c" +checksum = "e2483562e62ea94312f3576a7aca397306df7990b8d89033e18766744377ef95" dependencies = [ "is-wsl", "libc", @@ -1122,9 +1116,9 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pathdiff" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c5ce1153ab5b689d0c074c4e7fc613e942dfb7dd9eea5ab202d2ad91fe361" +checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" [[package]] name = "percent-encoding" @@ -1134,18 +1128,18 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pin-project" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be57f64e946e500c8ee36ef6331845d40a93055567ec57e8fae13efd33759b95" +checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c" +checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" dependencies = [ "proc-macro2", "quote", @@ -1154,9 +1148,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -1175,9 +1169,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.89" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -1203,9 +1197,9 @@ checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae" [[package]] name = "quote" -version = "1.0.37" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] @@ -1244,9 +1238,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ "bitflags 2.6.0", ] @@ -1329,9 +1323,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "serde" -version = "1.0.216" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] @@ -1349,9 +1343,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.216" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", @@ -1360,9 +1354,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.133" +version = "1.0.134" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d" dependencies = [ "itoa", "memchr", @@ -1410,9 +1404,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" dependencies = [ "libc", "windows-sys 0.52.0", @@ -1432,9 +1426,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.87" +version = "2.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +checksum = "46f71c0377baf4ef1cc3e3402ded576dccc315800fbc62dfc7fe04b009773b4a" dependencies = [ "proc-macro2", "quote", @@ -1551,9 +1545,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", @@ -1872,9 +1866,9 @@ dependencies = [ [[package]] name = "unicase" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df" +checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539" [[package]] name = "unicode-blocks" @@ -1884,9 +1878,9 @@ checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "unicode-width" @@ -1902,9 +1896,9 @@ checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "url" -version = "2.5.3" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -1954,9 +1948,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -1965,13 +1959,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", "syn", @@ -1980,9 +1973,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1990,9 +1983,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", @@ -2003,9 +1996,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] name = "winapi" @@ -2206,9 +2199,9 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" [[package]] name = "yoke" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" dependencies = [ "serde", "stable_deref_trait", @@ -2218,9 +2211,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", @@ -2251,18 +2244,18 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", diff --git a/harper-core/tests/run_tests.rs b/harper-core/tests/run_tests.rs index bae403fe..50f5f3d6 100644 --- a/harper-core/tests/run_tests.rs +++ b/harper-core/tests/run_tests.rs @@ -44,6 +44,6 @@ create_test!(chinese_lorem_ipsum.md, 2); create_test!(obsidian_links.md, 2); create_test!(issue_267.md, 0); create_test!(proper_noun_capitalization.md, 2); -create_test!(article_usage.md, 0); create_test!(amazon_hostname.md, 0); create_test!(issue_159.md, 1); +create_test!(issue_358.md, 0); diff --git a/harper-core/tests/test_sources/article_usage.md b/harper-core/tests/test_sources/issue_358.md similarity index 100% rename from harper-core/tests/test_sources/article_usage.md rename to harper-core/tests/test_sources/issue_358.md From c5b570e387cd9df7dec1713f49d7d29d69cba4fe Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Mon, 6 Jan 2025 11:22:42 -0700 Subject: [PATCH 08/15] feat(core): make proper nouns important to the `SpellCheck` rule --- harper-core/dictionary.dict | 3 ++- harper-core/src/linting/spell_check.rs | 28 +++++++++++++++++++++++++ harper-core/src/spell/fst_dictionary.rs | 2 +- harper-core/src/spell/mod.rs | 11 +++++++++- 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index d3753e0b..02fc35f0 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -31305,7 +31305,7 @@ marital/5Y maritime/5 marjoram/1M mark/14AMDSG -markdown/12SM +Markdown/12SM marked/54U markedly/ marker/14MS @@ -49768,3 +49768,4 @@ uncheck/SM upsample/SMG organoid/SM centric/SM +Harper/SM diff --git a/harper-core/src/linting/spell_check.rs b/harper-core/src/linting/spell_check.rs index 3f5f7c6a..60c16d58 100644 --- a/harper-core/src/linting/spell_check.rs +++ b/harper-core/src/linting/spell_check.rs @@ -98,3 +98,31 @@ impl Linter for SpellCheck { "Looks and provides corrections for misspelled words." } } + +#[cfg(test)] +mod tests { + use crate::{ + linting::tests::{assert_lint_count, assert_suggestion_result}, + FstDictionary, + }; + + use super::SpellCheck; + + #[test] + fn markdown_capitalized() { + assert_suggestion_result( + "The word markdown should be capitalized.", + SpellCheck::new(FstDictionary::curated()), + "The word Markdown should be capitalized.", + ); + } + + #[test] + fn harper_automattic_capitalized() { + assert_lint_count( + "So should harper and automattic.", + SpellCheck::new(FstDictionary::curated()), + 2, + ); + } +} diff --git a/harper-core/src/spell/fst_dictionary.rs b/harper-core/src/spell/fst_dictionary.rs index 574b1115..d9d688bd 100644 --- a/harper-core/src/spell/fst_dictionary.rs +++ b/harper-core/src/spell/fst_dictionary.rs @@ -45,7 +45,7 @@ lazy_static! { thread_local! { // Builders are computationally expensive and do not depend on the word, so we store a // collection of builders and the associated edit distance here. - // Currently, the edit distance we use is 3, but a value that does not exist in this + // Currently, the edit distance we use is three, but a value that does not exist in this // collection will create a new builder of that distance and push it to the collection. static AUTOMATON_BUILDERS: RefCell> = RefCell::new(vec![( EXPECTED_DISTANCE, diff --git a/harper-core/src/spell/mod.rs b/harper-core/src/spell/mod.rs index 91124f2d..5d5c75c7 100644 --- a/harper-core/src/spell/mod.rs +++ b/harper-core/src/spell/mod.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::ops::Index; use itertools::{Itertools, MinMaxResult}; @@ -15,7 +16,7 @@ mod full_dictionary; mod hunspell; mod merged_dictionary; -#[derive(PartialEq)] +#[derive(PartialEq, Debug)] pub struct FuzzyMatchResult<'a> { word: &'a [char], edit_distance: u8, @@ -59,6 +60,14 @@ fn order_suggestions(matches: Vec) -> Vec<&[char]> { found.swap(0, 2); } + if let Some(noun_index) = found + .iter() + .skip(3) + .position(|i| i.metadata.is_proper_noun()) + { + found.swap(2, noun_index + 3); + } + // Make commonality relevant found.sort_by_key(|fmr| if fmr.metadata.common { 0 } else { 1 }); From 8bb00834487b8bf6efe8f5547df92dd37e1a5660 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Mon, 6 Jan 2025 12:04:56 -0700 Subject: [PATCH 09/15] fix(core): appease `just precommit` --- harper-core/src/spell/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/harper-core/src/spell/mod.rs b/harper-core/src/spell/mod.rs index 5d5c75c7..647afd08 100644 --- a/harper-core/src/spell/mod.rs +++ b/harper-core/src/spell/mod.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::ops::Index; use itertools::{Itertools, MinMaxResult}; From 9767243c1ebde356667ecf764e8422f6c8cf01e1 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Mon, 6 Jan 2025 13:29:24 -0700 Subject: [PATCH 10/15] feat: wrote simple pattern for slightly more complex ideas --- harper-core/dictionary.dict | 2 +- harper-core/src/patterns/mod.rs | 12 +++++- harper-core/src/patterns/sequence_pattern.rs | 44 ++++++++++++++++++-- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index 02fc35f0..6c8d07a8 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -49003,7 +49003,7 @@ wit/14+SM witch/14MDSG witchcraft/1M witchery/1M -with/+1~ +with/+~ withal/+ withdraw/41SG withdrawal/1MS diff --git a/harper-core/src/patterns/mod.rs b/harper-core/src/patterns/mod.rs index e290a314..39fd588b 100644 --- a/harper-core/src/patterns/mod.rs +++ b/harper-core/src/patterns/mod.rs @@ -1,6 +1,6 @@ use std::collections::VecDeque; -use crate::{Span, Token, VecExt}; +use crate::{document, Document, Span, Token, VecExt}; mod any_pattern; mod consumes_remaining_pattern; @@ -122,3 +122,13 @@ where } } } + +trait DocPattern { + fn find_all_matches_in_doc(&self, document: &Document) -> Vec; +} + +impl DocPattern for P { + fn find_all_matches_in_doc(&self, document: &Document) -> Vec { + self.find_all_matches(document.get_tokens(), document.get_source()) + } +} diff --git a/harper-core/src/patterns/sequence_pattern.rs b/harper-core/src/patterns/sequence_pattern.rs index 39d538c3..8a75a186 100644 --- a/harper-core/src/patterns/sequence_pattern.rs +++ b/harper-core/src/patterns/sequence_pattern.rs @@ -2,10 +2,10 @@ use hashbrown::HashSet; use paste::paste; use super::whitespace_pattern::WhitespacePattern; -use super::{Pattern, RepeatingPattern}; +use super::{EitherPattern, Pattern, RepeatingPattern}; use crate::{CharStringExt, Lrc, Token, TokenKind}; -/// A pattern that checks that a sequence of others patterns match. +/// A pattern that checks that a sequence of other patterns match. #[derive(Default)] pub struct SequencePattern { token_patterns: Vec>, @@ -58,6 +58,22 @@ impl SequencePattern { gen_then_from_is!(adjective); gen_then_from_is!(hyphen); + /// Add a pattern that looks for more complex ideas, like nouns with adjectives attached. + pub fn then_idea(mut self) -> Self { + self.then(Box::new(EitherPattern::new(vec![ + Box::new( + SequencePattern::default() + .then_one_or_more(Box::new( + SequencePattern::default() + .then_adjective() + .then_whitespace(), + )) + .then_noun(), + ), + Box::new(SequencePattern::default().then_noun()), + ]))) + } + pub fn then_exact_word(mut self, word: &'static str) -> Self { self.token_patterns .push(Box::new(|tok: &Token, source: &[char]| { @@ -220,8 +236,8 @@ mod tests { use hashbrown::HashSet; use super::SequencePattern; - use crate::patterns::Pattern; - use crate::{Document, Lrc}; + use crate::patterns::{DocPattern, Pattern}; + use crate::{Document, Lrc, Span}; #[test] fn matches_n_whitespace_tokens() { @@ -271,4 +287,24 @@ mod tests { doc.get_tokens().len() ); } + + #[test] + fn simple_idea_apple() { + let doc = Document::new_markdown_curated("A red apple"); + let matches = SequencePattern::default() + .then_idea() + .find_all_matches_in_doc(&doc); + + assert_eq!(matches, vec![Span::new(0, 5)]) + } + + #[test] + fn complex_idea_apple() { + let doc = Document::new_markdown_curated("A red apple with a long stem"); + let matches = SequencePattern::default() + .then_idea() + .find_all_matches_in_doc(&doc); + + assert_eq!(matches, vec![Span::new(0, 5), Span::new(8, 13)]) + } } From 180e3aff1a9f40574e51f61734af291b9e43e8e9 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Mon, 6 Jan 2025 15:48:19 -0700 Subject: [PATCH 11/15] feat: wrote linter for Oxford commas --- harper-core/dictionary.dict | 9 +- harper-core/src/document.rs | 1 + harper-core/src/linting/lint.rs | 29 +++++ harper-core/src/linting/lint_group.rs | 5 +- harper-core/src/linting/mod.rs | 5 +- harper-core/src/linting/oxford_comma.rs | 108 +++++++++++++++++++ harper-core/src/patterns/mod.rs | 4 +- harper-core/src/patterns/noun_phrase.rs | 78 ++++++++++++++ harper-core/src/patterns/sequence_pattern.rs | 41 +------ harper-core/src/token.rs | 2 + harper-core/src/token_kind.rs | 4 + harper-ls/src/diagnostics.rs | 6 ++ harper-wasm/src/lib.rs | 5 +- 13 files changed, 251 insertions(+), 46 deletions(-) create mode 100644 harper-core/src/linting/oxford_comma.rs create mode 100644 harper-core/src/patterns/noun_phrase.rs diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index 6c8d07a8..ead6ece2 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -1,5 +1,4 @@ 50000 -A/125SM AA/1254M AAA/12 AB/215M @@ -10758,7 +10757,7 @@ Zworykin/M Zyrtec/M Zyuganov/M Zzz -a/1-+4857S~ +a/-+457S~ aah/14 aardvark/1SM ab/14+SDY @@ -11894,7 +11893,7 @@ amusing/45Y amygdala/1 amylase/1M amyloid/15 -an/-71+CS~ +an/-7+CS~ anabolism/1M anachronism/1SM anachronistic/5 @@ -11955,7 +11954,7 @@ anchovy/1SM ancient/51SPMRYT ancientness/1M ancillary/51SM -and/714~ +and/74~ andante/15SM andiron/1SM androgen/1M @@ -13329,7 +13328,7 @@ bamboozle/41DSG ban/41SM banal/5Y banality/1SM -banana/15SM +banana/1SM band's band/14ESGD bandage/14DSMG diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index 61b75f4d..e1378485 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -490,6 +490,7 @@ macro_rules! create_fns_on_doc { impl TokenStringExt for Document { create_fns_on_doc!(word); + create_fns_on_doc!(conjunction); create_fns_on_doc!(space); create_fns_on_doc!(apostrophe); create_fns_on_doc!(pipe); diff --git a/harper-core/src/linting/lint.rs b/harper-core/src/linting/lint.rs index 6c92706f..532a3604 100644 --- a/harper-core/src/linting/lint.rs +++ b/harper-core/src/linting/lint.rs @@ -32,6 +32,7 @@ impl Default for Lint { pub enum LintKind { Spelling, Capitalization, + Style, Formatting, Repetition, Enhancement, @@ -52,6 +53,7 @@ impl Display for LintKind { LintKind::Miscellaneous => "Miscellaneous", LintKind::Enhancement => "Enhancement", LintKind::WordChoice => "Word Choice", + LintKind::Style => "Style", }; write!(f, "{}", s) @@ -61,6 +63,8 @@ impl Display for LintKind { #[derive(Debug, Clone, Serialize, Deserialize, Is)] pub enum Suggestion { ReplaceWith(Vec), + /// Insert the provided characters _after_ the offending text. + InsertAfter(Vec), Remove, } @@ -88,6 +92,11 @@ impl Suggestion { source.truncate(source.len() - span.len()); } + Self::InsertAfter(chars) => { + let popped = source.split_off(span.end); + source.extend(chars); + source.extend(popped); + } } } } @@ -98,7 +107,27 @@ impl Display for Suggestion { Suggestion::ReplaceWith(with) => { write!(f, "Replace with: “{}”", with.iter().collect::()) } + Suggestion::InsertAfter(with) => { + write!(f, "Insert “{}”", with.iter().collect::()) + } Suggestion::Remove => write!(f, "Remove error"), } } } + +#[cfg(test)] +mod tests { + use crate::Span; + + use super::Suggestion; + + #[test] + fn insert_comma_after() { + let source = "This is a test"; + let mut source_chars = source.chars().collect(); + let sug = Suggestion::InsertAfter(vec![',']); + sug.apply(Span::new(0, 4), &mut source_chars); + + assert_eq!(source_chars, "This, is a test".chars().collect::>()); + } +} diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs index b6b4aa93..b39ff81b 100644 --- a/harper-core/src/linting/lint_group.rs +++ b/harper-core/src/linting/lint_group.rs @@ -29,7 +29,7 @@ use super::that_which::ThatWhich; use super::unclosed_quotes::UnclosedQuotes; use super::use_genitive::UseGenitive; use super::wrong_quotes::WrongQuotes; -use super::{Lint, Linter}; +use super::{Lint, Linter, OxfordComma}; use crate::{Dictionary, Document}; macro_rules! create_lint_group_config { @@ -183,7 +183,8 @@ create_lint_group_config!( AppleNames => true, AzureNames => true, MergeWords => true, - PluralConjugate => false + PluralConjugate => false, + OxfordComma => true ); impl Default for LintGroup { diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index deec6fa1..8f64ff57 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -14,6 +14,7 @@ mod matcher; mod merge_words; mod multiple_sequential_pronouns; mod number_suffix_capitalization; +mod oxford_comma; mod pattern_linter; mod plural_conjugate; mod proper_noun_capitalization_linters; @@ -43,6 +44,7 @@ pub use matcher::Matcher; pub use merge_words::MergeWords; pub use multiple_sequential_pronouns::MultipleSequentialPronouns; pub use number_suffix_capitalization::NumberSuffixCapitalization; +pub use oxford_comma::OxfordComma; pub use pattern_linter::PatternLinter; pub use proper_noun_capitalization_linters::{ AmazonNames, Americas, AppleNames, AzureNames, ChineseCommunistParty, GoogleNames, Holidays, @@ -97,7 +99,7 @@ mod tests { } /// Runs a provided linter on text, applies the first suggestion from each - /// lint and asserts that the result is equal to a given value. + /// lint and asserts whether the result is equal to a given value. pub fn assert_suggestion_result(text: &str, mut linter: impl Linter, expected_result: &str) { let test = Document::new_markdown_curated(text); let lints = linter.lint(&test); @@ -105,6 +107,7 @@ mod tests { let mut text: Vec = text.chars().collect(); for lint in lints { + dbg!(&lint); if let Some(sug) = lint.suggestions.first() { sug.apply(lint.span, &mut text); } diff --git a/harper-core/src/linting/oxford_comma.rs b/harper-core/src/linting/oxford_comma.rs new file mode 100644 index 00000000..01fcc2d8 --- /dev/null +++ b/harper-core/src/linting/oxford_comma.rs @@ -0,0 +1,108 @@ +use crate::{ + patterns::{Pattern, SequencePattern}, + Document, Token, TokenStringExt, +}; + +use super::{Lint, LintKind, Linter, Suggestion}; + +pub struct OxfordComma { + pattern: SequencePattern, +} + +impl OxfordComma { + pub fn new() -> Self { + Self { + pattern: SequencePattern::default() + .then_one_or_more(Box::new( + SequencePattern::default() + .then_noun_phrase() + .then_comma() + .then_whitespace(), + )) + .then_noun_phrase() + .then_whitespace() + .then_exact_word("and") + .then_whitespace() + .then_noun_phrase(), + } + } + + fn match_to_lint(&self, matched_toks: &[Token], _source: &[char]) -> Lint { + let conj_index = matched_toks.last_conjunction_index().unwrap(); + let offender = matched_toks[conj_index - 2]; + + Lint { + span: offender.span, + lint_kind: LintKind::Style, + suggestions: vec![Suggestion::InsertAfter(vec![','])], + message: "An Oxford comma is necessary here.".to_owned(), + priority: 31, + } + } +} + +impl Default for OxfordComma { + fn default() -> Self { + Self::new() + } +} + +impl Linter for OxfordComma { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + for sentence in document.iter_sentences() { + let mut tok_cursor = 0; + + loop { + if tok_cursor >= sentence.len() { + break; + } + + let match_len = self + .pattern + .matches(&sentence[tok_cursor..], document.get_source()); + + if match_len != 0 { + let lint = self.match_to_lint( + &sentence[tok_cursor..tok_cursor + match_len], + document.get_source(), + ); + + lints.push(lint); + tok_cursor += match_len; + } else { + tok_cursor += 1; + } + } + } + + lints + } + + fn description(&self) -> &str { + "The Oxford comma is one of the more controversial rules in common use today. Here, we make sure that we put a comma before `and` when listing out more than two ideas." + } +} + +#[cfg(test)] +mod tests { + use crate::linting::tests::{assert_lint_count, assert_suggestion_result}; + + use super::OxfordComma; + + #[test] + fn fruits() { + assert_lint_count("An apple, a banana and a pear", OxfordComma::default(), 1); + } + + #[test] + fn people() { + // Nancy, Steve and Carl are going + assert_suggestion_result( + "Nancy, Steve and Carl are going to the coffee shop.", + OxfordComma::default(), + "Nancy, Steve, and Carl are going to the coffee shop.", + ); + } +} diff --git a/harper-core/src/patterns/mod.rs b/harper-core/src/patterns/mod.rs index 39fd588b..ae50e215 100644 --- a/harper-core/src/patterns/mod.rs +++ b/harper-core/src/patterns/mod.rs @@ -1,6 +1,6 @@ use std::collections::VecDeque; -use crate::{document, Document, Span, Token, VecExt}; +use crate::{Document, Span, Token, VecExt}; mod any_pattern; mod consumes_remaining_pattern; @@ -8,6 +8,7 @@ mod either_pattern; mod invert; mod is_not_title_case; mod naive_pattern_group; +mod noun_phrase; mod repeating_pattern; mod sequence_pattern; mod token_kind_pattern_group; @@ -21,6 +22,7 @@ pub use either_pattern::EitherPattern; pub use invert::Invert; pub use is_not_title_case::IsNotTitleCase; pub use naive_pattern_group::NaivePatternGroup; +pub use noun_phrase::NounPhrase; pub use repeating_pattern::RepeatingPattern; pub use sequence_pattern::SequencePattern; pub use token_kind_pattern_group::TokenKindPatternGroup; diff --git a/harper-core/src/patterns/noun_phrase.rs b/harper-core/src/patterns/noun_phrase.rs new file mode 100644 index 00000000..31d18bc0 --- /dev/null +++ b/harper-core/src/patterns/noun_phrase.rs @@ -0,0 +1,78 @@ +use crate::Token; + +use super::Pattern; + +/// A pattern that returns the value of the first non-zero match in a list. +#[derive(Default)] +pub struct NounPhrase; + +impl Pattern for NounPhrase { + fn matches(&self, tokens: &[Token], _source: &[char]) -> usize { + let mut cursor = 0; + + loop { + let Some(tok) = tokens.get(cursor) else { + return 0; + }; + + if tok.kind.is_adjective() || tok.kind.is_article() { + let Some(next) = tokens.get(cursor + 1) else { + return 0; + }; + + if !next.kind.is_whitespace() { + return 0; + } + + cursor += 2; + continue; + } + + if tok.kind.is_noun() { + return cursor + 1; + } + + return 0; + } + } +} + +#[cfg(test)] +mod tests { + use super::super::DocPattern; + use super::NounPhrase; + use crate::{patterns::Pattern, Document, Span}; + + #[test] + fn simple_apple() { + let doc = Document::new_markdown_curated("A red apple"); + let matches = NounPhrase.find_all_matches_in_doc(&doc); + + assert_eq!(matches, vec![Span::new(0, 5)]) + } + + #[test] + fn complex_apple() { + let doc = Document::new_markdown_curated("A red apple with a long stem"); + let matches = NounPhrase.find_all_matches_in_doc(&doc); + + assert_eq!(matches, vec![Span::new(0, 5), Span::new(8, 13)]) + } + + #[test] + fn list_fruit() { + let doc = Document::new_markdown_curated("An apple, a banana and a pear"); + let matches = NounPhrase.find_all_matches_in_doc(&doc); + + assert_eq!( + matches, + vec![Span::new(0, 3), Span::new(5, 8), Span::new(11, 14)] + ) + } + + #[test] + fn simplest_banana() { + let doc = Document::new_markdown_curated("a banana"); + assert!(NounPhrase.matches(doc.get_tokens(), doc.get_source()) != 0); + } +} diff --git a/harper-core/src/patterns/sequence_pattern.rs b/harper-core/src/patterns/sequence_pattern.rs index 8a75a186..1d4d6321 100644 --- a/harper-core/src/patterns/sequence_pattern.rs +++ b/harper-core/src/patterns/sequence_pattern.rs @@ -2,7 +2,7 @@ use hashbrown::HashSet; use paste::paste; use super::whitespace_pattern::WhitespacePattern; -use super::{EitherPattern, Pattern, RepeatingPattern}; +use super::{NounPhrase, Pattern, RepeatingPattern}; use crate::{CharStringExt, Lrc, Token, TokenKind}; /// A pattern that checks that a sequence of other patterns match. @@ -59,19 +59,8 @@ impl SequencePattern { gen_then_from_is!(hyphen); /// Add a pattern that looks for more complex ideas, like nouns with adjectives attached. - pub fn then_idea(mut self) -> Self { - self.then(Box::new(EitherPattern::new(vec![ - Box::new( - SequencePattern::default() - .then_one_or_more(Box::new( - SequencePattern::default() - .then_adjective() - .then_whitespace(), - )) - .then_noun(), - ), - Box::new(SequencePattern::default().then_noun()), - ]))) + pub fn then_noun_phrase(self) -> Self { + self.then(Box::new(NounPhrase)) } pub fn then_exact_word(mut self, word: &'static str) -> Self { @@ -236,8 +225,8 @@ mod tests { use hashbrown::HashSet; use super::SequencePattern; - use crate::patterns::{DocPattern, Pattern}; - use crate::{Document, Lrc, Span}; + use crate::patterns::Pattern; + use crate::{Document, Lrc}; #[test] fn matches_n_whitespace_tokens() { @@ -287,24 +276,4 @@ mod tests { doc.get_tokens().len() ); } - - #[test] - fn simple_idea_apple() { - let doc = Document::new_markdown_curated("A red apple"); - let matches = SequencePattern::default() - .then_idea() - .find_all_matches_in_doc(&doc); - - assert_eq!(matches, vec![Span::new(0, 5)]) - } - - #[test] - fn complex_idea_apple() { - let doc = Document::new_markdown_curated("A red apple with a long stem"); - let matches = SequencePattern::default() - .then_idea() - .find_all_matches_in_doc(&doc); - - assert_eq!(matches, vec![Span::new(0, 5), Span::new(8, 13)]) - } } diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs index 5eaa36c7..4435173c 100644 --- a/harper-core/src/token.rs +++ b/harper-core/src/token.rs @@ -79,6 +79,7 @@ pub trait TokenStringExt { fn span(&self) -> Option; create_decl_for!(word); + create_decl_for!(conjunction); create_decl_for!(space); create_decl_for!(apostrophe); create_decl_for!(pipe); @@ -117,6 +118,7 @@ pub trait TokenStringExt { impl TokenStringExt for [Token] { create_fns_for!(word); + create_fns_for!(conjunction); create_fns_for!(space); create_fns_for!(apostrophe); create_fns_for!(pipe); diff --git a/harper-core/src/token_kind.rs b/harper-core/src/token_kind.rs index b669c040..015d82db 100644 --- a/harper-core/src/token_kind.rs +++ b/harper-core/src/token_kind.rs @@ -91,6 +91,10 @@ impl TokenKind { } } + pub fn is_article(&self) -> bool { + matches!(self, TokenKind::Word(WordMetadata { article: true, .. })) + } + pub fn is_ellipsis(&self) -> bool { matches!(self, TokenKind::Punctuation(Punctuation::Ellipsis)) } diff --git a/harper-ls/src/diagnostics.rs b/harper-ls/src/diagnostics.rs index 01348e4c..e30710d4 100644 --- a/harper-ls/src/diagnostics.rs +++ b/harper-ls/src/diagnostics.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use harper_core::linting::{Lint, Suggestion}; +use harper_core::CharStringExt; use tower_lsp::lsp_types::{ CodeAction, CodeActionKind, CodeActionOrCommand, Command, Diagnostic, TextEdit, Url, WorkspaceEdit, @@ -37,6 +38,11 @@ pub fn lint_to_code_actions<'a>( let replace_string = match suggestion { Suggestion::ReplaceWith(with) => with.iter().collect(), Suggestion::Remove => "".to_string(), + Suggestion::InsertAfter(with) => format!( + "{}{}", + lint.span.get_content_string(source), + with.to_string() + ), }; Some(CodeAction { diff --git a/harper-wasm/src/lib.rs b/harper-wasm/src/lib.rs index cb3c466f..7343462e 100644 --- a/harper-wasm/src/lib.rs +++ b/harper-wasm/src/lib.rs @@ -166,6 +166,7 @@ pub struct Suggestion { pub enum SuggestionKind { Replace = 0, Remove = 1, + InsertAfter = 2, } #[wasm_bindgen] @@ -174,13 +175,14 @@ impl Suggestion { Self { inner } } - /// Get the text that is going to replace error. + /// Get the text that is going to replace the problematic section. /// If [`Self::kind`] is `SuggestionKind::Remove`, this will return an empty /// string. pub fn get_replacement_text(&self) -> String { match &self.inner { harper_core::linting::Suggestion::Remove => "".to_string(), harper_core::linting::Suggestion::ReplaceWith(chars) => chars.iter().collect(), + harper_core::linting::Suggestion::InsertAfter(chars) => chars.iter().collect(), } } @@ -188,6 +190,7 @@ impl Suggestion { match &self.inner { harper_core::linting::Suggestion::Remove => SuggestionKind::Remove, harper_core::linting::Suggestion::ReplaceWith(_) => SuggestionKind::Replace, + harper_core::linting::Suggestion::InsertAfter(_) => SuggestionKind::InsertAfter, } } } From d516897711fcca254b9b6f0219605581241ced6a Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Tue, 7 Jan 2025 08:34:43 -0700 Subject: [PATCH 12/15] test(core): added more cases for Oxford commas and noun phrase detection --- harper-core/dictionary.dict | 2 +- harper-core/src/linting/oxford_comma.rs | 33 ++++++++++++++++++++++++- harper-core/src/patterns/noun_phrase.rs | 21 ++++++++++++++++ 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index ead6ece2..89409a69 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -15268,7 +15268,7 @@ bureaucratization/1M bureaucratize/4GDS burg/1MRZS burgeon/14DSG -burger/1M +burger/1MS burgh/1MRZ burgher/1M burghs/1 diff --git a/harper-core/src/linting/oxford_comma.rs b/harper-core/src/linting/oxford_comma.rs index 01fcc2d8..51dcf683 100644 --- a/harper-core/src/linting/oxford_comma.rs +++ b/harper-core/src/linting/oxford_comma.rs @@ -98,11 +98,42 @@ mod tests { #[test] fn people() { - // Nancy, Steve and Carl are going assert_suggestion_result( "Nancy, Steve and Carl are going to the coffee shop.", OxfordComma::default(), "Nancy, Steve, and Carl are going to the coffee shop.", ); } + + #[test] + fn places() { + assert_suggestion_result( + "I've always wanted to visit Paris, Tokyo and Rome.", + OxfordComma::default(), + "I've always wanted to visit Paris, Tokyo, and Rome.", + ); + } + + #[test] + fn foods() { + assert_suggestion_result( + "My favorite foods are pizza, sushi, tacos and burgers.", + OxfordComma::default(), + "My favorite foods are pizza, sushi, tacos, and burgers.", + ); + } + + #[test] + fn allows_clean_music() { + assert_lint_count( + "I enjoy listening to pop music, rock, hip-hop, electronic dance, and classical music.", + OxfordComma::default(), + 0, + ); + } + + #[test] + fn allows_clean_nations() { + assert_lint_count("The team consists of players from different countries: France, Germany, Italy, and Spain.", OxfordComma::default(), 0); + } } diff --git a/harper-core/src/patterns/noun_phrase.rs b/harper-core/src/patterns/noun_phrase.rs index 31d18bc0..66d25e4f 100644 --- a/harper-core/src/patterns/noun_phrase.rs +++ b/harper-core/src/patterns/noun_phrase.rs @@ -75,4 +75,25 @@ mod tests { let doc = Document::new_markdown_curated("a banana"); assert!(NounPhrase.matches(doc.get_tokens(), doc.get_source()) != 0); } + + #[test] + fn food() { + let doc = Document::new_markdown_curated( + "My favorite foods are pizza, sushi, tacos and burgers.", + ); + let matches = NounPhrase.find_all_matches_in_doc(&doc); + + dbg!(&matches); + + assert_eq!( + matches, + vec![ + Span::new(2, 5), + Span::new(8, 9), + Span::new(11, 12), + Span::new(14, 15), + Span::new(18, 19) + ] + ) + } } From 29983db520f368a76b827f7e21f58bd10e3f2960 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Tue, 7 Jan 2025 09:02:55 -0700 Subject: [PATCH 13/15] feat(core): extended `OxfordCommas` to cover `*or*` cases --- harper-core/src/linting/oxford_comma.rs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/harper-core/src/linting/oxford_comma.rs b/harper-core/src/linting/oxford_comma.rs index 51dcf683..a52a228c 100644 --- a/harper-core/src/linting/oxford_comma.rs +++ b/harper-core/src/linting/oxford_comma.rs @@ -1,5 +1,5 @@ use crate::{ - patterns::{Pattern, SequencePattern}, + patterns::{EitherPattern, Pattern, SequencePattern}, Document, Token, TokenStringExt, }; @@ -21,7 +21,10 @@ impl OxfordComma { )) .then_noun_phrase() .then_whitespace() - .then_exact_word("and") + .then(Box::new(EitherPattern::new(vec![ + Box::new(SequencePattern::aco("and")), + Box::new(SequencePattern::aco("or")), + ]))) .then_whitespace() .then_noun_phrase(), } @@ -136,4 +139,18 @@ mod tests { fn allows_clean_nations() { assert_lint_count("The team consists of players from different countries: France, Germany, Italy, and Spain.", OxfordComma::default(), 0); } + + #[test] + fn or_writing() { + assert_suggestion_result("Harper can be a lifesaver when writing technical documents, emails or other formal forms of communication.", OxfordComma::default(), "Harper can be a lifesaver when writing technical documents, emails, or other formal forms of communication.",); + } + + #[test] + fn sports() { + assert_suggestion_result( + "They enjoy playing soccer, basketball or tennis.", + OxfordComma::default(), + "They enjoy playing soccer, basketball, or tennis.", + ); + } } From 4b6f7e5c6e8657d00c852b2435f7fdec65afb6ee Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Tue, 7 Jan 2025 09:05:33 -0700 Subject: [PATCH 14/15] fix(web): properly render `Suggestion::InsertAfter` in the editor --- demo.md | 2 ++ packages/web/src/lib/Editor.svelte | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/demo.md b/demo.md index 5f20a7af..084c7b0f 100644 --- a/demo.md +++ b/demo.md @@ -5,6 +5,8 @@ Harper is an language checker for developers. it can detect improper capitalization and misspellled words, as well as a number of other issues. Like if you break up words you shoul dn't. +Harper can be a lifesaver when writing technical documents, +emails or other formal forms of communication. Harper works everywhere, even offline. Since you r data never leaves your device, you don't ned to worry aout us diff --git a/packages/web/src/lib/Editor.svelte b/packages/web/src/lib/Editor.svelte index 41e9e3ff..63385271 100644 --- a/packages/web/src/lib/Editor.svelte +++ b/packages/web/src/lib/Editor.svelte @@ -104,8 +104,10 @@ > {#if suggestion.kind() == SuggestionKind.Remove} Remove "{lint.get_problem_text()}" - {:else} + {:else if suggestion.kind() == SuggestionKind.Replace} Replace "{lint.get_problem_text()}" with "{suggestion.get_replacement_text()}" + {:else} + Insert "{suggestion.get_replacement_text()}" after "{lint.get_problem_text()}" {/if} From 3b438613db7f751f709c6b0fc29211f048e01fb5 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Tue, 7 Jan 2025 09:38:45 -0700 Subject: [PATCH 15/15] feat(core): improved performance by enhancing cache localization --- harper-core/src/linting/oxford_comma.rs | 7 +- .../proper_noun_capitalization_linters.rs | 154 ++++++++---------- harper-core/src/patterns/mod.rs | 2 + harper-core/src/patterns/word_set.rs | 93 +++++++++++ 4 files changed, 168 insertions(+), 88 deletions(-) create mode 100644 harper-core/src/patterns/word_set.rs diff --git a/harper-core/src/linting/oxford_comma.rs b/harper-core/src/linting/oxford_comma.rs index a52a228c..94e79b82 100644 --- a/harper-core/src/linting/oxford_comma.rs +++ b/harper-core/src/linting/oxford_comma.rs @@ -1,5 +1,5 @@ use crate::{ - patterns::{EitherPattern, Pattern, SequencePattern}, + patterns::{Pattern, SequencePattern, WordSet}, Document, Token, TokenStringExt, }; @@ -21,10 +21,7 @@ impl OxfordComma { )) .then_noun_phrase() .then_whitespace() - .then(Box::new(EitherPattern::new(vec![ - Box::new(SequencePattern::aco("and")), - Box::new(SequencePattern::aco("or")), - ]))) + .then(Box::new(WordSet::all(&["and", "or"]))) .then_whitespace() .then_noun_phrase(), } diff --git a/harper-core/src/linting/proper_noun_capitalization_linters.rs b/harper-core/src/linting/proper_noun_capitalization_linters.rs index 0fabf9c9..ca597717 100644 --- a/harper-core/src/linting/proper_noun_capitalization_linters.rs +++ b/harper-core/src/linting/proper_noun_capitalization_linters.rs @@ -1,7 +1,7 @@ use super::PatternLinter; use super::{Lint, LintKind, Suggestion}; use crate::make_title_case; -use crate::patterns::{EitherPattern, IsNotTitleCase, Pattern, SequencePattern}; +use crate::patterns::{EitherPattern, IsNotTitleCase, Pattern, SequencePattern, WordSet}; use crate::FstDictionary; use crate::{Token, TokenStringExt}; use std::sync::Arc; @@ -62,10 +62,7 @@ macro_rules! create_linter_for { create_linter_for!( Americas, SequencePattern::default() - .then(Box::new(EitherPattern::new(vec![ - Box::new(SequencePattern::aco("South")), - Box::new(SequencePattern::aco("North")) - ]))) + .then(Box::new(WordSet::all(&["South", "North",]))) .then_whitespace() .t_aco("America"), "When referring to the continents, make sure to treat them as a proper noun." @@ -74,10 +71,7 @@ create_linter_for!( create_linter_for!( Koreas, SequencePattern::default() - .then(Box::new(EitherPattern::new(vec![ - Box::new(SequencePattern::aco("South")), - Box::new(SequencePattern::aco("North")) - ]))) + .then(Box::new(WordSet::all(&["South", "North",]))) .then_whitespace() .t_aco("Korea"), "When referring to the nations, make sure to treat them as a proper noun." @@ -119,25 +113,27 @@ create_linter_for!( Box::new( SequencePattern::default() .then(Box::new(EitherPattern::new(vec![ - Box::new(SequencePattern::aco("Presidents'")), - Box::new(SequencePattern::aco("Valentines")), - Box::new(SequencePattern::aco("Christmas")), - Box::new(SequencePattern::aco("Easter")), - Box::new(SequencePattern::aco("Flag")), - Box::new(SequencePattern::aco("Independence")), - Box::new(SequencePattern::aco("Mothers'")), - Box::new(SequencePattern::aco("New").t_aco("Years")), - Box::new(SequencePattern::aco("Fathers'")), - Box::new(SequencePattern::aco("Columbus")), - Box::new(SequencePattern::aco("Thanksgiving")), - Box::new(SequencePattern::aco("Memorial")), - Box::new(SequencePattern::aco("May")), - Box::new(SequencePattern::aco("Halloween")), - Box::new(SequencePattern::aco("Tax")), - Box::new(SequencePattern::aco("Parents")), - Box::new(SequencePattern::aco("Veterans")), - Box::new(SequencePattern::aco("Armistice")), - Box::new(SequencePattern::aco("Groundhog")), + Box::new(WordSet::all(&[ + "Presidents'", + "Valentines", + "Christmas", + "Easter", + "Flag", + "Independence", + "Mothers'", + "Years", + "Fathers'", + "Columbus", + "Thanksgiving", + "Memorial", + "May", + "Halloween", + "Tax", + "Parents", + "Veterans", + "Armistice", + "Groundhog" + ])), Box::new( SequencePattern::default() .t_aco("National") @@ -249,30 +245,30 @@ create_linter_for!( SequencePattern::default() .t_aco("Google") .then_whitespace() - .then(Box::new(EitherPattern::new(vec![ - Box::new(SequencePattern::aco("Search")), - Box::new(SequencePattern::aco("Cloud")), - Box::new(SequencePattern::aco("Maps")), - Box::new(SequencePattern::aco("Docs")), - Box::new(SequencePattern::aco("Sheets")), - Box::new(SequencePattern::aco("Slides")), - Box::new(SequencePattern::aco("Drive")), - Box::new(SequencePattern::aco("Meet")), - Box::new(SequencePattern::aco("Gmail")), - Box::new(SequencePattern::aco("Calendar")), - Box::new(SequencePattern::aco("Chrome")), - Box::new(SequencePattern::aco("ChromeOS")), - Box::new(SequencePattern::aco("Android")), - Box::new(SequencePattern::aco("Play")), - Box::new(SequencePattern::aco("Bard")), - Box::new(SequencePattern::aco("Gemini")), - Box::new(SequencePattern::aco("YouTube")), - Box::new(SequencePattern::aco("Photos")), - Box::new(SequencePattern::aco("Analytics")), - Box::new(SequencePattern::aco("AdSense")), - Box::new(SequencePattern::aco("Pixel")), - Box::new(SequencePattern::aco("Nest")), - Box::new(SequencePattern::aco("Workspace")) + .then(Box::new(WordSet::all(&[ + "Search", + "Cloud", + "Maps", + "Docs", + "Sheets", + "Slides", + "Drive", + "Meet", + "Gmail", + "Calendar", + "Chrome", + "ChromeOS", + "Android", + "Play", + "Bard", + "Gemini", + "YouTube", + "Photos", + "Analytics", + "AdSense", + "Pixel", + "Nest", + "Workspace", ]))), "When referring to Google products and services, make sure to treat them as proper nouns." ); @@ -357,20 +353,22 @@ create_linter_for!( .t_aco("Microsoft") .then_whitespace() .then(Box::new(EitherPattern::new(vec![ - Box::new(SequencePattern::aco("Windows")), - Box::new(SequencePattern::aco("Office")), - Box::new(SequencePattern::aco("Teams")), - Box::new(SequencePattern::aco("Excel")), - Box::new(SequencePattern::aco("PowerPoint")), - Box::new(SequencePattern::aco("Word")), - Box::new(SequencePattern::aco("Outlook")), - Box::new(SequencePattern::aco("OneDrive")), - Box::new(SequencePattern::aco("SharePoint")), - Box::new(SequencePattern::aco("Xbox")), - Box::new(SequencePattern::aco("Surface")), - Box::new(SequencePattern::aco("Edge")), - Box::new(SequencePattern::aco("Bing")), - Box::new(SequencePattern::aco("Dynamics")), + Box::new(WordSet::all(&[ + "Windows", + "Office", + "Teams", + "Excel", + "PowerPoint", + "Word", + "Outlook", + "OneDrive", + "SharePoint", + "Xbox", + "Surface", + "Edge", + "Bing", + "Dynamics", + ])), Box::new( SequencePattern::default() .t_aco("Visual") @@ -387,10 +385,10 @@ create_linter_for!( .t_aco("Apple") .then_whitespace() .then(Box::new(EitherPattern::new(vec![ - Box::new(SequencePattern::aco("iPhone")), - Box::new(SequencePattern::aco("iPad")), - Box::new(SequencePattern::aco("iMac")), - Box::new(SequencePattern::aco("MacBook")), + Box::new(WordSet::all(&[ + "iPhone", "iPad", "iMac", "MacBook", "Watch", "TV", "Music", "Arcade", "iCloud", + "Safari", "HomeKit", "CarPlay", + ])), Box::new( SequencePattern::aco("MacBook") .then_whitespace() @@ -414,14 +412,6 @@ create_linter_for!( .then_whitespace() .t_aco("Max") ), - Box::new(SequencePattern::aco("Watch")), - Box::new(SequencePattern::aco("TV")), - Box::new(SequencePattern::aco("Music")), - Box::new(SequencePattern::aco("Arcade")), - Box::new(SequencePattern::aco("iCloud")), - Box::new(SequencePattern::aco("Safari")), - Box::new(SequencePattern::aco("HomeKit")), - Box::new(SequencePattern::aco("CarPlay")), Box::new( SequencePattern::default() .t_aco("Vision") @@ -437,11 +427,9 @@ create_linter_for!( SequencePattern::aco("Meta") .then_whitespace() .then(Box::new(EitherPattern::new(vec![ - Box::new(SequencePattern::aco("Oculus")), - Box::new(SequencePattern::aco("Portals")), - Box::new(SequencePattern::aco("Quest")), - Box::new(SequencePattern::aco("Gaming")), - Box::new(SequencePattern::aco("Horizon")), + Box::new(WordSet::all(&[ + "Oculus", "Portals", "Quest", "Gaming", "Horizon", + ])), Box::new( SequencePattern::default() .t_aco("Reality") diff --git a/harper-core/src/patterns/mod.rs b/harper-core/src/patterns/mod.rs index ae50e215..ce2d042d 100644 --- a/harper-core/src/patterns/mod.rs +++ b/harper-core/src/patterns/mod.rs @@ -14,6 +14,7 @@ mod sequence_pattern; mod token_kind_pattern_group; mod whitespace_pattern; mod word_pattern_group; +mod word_set; pub use any_pattern::AnyPattern; use blanket::blanket; @@ -28,6 +29,7 @@ pub use sequence_pattern::SequencePattern; pub use token_kind_pattern_group::TokenKindPatternGroup; pub use whitespace_pattern::WhitespacePattern; pub use word_pattern_group::WordPatternGroup; +pub use word_set::WordSet; #[cfg(not(feature = "concurrent"))] #[blanket(derive(Rc, Arc))] diff --git a/harper-core/src/patterns/word_set.rs b/harper-core/src/patterns/word_set.rs new file mode 100644 index 00000000..65be9be5 --- /dev/null +++ b/harper-core/src/patterns/word_set.rs @@ -0,0 +1,93 @@ +use super::Pattern; +use smallvec::SmallVec; + +use crate::{CharString, Token}; + +// A [`Pattern`] that matches against any of a set of provided words. +// For small sets of short words, it doesn't allocate. +// +// Note that any capitalization of the contained words will result in a match. +#[derive(Debug, Default, Clone)] +pub struct WordSet { + words: SmallVec<[CharString; 4]>, +} + +impl WordSet { + pub fn add(&mut self, word: &str) { + let chars = word.chars().collect(); + + if !self.words.contains(&chars) { + self.words.push(chars); + } + } + + pub fn all(words: &[&'static str]) -> Self { + let mut set = Self::default(); + + for str in words { + set.add(str); + } + + set + } +} + +impl Pattern for WordSet { + fn matches(&self, tokens: &[Token], source: &[char]) -> usize { + let Some(tok) = tokens.first() else { + return 0; + }; + + if !tok.kind.is_word() { + return 0; + } + + let tok_chars = tok.span.get_content(source); + + for word in &self.words { + if tok_chars.len() != word.len() { + continue; + } + + let partial_match = tok_chars + .iter() + .zip(word) + .all(|(a, b)| a.to_ascii_lowercase() == b.to_ascii_lowercase()); + + if partial_match { + return 1; + } + } + + 0 + } +} + +#[cfg(test)] +mod tests { + use crate::{patterns::DocPattern, Document, Span}; + + use super::WordSet; + + #[test] + fn fruit() { + let set = WordSet::all(&["banana", "apple", "orange"]); + + let doc = Document::new_markdown_curated("I ate a banana and an apple today."); + + let matches = set.find_all_matches_in_doc(&doc); + + assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]); + } + + #[test] + fn fruit_whack_capitalization() { + let set = WordSet::all(&["banana", "apple", "orange"]); + + let doc = Document::new_markdown_curated("I Ate A bAnaNa And aN apPlE today."); + + let matches = set.find_all_matches_in_doc(&doc); + + assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]); + } +}