From f4ba4d04c75fa078bd935ea200ee6baf0f675142 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Thu, 2 Jan 2025 11:10:24 -0700 Subject: [PATCH] feat(core): wrote linter to look for separated closed compound words --- harper-core/src/linting/compound_words.rs | 147 ++++++++++++++++++ harper-core/src/linting/lint_group.rs | 7 +- harper-core/src/linting/matcher.rs | 81 ---------- harper-core/src/linting/mod.rs | 3 +- .../proper_noun_capitalization_linters.rs | 12 ++ 5 files changed, 166 insertions(+), 84 deletions(-) create mode 100644 harper-core/src/linting/compound_words.rs diff --git a/harper-core/src/linting/compound_words.rs b/harper-core/src/linting/compound_words.rs new file mode 100644 index 00000000..47d92533 --- /dev/null +++ b/harper-core/src/linting/compound_words.rs @@ -0,0 +1,147 @@ +use std::sync::Arc; + +use itertools::Itertools; + +use crate::{CharString, Dictionary, Document, FstDictionary, Span}; + +use super::{Lint, LintKind, Linter, Suggestion}; + +pub struct CompoundWords { + dict: Arc, +} + +impl CompoundWords { + pub fn new() -> Self { + Self { + dict: FstDictionary::curated(), + } + } +} + +impl Default for CompoundWords { + fn default() -> Self { + Self::new() + } +} + +impl Linter for CompoundWords { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + let mut merged_word = CharString::new(); + + for (a, w, b) in document.tokens().tuple_windows() { + if !a.kind.is_word() || !w.kind.is_whitespace() || !b.kind.is_word() { + continue; + } + + let a_chars = document.get_span_content(a.span); + let b_chars = document.get_span_content(b.span); + + // Not super helpful in this case, so we skip it + if matches!(a_chars, ['a']) { + continue; + } + + merged_word.clear(); + merged_word.extend_from_slice(a_chars); + merged_word.extend_from_slice(b_chars); + + if self.dict.contains_word(&merged_word) { + lints.push(Lint { + span: Span::new(a.span.start, b.span.end), + lint_kind: LintKind::Spelling, + suggestions: vec![Suggestion::ReplaceWith(merged_word.to_vec())], + message: "These two words are often combined to form a closed compound word." + .to_owned(), + priority: 63, + }); + } + } + + lints + } + + fn description(&self) -> &str { + "Accidentally inserting a space inside a word is common. This rule looks for valid words that are split by whitespace." + } +} + +#[cfg(test)] +mod tests { + use crate::linting::tests::assert_lint_count; + + use super::CompoundWords; + + #[test] + fn scarecrow() { + assert_lint_count( + "I saw a scare crow in the field today.", + CompoundWords::default(), + 1, + ); + } + + #[test] + fn clean() { + assert_lint_count( + "When referring to the political party, make sure to treat them as a proper noun.", + CompoundWords::default(), + 0, + ); + } + + #[test] + fn bookshelf() { + assert_lint_count( + "I have a big book shelf in my room.", + CompoundWords::default(), + 1, + ); + } + + #[test] + fn sunscreen() { + assert_lint_count( + "Don't forget to apply your sunscreen before going out.", + CompoundWords::default(), + 0, + ); + } + + #[test] + fn makeup() { + assert_lint_count( + "She spent a lot of time doing her make up this morning.", + CompoundWords::default(), + 1, + ); + } + + #[test] + fn birthday() { + assert_lint_count( + "We're having a big party to celebrate the couple's birthday today.", + CompoundWords::default(), + 0, + ); + } + + #[test] + fn hometown() { + assert_lint_count( + "My home town is a beautiful place with many historical land marks.", + CompoundWords::default(), + 2, + ); + } + + #[test] + fn assertions() { + assert_lint_count( + "Make sure to compile with debug ass ertions disabled.", + CompoundWords::default(), + 1, + ); + } +} diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs index fd4489a6..b49dfec9 100644 --- a/harper-core/src/linting/lint_group.rs +++ b/harper-core/src/linting/lint_group.rs @@ -5,6 +5,7 @@ use super::an_a::AnA; use super::avoid_curses::AvoidCurses; use super::boring_words::BoringWords; use super::capitalize_personal_pronouns::CapitalizePersonalPronouns; +use super::compound_words::CompoundWords; use super::correct_number_suffix::CorrectNumberSuffix; use super::dot_initialisms::DotInitialisms; use super::ellipsis_length::EllipsisLength; @@ -15,7 +16,7 @@ use super::multiple_sequential_pronouns::MultipleSequentialPronouns; use super::number_suffix_capitalization::NumberSuffixCapitalization; use super::proper_noun_capitalization_linters::{ AmazonNames, Americas, AppleNames, AzureNames, ChineseCommunistParty, GoogleNames, Holidays, - MetaNames, MicrosoftNames, UnitedOrganizations, + Koreas, MetaNames, MicrosoftNames, UnitedOrganizations, }; use super::repeated_words::RepeatedWords; use super::sentence_capitalization::SentenceCapitalization; @@ -162,6 +163,7 @@ create_lint_group_config!( ThatWhich => true, CapitalizePersonalPronouns => true, Americas => true, + Koreas => true, ChineseCommunistParty => true, UnitedOrganizations => true, Holidays => true, @@ -170,7 +172,8 @@ create_lint_group_config!( MetaNames => true, MicrosoftNames => true, AppleNames => true, - AzureNames => true + AzureNames => true, + CompoundWords => true ); impl Default for LintGroup { diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index 3349bedb..cefd4c50 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -106,7 +106,6 @@ impl Matcher { "wellbeing" => "well-being", "hashtable" => "hash table", "hashmap" => "hash map", - "CCP" => "Chinese Communist Party", "dep" => "dependency", "deps" => "dependencies", "off","the","cuff" => "off-the-cuff", @@ -118,7 +117,6 @@ impl Matcher { "todo" => "to-do", "To-Do" => "To-do", "performing","this" => "perform this", - "united nations" => "United Nations", "mins" => "minutes", "min" => "minute", "min" => "minimum", @@ -139,10 +137,6 @@ impl Matcher { "There","fore" => "Therefore", "fatal","outcome" => "death", "geiger","counter" => "Geiger counter", - "veterans","day" => "Veterans Day", - "presidents","day" => "Presidents' Day", - "president's","day" => "Presidents' Day", - "valentines","day" => "Valentine's Day", "world","war","2" => "World War II", "World","war","ii" => "World War II", "world","War","ii" => "World War II", @@ -185,86 +179,11 @@ impl Matcher { "the","hing" => "the thing", "The","hing" => "The thing", "need","helps" => "need help", - "all","though" => "although", - "All","though" => "although", - "al","though" => "although", - "Al","though" => "although", "an","this" => "and this", "break","up" => "break-up", "case", "sensitive" => "case-sensitive", - "bare", "foot" => "barefoot", - "air", "port" => "airport", - "any", "body" => "anybody", - "every", "body" => "everybody", - "no", "body" => "nobody", - "some", "body" => "somebody", - "any", "one" => "anyone", - "every", "one" => "everyone", - "some", "one" => "someone", - "any", "thing" => "anything", - "every", "thing" => "everything", - "no", "thing" => "nothing", - "some", "thing" => "something", - "any", "where" => "anywhere", - "every", "where" => "everywhere", - "no", "where" => "nowhere", - "some", "where" => "somewhere", - "baby", "sit" => "babysit", - "back", "ground" => "background", - "bare", "foot" => "barefoot", - "base", "ball" => "baseball", - "basket", "ball" => "basketball", - "foot", "ball" => "football", - "bath", "room" => "bathroom", - "bed", "room" => "bedroom", - "black", "berry" => "blackberry", - "blue", "berry" => "blueberry", - "break", "fast" => "breakfast", - "can", "not" => "cannot", - "check", "out" => "checkout", - "cow", "boy" => "cowboy", - "day", "light" => "daylight", - "desk", "top" => "desktop", - "finger", "print" => "fingerprint", - "fire", "fly" => "firefly", - "fore", "ver" => "forever", - "gentle", "man" => "gentleman", - "grand", "mother" => "grandmother", - "grand", "father" => "grandfather", - "grand", "daughter" => "granddaughter", - "grape", "fruit" => "grapefruit", - "grass", "hopper" => "grasshopper", - "head", "quarters" => "headquarters", - "hand", "shake" => "handshake", - "in", "side" => "inside", - "key", "board" => "keyboard", - "lip", "stick" => "lipstick", - "mail", "box" => "mailbox", - "never", "theless" => "nevertheless", - "none", "theless" => "nonetheless", - "note", "book" => "notebook", - "ou", "tside" => "outside", - "pay", "day" => "payday", - "rail", "road" => "railroad", - "rain", "bow" => "rainbow", - "rain", "coat" => "raincoat", - "skate", "board" => "skateboard", - "smart", "phone" => "smartphone", - "snow", "ball" => "snowball", - "some", "times" => "sometimes", - "sun", "flower" => "sunflower", - "tooth", "brush" => "toothbrush", - "turn", "table" => "turntable", - "under", "cover" => "undercover", - "up", "stream" => "upstream", - "water", "fall" => "waterfall", - "water", "melon" => "watermelon", - "wee", "kend" => "weekend", - "with", "in" => "within", - "with", "out" => "without", "Tree", "sitter" => "Tree-sitter", "all", "of", "the" => "all the", - "an", "other" => "another", "not", "longer" => "no longer", "to", "towards" => "towards", "though", "process" => "thought process", diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index f06e548b..58217074 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -2,6 +2,7 @@ mod an_a; mod avoid_curses; mod boring_words; mod capitalize_personal_pronouns; +mod compound_words; mod correct_number_suffix; mod dot_initialisms; mod ellipsis_length; @@ -42,7 +43,7 @@ pub use number_suffix_capitalization::NumberSuffixCapitalization; pub use pattern_linter::PatternLinter; pub use proper_noun_capitalization_linters::{ AmazonNames, Americas, AppleNames, AzureNames, ChineseCommunistParty, GoogleNames, Holidays, - MetaNames, MicrosoftNames, UnitedOrganizations, + Koreas, MetaNames, MicrosoftNames, UnitedOrganizations, }; pub use repeated_words::RepeatedWords; pub use sentence_capitalization::SentenceCapitalization; diff --git a/harper-core/src/linting/proper_noun_capitalization_linters.rs b/harper-core/src/linting/proper_noun_capitalization_linters.rs index 4c10a59b..5b4c896c 100644 --- a/harper-core/src/linting/proper_noun_capitalization_linters.rs +++ b/harper-core/src/linting/proper_noun_capitalization_linters.rs @@ -71,6 +71,18 @@ create_linter_for!( "When referring to the continents, make sure to treat them as a proper noun." ); +create_linter_for!( + Koreas, + SequencePattern::default() + .then(Box::new(EitherPattern::new(vec![ + Box::new(SequencePattern::default().then_any_capitalization_of("South")), + Box::new(SequencePattern::default().then_any_capitalization_of("North")) + ]))) + .then_whitespace() + .then_any_capitalization_of("Korea"), + "When referring to the nations, make sure to treat them as a proper noun." +); + create_linter_for!( ChineseCommunistParty, SequencePattern::default()