Skip to content

Commit

Permalink
feat(core): wrote linter to look for separated closed compound words
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-potter committed Jan 2, 2025
1 parent 9f6737b commit f4ba4d0
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 84 deletions.
147 changes: 147 additions & 0 deletions harper-core/src/linting/compound_words.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
use std::sync::Arc;

use itertools::Itertools;

use crate::{CharString, Dictionary, Document, FstDictionary, Span};

use super::{Lint, LintKind, Linter, Suggestion};

pub struct CompoundWords {
dict: Arc<FstDictionary>,
}

impl CompoundWords {
pub fn new() -> Self {
Self {
dict: FstDictionary::curated(),
}
}
}

impl Default for CompoundWords {
fn default() -> Self {
Self::new()
}
}

impl Linter for CompoundWords {
fn lint(&mut self, document: &Document) -> Vec<Lint> {
let mut lints = Vec::new();

let mut merged_word = CharString::new();

for (a, w, b) in document.tokens().tuple_windows() {
if !a.kind.is_word() || !w.kind.is_whitespace() || !b.kind.is_word() {
continue;
}

let a_chars = document.get_span_content(a.span);
let b_chars = document.get_span_content(b.span);

// Not super helpful in this case, so we skip it
if matches!(a_chars, ['a']) {
continue;
}

merged_word.clear();
merged_word.extend_from_slice(a_chars);
merged_word.extend_from_slice(b_chars);

if self.dict.contains_word(&merged_word) {
lints.push(Lint {
span: Span::new(a.span.start, b.span.end),
lint_kind: LintKind::Spelling,
suggestions: vec![Suggestion::ReplaceWith(merged_word.to_vec())],
message: "These two words are often combined to form a closed compound word."
.to_owned(),
priority: 63,
});
}
}

lints
}

fn description(&self) -> &str {
"Accidentally inserting a space inside a word is common. This rule looks for valid words that are split by whitespace."
}
}

#[cfg(test)]
mod tests {
use crate::linting::tests::assert_lint_count;

use super::CompoundWords;

#[test]
fn scarecrow() {
assert_lint_count(
"I saw a scare crow in the field today.",
CompoundWords::default(),
1,
);
}

#[test]
fn clean() {
assert_lint_count(
"When referring to the political party, make sure to treat them as a proper noun.",
CompoundWords::default(),
0,
);
}

#[test]
fn bookshelf() {
assert_lint_count(
"I have a big book shelf in my room.",
CompoundWords::default(),
1,
);
}

#[test]
fn sunscreen() {
assert_lint_count(
"Don't forget to apply your sunscreen before going out.",
CompoundWords::default(),
0,
);
}

#[test]
fn makeup() {
assert_lint_count(
"She spent a lot of time doing her make up this morning.",
CompoundWords::default(),
1,
);
}

#[test]
fn birthday() {
assert_lint_count(
"We're having a big party to celebrate the couple's birthday today.",
CompoundWords::default(),
0,
);
}

#[test]
fn hometown() {
assert_lint_count(
"My home town is a beautiful place with many historical land marks.",
CompoundWords::default(),
2,
);
}

#[test]
fn assertions() {
assert_lint_count(
"Make sure to compile with debug ass ertions disabled.",
CompoundWords::default(),
1,
);
}
}
7 changes: 5 additions & 2 deletions harper-core/src/linting/lint_group.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use super::an_a::AnA;
use super::avoid_curses::AvoidCurses;
use super::boring_words::BoringWords;
use super::capitalize_personal_pronouns::CapitalizePersonalPronouns;
use super::compound_words::CompoundWords;
use super::correct_number_suffix::CorrectNumberSuffix;
use super::dot_initialisms::DotInitialisms;
use super::ellipsis_length::EllipsisLength;
Expand All @@ -15,7 +16,7 @@ use super::multiple_sequential_pronouns::MultipleSequentialPronouns;
use super::number_suffix_capitalization::NumberSuffixCapitalization;
use super::proper_noun_capitalization_linters::{
AmazonNames, Americas, AppleNames, AzureNames, ChineseCommunistParty, GoogleNames, Holidays,
MetaNames, MicrosoftNames, UnitedOrganizations,
Koreas, MetaNames, MicrosoftNames, UnitedOrganizations,
};
use super::repeated_words::RepeatedWords;
use super::sentence_capitalization::SentenceCapitalization;
Expand Down Expand Up @@ -162,6 +163,7 @@ create_lint_group_config!(
ThatWhich => true,
CapitalizePersonalPronouns => true,
Americas => true,
Koreas => true,
ChineseCommunistParty => true,
UnitedOrganizations => true,
Holidays => true,
Expand All @@ -170,7 +172,8 @@ create_lint_group_config!(
MetaNames => true,
MicrosoftNames => true,
AppleNames => true,
AzureNames => true
AzureNames => true,
CompoundWords => true
);

impl<T: Dictionary + Default> Default for LintGroup<T> {
Expand Down
81 changes: 0 additions & 81 deletions harper-core/src/linting/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ impl Matcher {
"wellbeing" => "well-being",
"hashtable" => "hash table",
"hashmap" => "hash map",
"CCP" => "Chinese Communist Party",
"dep" => "dependency",
"deps" => "dependencies",
"off","the","cuff" => "off-the-cuff",
Expand All @@ -118,7 +117,6 @@ impl Matcher {
"todo" => "to-do",
"To-Do" => "To-do",
"performing","this" => "perform this",
"united nations" => "United Nations",
"mins" => "minutes",
"min" => "minute",
"min" => "minimum",
Expand All @@ -139,10 +137,6 @@ impl Matcher {
"There","fore" => "Therefore",
"fatal","outcome" => "death",
"geiger","counter" => "Geiger counter",
"veterans","day" => "Veterans Day",
"presidents","day" => "Presidents' Day",
"president's","day" => "Presidents' Day",
"valentines","day" => "Valentine's Day",
"world","war","2" => "World War II",
"World","war","ii" => "World War II",
"world","War","ii" => "World War II",
Expand Down Expand Up @@ -185,86 +179,11 @@ impl Matcher {
"the","hing" => "the thing",
"The","hing" => "The thing",
"need","helps" => "need help",
"all","though" => "although",
"All","though" => "although",
"al","though" => "although",
"Al","though" => "although",
"an","this" => "and this",
"break","up" => "break-up",
"case", "sensitive" => "case-sensitive",
"bare", "foot" => "barefoot",
"air", "port" => "airport",
"any", "body" => "anybody",
"every", "body" => "everybody",
"no", "body" => "nobody",
"some", "body" => "somebody",
"any", "one" => "anyone",
"every", "one" => "everyone",
"some", "one" => "someone",
"any", "thing" => "anything",
"every", "thing" => "everything",
"no", "thing" => "nothing",
"some", "thing" => "something",
"any", "where" => "anywhere",
"every", "where" => "everywhere",
"no", "where" => "nowhere",
"some", "where" => "somewhere",
"baby", "sit" => "babysit",
"back", "ground" => "background",
"bare", "foot" => "barefoot",
"base", "ball" => "baseball",
"basket", "ball" => "basketball",
"foot", "ball" => "football",
"bath", "room" => "bathroom",
"bed", "room" => "bedroom",
"black", "berry" => "blackberry",
"blue", "berry" => "blueberry",
"break", "fast" => "breakfast",
"can", "not" => "cannot",
"check", "out" => "checkout",
"cow", "boy" => "cowboy",
"day", "light" => "daylight",
"desk", "top" => "desktop",
"finger", "print" => "fingerprint",
"fire", "fly" => "firefly",
"fore", "ver" => "forever",
"gentle", "man" => "gentleman",
"grand", "mother" => "grandmother",
"grand", "father" => "grandfather",
"grand", "daughter" => "granddaughter",
"grape", "fruit" => "grapefruit",
"grass", "hopper" => "grasshopper",
"head", "quarters" => "headquarters",
"hand", "shake" => "handshake",
"in", "side" => "inside",
"key", "board" => "keyboard",
"lip", "stick" => "lipstick",
"mail", "box" => "mailbox",
"never", "theless" => "nevertheless",
"none", "theless" => "nonetheless",
"note", "book" => "notebook",
"ou", "tside" => "outside",
"pay", "day" => "payday",
"rail", "road" => "railroad",
"rain", "bow" => "rainbow",
"rain", "coat" => "raincoat",
"skate", "board" => "skateboard",
"smart", "phone" => "smartphone",
"snow", "ball" => "snowball",
"some", "times" => "sometimes",
"sun", "flower" => "sunflower",
"tooth", "brush" => "toothbrush",
"turn", "table" => "turntable",
"under", "cover" => "undercover",
"up", "stream" => "upstream",
"water", "fall" => "waterfall",
"water", "melon" => "watermelon",
"wee", "kend" => "weekend",
"with", "in" => "within",
"with", "out" => "without",
"Tree", "sitter" => "Tree-sitter",
"all", "of", "the" => "all the",
"an", "other" => "another",
"not", "longer" => "no longer",
"to", "towards" => "towards",
"though", "process" => "thought process",
Expand Down
3 changes: 2 additions & 1 deletion harper-core/src/linting/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ mod an_a;
mod avoid_curses;
mod boring_words;
mod capitalize_personal_pronouns;
mod compound_words;
mod correct_number_suffix;
mod dot_initialisms;
mod ellipsis_length;
Expand Down Expand Up @@ -42,7 +43,7 @@ pub use number_suffix_capitalization::NumberSuffixCapitalization;
pub use pattern_linter::PatternLinter;
pub use proper_noun_capitalization_linters::{
AmazonNames, Americas, AppleNames, AzureNames, ChineseCommunistParty, GoogleNames, Holidays,
MetaNames, MicrosoftNames, UnitedOrganizations,
Koreas, MetaNames, MicrosoftNames, UnitedOrganizations,
};
pub use repeated_words::RepeatedWords;
pub use sentence_capitalization::SentenceCapitalization;
Expand Down
12 changes: 12 additions & 0 deletions harper-core/src/linting/proper_noun_capitalization_linters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,18 @@ create_linter_for!(
"When referring to the continents, make sure to treat them as a proper noun."
);

create_linter_for!(
Koreas,
SequencePattern::default()
.then(Box::new(EitherPattern::new(vec![
Box::new(SequencePattern::default().then_any_capitalization_of("South")),
Box::new(SequencePattern::default().then_any_capitalization_of("North"))
])))
.then_whitespace()
.then_any_capitalization_of("Korea"),
"When referring to the nations, make sure to treat them as a proper noun."
);

create_linter_for!(
ChineseCommunistParty,
SequencePattern::default()
Expand Down

0 comments on commit f4ba4d0

Please sign in to comment.