Skip to content

Commit

Permalink
feat(Automattic#230): consolidate words separated by apostrophes into…
Browse files Browse the repository at this point in the history
… possessives or conjunctions
  • Loading branch information
grantlemons committed Dec 4, 2024
1 parent 2aad44f commit fb23a28
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 4 deletions.
51 changes: 47 additions & 4 deletions harper-core/src/parsers/typst.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
use itertools::Itertools;

use std::collections::VecDeque;
use typst_syntax::ast::{AstNode, Expr, Markup};

use super::{Parser, PlainEnglish};
use crate::{parsers::StrParser, Punctuation, Token, TokenKind, WordMetadata};
use crate::{
parsers::StrParser,
patterns::{PatternExt, SequencePattern},
ConjunctionData, Lrc, Punctuation, Span, Token, TokenKind, VecExt, WordMetadata,
};

/// A parser that wraps the [`PlainEnglish`] parser that allows one to parse
/// Typst files.
Expand Down Expand Up @@ -199,6 +203,13 @@ fn map_token(
}
}

thread_local! {
static WORD_APOSTROPHE_WORD: Lrc<SequencePattern> = Lrc::new(SequencePattern::default()
.then_any_word()
.then_apostrophe()
.then_any_word());
}

impl Parser for Typst {
fn parse(&mut self, source: &[char]) -> Vec<Token> {
let mut english_parser = PlainEnglish;
Expand All @@ -210,11 +221,43 @@ impl Parser for Typst {

// NOTE: the range spits out __byte__ indices, not char indices.
// This is why we keep track above.
typst_tree
let mut tokens = typst_tree
.exprs()
.filter_map(|ex| map_token(ex, &typst_document, &mut english_parser))
.flatten()
.collect_vec()
.collect_vec();

// Consolidate conjunctions
let mut to_remove = VecDeque::default();
for tok_span in WORD_APOSTROPHE_WORD
.with(|v| v.clone())
.find_all_matches(&tokens, source)
{
let start_tok = &tokens[tok_span.start];
let end_tok = &tokens[tok_span.end - 1];
let char_span = Span::new(start_tok.span.start, end_tok.span.end);

if let TokenKind::Word(metadata) = start_tok.kind {
if end_tok.span.get_content(source) == &['s'] {
if let Some(mut noun) = metadata.noun {
noun.is_possessive = Some(true);
}
} else {
tokens[tok_span.start].kind = TokenKind::Word(WordMetadata {
conjunction: Some(ConjunctionData {}),
..metadata
});
};

tokens[tok_span.start].span = char_span;
to_remove.extend(tok_span.start + 1..tok_span.end);
} else {
panic!("Apostrophe consolidation does not start with Word Token!")
}
}
tokens.remove_indices(to_remove.into_iter().sorted().unique().collect());

tokens
}
}

Expand Down
1 change: 1 addition & 0 deletions harper-core/src/patterns/sequence_pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ impl SequencePattern {
gen_then_from_is!(case_separator);
gen_then_from_is!(adverb);
gen_then_from_is!(adjective);
gen_then_from_is!(apostrophe);

pub fn then_exact_word(mut self, word: &'static str) -> Self {
self.token_patterns
Expand Down

0 comments on commit fb23a28

Please sign in to comment.