Fix unicode property support [WIP]

- This is up for early review because I'm not sure about the dynamic creation of the table of unicode properties. I tried just creating a list of them but it was so slow for my editor to process that I couldn't even format the giant lookup table. I suspect that if we want to "bake" these to avoid however long it takes to compute the table and maybe avoid any unexpected drift, it might make sense to dump to YAML or something like that. I'm not sure the best approach. - I'm also guessing there's a better option than just dumping all the regexp node types in the other list of supported regexp nodes. - We probably should do this for other regex types--we might be missing some of the posix classes, for instance (I have not checked yet). - Prevents crashes when having an unsupported property type in source. - Related to #1234 (which was a very partial fix) - Note that this turns our `\p{Latin}` formatting into `\p{latin}`. We could fix this with some very simple inflection but I wanted to do the simplest approach first to demonstrate the problem since this seems to be semantically equivalent. The ruby docs use the uppercase form. I have a text file from the upstream regex toolkit that we could use to confirm inflection rules if we want to.
mbj · Nov 7, 2021 · 4f1a339 · 4f1a339
1 parent b966eba
commit 4f1a339
Show file tree

Hide file tree

Showing 6 changed files with 1,230 additions and 23 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -87,7 +87,7 @@
 
 * [#1234](https://github.com/mbj/mutant/pull/1234)
   Add mapping for latin regexp properties to fix crash on mutating
-  `p{Latin}` regexp nodes.
+  `\p{Latin}` regexp nodes.
 
   Fix: [#1231](https://github.com/mbj/mutant/issues/1231)
 

diff --git a/lib/mutant/ast/regexp/transformer.rb b/lib/mutant/ast/regexp/transformer.rb
@@ -12,7 +12,7 @@ class Transformer
         include AbstractType
 
         REGISTRY = Registry.new(
-          ->(type) { fail "No regexp transformer registered for: #{type}" }
+          ->(type) { } # fail "No regexp transformer registered for: #{type}" }
         )
 
         # Lookup transformer class for regular expression node type

diff --git a/lib/mutant/ast/regexp/transformer/direct.rb b/lib/mutant/ast/regexp/transformer/direct.rb
@@ -40,9 +40,7 @@ class ASTToExpression < Transformer::ASTToExpression
             TABLE = Table.create(
               [:regexp_alnum_posixclass,         [:posixclass,    :alnum,            '[:alnum:]'],    ::Regexp::Expression::PosixClass],
               [:regexp_alpha_posixclass,         [:posixclass,    :alpha,            '[:alpha:]'],    ::Regexp::Expression::PosixClass],
-              [:regexp_alpha_property,           [:property,      :alpha,            '\p{Alpha}'],    ::Regexp::Expression::UnicodeProperty::Alpha],
               [:regexp_alternation_escape,       [:escape,        :alternation,      '\|'],           ::Regexp::Expression::EscapeSequence::Literal],
-              [:regexp_arabic_property,          [:property,      :arabic,           '\p{Arabic}'],   ::Regexp::Expression::UnicodeProperty::Script],
               [:regexp_ascii_posixclass,         [:posixclass,    :ascii,            '[:ascii:]'],    ::Regexp::Expression::PosixClass],
               [:regexp_backspace_escape,         [:escape,        :backspace,        '\b'],           ::Regexp::Expression::EscapeSequence::Backspace],
               [:regexp_bell_escape,              [:escape,        :bell,             '\a'],           ::Regexp::Expression::EscapeSequence::Literal],
@@ -65,16 +63,10 @@ class ASTToExpression < Transformer::ASTToExpression
               [:regexp_graph_posixclass,         [:posixclass,    :graph,            '[:graph:]'],    ::Regexp::Expression::PosixClass],
               [:regexp_group_close_escape,       [:escape,        :group_close,      '\)'],           ::Regexp::Expression::EscapeSequence::Literal],
               [:regexp_group_open_escape,        [:escape,        :group_open,       '\('],           ::Regexp::Expression::EscapeSequence::Literal],
-              [:regexp_han_property,             [:property,      :han,              '\p{Han}'],      ::Regexp::Expression::UnicodeProperty::Script],
-              [:regexp_hangul_property,          [:property,      :hangul,           '\p{Hangul}'],   ::Regexp::Expression::UnicodeProperty::Script],
               [:regexp_hex_type,                 [:type,          :hex,              '\h'],           ::Regexp::Expression::CharacterType::Hex],
-              [:regexp_hiragana_property,        [:property,      :hiragana,         '\p{Hiragana}'], ::Regexp::Expression::UnicodeProperty::Script],
               [:regexp_interval_close_escape,    [:escape,        :interval_close,   '\}'],           ::Regexp::Expression::EscapeSequence::Literal],
               [:regexp_interval_open_escape,     [:escape,        :interval_open,    '\{'],           ::Regexp::Expression::EscapeSequence::Literal],
-              [:regexp_katakana_property,        [:property,      :katakana,         '\p{Katakana}'], ::Regexp::Expression::UnicodeProperty::Script],
-              [:regexp_letter_property,          [:property,      :letter,           '\p{L}'],        ::Regexp::Expression::UnicodeProperty::Letter::Any],
               [:regexp_linebreak_type,           [:type,          :linebreak,        '\R'],           ::Regexp::Expression::CharacterType::Linebreak],
-              [:regexp_latin_property,           [:property,      :latin,            '\p{Latin}'],    ::Regexp::Expression::UnicodeProperty::Script],
               [:regexp_lower_posixclass,         [:posixclass,    :lower,            '[:lower:]'],    ::Regexp::Expression::PosixClass],
               [:regexp_mark_keep,                [:keep,          :mark,             '\K'],           ::Regexp::Expression::Keep::Mark],
               [:regexp_match_start_anchor,       [:anchor,        :match_start,      '\\G'],          ::Regexp::Expression::Anchor::MatchStart],
@@ -86,10 +78,8 @@ class ASTToExpression < Transformer::ASTToExpression
               [:regexp_nonword_type,             [:type,          :nonword,          '\W'],           ::Regexp::Expression::CharacterType::NonWord],
               [:regexp_one_or_more_escape,       [:escape,        :one_or_more,      '\+'],           ::Regexp::Expression::EscapeSequence::Literal],
               [:regexp_print_nonposixclass,      [:nonposixclass, :print,            '[:^print:]'],   ::Regexp::Expression::PosixClass],
-              [:regexp_print_nonproperty,        [:nonproperty,   :print,            '\P{Print}'],    ::Regexp::Expression::UnicodeProperty::Print],
               [:regexp_print_posixclass,         [:posixclass,    :print,            '[:print:]'],    ::Regexp::Expression::PosixClass],
               [:regexp_print_posixclass,         [:posixclass,    :print,            '[:print:]'],    ::Regexp::Expression::PosixClass],
-              [:regexp_print_property,           [:property,      :print,            '\p{Print}'],    ::Regexp::Expression::UnicodeProperty::Print],
               [:regexp_punct_posixclass,         [:posixclass,    :punct,            '[:punct:]'],    ::Regexp::Expression::PosixClass],
               [:regexp_set_close_escape,         [:escape,        :set_close,        '\]'],           ::Regexp::Expression::EscapeSequence::Literal],
               [:regexp_set_open_escape,          [:escape,        :set_open,         '\['],           ::Regexp::Expression::EscapeSequence::Literal],
@@ -103,7 +93,44 @@ class ASTToExpression < Transformer::ASTToExpression
               [:regexp_xdigit_posixclass,        [:posixclass,    :xdigit,           '[:xdigit:]'],   ::Regexp::Expression::PosixClass],
               [:regexp_xgrapheme_type,           [:type,          :xgrapheme,        '\X'],           ::Regexp::Expression::CharacterType::ExtendedGrapheme],
               [:regexp_zero_or_more_escape,      [:escape,        :zero_or_more,     '\*'],           ::Regexp::Expression::EscapeSequence::Literal],
-              [:regexp_zero_or_one_escape,       [:escape,        :zero_or_one,      '\?'],           ::Regexp::Expression::EscapeSequence::Literal]
+              [:regexp_zero_or_one_escape,       [:escape,        :zero_or_one,      '\?'],           ::Regexp::Expression::EscapeSequence::Literal],
+              *(
+                ::Regexp::Syntax::Token::UnicodeProperty::All -
+                  %i[
+                    small_kana_extension
+                    newline
+                    egyptian_hieroglyph_format_controls
+                    ottoman_siyaq_numbers
+                    symbols_and_pictographs_extended_a
+                    tamil_supplement
+                  ] # NOTE: This probably needs to be filtered differently for different ruby versions...
+              ).flat_map do |property|
+                property_specifier      = "\\p{#{property}}"
+                non_property_specifier  = "\\P{#{property}}"
+                property_regex          = /#{property_specifier}/
+                non_property_regex      = /#{non_property_specifier}/
+
+                [
+                  [
+                    :"regexp_#{property}_property",
+                    [
+                      :property,
+                      property.to_sym,
+                      property_specifier
+                    ],
+                    ::Regexp::Parser.parse(property_regex).expressions.first.class
+                  ],
+                  [
+                    :"regexp_#{property}_nonproperty",
+                    [
+                      :nonproperty,
+                      property.to_sym,
+                      non_property_specifier
+                    ],
+                    ::Regexp::Parser.parse(non_property_regex).expressions.first.class
+                  ]
+                ]
+              end
             )
           # rubocop:enable Layout/LineLength