diff --git a/data/generate_constants.rb b/data/generate_constants.rb index b85f4b0..8c20f5d 100644 --- a/data/generate_constants.rb +++ b/data/generate_constants.rb @@ -4,125 +4,15 @@ include Unicode::Emoji -pack = ->(ord){ Regexp.escape(Array(ord).pack("U*")) } -join = -> (*strings){ "(?:" + strings.join("|") + ")" } -pack_and_join = ->(ords){ join[*ords.map{ |ord| pack[ord] }] } - -emoji_character = pack_and_join[EMOJI_CHAR] -emoji_modifier = pack_and_join[EMOJI_MODIFIERS] -emoji_modifier_base = pack_and_join[EMOJI_MODIFIER_BASES] -emoji_component = pack_and_join[EMOJI_COMPONENT] -emoji_presentation = pack_and_join[EMOJI_PRESENTATION] -picto = pack_and_join[EXTENDED_PICTOGRAPHIC] -picto_no_emoji = pack_and_join[EXTENDED_PICTOGRAPHIC_NO_EMOJI] - -emoji_presentation_sequence = \ - join[ - pack_and_join[TEXT_PRESENTATION] + pack[EMOJI_VARIATION_SELECTOR], - emoji_presentation + "(?!" + pack[TEXT_VARIATION_SELECTOR] + ")" + pack[EMOJI_VARIATION_SELECTOR] + "?", - ] - -non_component_emoji_presentation_sequence = \ - "(?!" + emoji_component + ")" + emoji_presentation_sequence - -text_presentation_sequence = \ - join[ - pack_and_join[TEXT_PRESENTATION]+ "(?!" + join[emoji_modifier, pack[EMOJI_VARIATION_SELECTOR]] + ")" + pack[TEXT_VARIATION_SELECTOR] + "?", - emoji_presentation + pack[TEXT_VARIATION_SELECTOR] - ] - -emoji_modifier_sequence = \ - emoji_modifier_base + emoji_modifier - -emoji_keycap_sequence = \ - pack_and_join[EMOJI_KEYCAPS] + pack[[EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]] - -emoji_valid_flag_sequence = \ - pack_and_join[VALID_REGION_FLAGS] - -emoji_well_formed_flag_sequence = \ - "(?:" + - pack_and_join[REGIONAL_INDICATORS] + - pack_and_join[REGIONAL_INDICATORS] + - ")" - -emoji_valid_core_sequence = \ - join[ - # emoji_character, - emoji_keycap_sequence, - emoji_modifier_sequence, - non_component_emoji_presentation_sequence, - emoji_valid_flag_sequence, - ] - -emoji_well_formed_core_sequence = \ - join[ - # emoji_character, - emoji_keycap_sequence, - emoji_modifier_sequence, - non_component_emoji_presentation_sequence, - emoji_well_formed_flag_sequence, - ] - -emoji_rgi_tag_sequence = \ - pack_and_join[RECOMMENDED_SUBDIVISION_FLAGS] - -emoji_valid_tag_sequence = \ - "(?:" + - pack[EMOJI_TAG_BASE_FLAG] + - "(?:" + VALID_SUBDIVISIONS.map{ |sd| Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))}.join("|") + ")" + - pack[CANCEL_TAG] + - ")" - -emoji_well_formed_tag_sequence = \ - "(?:" + - join[ - non_component_emoji_presentation_sequence, - emoji_modifier_sequence, - ] + - pack_and_join[TAGS] + "+" + - pack[CANCEL_TAG] + - ")" - -emoji_rgi_zwj_sequence = \ - pack_and_join[RECOMMENDED_ZWJ_SEQUENCES] - -emoji_valid_zwj_element = \ - join[ - emoji_modifier_sequence, - emoji_presentation_sequence, - emoji_character, - ] - -emoji_valid_zwj_sequence = \ - "(?:" + - "(?:" + emoji_valid_zwj_element + pack[ZWJ] + ")+" + emoji_valid_zwj_element + - ")" - -emoji_rgi_sequence = \ - join[ - emoji_rgi_zwj_sequence, - emoji_rgi_tag_sequence, - emoji_valid_core_sequence, - ] - -emoji_valid_sequence = \ - join[ - emoji_valid_zwj_sequence, - emoji_valid_tag_sequence, - emoji_valid_core_sequence, - ] - -emoji_well_formed_sequence = \ - join[ - emoji_valid_zwj_sequence, - emoji_well_formed_tag_sequence, - emoji_well_formed_core_sequence, - ] - -def write_regex(const_name, regex) +def write_regexes(regexes, dirpath) + regexes.each do |const_name, regex| + write_regex(const_name, regex, dirpath) + end +end + +def write_regex(const_name, regex, dirpath) filename = const_name.to_s.downcase - filepath = File.expand_path("../lib/unicode/emoji/generated/#{filename}.rb", __dir__) + filepath = File.join(dirpath, "#{filename}.rb") File.write(filepath, <<~CONTENT) # This file was generated. Please, do not edit this file by hand. @@ -135,54 +25,181 @@ module Emoji puts "#{const_name} written to #{filepath}" end -# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi) -REGEX = Regexp.compile(emoji_rgi_sequence) -write_regex(:REGEX, REGEX) +def pack(ord) + Regexp.escape(Array(ord).pack("U*")) +end -# Matches basic singleton emoji and all kind of valid sequences -REGEX_VALID = Regexp.compile(emoji_valid_sequence) -write_regex(:REGEX_VALID, REGEX_VALID) +def join(*strings) + "(?:" + strings.join("|") + ")" +end -# Matches basic singleton emoji and all kind of sequences -REGEX_WELL_FORMED = Regexp.compile(emoji_well_formed_sequence) -write_regex(:REGEX_WELL_FORMED, REGEX_WELL_FORMED) +def pack_and_join(ords) + join(*ords.map{ |ord| pack(ord) }) +end -# Matches only basic single, non-textual emoji -# Ignores "components" like modifiers or simple digits -REGEX_BASIC = Regexp.compile( - "(?!" + emoji_component + ")" + emoji_presentation_sequence -) -write_regex(:REGEX_BASIC, REGEX_BASIC) +def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:) + emoji_presentation_sequence = \ + join( + pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR), + emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?", + ) + + non_component_emoji_presentation_sequence = \ + "(?!" + emoji_component + ")" + emoji_presentation_sequence + + text_presentation_sequence = \ + join( + pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?", + emoji_presentation + pack(TEXT_VARIATION_SELECTOR) + ) + + emoji_modifier_sequence = \ + emoji_modifier_base + emoji_modifier + + emoji_keycap_sequence = \ + pack_and_join(EMOJI_KEYCAPS) + pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) + + emoji_valid_flag_sequence = \ + pack_and_join(VALID_REGION_FLAGS) + + emoji_well_formed_flag_sequence = \ + "(?:" + + pack_and_join(REGIONAL_INDICATORS) + + pack_and_join(REGIONAL_INDICATORS) + + ")" + + emoji_valid_core_sequence = \ + join( + # emoji_character, + emoji_keycap_sequence, + emoji_modifier_sequence, + non_component_emoji_presentation_sequence, + emoji_valid_flag_sequence, + ) -# Matches only basic single, textual emoji -# Ignores "components" like modifiers or simple digits -REGEX_TEXT = Regexp.compile( - "(?!" + emoji_component + ")" + text_presentation_sequence -) -write_regex(:REGEX_TEXT, REGEX_TEXT) + emoji_well_formed_core_sequence = \ + join( + # emoji_character, + emoji_keycap_sequence, + emoji_modifier_sequence, + non_component_emoji_presentation_sequence, + emoji_well_formed_flag_sequence, + ) + + emoji_rgi_tag_sequence = \ + pack_and_join(RECOMMENDED_SUBDIVISION_FLAGS) + + emoji_valid_tag_sequence = \ + "(?:" + + pack(EMOJI_TAG_BASE_FLAG) + + "(?:" + VALID_SUBDIVISIONS.map{ |sd| Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))}.join("|") + ")" + + pack(CANCEL_TAG) + + ")" + + emoji_well_formed_tag_sequence = \ + "(?:" + + join( + non_component_emoji_presentation_sequence, + emoji_modifier_sequence, + ) + + pack_and_join(TAGS) + "+" + + pack(CANCEL_TAG) + + ")" + + emoji_rgi_zwj_sequence = \ + pack_and_join(RECOMMENDED_ZWJ_SEQUENCES) + + emoji_valid_zwj_element = \ + join( + emoji_modifier_sequence, + emoji_presentation_sequence, + emoji_character, + ) -# Matches any emoji-related codepoint - Use with caution (returns partil matches) -REGEX_ANY = Regexp.compile( - emoji_character -) -write_regex(:REGEX_ANY, REGEX_ANY) + emoji_valid_zwj_sequence = \ + "(?:" + + "(?:" + emoji_valid_zwj_element + pack(ZWJ) + ")+" + emoji_valid_zwj_element + + ")" -# Combined REGEXes which also match for TEXTUAL emoji -REGEX_INCLUDE_TEXT = Regexp.union(REGEX, REGEX_TEXT) -write_regex(:REGEX_INCLUDE_TEXT, REGEX_INCLUDE_TEXT) + emoji_rgi_sequence = \ + join( + emoji_rgi_zwj_sequence, + emoji_rgi_tag_sequence, + emoji_valid_core_sequence, + ) -REGEX_VALID_INCLUDE_TEXT = Regexp.union(REGEX_VALID, REGEX_TEXT) -write_regex(:REGEX_VALID_INCLUDE_TEXT, REGEX_VALID_INCLUDE_TEXT) + emoji_valid_sequence = \ + join( + emoji_valid_zwj_sequence, + emoji_valid_tag_sequence, + emoji_valid_core_sequence, + ) -REGEX_WELL_FORMED_INCLUDE_TEXT = Regexp.union(REGEX_WELL_FORMED, REGEX_TEXT) -write_regex(:REGEX_WELL_FORMED_INCLUDE_TEXT, REGEX_WELL_FORMED_INCLUDE_TEXT) + emoji_well_formed_sequence = \ + join( + emoji_valid_zwj_sequence, + emoji_well_formed_tag_sequence, + emoji_well_formed_core_sequence, + ) -REGEX_PICTO = Regexp.compile( - picto -) -write_regex(:REGEX_PICTO, REGEX_PICTO) + regexes = {} + + # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi) + regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence) + + # Matches basic singleton emoji and all kind of valid sequences + regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence) + + # Matches basic singleton emoji and all kind of sequences + regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence) + + # Matches only basic single, non-textual emoji + # Ignores "components" like modifiers or simple digits + regexes[:REGEX_BASIC] = Regexp.compile( + "(?!" + emoji_component + ")" + emoji_presentation_sequence + ) + + # Matches only basic single, textual emoji + # Ignores "components" like modifiers or simple digits + regexes[:REGEX_TEXT] = Regexp.compile( + "(?!" + emoji_component + ")" + text_presentation_sequence + ) + + # Matches any emoji-related codepoint - Use with caution (returns partil matches) + regexes[:REGEX_ANY] = Regexp.compile(emoji_character) + + # Combined REGEXes which also match for TEXTUAL emoji + regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT]) -REGEX_PICTO_NO_EMOJI = Regexp.compile( - picto_no_emoji + regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT]) + + regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT]) + + regexes[:REGEX_PICTO] = Regexp.compile(picto) + + regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji) + + regexes +end + +regexes = compile( + emoji_character: pack_and_join(EMOJI_CHAR), + emoji_modifier: pack_and_join(EMOJI_MODIFIERS), + emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES), + emoji_component: pack_and_join(EMOJI_COMPONENT), + emoji_presentation: pack_and_join(EMOJI_PRESENTATION), + picto: pack_and_join(EXTENDED_PICTOGRAPHIC), + picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI) +) +write_regexes(regexes, File.expand_path("../lib/unicode/emoji/generated", __dir__)) + +native_regexes = compile( + emoji_character: "\\p{Emoji}", + emoji_modifier: "\\p{Emoji Modifier}", + emoji_modifier_base: "\\p{Emoji Modifier Base}", + emoji_component: "\\p{Emoji Component}", + emoji_presentation: "\\p{Emoji Presentation}", + picto: "\\p{Extended Pictographic}", + picto_no_emoji: "\\p{Extended Pictographic}(?