fix: restore commas in entries at parsing time

openfoodfacts · Jul 17, 2024 · 61ca542 · 61ca542
1 parent 83bbbe5
commit 61ca542
Showing 1 changed file with 12 additions and 5 deletions.
diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
@@ -101,7 +101,7 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
                 line = line.rstrip()
                 # replace ’ (typographique quote) to simple quote '
                 line = line.replace("’", "'")
-                # replace commas that have no space around by a lower comma character
+                # replace commas between digits and that have no space around by a lower comma character
                 # and do the same for escaped comma (preceded by a \)
                 # (to distinguish them from commas acting as tags separators)
                 line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
@@ -122,6 +122,13 @@ def _normalize_entry_id(self, raw_id: str) -> str:
         normalized_id = f"{lc}:{normalized_main_tag}"
         return normalized_id
 
+    def undo_normalize_text(self, text: str) -> str:
+        """Undo some normalizations made in `_file_iter`"""
+        # restore commas from lower comma characters
+        text = re.sub(r"(\d)‚(\d)", r"\1,\2", text)
+        text = re.sub(r"\\‚", "\\,", text)
+        return text
+
     def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
         """Get the language code "lc" and a list of normalized values"""
         lc, line = line.split(":", 1)
@@ -306,7 +313,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     # remove "stopwords:" part
                     line = line[10:]
                     # compute raw values outside _get_lc_value as it normalizes them!
-                    tags = [words.strip() for words in line[3:].split(",")]
+                    tags = [self.undo_normalize_text(words.strip()) for words in line[3:].split(",")]
                     try:
                         lc, value = self._get_lc_value(line)
                     except ValueError:
@@ -326,7 +333,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     # remove "synonyms:" part
                     line = line[9:]
                     # compute raw values outside _get_lc_value as it normalizes them!
-                    tags = [words.strip() for words in line[3:].split(",")]
+                    tags = [self.undo_normalize_text(words.strip()) for words in line[3:].split(",")]
                     try:
                         lc, value = self._get_lc_value(line)
                     except ValueError:
@@ -352,7 +359,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     tags_list = []
                     tagsids_list = []
                     for word in line.split(","):
-                        tags_list.append(word.strip())
+                        tags_list.append(self.undo_normalize_text(word.strip()))
                         word_normalized = normalize_text(word, lang, stopwords=self.stopwords)
                         if word_normalized not in tagsids_list:
                             # in case 2 normalized synonyms are the same
@@ -383,7 +390,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                             )
                         if property_name:
                             prop_key = "prop_" + property_name + "_" + lc
-                            data.properties[prop_key] = property_value
+                            data.properties[prop_key] = self.undo_normalize_text(property_value)
                             data = self._get_node_data_with_comments_above_key(
                                 data, line_number, prop_key
                             )