Skip to content

Commit

Permalink
fix: restore commas in entries at parsing time
Browse files Browse the repository at this point in the history
  • Loading branch information
alexgarel committed Jul 17, 2024
1 parent 83bbbe5 commit 61ca542
Showing 1 changed file with 12 additions and 5 deletions.
17 changes: 12 additions & 5 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
line = line.rstrip()
# replace ’ (typographique quote) to simple quote '
line = line.replace("’", "'")
# replace commas that have no space around by a lower comma character
# replace commas between digits and that have no space around by a lower comma character
# and do the same for escaped comma (preceded by a \)
# (to distinguish them from commas acting as tags separators)
line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
Expand All @@ -122,6 +122,13 @@ def _normalize_entry_id(self, raw_id: str) -> str:
normalized_id = f"{lc}:{normalized_main_tag}"
return normalized_id

def undo_normalize_text(self, text: str) -> str:
"""Undo some normalizations made in `_file_iter`"""
# restore commas from lower comma characters
text = re.sub(r"(\d)‚(\d)", r"\1,\2", text)
text = re.sub(r"\\‚", "\\,", text)
return text

def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
"""Get the language code "lc" and a list of normalized values"""
lc, line = line.split(":", 1)
Expand Down Expand Up @@ -306,7 +313,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
# remove "stopwords:" part
line = line[10:]
# compute raw values outside _get_lc_value as it normalizes them!
tags = [words.strip() for words in line[3:].split(",")]
tags = [self.undo_normalize_text(words.strip()) for words in line[3:].split(",")]
try:
lc, value = self._get_lc_value(line)
except ValueError:
Expand All @@ -326,7 +333,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
# remove "synonyms:" part
line = line[9:]
# compute raw values outside _get_lc_value as it normalizes them!
tags = [words.strip() for words in line[3:].split(",")]
tags = [self.undo_normalize_text(words.strip()) for words in line[3:].split(",")]
try:
lc, value = self._get_lc_value(line)
except ValueError:
Expand All @@ -352,7 +359,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
tags_list = []
tagsids_list = []
for word in line.split(","):
tags_list.append(word.strip())
tags_list.append(self.undo_normalize_text(word.strip()))
word_normalized = normalize_text(word, lang, stopwords=self.stopwords)
if word_normalized not in tagsids_list:
# in case 2 normalized synonyms are the same
Expand Down Expand Up @@ -383,7 +390,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
)
if property_name:
prop_key = "prop_" + property_name + "_" + lc
data.properties[prop_key] = property_value
data.properties[prop_key] = self.undo_normalize_text(property_value)
data = self._get_node_data_with_comments_above_key(
data, line_number, prop_key
)
Expand Down

0 comments on commit 61ca542

Please sign in to comment.