From 9e4c39c30e6dd56ba47cff66686e3eaba814f7a4 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 7 May 2024 06:41:30 +0000 Subject: [PATCH 1/3] Reencode all XML files Files from SE dumps are now encoded in UTF-16-LE, and we need them in UTF-8. Let's simply reencode them for now. --- src/sotoki/archives.py | 7 ++++++- src/sotoki/constants.py | 1 + src/sotoki/utils/preparation.py | 23 ++++++++++++++++++++++- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/sotoki/archives.py b/src/sotoki/archives.py index f2bde42..b5c02f9 100644 --- a/src/sotoki/archives.py +++ b/src/sotoki/archives.py @@ -15,6 +15,7 @@ from .utils.preparation import ( merge_users_with_badges, merge_posts_with_answers_comments, + reencode_file, ) @@ -89,9 +90,13 @@ def _run(url, fpath): # remove other files from ark that we won't need for fp in self.build_dir.iterdir(): - if fp.suffix == ".xml" and fp.stem not in self.dump_parts: + if fp.suffix != ".xml" or fp.stem not in self.dump_parts: fp.unlink() + # reencode xml files + for fp in self.build_dir.iterdir(): + reencode_file(fp) + futures = {} executor = cf.ThreadPoolExecutor(max_workers=len(self.archives)) diff --git a/src/sotoki/constants.py b/src/sotoki/constants.py index e363383..d462f0a 100644 --- a/src/sotoki/constants.py +++ b/src/sotoki/constants.py @@ -21,6 +21,7 @@ VERSION = fh.read().strip() UTF8 = "utf-8" +UTF16LE = "utf-16-le" SCRAPER = f"{NAME} {VERSION}" USER_AGENT = ( f"{NAME}/{VERSION} (https://github.com/openzim/sotoki; " diff --git a/src/sotoki/utils/preparation.py b/src/sotoki/utils/preparation.py index 953f88c..9062aa4 100644 --- a/src/sotoki/utils/preparation.py +++ b/src/sotoki/utils/preparation.py @@ -16,11 +16,32 @@ from .shared import logger from .misc import has_binary, get_available_memory -from ..constants import UTF8 +from ..constants import UTF8, UTF16LE has_gnusort = has_binary("sort") +def reencode_file(src: pathlib.Path): + """Reencode a file from dump format (UTF-16-LE as of March 2024) to expected format (UTF8) + + This is based on a streaming on-the-fly reencoding of file chunks to limit memory pressure. + + Content is read line-by-line to ensure it is not split in the middle of a grapheme cluster. + + During reencoding, there will be two versions of the same content on the filesystem, one in + previous encoding and one in target encoding, filesystem needs enough space for that. + """ + tmp = src.with_suffix(src.suffix + ".tmp") + with open(src, "r", encoding=UTF16LE) as sourceFile: + with open(tmp, "w", encoding=UTF8) as targetFile: + while True: + contents = sourceFile.readline() + if not contents: + break + targetFile.write(contents) + src.unlink() + tmp.rename(src) + def get_within_chars(nb_chars_glue: int, nb_ids: int) -> int: """nb of chars to combine `nb_ids`'s values with `nb_chars_glue` From 396fe796cb3b4e9a8221cab12bf8f04c9d1ce6fb Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 7 May 2024 06:43:25 +0000 Subject: [PATCH 2/3] Ignore deleted posts and answers --- src/sotoki/posts.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/sotoki/posts.py b/src/sotoki/posts.py index 6b5e69e..8d80210 100644 --- a/src/sotoki/posts.py +++ b/src/sotoki/posts.py @@ -55,6 +55,9 @@ def _user_to_set(aset, field): # opening answers of a question if name == "answer": # a answer + # ignore deleted answers + if "DeletionDate" in attrs: + return _user_to_set(self.post["users_ids"], "OwnerUserId") _user_to_set(self.post["users_ids"], "LastEditorUserId") self.post["nb_answers"] += 1 @@ -91,6 +94,10 @@ def run(self): ) def processor(self, item): + # ignore deleted posts + if "DeletionDate" in item: + self.release() + return # skip post without answers ; maybe? if self.conf.without_unanswered and not item["nb_answers"]: self.release() @@ -165,6 +172,8 @@ def startElement(self, name, attrs): # an answer if name == "answer": + if "DeletionDate" in attrs: + return self.answers.append(dict(attrs.items())) return @@ -241,6 +250,10 @@ def processor(self, item): if self.conf.without_unanswered and not post["answers"]: self.release() return + # ignore deleted posts + if "DeletionDate" in item: + self.release() + return harmonize_post(post) path = f'questions/{post["Id"]}/{get_slug_for(post["Title"])}' From baef340d770a27fbba4d761e12eb9e5c3361b31d Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 7 May 2024 06:44:05 +0000 Subject: [PATCH 3/3] Adjust magic numbers used for XML manipulation To avoid parsing XML, some magic number are used to trim opening and closing markup. The dump format has slightly changed and these magic numbers had to be adapted. --- src/sotoki/utils/preparation.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/sotoki/utils/preparation.py b/src/sotoki/utils/preparation.py index 9062aa4..3c46bc9 100644 --- a/src/sotoki/utils/preparation.py +++ b/src/sotoki/utils/preparation.py @@ -233,8 +233,8 @@ def read_sub(): for main_line in mainfh: main_id = get_id_in(main_line, field_index_in_main) - # write main line to dest; removing tag end (/> -> >) and CRLF - dsth.write(main_line[:-4]) + # write main line to dest; removing tag end (/>) and LF + dsth.write(main_line[:-3]) dsth.write(b">") # fetch subs matching this ID (IDs are sorted so it's continuous) @@ -247,9 +247,9 @@ def read_sub(): has_subs = True dsth.write(node_start) - # write the sub line removing the 2 heading spaces, node name (` + # rewrite with new name replacing `LF` fhs[found_id].write(starts[found_id]) - fhs[found_id].write(line[6:-5]) + fhs[found_id].write(line[4:-5]) fhs[found_id].write(ends[found_id]) except KeyError: continue @@ -403,9 +403,9 @@ def read_csv(): break if current_csv[0] == post_id: - # write user line to dest; removing tag end and CRLF + # write user line to dest; removing tag open () and LF dsth.write(b"