diff --git a/src/sotoki/archives.py b/src/sotoki/archives.py index f2bde42..b5c02f9 100644 --- a/src/sotoki/archives.py +++ b/src/sotoki/archives.py @@ -15,6 +15,7 @@ from .utils.preparation import ( merge_users_with_badges, merge_posts_with_answers_comments, + reencode_file, ) @@ -89,9 +90,13 @@ def _run(url, fpath): # remove other files from ark that we won't need for fp in self.build_dir.iterdir(): - if fp.suffix == ".xml" and fp.stem not in self.dump_parts: + if fp.suffix != ".xml" or fp.stem not in self.dump_parts: fp.unlink() + # reencode xml files + for fp in self.build_dir.iterdir(): + reencode_file(fp) + futures = {} executor = cf.ThreadPoolExecutor(max_workers=len(self.archives)) diff --git a/src/sotoki/constants.py b/src/sotoki/constants.py index e363383..d462f0a 100644 --- a/src/sotoki/constants.py +++ b/src/sotoki/constants.py @@ -21,6 +21,7 @@ VERSION = fh.read().strip() UTF8 = "utf-8" +UTF16LE = "utf-16-le" SCRAPER = f"{NAME} {VERSION}" USER_AGENT = ( f"{NAME}/{VERSION} (https://github.com/openzim/sotoki; " diff --git a/src/sotoki/posts.py b/src/sotoki/posts.py index 6b5e69e..8d80210 100644 --- a/src/sotoki/posts.py +++ b/src/sotoki/posts.py @@ -55,6 +55,9 @@ def _user_to_set(aset, field): # opening answers of a question if name == "answer": # a answer + # ignore deleted answers + if "DeletionDate" in attrs: + return _user_to_set(self.post["users_ids"], "OwnerUserId") _user_to_set(self.post["users_ids"], "LastEditorUserId") self.post["nb_answers"] += 1 @@ -91,6 +94,10 @@ def run(self): ) def processor(self, item): + # ignore deleted posts + if "DeletionDate" in item: + self.release() + return # skip post without answers ; maybe? if self.conf.without_unanswered and not item["nb_answers"]: self.release() @@ -165,6 +172,8 @@ def startElement(self, name, attrs): # an answer if name == "answer": + if "DeletionDate" in attrs: + return self.answers.append(dict(attrs.items())) return @@ -241,6 +250,10 @@ def processor(self, item): if self.conf.without_unanswered and not post["answers"]: self.release() return + # ignore deleted posts + if "DeletionDate" in item: + self.release() + return harmonize_post(post) path = f'questions/{post["Id"]}/{get_slug_for(post["Title"])}' diff --git a/src/sotoki/utils/preparation.py b/src/sotoki/utils/preparation.py index 953f88c..3c46bc9 100644 --- a/src/sotoki/utils/preparation.py +++ b/src/sotoki/utils/preparation.py @@ -16,11 +16,32 @@ from .shared import logger from .misc import has_binary, get_available_memory -from ..constants import UTF8 +from ..constants import UTF8, UTF16LE has_gnusort = has_binary("sort") +def reencode_file(src: pathlib.Path): + """Reencode a file from dump format (UTF-16-LE as of March 2024) to expected format (UTF8) + + This is based on a streaming on-the-fly reencoding of file chunks to limit memory pressure. + + Content is read line-by-line to ensure it is not split in the middle of a grapheme cluster. + + During reencoding, there will be two versions of the same content on the filesystem, one in + previous encoding and one in target encoding, filesystem needs enough space for that. + """ + tmp = src.with_suffix(src.suffix + ".tmp") + with open(src, "r", encoding=UTF16LE) as sourceFile: + with open(tmp, "w", encoding=UTF8) as targetFile: + while True: + contents = sourceFile.readline() + if not contents: + break + targetFile.write(contents) + src.unlink() + tmp.rename(src) + def get_within_chars(nb_chars_glue: int, nb_ids: int) -> int: """nb of chars to combine `nb_ids`'s values with `nb_chars_glue` @@ -212,8 +233,8 @@ def read_sub(): for main_line in mainfh: main_id = get_id_in(main_line, field_index_in_main) - # write main line to dest; removing tag end (/> -> >) and CRLF - dsth.write(main_line[:-4]) + # write main line to dest; removing tag end (/>) and LF + dsth.write(main_line[:-3]) dsth.write(b">") # fetch subs matching this ID (IDs are sorted so it's continuous) @@ -226,9 +247,9 @@ def read_sub(): has_subs = True dsth.write(node_start) - # write the sub line removing the 2 heading spaces, node name (` + # rewrite with new name replacing `LF` fhs[found_id].write(starts[found_id]) - fhs[found_id].write(line[6:-5]) + fhs[found_id].write(line[4:-5]) fhs[found_id].write(ends[found_id]) except KeyError: continue @@ -382,9 +403,9 @@ def read_csv(): break if current_csv[0] == post_id: - # write user line to dest; removing tag end and CRLF + # write user line to dest; removing tag open () and LF dsth.write(b"