Merge pull request #307 from openzim/xml_changes

openzim · May 7, 2024 · 2b804e8 · 2b804e8
2 parents 45c4fed + baef340
commit 2b804e8
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 11 deletions.
diff --git a/src/sotoki/archives.py b/src/sotoki/archives.py
@@ -15,6 +15,7 @@
 from .utils.preparation import (
     merge_users_with_badges,
     merge_posts_with_answers_comments,
+    reencode_file,
 )
 
 
@@ -89,9 +90,13 @@ def _run(url, fpath):
 
             # remove other files from ark that we won't need
             for fp in self.build_dir.iterdir():
-                if fp.suffix == ".xml" and fp.stem not in self.dump_parts:
+                if fp.suffix != ".xml" or fp.stem not in self.dump_parts:
                     fp.unlink()
 
+            # reencode xml files
+            for fp in self.build_dir.iterdir():
+                reencode_file(fp)
+
         futures = {}
         executor = cf.ThreadPoolExecutor(max_workers=len(self.archives))
 

diff --git a/src/sotoki/constants.py b/src/sotoki/constants.py
@@ -21,6 +21,7 @@
     VERSION = fh.read().strip()
 
 UTF8 = "utf-8"
+UTF16LE = "utf-16-le"
 SCRAPER = f"{NAME} {VERSION}"
 USER_AGENT = (
     f"{NAME}/{VERSION} (https://github.com/openzim/sotoki; "

diff --git a/src/sotoki/posts.py b/src/sotoki/posts.py
@@ -55,6 +55,9 @@ def _user_to_set(aset, field):
 
         # opening answers of a question
         if name == "answer":  # a answer
+            # ignore deleted answers
+            if "DeletionDate" in attrs:
+                return
             _user_to_set(self.post["users_ids"], "OwnerUserId")
             _user_to_set(self.post["users_ids"], "LastEditorUserId")
             self.post["nb_answers"] += 1
@@ -91,6 +94,10 @@ def run(self):
         )
 
     def processor(self, item):
+        # ignore deleted posts
+        if "DeletionDate" in item:
+            self.release()
+            return
         # skip post without answers ; maybe?
         if self.conf.without_unanswered and not item["nb_answers"]:
             self.release()
@@ -165,6 +172,8 @@ def startElement(self, name, attrs):
 
         # an answer
         if name == "answer":
+            if "DeletionDate" in attrs:
+                return
             self.answers.append(dict(attrs.items()))
             return
 
@@ -241,6 +250,10 @@ def processor(self, item):
         if self.conf.without_unanswered and not post["answers"]:
             self.release()
             return
+        # ignore deleted posts
+        if "DeletionDate" in item:
+            self.release()
+            return
         harmonize_post(post)
 
         path = f'questions/{post["Id"]}/{get_slug_for(post["Title"])}'

diff --git a/src/sotoki/utils/preparation.py b/src/sotoki/utils/preparation.py
@@ -16,11 +16,32 @@
 
 from .shared import logger
 from .misc import has_binary, get_available_memory
-from ..constants import UTF8
+from ..constants import UTF8, UTF16LE
 
 has_gnusort = has_binary("sort")
 
 
+def reencode_file(src: pathlib.Path):
+    """Reencode a file from dump format (UTF-16-LE as of March 2024) to expected format (UTF8)
+
+    This is based on a streaming on-the-fly reencoding of file chunks to limit memory pressure.
+
+    Content is read line-by-line to ensure it is not split in the middle of a grapheme cluster.
+
+    During reencoding, there will be two versions of the same content on the filesystem, one in
+    previous encoding and one in target encoding, filesystem needs enough space for that.
+    """
+    tmp = src.with_suffix(src.suffix + ".tmp")
+    with open(src, "r", encoding=UTF16LE) as sourceFile:
+        with open(tmp, "w", encoding=UTF8) as targetFile:
+            while True:
+                contents = sourceFile.readline()
+                if not contents:
+                    break
+                targetFile.write(contents)
+    src.unlink()
+    tmp.rename(src)
+
 def get_within_chars(nb_chars_glue: int, nb_ids: int) -> int:
     """nb of chars to combine `nb_ids`'s values with `nb_chars_glue`
 
@@ -212,8 +233,8 @@ def read_sub():
         for main_line in mainfh:
             main_id = get_id_in(main_line, field_index_in_main)
 
-            # write main line to dest; removing tag end (/> -> >) and CRLF
-            dsth.write(main_line[:-4])
+            # write main line to dest; removing tag end (/>) and LF
+            dsth.write(main_line[:-3])
             dsth.write(b">")
 
             # fetch subs matching this ID (IDs are sorted so it's continuous)
@@ -226,9 +247,9 @@ def read_sub():
                     has_subs = True
 
                 dsth.write(node_start)
-                # write the sub line removing the 2 heading spaces, node name (<row)
-                # removing trailing CRLF as well. node already self closed in source
-                dsth.write(current_sub[1][6:-2])
+                # write the sub line removing node name (<row) and trailing LF as well. node already
+                # self closed in source
+                dsth.write(current_sub[1][4:-1])
                 current_sub = read_sub()
 
             if has_subs:
@@ -313,9 +334,9 @@ def split_posts_by_posttypeid(
             except IndexError:
                 break
             try:
-                # rewrite with new name replacing `  <row` and `row>`
+                # rewrite with new name replacing `<row` and `row>LF`
                 fhs[found_id].write(starts[found_id])
-                fhs[found_id].write(line[6:-5])
+                fhs[found_id].write(line[4:-5])
                 fhs[found_id].write(ends[found_id])
             except KeyError:
                 continue
@@ -382,9 +403,9 @@ def read_csv():
                 break
 
             if current_csv[0] == post_id:
-                # write user line to dest; removing tag end and CRLF
+                # write user line to dest; removing tag open (<row), tag end (/>) and LF
                 dsth.write(b"<link")
-                dsth.write(line[6:-4])
+                dsth.write(line[4:-3])
                 # CSV title already includes appropriate quoting
                 dsth.write(b" PostName=")
                 dsth.write(current_csv[1])