From 9e4c39c30e6dd56ba47cff66686e3eaba814f7a4 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 7 May 2024 06:41:30 +0000
Subject: [PATCH 1/3] Reencode all XML files

Files from SE dumps are now encoded in UTF-16-LE, and we need them in
UTF-8. Let's simply reencode them for now.
---
 src/sotoki/archives.py          |  7 ++++++-
 src/sotoki/constants.py         |  1 +
 src/sotoki/utils/preparation.py | 23 ++++++++++++++++++++++-
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/sotoki/archives.py b/src/sotoki/archives.py
index f2bde42..b5c02f9 100644
--- a/src/sotoki/archives.py
+++ b/src/sotoki/archives.py
@@ -15,6 +15,7 @@
 from .utils.preparation import (
     merge_users_with_badges,
     merge_posts_with_answers_comments,
+    reencode_file,
 )
 
 
@@ -89,9 +90,13 @@ def _run(url, fpath):
 
             # remove other files from ark that we won't need
             for fp in self.build_dir.iterdir():
-                if fp.suffix == ".xml" and fp.stem not in self.dump_parts:
+                if fp.suffix != ".xml" or fp.stem not in self.dump_parts:
                     fp.unlink()
 
+            # reencode xml files
+            for fp in self.build_dir.iterdir():
+                reencode_file(fp)
+
         futures = {}
         executor = cf.ThreadPoolExecutor(max_workers=len(self.archives))
 
diff --git a/src/sotoki/constants.py b/src/sotoki/constants.py
index e363383..d462f0a 100644
--- a/src/sotoki/constants.py
+++ b/src/sotoki/constants.py
@@ -21,6 +21,7 @@
     VERSION = fh.read().strip()
 
 UTF8 = "utf-8"
+UTF16LE = "utf-16-le"
 SCRAPER = f"{NAME} {VERSION}"
 USER_AGENT = (
     f"{NAME}/{VERSION} (https://github.com/openzim/sotoki; "
diff --git a/src/sotoki/utils/preparation.py b/src/sotoki/utils/preparation.py
index 953f88c..9062aa4 100644
--- a/src/sotoki/utils/preparation.py
+++ b/src/sotoki/utils/preparation.py
@@ -16,11 +16,32 @@
 
 from .shared import logger
 from .misc import has_binary, get_available_memory
-from ..constants import UTF8
+from ..constants import UTF8, UTF16LE
 
 has_gnusort = has_binary("sort")
 
 
+def reencode_file(src: pathlib.Path):
+    """Reencode a file from dump format (UTF-16-LE as of March 2024) to expected format (UTF8)
+
+    This is based on a streaming on-the-fly reencoding of file chunks to limit memory pressure.
+
+    Content is read line-by-line to ensure it is not split in the middle of a grapheme cluster.
+
+    During reencoding, there will be two versions of the same content on the filesystem, one in
+    previous encoding and one in target encoding, filesystem needs enough space for that.
+    """
+    tmp = src.with_suffix(src.suffix + ".tmp")
+    with open(src, "r", encoding=UTF16LE) as sourceFile:
+        with open(tmp, "w", encoding=UTF8) as targetFile:
+            while True:
+                contents = sourceFile.readline()
+                if not contents:
+                    break
+                targetFile.write(contents)
+    src.unlink()
+    tmp.rename(src)
+
 def get_within_chars(nb_chars_glue: int, nb_ids: int) -> int:
     """nb of chars to combine `nb_ids`'s values with `nb_chars_glue`
 

From 396fe796cb3b4e9a8221cab12bf8f04c9d1ce6fb Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 7 May 2024 06:43:25 +0000
Subject: [PATCH 2/3] Ignore deleted posts and answers

---
 src/sotoki/posts.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/sotoki/posts.py b/src/sotoki/posts.py
index 6b5e69e..8d80210 100644
--- a/src/sotoki/posts.py
+++ b/src/sotoki/posts.py
@@ -55,6 +55,9 @@ def _user_to_set(aset, field):
 
         # opening answers of a question
         if name == "answer":  # a answer
+            # ignore deleted answers
+            if "DeletionDate" in attrs:
+                return
             _user_to_set(self.post["users_ids"], "OwnerUserId")
             _user_to_set(self.post["users_ids"], "LastEditorUserId")
             self.post["nb_answers"] += 1
@@ -91,6 +94,10 @@ def run(self):
         )
 
     def processor(self, item):
+        # ignore deleted posts
+        if "DeletionDate" in item:
+            self.release()
+            return
         # skip post without answers ; maybe?
         if self.conf.without_unanswered and not item["nb_answers"]:
             self.release()
@@ -165,6 +172,8 @@ def startElement(self, name, attrs):
 
         # an answer
         if name == "answer":
+            if "DeletionDate" in attrs:
+                return
             self.answers.append(dict(attrs.items()))
             return
 
@@ -241,6 +250,10 @@ def processor(self, item):
         if self.conf.without_unanswered and not post["answers"]:
             self.release()
             return
+        # ignore deleted posts
+        if "DeletionDate" in item:
+            self.release()
+            return
         harmonize_post(post)
 
         path = f'questions/{post["Id"]}/{get_slug_for(post["Title"])}'

From baef340d770a27fbba4d761e12eb9e5c3361b31d Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 7 May 2024 06:44:05 +0000
Subject: [PATCH 3/3] Adjust magic numbers used for XML manipulation

To avoid parsing XML, some magic number are used to trim opening and
closing markup. The dump format has slightly changed and these magic
numbers had to be adapted.
---
 src/sotoki/utils/preparation.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/sotoki/utils/preparation.py b/src/sotoki/utils/preparation.py
index 9062aa4..3c46bc9 100644
--- a/src/sotoki/utils/preparation.py
+++ b/src/sotoki/utils/preparation.py
@@ -233,8 +233,8 @@ def read_sub():
         for main_line in mainfh:
             main_id = get_id_in(main_line, field_index_in_main)
 
-            # write main line to dest; removing tag end (/> -> >) and CRLF
-            dsth.write(main_line[:-4])
+            # write main line to dest; removing tag end (/>) and LF
+            dsth.write(main_line[:-3])
             dsth.write(b">")
 
             # fetch subs matching this ID (IDs are sorted so it's continuous)
@@ -247,9 +247,9 @@ def read_sub():
                     has_subs = True
 
                 dsth.write(node_start)
-                # write the sub line removing the 2 heading spaces, node name (<row)
-                # removing trailing CRLF as well. node already self closed in source
-                dsth.write(current_sub[1][6:-2])
+                # write the sub line removing node name (<row) and trailing LF as well. node already
+                # self closed in source
+                dsth.write(current_sub[1][4:-1])
                 current_sub = read_sub()
 
             if has_subs:
@@ -334,9 +334,9 @@ def split_posts_by_posttypeid(
             except IndexError:
                 break
             try:
-                # rewrite with new name replacing `  <row` and `row>`
+                # rewrite with new name replacing `<row` and `row>LF`
                 fhs[found_id].write(starts[found_id])
-                fhs[found_id].write(line[6:-5])
+                fhs[found_id].write(line[4:-5])
                 fhs[found_id].write(ends[found_id])
             except KeyError:
                 continue
@@ -403,9 +403,9 @@ def read_csv():
                 break
 
             if current_csv[0] == post_id:
-                # write user line to dest; removing tag end and CRLF
+                # write user line to dest; removing tag open (<row), tag end (/>) and LF
                 dsth.write(b"<link")
-                dsth.write(line[6:-4])
+                dsth.write(line[4:-3])
                 # CSV title already includes appropriate quoting
                 dsth.write(b" PostName=")
                 dsth.write(current_csv[1])