Skip to content

Commit

Permalink
Merge pull request #307 from openzim/xml_changes
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed May 7, 2024
2 parents 45c4fed + baef340 commit 2b804e8
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 11 deletions.
7 changes: 6 additions & 1 deletion src/sotoki/archives.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .utils.preparation import (
merge_users_with_badges,
merge_posts_with_answers_comments,
reencode_file,
)


Expand Down Expand Up @@ -89,9 +90,13 @@ def _run(url, fpath):

# remove other files from ark that we won't need
for fp in self.build_dir.iterdir():
if fp.suffix == ".xml" and fp.stem not in self.dump_parts:
if fp.suffix != ".xml" or fp.stem not in self.dump_parts:
fp.unlink()

# reencode xml files
for fp in self.build_dir.iterdir():
reencode_file(fp)

futures = {}
executor = cf.ThreadPoolExecutor(max_workers=len(self.archives))

Expand Down
1 change: 1 addition & 0 deletions src/sotoki/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
VERSION = fh.read().strip()

UTF8 = "utf-8"
UTF16LE = "utf-16-le"
SCRAPER = f"{NAME} {VERSION}"
USER_AGENT = (
f"{NAME}/{VERSION} (https://github.com/openzim/sotoki; "
Expand Down
13 changes: 13 additions & 0 deletions src/sotoki/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ def _user_to_set(aset, field):

# opening answers of a question
if name == "answer": # a answer
# ignore deleted answers
if "DeletionDate" in attrs:
return
_user_to_set(self.post["users_ids"], "OwnerUserId")
_user_to_set(self.post["users_ids"], "LastEditorUserId")
self.post["nb_answers"] += 1
Expand Down Expand Up @@ -91,6 +94,10 @@ def run(self):
)

def processor(self, item):
# ignore deleted posts
if "DeletionDate" in item:
self.release()
return
# skip post without answers ; maybe?
if self.conf.without_unanswered and not item["nb_answers"]:
self.release()
Expand Down Expand Up @@ -165,6 +172,8 @@ def startElement(self, name, attrs):

# an answer
if name == "answer":
if "DeletionDate" in attrs:
return
self.answers.append(dict(attrs.items()))
return

Expand Down Expand Up @@ -241,6 +250,10 @@ def processor(self, item):
if self.conf.without_unanswered and not post["answers"]:
self.release()
return
# ignore deleted posts
if "DeletionDate" in item:
self.release()
return
harmonize_post(post)

path = f'questions/{post["Id"]}/{get_slug_for(post["Title"])}'
Expand Down
41 changes: 31 additions & 10 deletions src/sotoki/utils/preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,32 @@

from .shared import logger
from .misc import has_binary, get_available_memory
from ..constants import UTF8
from ..constants import UTF8, UTF16LE

has_gnusort = has_binary("sort")


def reencode_file(src: pathlib.Path):
"""Reencode a file from dump format (UTF-16-LE as of March 2024) to expected format (UTF8)
This is based on a streaming on-the-fly reencoding of file chunks to limit memory pressure.
Content is read line-by-line to ensure it is not split in the middle of a grapheme cluster.
During reencoding, there will be two versions of the same content on the filesystem, one in
previous encoding and one in target encoding, filesystem needs enough space for that.
"""
tmp = src.with_suffix(src.suffix + ".tmp")
with open(src, "r", encoding=UTF16LE) as sourceFile:
with open(tmp, "w", encoding=UTF8) as targetFile:
while True:
contents = sourceFile.readline()
if not contents:
break
targetFile.write(contents)
src.unlink()
tmp.rename(src)

def get_within_chars(nb_chars_glue: int, nb_ids: int) -> int:
"""nb of chars to combine `nb_ids`'s values with `nb_chars_glue`
Expand Down Expand Up @@ -212,8 +233,8 @@ def read_sub():
for main_line in mainfh:
main_id = get_id_in(main_line, field_index_in_main)

# write main line to dest; removing tag end (/> -> >) and CRLF
dsth.write(main_line[:-4])
# write main line to dest; removing tag end (/>) and LF
dsth.write(main_line[:-3])
dsth.write(b">")

# fetch subs matching this ID (IDs are sorted so it's continuous)
Expand All @@ -226,9 +247,9 @@ def read_sub():
has_subs = True

dsth.write(node_start)
# write the sub line removing the 2 heading spaces, node name (<row)
# removing trailing CRLF as well. node already self closed in source
dsth.write(current_sub[1][6:-2])
# write the sub line removing node name (<row) and trailing LF as well. node already
# self closed in source
dsth.write(current_sub[1][4:-1])
current_sub = read_sub()

if has_subs:
Expand Down Expand Up @@ -313,9 +334,9 @@ def split_posts_by_posttypeid(
except IndexError:
break
try:
# rewrite with new name replacing ` <row` and `row>`
# rewrite with new name replacing `<row` and `row>LF`
fhs[found_id].write(starts[found_id])
fhs[found_id].write(line[6:-5])
fhs[found_id].write(line[4:-5])
fhs[found_id].write(ends[found_id])
except KeyError:
continue
Expand Down Expand Up @@ -382,9 +403,9 @@ def read_csv():
break

if current_csv[0] == post_id:
# write user line to dest; removing tag end and CRLF
# write user line to dest; removing tag open (<row), tag end (/>) and LF
dsth.write(b"<link")
dsth.write(line[6:-4])
dsth.write(line[4:-3])
# CSV title already includes appropriate quoting
dsth.write(b" PostName=")
dsth.write(current_csv[1])
Expand Down

0 comments on commit 2b804e8

Please sign in to comment.