From c182ad7dad1e286d076e48c6477915abe1ce70c6 Mon Sep 17 00:00:00 2001 From: AG Date: Mon, 4 Mar 2024 15:37:37 -0800 Subject: [PATCH] Removing characters --- train.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/train.py b/train.py index 4d01b30..acba935 100644 --- a/train.py +++ b/train.py @@ -13,13 +13,13 @@ # RECIPES # ########### # All with chunk size of 1024 -# ?.?MB: Target dictionary count 100,000, Prune 10,000,000 -# 8.5MB: Target dictionary count 25,000, Prune 10,000,000 -# 3.6MB: Target dictionary count 10,000, Prune 10,000,000 +# ?.?MB: Target dictionary count 100,000, Prune 1,000,000 +# 8.5MB: Target dictionary count 25,000, Prune 10,000,000 +# 5.4MB: Target dictionary count 10,000, Prune 10,000,000 -PRUNE_FREQUENCY = 10 * 1000 * 1000 # Every this many document positions +PRUNE_FREQUENCY = 1000 * 1000 # Every this many document positions CHUNK_SIZE = 1024 # 1KB per chunk -TARGET_DICTIONARY_COUNT = 10 * 1000 * 1000 +TARGET_DICTIONARY_COUNT = 100 * 1000 # Define a flag to indicate when an interrupt has been caught interrupted = False @@ -191,8 +191,9 @@ def main(): word = words[j] # Define a set of punctuation that is allowed within a word internal_punctuation = {"'", "-"} + additional_punctuation = {"“", "”"} # Create a set of punctuation that signals the end of a word, excluding the internal punctuation - ending_punctuation = set(string.punctuation) - internal_punctuation + ending_punctuation = set(string.punctuation) - internal_punctuation + additional_punctuation # Check for and remove ending punctuation from the word cleaned_word = ''.join(char for char in word if char not in ending_punctuation)