From 393e79f1779049461deca31fa193b87eb4809e9b Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 31 Jul 2023 11:50:21 +0200 Subject: [PATCH] Enhancing Morphological Analysis with spaCy Pretraining (#188) * init * add commands to project yml * add language variable * Add more configs * add german language * add nl lang * start evaluation script * Finish evaluation script * code adjustments * edit eval script * Adjust description and requirements * Add install requirements command * add working_env ignore * Adjustments * Fix description * Update readme * Adjust benchmark readme * Add static vector training workflow * set gpu to -1 * Update with model-last.bin for spacy v3.5.2+ * Add pretraining workflow to tests * Update README --------- Co-authored-by: Adriane Boyd --- benchmarks/README.md | 1 + .../.gitignore | 7 + .../pretraining_morphologizer_oscar/README.md | 69 +++++ .../configs/config.cfg | 127 +++++++++ .../configs/config_pretrain_char.cfg | 165 +++++++++++ .../configs/config_pretrain_vector.cfg | 164 +++++++++++ .../configs/config_static.cfg | 126 +++++++++ .../configs/config_trf.cfg | 164 +++++++++++ .../project.yml | 264 ++++++++++++++++++ .../requirements.txt | 4 + .../scripts/copy_files.py | 14 + .../scripts/evaluate_metrics.py | 188 +++++++++++++ .../scripts/get_latest_model_weight.py | 30 ++ .../scripts/get_oscar_dataset.py | 35 +++ .../scripts/reset.py | 22 ++ .../test_project_pretraining.py | 10 + 16 files changed, 1390 insertions(+) create mode 100644 benchmarks/pretraining_morphologizer_oscar/.gitignore create mode 100644 benchmarks/pretraining_morphologizer_oscar/README.md create mode 100644 benchmarks/pretraining_morphologizer_oscar/configs/config.cfg create mode 100644 benchmarks/pretraining_morphologizer_oscar/configs/config_pretrain_char.cfg create mode 100644 benchmarks/pretraining_morphologizer_oscar/configs/config_pretrain_vector.cfg create mode 100644 benchmarks/pretraining_morphologizer_oscar/configs/config_static.cfg create mode 100644 benchmarks/pretraining_morphologizer_oscar/configs/config_trf.cfg create mode 100644 benchmarks/pretraining_morphologizer_oscar/project.yml create mode 100644 benchmarks/pretraining_morphologizer_oscar/requirements.txt create mode 100644 benchmarks/pretraining_morphologizer_oscar/scripts/copy_files.py create mode 100644 benchmarks/pretraining_morphologizer_oscar/scripts/evaluate_metrics.py create mode 100644 benchmarks/pretraining_morphologizer_oscar/scripts/get_latest_model_weight.py create mode 100644 benchmarks/pretraining_morphologizer_oscar/scripts/get_oscar_dataset.py create mode 100644 benchmarks/pretraining_morphologizer_oscar/scripts/reset.py create mode 100644 benchmarks/pretraining_morphologizer_oscar/test_project_pretraining.py diff --git a/benchmarks/README.md b/benchmarks/README.md index 404ddac32..9dee4ebff 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -9,6 +9,7 @@ | [`ner_conll03`](ner_conll03) | Named Entity Recognition (CoNLL-2003) | | [`ner_embeddings`](ner_embeddings) | Comparing embedding layers in spaCy | | [`parsing_penn_treebank`](parsing_penn_treebank) | Dependency Parsing (Penn Treebank) | +| [`pretraining_morphologizer_oscar`](pretraining_morphologizer_oscar) | Pretraining Morphologizer | | [`span-labeling-datasets`](span-labeling-datasets) | Span labeling datasets | | [`speed`](speed) | Project for speed benchmarking of various pretrained models of different NLP libraries. | | [`textcat_architectures`](textcat_architectures) | Textcat performance benchmarks | diff --git a/benchmarks/pretraining_morphologizer_oscar/.gitignore b/benchmarks/pretraining_morphologizer_oscar/.gitignore new file mode 100644 index 000000000..bebbaccf5 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/.gitignore @@ -0,0 +1,7 @@ +assets +corpus +data +training +pretraining +metrics +working_env \ No newline at end of file diff --git a/benchmarks/pretraining_morphologizer_oscar/README.md b/benchmarks/pretraining_morphologizer_oscar/README.md new file mode 100644 index 000000000..bd01348c3 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/README.md @@ -0,0 +1,69 @@ + + +# 🪐 spaCy Project: Enhancing Morphological Analysis with spaCy Pretraining + +This project explores the effectiveness of pretraining techniques on morphological analysis (morphologizer) by conducting experiments on multiple languages. The objective of this project is to demonstrate the benefits of pretraining word vectors using domain-specific data on the performance of the morphological analysis. We leverage the OSCAR dataset to pretrain our vectors for tok2vec and utilize the UD_Treebanks dataset to train a morphologizer component. We evaluate and compare the performance of different pretraining techniques and the performance of models without any pretraining. + +## 📋 project.yml + +The [`project.yml`](project.yml) defines the data assets required by the +project, as well as the available commands and workflows. For details, see the +[spaCy projects documentation](https://spacy.io/usage/projects). + +### ⏯ Commands + +The following commands are defined by the project. They +can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run). +Commands are only re-run if their inputs have changed. + +| Command | Description | +| --- | --- | +| `install_requirements` | Download and install all requirements | +| `download_oscar` | Download a subset of the oscar dataset | +| `download_model` | Download the specified spaCy model for vector-objective pretraining | +| `extract_ud` | Extract the ud-treebanks data | +| `convert_ud` | Convert the ud-treebanks data to spaCy's format | +| `train` | Train a morphologizer component without pretrained weights and static vectors | +| `evaluate` | Evaluate the trained morphologizer component without pretrained weights and static vectors | +| `train_static` | Train a morphologizer component with static vectors from a pretrained model | +| `evaluate_static` | Evaluate the trained morphologizer component with static weights | +| `pretrain_char` | Pretrain a tok2vec component with the character objective | +| `train_char` | Train a morphologizer component with pretrained weights (character_objective) | +| `evaluate_char` | Evaluate the trained morphologizer component with pretrained weights (character-objective) | +| `pretrain_vector` | Pretrain a tok2vec component with the vector objective | +| `train_vector` | Train a morphologizer component with pretrained weights (vector_objective) | +| `evaluate_vector` | Evaluate the trained morphologizer component with pretrained weights (vector-objective) | +| `train_trf` | Train a morphologizer component without transformer embeddings | +| `evaluate_trf` | Evaluate the trained morphologizer component with transformer embeddings | +| `evaluate_metrics` | Evaluate all experiments and create a summary json file | +| `reset_project` | Reset the project to its original state and delete all training process | +| `reset_training` | Reset the training progress | +| `reset_metrics` | Delete the metrics folder | + +### ⏭ Workflows + +The following workflows are defined by the project. They +can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run) +and will run the specified commands in order. Commands are only re-run if their +inputs have changed. + +| Workflow | Steps | +| --- | --- | +| `data` | `download_oscar` → `download_model` → `extract_ud` → `convert_ud` | +| `training` | `train` → `evaluate` | +| `training_static` | `train_static` → `evaluate_static` | +| `training_char` | `pretrain_char` → `train_char` → `evaluate_char` | +| `training_vector` | `pretrain_vector` → `train_vector` → `evaluate_vector` | +| `training_trf` | `train_trf` → `evaluate_trf` | + +### 🗂 Assets + +The following assets are defined by the project. They can +be fetched by running [`spacy project assets`](https://spacy.io/api/cli#project-assets) +in the project directory. + +| File | Source | Description | +| --- | --- | --- | +| `assets/ud-treebanks-v2.5.tgz` | URL | | + + \ No newline at end of file diff --git a/benchmarks/pretraining_morphologizer_oscar/configs/config.cfg b/benchmarks/pretraining_morphologizer_oscar/configs/config.cfg new file mode 100644 index 000000000..066407e13 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/configs/config.cfg @@ -0,0 +1,127 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +log_file = null +raw_text = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "en" +pipeline = ["morphologizer"] +batch_size = 64 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.morphologizer] +factory = "morphologizer" +overwrite = false +scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} + +[components.morphologizer.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.morphologizer.model.tok2vec] +@architectures = "spacy.Tok2Vec.v2" + +[components.morphologizer.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v2" +width = ${components.morphologizer.model.tok2vec.encode.width} +attrs = ["ORTH", "SHAPE"] +rows = [5000, 2500] +include_static_vectors = false + +[components.morphologizer.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 256 +depth = 8 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system:seed} +gpu_allocator = ${system:gpu_allocator} +dropout = 0.1 +accumulate_gradient = 3 +patience = 2500 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 250 +frozen_components = [] +before_to_disk = null +annotating_components = [] + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +get_length = null +size = 2000 +buffer = 256 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v2" +progress_bar = true +output_file = ${paths.log_file} + + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] diff --git a/benchmarks/pretraining_morphologizer_oscar/configs/config_pretrain_char.cfg b/benchmarks/pretraining_morphologizer_oscar/configs/config_pretrain_char.cfg new file mode 100644 index 000000000..e2bcbcb83 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/configs/config_pretrain_char.cfg @@ -0,0 +1,165 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +log_file = null +raw_text = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "en" +pipeline = ["morphologizer"] +batch_size = 64 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.morphologizer] +factory = "morphologizer" +overwrite = false +scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} + +[components.morphologizer.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.morphologizer.model.tok2vec] +@architectures = "spacy.Tok2Vec.v2" + +[components.morphologizer.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v2" +width = ${components.morphologizer.model.tok2vec.encode.width} +attrs = ["ORTH", "SHAPE"] +rows = [5000, 2500] +include_static_vectors = true + +[components.morphologizer.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 256 +depth = 8 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.pretrain] +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} +min_length = 5 +max_length = 500 +limit = 0 + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system:seed} +gpu_allocator = ${system:gpu_allocator} +dropout = 0.1 +accumulate_gradient = 3 +patience = 2500 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 500 +frozen_components = [] +before_to_disk = null +annotating_components = [] + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +get_length = null +size = 2000 +buffer = 256 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v2" +progress_bar = true +output_file = ${paths.log_file} + + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] + +[pretraining] +max_epochs = 1000 +dropout = 0.2 +n_save_every = 0 +n_save_epoch = 1 +component = "morphologizer" +layer = "tok2vec" +corpus = "corpora.pretrain" + +[pretraining.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 +get_length = null + +[pretraining.objective] +@architectures = "spacy.PretrainCharacters.v1" +maxout_pieces = 3 +hidden_size = 300 +n_characters = 4 + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] diff --git a/benchmarks/pretraining_morphologizer_oscar/configs/config_pretrain_vector.cfg b/benchmarks/pretraining_morphologizer_oscar/configs/config_pretrain_vector.cfg new file mode 100644 index 000000000..0dfc03e01 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/configs/config_pretrain_vector.cfg @@ -0,0 +1,164 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +log_file = null +raw_text = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "en" +pipeline = ["morphologizer"] +batch_size = 64 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.morphologizer] +factory = "morphologizer" +overwrite = false +scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} + +[components.morphologizer.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.morphologizer.model.tok2vec] +@architectures = "spacy.Tok2Vec.v2" + +[components.morphologizer.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v2" +width = ${components.morphologizer.model.tok2vec.encode.width} +attrs = ["ORTH", "SHAPE"] +rows = [5000, 2500] +include_static_vectors = true + +[components.morphologizer.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 256 +depth = 8 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.pretrain] +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} +min_length = 5 +max_length = 500 +limit = 0 + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system:seed} +gpu_allocator = ${system:gpu_allocator} +dropout = 0.1 +accumulate_gradient = 3 +patience = 2500 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 500 +frozen_components = [] +before_to_disk = null +annotating_components = [] + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +get_length = null +size = 2000 +buffer = 256 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v2" +progress_bar = true +output_file = ${paths.log_file} + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] + +[pretraining] +max_epochs = 1000 +dropout = 0.2 +n_save_every = 0 +n_save_epoch = 1 +component = "morphologizer" +layer = "tok2vec" +corpus = "corpora.pretrain" + +[pretraining.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 +get_length = null + +[pretraining.objective] +@architectures = "spacy.PretrainVectors.v1" +maxout_pieces = 3 +hidden_size = 300 +loss = "cosine" + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] diff --git a/benchmarks/pretraining_morphologizer_oscar/configs/config_static.cfg b/benchmarks/pretraining_morphologizer_oscar/configs/config_static.cfg new file mode 100644 index 000000000..267cbce61 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/configs/config_static.cfg @@ -0,0 +1,126 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +log_file = null +raw_text = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "en" +pipeline = ["morphologizer"] +batch_size = 64 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.morphologizer] +factory = "morphologizer" +overwrite = false +scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} + +[components.morphologizer.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.morphologizer.model.tok2vec] +@architectures = "spacy.Tok2Vec.v2" + +[components.morphologizer.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v2" +width = ${components.morphologizer.model.tok2vec.encode.width} +attrs = ["ORTH", "SHAPE"] +rows = [5000, 2500] +include_static_vectors = true + +[components.morphologizer.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 256 +depth = 8 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system:seed} +gpu_allocator = ${system:gpu_allocator} +dropout = 0.1 +accumulate_gradient = 3 +patience = 2500 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 500 +frozen_components = [] +before_to_disk = null +annotating_components = [] + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +get_length = null +size = 2000 +buffer = 256 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v2" +progress_bar = true +output_file = ${paths.log_file} + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] diff --git a/benchmarks/pretraining_morphologizer_oscar/configs/config_trf.cfg b/benchmarks/pretraining_morphologizer_oscar/configs/config_trf.cfg new file mode 100644 index 000000000..38c8841fc --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/configs/config_trf.cfg @@ -0,0 +1,164 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +log_file = null +raw_text = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "en" +pipeline = ["morphologizer"] +batch_size = 64 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.morphologizer] +factory = "morphologizer" +overwrite = false +scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} + +[components.morphologizer.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.morphologizer.model.tok2vec] +@architectures = "spacy-transformers.Tok2VecTransformer.v3" +name = "roberta-base" +tokenizer_config = {"use_fast": false} +transformer_config = {} +grad_factor = 1.0 +mixed_precision = false +grad_scaler_config = {"init_scale": 32768} + +[components.morphologizer.model.tok2vec.pooling] +@layers = "reduce_mean.v1" + +[components.morphologizer.model.tok2vec.get_spans] +@span_getters = "spacy-transformers.strided_spans.v1" +window = 128 +stride = 96 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.pretrain] +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} +min_length = 5 +max_length = 500 +limit = 0 + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system:seed} +gpu_allocator = ${system:gpu_allocator} +dropout = 0.1 +accumulate_gradient = 3 +patience = 2500 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 500 +frozen_components = [] +before_to_disk = null +annotating_components = [] + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +get_length = null +size = 2000 +buffer = 256 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v2" +progress_bar = true +output_file = ${paths.log_file} + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] + +[pretraining] +max_epochs = 1000 +dropout = 0.2 +n_save_every = 0 +n_save_epoch = 1 +component = "morphologizer" +layer = "tok2vec" +corpus = "corpora.pretrain" + +[pretraining.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 +get_length = null + +[pretraining.objective] +@architectures = "spacy.PretrainCharacters.v1" +maxout_pieces = 3 +hidden_size = 300 +n_characters = 4 + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] diff --git a/benchmarks/pretraining_morphologizer_oscar/project.yml b/benchmarks/pretraining_morphologizer_oscar/project.yml new file mode 100644 index 000000000..cc750f51a --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/project.yml @@ -0,0 +1,264 @@ +title: "Enhancing Morphological Analysis with spaCy Pretraining" +description: "This project explores the effectiveness of pretraining techniques on morphological analysis (morphologizer) by conducting experiments on multiple languages. The objective of this project is to demonstrate the benefits of pretraining word vectors using domain-specific data on the performance of the morphological analysis. We leverage the OSCAR dataset to pretrain our vectors for tok2vec and utilize the UD_Treebanks dataset to train a morphologizer component. We evaluate and compare the performance of different pretraining techniques and the performance of models without any pretraining." +# Variables can be referenced across the project.yml using ${vars.var_name} +vars: + # Change all three variables to change the language + spacy_lang: "en" # en, de, nl + spacy_model: "en_core_web_lg" # "en_core_web_lg", "de_core_news_lg", "nl_core_news_lg" + ud_treebank: "UD_English-EWT" # UD_English-EWT, UD_German-HDT, UD_Dutch-Alpino + + epochs: 20 + eval_frequency: 200 + + oscar_path: "data" + max_pretraining_texts: 1000 + # This variable depends on the output of the pretrain commands + pretraining_model: "model-last.bin" + # Choose -1 for CPU + gpu: -1 + +spacy_version: ">=3.5.2,<4.0.0" + +# These are the directories that the project needs. The project CLI will make +# sure that they always exist. +directories: ["assets", "scripts", "data", "training", "pretraining", "metrics"] + +# Assets that should be downloaded or available in the directory. We're shipping +# some of them with the project, so they won't have to be downloaded. But the +# 'project assets' command still lets you verify that the checksums match. +assets: + - dest: "assets/ud-treebanks-v2.5.tgz" + url: "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3105/ud-treebanks-v2.5.tgz?sequence=1&isAllowed=y" + checksum: "388456892760ada0db8e20ce58501025" + +# Workflows are sequences of commands (see below) executed in order. You can +# run them via "spacy project run [workflow]". If a commands's inputs/outputs +# haven't changed, it won't be re-run. +workflows: + # Run all data preparations scripts (do not forget to run spacy assets before) + data: + - download_oscar + - download_model + - extract_ud + - convert_ud + # Train without pretraining and without static vectors + training: + - train + - evaluate + # Train with static vectors + training_static: + - train_static + - evaluate_static + # Train with character pretraining + training_char: + - pretrain_char + - train_char + - evaluate_char + # Train with vector pretraining + training_vector: + - pretrain_vector + - train_vector + - evaluate_vector + # Train with transformer + training_trf: + - train_trf + - evaluate_trf + +# Project commands, specified in a style similar to CI config files (e.g. Azure +# pipelines). The name is the command name that lets you trigger the command +# via "spacy project run [command] [path]". The help message is optional and +# shown when executing "spacy project run [optional command] [path] --help". +commands: + - name: install_requirements + help: "Download and install all requirements" + script: + - "pip install -r requirements.txt" + + - name: download_oscar + help: "Download a subset of the oscar dataset" + script: + - "python scripts/get_oscar_dataset.py ${vars.max_pretraining_texts} ${vars.spacy_lang} ${vars.oscar_path}/oscar_data_${vars.spacy_lang}.jsonl" + outputs: + - "${vars.oscar_path}/oscar_data_${vars.spacy_lang}.jsonl" + + - name: download_model + help: "Download the specified spaCy model for vector-objective pretraining" + script: + - "python -m spacy download ${vars.spacy_model}" + + - name: extract_ud + help: "Extract the ud-treebanks data" + script: + - "tar xf assets/ud-treebanks-v2.5.tgz -C assets/" + deps: + - "assets/ud-treebanks-v2.5.tgz" + outputs: + - "assets/ud-treebanks-v2.5/" + + - name: convert_ud + help: "Convert the ud-treebanks data to spaCy's format" + script: + - "python scripts/copy_files.py train conllu assets/ud-treebanks-v2.5/${vars.ud_treebank}/ data/${vars.ud_treebank}/train/" + - "python scripts/copy_files.py dev conllu assets/ud-treebanks-v2.5/${vars.ud_treebank}/ data/${vars.ud_treebank}/dev/" + - "python scripts/copy_files.py test conllu assets/ud-treebanks-v2.5/${vars.ud_treebank}/ data/${vars.ud_treebank}/test/" + - "python -m spacy convert data/${vars.ud_treebank}/train/ data/${vars.ud_treebank}/ --converter conllu -n 10 -T -C" + - "python -m spacy convert data/${vars.ud_treebank}/dev/ data/${vars.ud_treebank}/ --converter conllu -n 10 -T -C" + - "python -m spacy convert data/${vars.ud_treebank}/test/ data/${vars.ud_treebank}/ --converter conllu -n 10 -T -C" + deps: + - "assets/ud-treebanks-v2.5/" + outputs: + - "data/${vars.ud_treebank}/train.spacy" + - "data/${vars.ud_treebank}/dev.spacy" + - "data/${vars.ud_treebank}/test.spacy" + + - name: train + help: "Train a morphologizer component without pretrained weights and static vectors" + script: + - "python -m spacy train configs/config.cfg -o training/${vars.ud_treebank}/no_pretraining/ --gpu-id ${vars.gpu} --nlp.lang ${vars.spacy_lang} --paths.train data/${vars.ud_treebank}/train.spacy --paths.dev data/${vars.ud_treebank}/dev.spacy --paths.log_file metrics/${vars.ud_treebank}_no_pretraining.jsonl --training.max_epochs ${vars.epochs} --training.eval_frequency ${vars.eval_frequency}" + deps: + - "data/${vars.ud_treebank}/train.spacy" + - "data/${vars.ud_treebank}/dev.spacy" + - "configs/config.cfg" + outputs: + - "training/${vars.ud_treebank}/no_pretraining/model-best" + + - name: evaluate + help: "Evaluate the trained morphologizer component without pretrained weights and static vectors" + script: + - "python -m spacy evaluate training/${vars.ud_treebank}/no_pretraining/model-best data/${vars.ud_treebank}/test.spacy --output metrics/${vars.ud_treebank}_no_pretraining.json --gpu-id ${vars.gpu}" + deps: + - "training/${vars.ud_treebank}/no_pretraining/model-best" + - "data/${vars.ud_treebank}/test.spacy" + outputs: + - "metrics/${vars.ud_treebank}/no_pretraining/" + + - name: train_static + help: "Train a morphologizer component with static vectors from a pretrained model" + script: + - "python -m spacy train configs/config_static.cfg -o training/${vars.ud_treebank}/static/ --gpu-id ${vars.gpu} --nlp.lang ${vars.spacy_lang} --paths.train data/${vars.ud_treebank}/train.spacy --paths.dev data/${vars.ud_treebank}/dev.spacy --paths.vectors ${vars.spacy_model} --paths.log_file metrics/${vars.ud_treebank}_static.jsonl --training.max_epochs ${vars.epochs} --training.eval_frequency ${vars.eval_frequency}" + deps: + - "data/${vars.ud_treebank}/train.spacy" + - "data/${vars.ud_treebank}/dev.spacy" + - "configs/config_static.cfg" + outputs: + - "training/${vars.ud_treebank}/static/model-best" + + - name: evaluate_static + help: "Evaluate the trained morphologizer component with static weights" + script: + - "python -m spacy evaluate training/${vars.ud_treebank}/static/model-best data/${vars.ud_treebank}/test.spacy --output metrics/${vars.ud_treebank}_static.json --gpu-id ${vars.gpu}" + deps: + - "training/${vars.ud_treebank}/static/model-best" + - "data/${vars.ud_treebank}/test.spacy" + outputs: + - "metrics/${vars.ud_treebank}/static/" + + - name: pretrain_char + help: "Pretrain a tok2vec component with the character objective" + script: + - "python -m spacy pretrain configs/config_pretrain_char.cfg pretraining/${vars.spacy_lang}/character_objective --gpu-id ${vars.gpu} --paths.raw_text ${vars.oscar_path}/oscar_data_${vars.spacy_lang}.jsonl --nlp.lang ${vars.spacy_lang} --pretraining.max_epochs ${vars.epochs} --training.eval_frequency ${vars.eval_frequency}" + deps: + - "${vars.oscar_path}" + - "configs/config_pretrain_char.cfg" + outputs: + - "pretraining/${vars.spacy_lang}/character_objective/${vars.pretraining_model}" + + - name: train_char + help: "Train a morphologizer component with pretrained weights (character_objective)" + script: + - "python -m spacy train configs/config_pretrain_char.cfg -o training/${vars.ud_treebank}/character_objective/ --gpu-id ${vars.gpu} --nlp.lang ${vars.spacy_lang} --paths.train data/${vars.ud_treebank}/train.spacy --paths.dev data/${vars.ud_treebank}/dev.spacy --paths.init_tok2vec pretraining/${vars.spacy_lang}/character_objective/${vars.pretraining_model} --nlp.lang ${vars.spacy_lang} --paths.log_file metrics/${vars.ud_treebank}_character_objective.jsonl --training.max_epochs ${vars.epochs} --training.eval_frequency ${vars.eval_frequency}" + deps: + - "data/${vars.ud_treebank}/train.spacy" + - "data/${vars.ud_treebank}/dev.spacy" + - "configs/config_pretrain_char.cfg" + - "pretraining/${vars.spacy_lang}/character_objective/${vars.pretraining_model}" + outputs: + - "training/${vars.ud_treebank}/character_objective/model-best" + + - name: evaluate_char + help: "Evaluate the trained morphologizer component with pretrained weights (character-objective)" + script: + - "python -m spacy evaluate training/${vars.ud_treebank}/character_objective/model-best data/${vars.ud_treebank}/test.spacy --output metrics/${vars.ud_treebank}_character_objective.json --gpu-id ${vars.gpu}" + deps: + - "training/${vars.ud_treebank}/character_objective/model-best" + - "data/${vars.ud_treebank}/test.spacy" + outputs: + - "metrics/${vars.ud_treebank}/character_objective/" + + - name: pretrain_vector + help: "Pretrain a tok2vec component with the vector objective" + script: + - "python -m spacy pretrain configs/config_pretrain_vector.cfg pretraining/${vars.spacy_lang}/vector_objective --gpu-id ${vars.gpu} --paths.raw_text ${vars.oscar_path}/oscar_data_${vars.spacy_lang}.jsonl --paths.vectors ${vars.spacy_model} --nlp.lang ${vars.spacy_lang} --pretraining.max_epochs ${vars.epochs} --training.eval_frequency ${vars.eval_frequency}" + deps: + - "${vars.oscar_path}" + - "configs/config_pretrain_vector.cfg" + outputs: + - "pretraining/${vars.spacy_lang}/vector_objective/${vars.pretraining_model}" + + - name: train_vector + help: "Train a morphologizer component with pretrained weights (vector_objective)" + script: + - "python -m spacy train configs/config_pretrain_vector.cfg -o training/${vars.ud_treebank}/vector_objective/ --gpu-id ${vars.gpu} --nlp.lang ${vars.spacy_lang} --paths.train data/${vars.ud_treebank}/train.spacy --paths.dev data/${vars.ud_treebank}/dev.spacy --paths.init_tok2vec pretraining/${vars.spacy_lang}/vector_objective/${vars.pretraining_model} --paths.vectors ${vars.spacy_model} --nlp.lang ${vars.spacy_lang} --paths.log_file metrics/${vars.ud_treebank}_vector_objective.jsonl --training.max_epochs ${vars.epochs} --training.eval_frequency ${vars.eval_frequency}" + deps: + - "data/${vars.ud_treebank}/train.spacy" + - "data/${vars.ud_treebank}/dev.spacy" + - "configs/config_pretrain_vector.cfg" + - "pretraining/${vars.spacy_lang}/vector_objective/${vars.pretraining_model}" + outputs: + - "training/${vars.ud_treebank}/vector_objective/model-best" + + - name: evaluate_vector + help: "Evaluate the trained morphologizer component with pretrained weights (vector-objective)" + script: + - "python -m spacy evaluate training/${vars.ud_treebank}/vector_objective/model-best data/${vars.ud_treebank}/test.spacy --output metrics/${vars.ud_treebank}_vector_objective.json --gpu-id ${vars.gpu}" + deps: + - "training/${vars.ud_treebank}/vector_objective/model-best" + - "data/${vars.ud_treebank}/test.spacy" + outputs: + - "metrics/${vars.ud_treebank}/vector_objective/" + + - name: train_trf + help: "Train a morphologizer component without transformer embeddings" + script: + - "python -m spacy train configs/config_trf.cfg -o training/${vars.ud_treebank}/transformer/ --gpu-id ${vars.gpu} --nlp.lang ${vars.spacy_lang} --paths.train data/${vars.ud_treebank}/train.spacy --paths.dev data/${vars.ud_treebank}/dev.spacy --paths.log_file metrics/${vars.ud_treebank}_transformer.jsonl --training.max_epochs ${vars.epochs} --training.eval_frequency ${vars.eval_frequency}" + deps: + - "data/${vars.ud_treebank}/train.spacy" + - "data/${vars.ud_treebank}/dev.spacy" + - "configs/config_trf.cfg" + outputs: + - "training/${vars.ud_treebank}/transformer/model-best" + + - name: evaluate_trf + help: "Evaluate the trained morphologizer component with transformer embeddings" + script: + - "python -m spacy evaluate training/${vars.ud_treebank}/transformer/model-best data/${vars.ud_treebank}/test.spacy --output metrics/${vars.ud_treebank}_transformer.json --gpu-id ${vars.gpu}" + deps: + - "training/${vars.ud_treebank}/transformer/model-best" + - "data/${vars.ud_treebank}/test.spacy" + outputs: + - "metrics/${vars.ud_treebank}/transformer/" + + - name: evaluate_metrics + help: "Evaluate all experiments and create a summary json file" + script: + - "python scripts/evaluate_metrics.py ./metrics/" + + - name: "reset_project" + help: "Reset the project to its original state and delete all training process" + script: + - "python scripts/reset.py training" + - "python scripts/reset.py metrics" + - "python scripts/reset.py assets" + - "python scripts/reset.py data" + - "python scripts/reset.py pretraining" + + - name: "reset_training" + help: "Reset the training progress" + script: + - "python scripts/reset.py training" + - "python scripts/reset.py pretraining" + + - name: "reset_metrics" + help: "Delete the metrics folder" + script: + - "python scripts/reset.py metrics" diff --git a/benchmarks/pretraining_morphologizer_oscar/requirements.txt b/benchmarks/pretraining_morphologizer_oscar/requirements.txt new file mode 100644 index 000000000..1f029ba77 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/requirements.txt @@ -0,0 +1,4 @@ +spacy +datasets +spacy-transformers +matplotlib \ No newline at end of file diff --git a/benchmarks/pretraining_morphologizer_oscar/scripts/copy_files.py b/benchmarks/pretraining_morphologizer_oscar/scripts/copy_files.py new file mode 100644 index 000000000..66bd79579 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/scripts/copy_files.py @@ -0,0 +1,14 @@ +import typer +from pathlib import Path +import glob +import shutil + + +def main(stem: str, ext: str, input_dir: Path, output_dir: Path): + output_dir.mkdir(parents=True, exist_ok=True) + for filename in glob.glob(str(input_dir.resolve()) + f"/*-{stem}*.{ext}"): + shutil.copy(filename, str(output_dir.resolve())) + + +if __name__ == "__main__": + typer.run(main) diff --git a/benchmarks/pretraining_morphologizer_oscar/scripts/evaluate_metrics.py b/benchmarks/pretraining_morphologizer_oscar/scripts/evaluate_metrics.py new file mode 100644 index 000000000..7c0d5c88a --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/scripts/evaluate_metrics.py @@ -0,0 +1,188 @@ +import typer +from pathlib import Path +import srsly +from wasabi import msg +from os import walk +import matplotlib.pyplot as plt +import numpy as np + + +def main(metric_folder: Path): + + # Hardcode all datasets + datasets = { + "UD_English-EWT": { + "no_pretraining": {}, + "static": {}, + "character_objective": {}, + "vector_objective": {}, + "transformer": {}, + }, + "UD_German-HDT": { + "no_pretraining": {}, + "static": {}, + "character_objective": {}, + "vector_objective": {}, + "transformer": {}, + }, + "UD_Dutch-Alpino": { + "no_pretraining": {}, + "static": {}, + "character_objective": {}, + "vector_objective": {}, + "transformer": {}, + }, + } + + datasets_exist = set() + + # Import all metrics and assign them to a dict + msg.info("Importing all metrics") + for (dirpath, dirnames, filenames) in walk(metric_folder): + for filename in filenames: + for dataset in datasets: + if str(dataset) in str(filename): + datasets_exist.add(dataset) + for key in datasets[dataset]: + if str(key) in str(filename): + if ".jsonl" in str(filename): + data = list(srsly.read_jsonl(metric_folder / filename)) + datasets[dataset][key]["training"] = data + else: + data = srsly.read_json(metric_folder / filename) + datasets[dataset][key]["evaluation"] = data + msg.good(f"Found metrics for {str(datasets_exist)}") + + del_list = [] + for dataset in datasets: + if dataset not in datasets_exist: + del_list.append(dataset) + for del_key in del_list: + del datasets[del_key] + + del_list_experiment = [] + for dataset in datasets: + for experiment in dataset: + if len(experiment) == 0: + del_list_experiment.append((dataset, experiment)) + for del_experiment in del_list_experiment: + del datasets[del_experiment[0]][del_experiment[1]] + + # Training eval + msg.info("Starting training evaluation") + for dataset in datasets: + if dataset not in datasets_exist: + continue + x_list = [] + y_list = [] + name_list = [] + for metric_type in datasets[dataset]: + if not datasets[dataset][metric_type]: + continue + epochs = [] + scores = [] + for line in datasets[dataset][metric_type]["training"]: + epochs.append(line["epoch"]) + scores.append(line["score"]) + x_list.append(epochs) + y_list.append(scores) + name_list.append(metric_type) + + for x, y, name in zip(x_list, y_list, name_list): + plt.plot(x, y, label=name) + + for _x, _y in zip(x, y): + label = "{:.2f}".format(_y) + plt.annotate(label, (_x, _y + 0.005), size=7, ha="center") + + # Plot settings + ax = plt.gca() + ax.set_ylim([0.8, 1.0]) + ax.set_ylabel("Score") + ax.set_xlabel("Epochs") + plt.legend() + plt.grid() + plt.title(f"Training {dataset}", size=15) + plt.savefig(metric_folder / f"{dataset}_training_graph.png", dpi=300) + msg.good(f"Saved training plot for {dataset}") + + # Evaluation comparison + msg.info("Starting evaluation comparison") + + # Set metrics which we want to compare + compare_metrics = [ + "pos_acc", + "morph_micro_p", + "morph_micro_f", + "morph_micro_r", + "morph_per_feat", + "speed", + ] + + metric_types = [ + "no_pretraining", + "static", + "character_objective", + "vector_objective", + "transformer", + ] + for dataset in datasets: + if dataset not in datasets_exist: + continue + metric_table = {} + for metric in compare_metrics: + metric_table[metric] = {} + for metric_type in datasets[dataset]: + if not datasets[dataset][metric_type]: + continue + eval_data = datasets[dataset][metric_type]["evaluation"] + + if type(eval_data[metric]) == dict: + for label in eval_data[metric]: + if label not in metric_table: + metric_table[label] = {} + metric_table[label][metric_type] = eval_data[metric][label]["f"] + else: + metric_table[metric][metric_type] = eval_data[metric] + + dataset_output = "" + header = "| Label |" + row_sep = "| :----: |" + for metric in metric_types: + header += f" {metric} |" + row_sep += " :----: |" + dataset_output += header + "\n" + dataset_output += row_sep + "\n" + + for metric in metric_table: + row = f"| {metric} |" + if len(metric_table[metric]) > 0: + no_pretraining_value = 0 + for metric_type in metric_types: + if metric_type not in metric_table[metric]: + continue + difference = 0 + if metric_type == "no_pretraining": + no_pretraining_value = metric_table[metric][metric_type] + else: + difference = ( + metric_table[metric][metric_type] - no_pretraining_value + ) + + value = "{:.2f}".format(metric_table[metric][metric_type]) + difference = "{:.2f}".format(difference) + row += f" {value} ({difference}) |" + + dataset_output += row + "\n" + + with open( + metric_folder / f"{dataset}_evaluation_comparison.md", "w", encoding="utf-8" + ) as f: + f.write(dataset_output) + + msg.good(f"Saved eval comparison for {dataset}") + msg.info(f"Done!") + + +if __name__ == "__main__": + typer.run(main) diff --git a/benchmarks/pretraining_morphologizer_oscar/scripts/get_latest_model_weight.py b/benchmarks/pretraining_morphologizer_oscar/scripts/get_latest_model_weight.py new file mode 100644 index 000000000..77d309dd6 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/scripts/get_latest_model_weight.py @@ -0,0 +1,30 @@ +import os +import yaml +import typer +from pathlib import Path +from wasabi import msg + + +def main(weights_folder: Path, project_file: Path): + number_list = [] + for file in os.listdir(weights_folder): + if "model" in file: + number_list.append(int(file[5:-4])) + + try: + model_name = f"model{max(number_list)}.bin" + except: + msg.warn("No pretrained weights found. Make sure to run the pretrain command.") + return + + with open(project_file, "r") as file: + yaml_content = yaml.safe_load(file) + + yaml_content["vars"]["pretraining_model"] = model_name + + with open(project_file, "w") as outfile: + yaml.dump(yaml_content, outfile, default_flow_style=False) + + +if __name__ == "__main__": + typer.run(main) diff --git a/benchmarks/pretraining_morphologizer_oscar/scripts/get_oscar_dataset.py b/benchmarks/pretraining_morphologizer_oscar/scripts/get_oscar_dataset.py new file mode 100644 index 000000000..19be46518 --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/scripts/get_oscar_dataset.py @@ -0,0 +1,35 @@ +import typer +from pathlib import Path +import srsly +from datasets import load_dataset +from itertools import islice +from wasabi import msg +from tqdm import tqdm + + +def main(max_texts: int, lang: str, output_path: Path): + """Uses the datasets API from HuggingFace to retrieve a set amount of data entries from the OSCAR corpus and saves it as a jsonl file""" + + msg.info( + f"Start downloading {max_texts} data entries from the OSCAR corpus (lang: {lang})" + ) + language = f"unshuffled_deduplicated_{lang}" + dataset = load_dataset("oscar", language, split="train", streaming=True) + data = [] + text_length = 0 + for line in tqdm( + islice(iter(dataset), max_texts), + total=max_texts, + desc="Downloading OSCAR extract", + ): + data.append(line) + text_length += len(line["text"]) - line["text"].count(" ") + srsly.write_jsonl(output_path, data) + msg.info( + f"Downloaded extract contains about {int(round(text_length/1000000,0))} million characters" + ) + msg.good(f"Saved to {output_path}") + + +if __name__ == "__main__": + typer.run(main) diff --git a/benchmarks/pretraining_morphologizer_oscar/scripts/reset.py b/benchmarks/pretraining_morphologizer_oscar/scripts/reset.py new file mode 100644 index 000000000..def34298f --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/scripts/reset.py @@ -0,0 +1,22 @@ +import shutil +from pathlib import Path +import typer +from wasabi import Printer + +msg = Printer() + + +def main(path: Path): + """This script is used to delete directories and reset the project""" + if path.is_dir(): + answer = input(f"Are you sure you want to reset {path} (y)") + if answer.lower().strip() == "y": + try: + shutil.rmtree(path) + msg.good(f"Deleted directory {path}") + except Exception as e: + print(e) + + +if __name__ == "__main__": + typer.run(main) diff --git a/benchmarks/pretraining_morphologizer_oscar/test_project_pretraining.py b/benchmarks/pretraining_morphologizer_oscar/test_project_pretraining.py new file mode 100644 index 000000000..68a40cc5b --- /dev/null +++ b/benchmarks/pretraining_morphologizer_oscar/test_project_pretraining.py @@ -0,0 +1,10 @@ +from spacy.cli.project.run import project_run +from spacy.cli.project.assets import project_assets +from pathlib import Path + + +def test_project_pretraining(): + root = Path(__file__).parent + project_assets(root) + project_run(root, "data") + project_run(root, "training_char", overrides={"vars.epochs": 1})