-
Notifications
You must be signed in to change notification settings - Fork 0
/
project.yml
212 lines (189 loc) · 7.34 KB
/
project.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
title: "Prodigy Address Extraction model bootstrapped with LLM's"
description: |
This project creates an address extraction model. To improve annotation efficiency,
we'll experiment with using LLM's to speed up the development process.
spacy_version: ">=3.5.0,<4.0.0"
check_requirements: false
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
name: "address_extraction"
config: "config.cfg"
version: "0.2.0"
lang: "en"
dev: "corpus/dev.spacy"
train: "corpus/train.spacy"
labels: "assets/labels.txt"
generate-prompt: "Generate news article text embedded with US addresses and locations."
generate-file: "addresses.jsonl"
generate-n: 65
vectors: "en_core_web_lg"
gpu_id: -1
# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "corpus", "training", "scripts", "packages", "metrics"]
# Assets that should be downloaded or available in the directory. We're shipping
# them with the project, so they won't have to be downloaded. But the
# 'project assets' command still lets you verify that the checksums match.
assets:
- dest: "assets/addresses.jsonl"
description: "LLM-generated (synthetic) data"
- dest: "assets/addresses_train.jsonl"
description: "Annotated training data from LLM-generated (synthetic) data"
- dest: "assets/addresses_eval.jsonl"
description: "Annotated evaluation data from LLM-generated (synthetic) data"
# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
all:
- install
- load-annotations
- ner-data-to-spacy
- train-vectors
- evaluate
retrain:
- ner-data-to-spacy
- ner-data-debug
- train-vectors
- evaluate
visualize:
- package
- visualize-model
# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
- name: "install"
help: "Install packages"
script:
- "python3 -m pip install --upgrade pip"
- "python3 -m pip install -r requirements.txt"
- "dotenv run -- python3 -m pip install prodigy --pre -f https://xxx@download.prodi.gy"
- name: clean
help: "Remove intermediate files"
script:
- "rm -rf training"
- "rm -rf corpus"
- "rm -rf packages"
- "rm -rf metrics"
- name: "clean-venv"
script:
- "rm -rf venv"
help: "Remove the virtual environment"
- name: "generate-data"
script:
- "dotenv run -- python3 -m prodigy terms.openai.fetch \"${vars.generate-prompt}\" ./assets/${vars.generate-file} --n ${vars.generate-n}"
help: "Create synthetic data from LLM"
- name: "ner-manual-train"
script:
- "python3 -m prodigy ner.manual address_manual blank:en assets/${vars.generate-file} --label ${vars.labels}"
help: "NER manual annotate for training from generated (synthetic) data"
- name: "ner-manual-eval"
script:
- "python3 -m prodigy ner.manual address_eval blank:en assets/${vars.generate-file} --exclude address_train --label ${vars.labels}"
help: "NER manual annotate for evaluation from generated (synthetic) data"
- name: "ner-train-curve"
script:
- "python3 -m prodigy train-curve --ner address_train,eval:address_eval"
help: "NER correct annotate for training from generated (synthetic) data"
- name: "ner-correct"
script:
- "python3 -m prodigy ner.correct address_correct training/model-last assets/${vars.generate-file} --exclude address_train,address_eval --label ${vars.labels}"
help: "NER correct annotate for training from generated (synthetic) data"
- name: "data-merge"
script:
- "python3 -m prodigy drop address_train"
- "python3 -m prodigy db-merge address_manual,address_correct address_train"
help: "Merge manual and correct data for training data"
- name: "ner-data-to-spacy"
script:
- "python3 -m prodigy data-to-spacy ./corpus --ner address_train,eval:address_eval"
help: "Convert training and evaluations to spaCy binary data"
outputs:
- "corpus/train.spacy"
- "corpus/dev.spacy"
- "corpus/config.cfg"
- name: "ner-data-debug"
help: "Run data debug on training and evaluation data"
script:
- "python3 -m spacy debug data corpus/config.cfg --paths.train ${vars.train} --paths.dev ${vars.dev}"
- name: "train"
help: "Train pipeline models"
script:
- "python3 -m spacy train corpus/config.cfg --paths.train ${vars.train} --paths.dev ${vars.dev} --output training/ --gpu-id ${vars.gpu_id}"
deps:
- "corpus/config.cfg"
- "corpus/train.spacy"
- "corpus/dev.spacy"
outputs:
- "training/model-best"
- name: "load-annotations"
help: "Load training and evaluation data as Prodigy datasets"
script:
- "python3 -m prodigy drop address_train"
- "python3 -m prodigy db-in address_train assets/address_train.jsonl"
- "python3 -m prodigy drop address_eval"
- "python3 -m prodigy db-in address_eval assets/address_eval.jsonl"
- name: "export-annotations"
help: "Explort training and evaluation data as jsonl files"
script:
- "python3 -m prodigy db-out address_train > ./assets/address_train.jsonl"
- "python3 -m prodigy db-out address_eval > ./assets/address_eval.jsonl"
- name: "train-vectors"
help: "Train pipeline models with vectors"
script:
- >-
python3 -m spacy train corpus/config.cfg
--paths.train ${vars.train}
--paths.dev ${vars.dev}
--output training/
--gpu-id ${vars.gpu_id}
--paths.vectors ${vars.vectors}
--components.tok2vec.model.embed.include_static_vectors True
deps:
- "corpus/config.cfg"
- "corpus/train.spacy"
- "corpus/dev.spacy"
outputs:
- "training/model-best"
- name: "evaluate"
help: "Evaluate the model and export metrics"
script:
- >-
python3 -m spacy evaluate
training/model-last
${vars.dev}
--output training/model-last.json
- >-
python3 -m spacy evaluate
training/model-best
${vars.dev}
--output training/model-best.json
deps:
- "corpus/dev.spacy"
- "training/model-last"
- "training/model-best"
outputs:
- "training/model-last.json"
- "training/model-best.json"
- name: package
help: "Package the trained model as a pip package"
script:
- "python -m spacy package training/model-last packages --name ${vars.name} --version ${vars.version} --force"
deps:
- "training/model-last"
outputs_no_cache:
- "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.tar.gz"
- name: visualize-model
help: Visualize the model's output interactively using Streamlit
script:
- "streamlit run scripts/visualize_model.py"
deps:
- "scripts/visualize_model.py"
- "training/model-best"
- "training/model-last"
- name: document
help: "Export README for project details"
script:
- "spacy project document --output README.md"