Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add model-last saving mechanism to pretraining #12459

Merged
merged 9 commits into from
Apr 3, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions spacy/tests/training/test_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ def test_pretraining_tok2vec_characters(objective):
pretrain(filled, tmp_dir)
assert Path(tmp_dir / "model0.bin").exists()
assert Path(tmp_dir / "model4.bin").exists()
assert Path(tmp_dir / "model-last.bin").exists()
assert not Path(tmp_dir / "model5.bin").exists()


Expand Down Expand Up @@ -237,6 +238,7 @@ def test_pretraining_tagger_tok2vec(config):
pretrain(filled, tmp_dir)
assert Path(tmp_dir / "model0.bin").exists()
assert Path(tmp_dir / "model4.bin").exists()
assert Path(tmp_dir / "model-last.bin").exists()
assert not Path(tmp_dir / "model5.bin").exists()


Expand Down
41 changes: 24 additions & 17 deletions spacy/training/pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,14 @@ def pretrain(
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

def _save_model(epoch, is_temp=False):
def _save_model(epoch, is_temp=False, is_last=False):
is_temp_str = ".temp" if is_temp else ""
with model.use_params(optimizer.averages):
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
if is_last:
save_path = output_dir / f"model-last.bin"
else:
save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
with (save_path).open("wb") as file_:
file_.write(model.get_ref("tok2vec").to_bytes())
log = {
"nr_word": tracker.nr_word,
Expand All @@ -76,22 +80,25 @@ def _save_model(epoch, is_temp=False):

# TODO: I think we probably want this to look more like the
# 'create_train_batches' function?
for epoch in range(epoch_resume, P["max_epochs"]):
for batch_id, batch in enumerate(batcher(corpus(nlp))):
docs = ensure_docs(batch)
loss = make_update(model, docs, optimizer, objective)
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
_save_model(epoch, is_temp=True)

if P["n_save_epoch"]:
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
try:
for epoch in range(epoch_resume, P["max_epochs"]):
for batch_id, batch in enumerate(batcher(corpus(nlp))):
docs = ensure_docs(batch)
loss = make_update(model, docs, optimizer, objective)
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
_save_model(epoch, is_temp=True)

if P["n_save_epoch"]:
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
_save_model(epoch)
else:
_save_model(epoch)
else:
_save_model(epoch)
tracker.epoch_loss = 0.0
tracker.epoch_loss = 0.0
finally:
_save_model(P["max_epochs"], is_last=True)


def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
Expand Down
9 changes: 6 additions & 3 deletions website/docs/usage/embeddings-transformers.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
that you want to use from pretraining.

A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
make use of the final output, you could fill in this value in your config file:
adrianeboyd marked this conversation as resolved.
Show resolved Hide resolved
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
configure `n_save_epoch` to tell pretraining in which epoch interval it should
save the current training progress. To use the final output to initialize your
`tok2vec` layer, you could fill in this value in your config file:

```ini {title="config.cfg"}

[paths]
init_tok2vec = "pretrain/model4.bin"
init_tok2vec = "pretrain/model-last.bin"

[initialize]
init_tok2vec = ${paths.init_tok2vec}
Expand Down