Skip to content

Commit

Permalink
Better handling for non monotonic training data when copying model ckpts
Browse files Browse the repository at this point in the history
  • Loading branch information
lightvector committed Jan 6, 2024
1 parent c6de1bb commit 4334a0f
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions python/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,13 @@ def maybe_reload_training_data():
if train_state["train_bucket_level"] > cap:
train_state["train_bucket_level"] = cap
logging.info("New rows in bucket: %.0f" % train_state["train_bucket_level"])
if train_state["total_num_data_rows"] < train_state["train_bucket_level_at_row"]:
# Bucket went backward! This must be a network imported from a different run, reset the train bucket level
logging.warning("Train bucket last filled at %d rows but now there are only %d rows!" % (
train_state["train_bucket_level_at_row"], train_state["total_num_data_rows"]
))
logging.warning("Data was deleted or this network was transplanted into a new run, resetting the train bucket fill rows")
train_state["train_bucket_level_at_row"] = train_state["total_num_data_rows"]

logging.info("Train steps since last reload: %.0f -> 0" % train_state["train_steps_since_last_reload"])
train_state["train_steps_since_last_reload"] = 0
Expand Down

0 comments on commit 4334a0f

Please sign in to comment.