Skip to content

Commit

Permalink
Fix race condition with ready file. (tensorflow#5271)
Browse files Browse the repository at this point in the history
  • Loading branch information
reedwm authored and Taylor Robie committed Sep 11, 2018
1 parent e6ce8cd commit 34beb7a
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 1 deletion.
1 change: 1 addition & 0 deletions official/recommendation/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(self, data_dir, cache_id=None):
CYCLES_TO_BUFFER = 3 # The number of train cycles worth of data to "run ahead"
# of the main training loop.

READY_FILE_TEMP = "ready.json.temp"
READY_FILE = "ready.json"
TRAIN_RECORD_TEMPLATE = "train_{}.tfrecords"

Expand Down
8 changes: 7 additions & 1 deletion official/recommendation/data_async_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,11 +282,17 @@ def _construct_training_records(
raise ValueError("Error detected: point counts do not match: {} vs. {}"
.format(num_pts, written_pts))

with tf.gfile.Open(os.path.join(record_dir, rconst.READY_FILE), "w") as f:
# We write to a temp file then atomically rename it to the final file, because
# writing directly to the final file can cause the main process to read a
# partially written JSON file.
ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
with tf.gfile.Open(ready_file_temp, "w") as f:
json.dump({
"batch_size": train_batch_size,
"batch_count": batch_count,
}, f)
ready_file = os.path.join(record_dir, rconst.READY_FILE)
tf.gfile.Rename(ready_file_temp, ready_file)

log_msg("Cycle {} complete. Total time: {:.1f} seconds"
.format(train_cycle, timeit.default_timer() - st))
Expand Down

0 comments on commit 34beb7a

Please sign in to comment.