Skip to content

Commit

Permalink
fallback if valid dir not provided
Browse files Browse the repository at this point in the history
  • Loading branch information
davidcpage committed Jan 20, 2021
1 parent 6b4ed95 commit b1713d4
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 10 deletions.
4 changes: 4 additions & 0 deletions bonito/cli/basecaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def main(args):
args.reads_directory, n_proc=8, recursive=args.recursive,
read_ids=column_to_set(args.read_ids), skip=args.skip,
)
if args.n_reads:
from itertools import islice as take
reads = take(reads, args.n_reads)

basecall = load_symbol(args.model_directory, "basecall")

Expand Down Expand Up @@ -86,4 +89,5 @@ def argparser():
parser.add_argument("--ctc-min-coverage", default=0.9, type=float)
parser.add_argument("--ctc-min-accuracy", default=0.9, type=float)
parser.add_argument("--chunksize", default=4000, type=int)
parser.add_argument("--n_reads", default=0, type=int)
return parser
18 changes: 10 additions & 8 deletions bonito/cli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,16 @@ def main(args):

print("[loading data]")
train_data = load_data(limit=args.chunks, directory=args.directory)
train_dataset = ChunkDataSet(*train_data)

test_data = load_data(limit=args.validation_chunks, directory=args.directory, validation=True)
test_dataset = ChunkDataSet(*test_data)
if os.path.exists(os.path.join(args.directory, 'validation')):
valid_data = load_data(directory=os.path.join(args.directory, 'validation'))
else:
print("[validation set not found: splitting training set]")
split = np.floor(len(train_data[0]) * 0.97).astype(np.int32)
valid_data = [x[split:] for x in train_data]
train_data = [x[:split] for x in train_data]

train_loader = DataLoader(train_dataset, batch_size=args.batch, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=args.batch, num_workers=4, pin_memory=True)
train_loader = DataLoader(ChunkDataSet(*train_data), batch_size=args.batch, shuffle=True, num_workers=4, pin_memory=True)
valid_loader = DataLoader(ChunkDataSet(*valid_data), batch_size=args.batch, num_workers=4, pin_memory=True)

config = toml.load(args.config)
argsdict = dict(training=vars(args))
Expand Down Expand Up @@ -93,7 +96,7 @@ def main(args):
torch.save(model_state, os.path.join(workdir, "weights_%s.tar" % epoch))

val_loss, val_mean, val_median = test(
model, device, test_loader, criterion=criterion
model, device, valid_loader, criterion=criterion
)
except KeyboardInterrupt:
break
Expand Down Expand Up @@ -127,7 +130,6 @@ def argparser():
parser.add_argument("--epochs", default=5, type=int)
parser.add_argument("--batch", default=64, type=int)
parser.add_argument("--chunks", default=0, type=int)
parser.add_argument("--validation_chunks", default=0, type=int)
parser.add_argument("--amp", action="store_true", default=False)
parser.add_argument("--multi-gpu", action="store_true", default=False)
parser.add_argument("-f", "--force", action="store_true", default=False)
Expand Down
5 changes: 3 additions & 2 deletions bonito/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,13 +225,14 @@ def load_data(limit=None, directory=None, validation=False):
lengths = np.load(os.path.join(directory, "reference_lengths.npy"), mmap_mode='r')

indices = os.path.join(directory, "indices.npy")

if os.path.exists(indices):
idx = np.load(indices, mmap_mode='r')
if limit: idx = idx[:limit]
chunks = chunks[idx, :]
targets = targets[idx, :]
lengths = lengths[idx]

if limit:
elif limit:
chunks = chunks[:limit]
targets = targets[:limit]
lengths = lengths[:limit]
Expand Down

0 comments on commit b1713d4

Please sign in to comment.