Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Manifest refactor #284

Merged
merged 6 commits into from
Jan 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 119 additions & 107 deletions collections/nemo_asr/nemo_asr/data_layer.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,30 @@
# Copyright (c) 2019 NVIDIA Corporation
"""
This package contains Neural Modules responsible for ASR-related
data layers.
"""
__all__ = ['AudioToTextDataLayer',
'KaldiFeatureDataLayer',
'TranscriptDataLayer']
"""This package contains Neural Modules responsible for ASR data layers."""

from functools import partial

import torch
stasbel marked this conversation as resolved.
Show resolved Hide resolved

import nemo
from nemo.backends.pytorch import DataLayerNM
from nemo.core import DeviceType
from nemo.core.neural_types import *
from nemo.utils.misc import pad_to

from .parts.dataset import (
AudioDataset,
seq_collate_fn,
KaldiFeatureDataset,
TranscriptDataset)
TranscriptDataset,
)
from .parts.features import WaveformFeaturizer

__all__ = [
'AudioToTextDataLayer',
'KaldiFeatureDataLayer',
'TranscriptDataLayer',
]


class AudioToTextDataLayer(DataLayerNM):
"""Data Layer for general ASR tasks.
Expand Down Expand Up @@ -103,62 +106,65 @@ def output_ports(self):

"""
return {
"audio_signal": NeuralType({0: AxisType(BatchTag),
1: AxisType(TimeTag)}),

"a_sig_length": NeuralType({0: AxisType(BatchTag)}),

"transcripts": NeuralType({0: AxisType(BatchTag),
1: AxisType(TimeTag)}),

"transcript_length": NeuralType({0: AxisType(BatchTag)})
'audio_signal': NeuralType(
{0: AxisType(BatchTag), 1: AxisType(TimeTag)}
),
'a_sig_length': NeuralType({0: AxisType(BatchTag)}),
'transcripts': NeuralType(
{0: AxisType(BatchTag), 1: AxisType(TimeTag)}
),
'transcript_length': NeuralType({0: AxisType(BatchTag)}),
}

def __init__(
self, *,
manifest_filepath,
labels,
batch_size,
sample_rate=16000,
int_values=False,
bos_id=None,
eos_id=None,
pad_id=None,
min_duration=0.1,
max_duration=None,
normalize_transcripts=True,
trim_silence=False,
load_audio=True,
drop_last=False,
shuffle=True,
num_workers=0,
# perturb_config=None,
**kwargs
self,
*,
manifest_filepath,
labels,
batch_size,
sample_rate=16000,
int_values=False,
bos_id=None,
eos_id=None,
pad_id=None,
min_duration=0.1,
max_duration=None,
normalize_transcripts=True,
trim_silence=False,
load_audio=True,
drop_last=False,
shuffle=True,
num_workers=0,
# perturb_config=None,
**kwargs,
):
super().__init__(**kwargs)

self._featurizer = WaveformFeaturizer(
sample_rate=sample_rate, int_values=int_values, augmentor=None)
sample_rate=sample_rate, int_values=int_values, augmentor=None
)

# Set up dataset
dataset_params = {'manifest_filepath': manifest_filepath,
'labels': labels,
'featurizer': self._featurizer,
'max_duration': max_duration,
'min_duration': min_duration,
'normalize': normalize_transcripts,
'trim': trim_silence,
'bos_id': bos_id,
'eos_id': eos_id,
'load_audio': load_audio}

dataset_params = {
'manifest_filepath': manifest_filepath,
'labels': labels,
'featurizer': self._featurizer,
'max_duration': max_duration,
'min_duration': min_duration,
'normalize': normalize_transcripts,
'trim': trim_silence,
'bos_id': bos_id,
'eos_id': eos_id,
'load_audio': load_audio,
}
self._dataset = AudioDataset(**dataset_params)

# Set up data loader
if self._placement == DeviceType.AllGpu:
nemo.logging.info('Parallelizing DATALAYER')
nemo.logging.info("Parallelizing Datalayer.")
sampler = torch.utils.data.distributed.DistributedSampler(
self._dataset)
self._dataset
)
else:
sampler = None

Expand All @@ -170,7 +176,7 @@ def __init__(
drop_last=drop_last,
shuffle=shuffle if sampler is None else False,
sampler=sampler,
num_workers=num_workers
num_workers=num_workers,
)

def __len__(self):
Expand Down Expand Up @@ -243,46 +249,52 @@ def output_ports(self):

"""
return {
"processed_signal": NeuralType({0: AxisType(BatchTag),
1: AxisType(SpectrogramSignalTag),
2: AxisType(ProcessedTimeTag)}),

"processed_length": NeuralType({0: AxisType(BatchTag)}),

"transcripts": NeuralType({0: AxisType(BatchTag),
1: AxisType(TimeTag)}),

"transcript_length": NeuralType({0: AxisType(BatchTag)})
'processed_signal': NeuralType(
{
0: AxisType(BatchTag),
1: AxisType(SpectrogramSignalTag),
2: AxisType(ProcessedTimeTag),
}
),
'processed_length': NeuralType({0: AxisType(BatchTag)}),
'transcripts': NeuralType(
{0: AxisType(BatchTag), 1: AxisType(TimeTag)}
),
'transcript_length': NeuralType({0: AxisType(BatchTag)}),
}

def __init__(
self, *,
kaldi_dir,
labels,
batch_size,
min_duration=None,
max_duration=None,
normalize_transcripts=True,
drop_last=False,
shuffle=True,
num_workers=0,
**kwargs
self,
*,
kaldi_dir,
labels,
batch_size,
min_duration=None,
max_duration=None,
normalize_transcripts=True,
drop_last=False,
shuffle=True,
num_workers=0,
**kwargs,
):
super().__init__(**kwargs)

# Set up dataset
dataset_params = {'kaldi_dir': kaldi_dir,
'labels': labels,
'min_duration': min_duration,
'max_duration': max_duration,
'normalize': normalize_transcripts}
dataset_params = {
"kaldi_dir": kaldi_dir,
"labels": labels,
"min_duration": min_duration,
"max_duration": max_duration,
"normalize": normalize_transcripts,
}
self._dataset = KaldiFeatureDataset(**dataset_params)

# Set up data loader
if self._placement == DeviceType.AllGpu:
nemo.logging.info('Parallelizing DATALAYER')
nemo.logging.info("Parallelizing DATALAYER")
sampler = torch.utils.data.distributed.DistributedSampler(
self._dataset)
self._dataset
)
else:
sampler = None

Expand All @@ -293,7 +305,7 @@ def __init__(
drop_last=drop_last,
shuffle=shuffle if sampler is None else False,
sampler=sampler,
num_workers=num_workers
num_workers=num_workers,
)

@staticmethod
Expand All @@ -320,13 +332,13 @@ def _collate_fn(batch):
for feat, feat_len, tkns, tkns_len in batch:
feat_len = feat_len.item()
if feat_len < max_feat_len:
pad = (0, max_feat_len - feat_len)
pad = [0, max_feat_len - feat_len]
feat = torch.nn.functional.pad(feat, pad)
features.append(feat)

tkns_len = tkns_len.item()
if tkns_len < max_tokens_len:
pad = (0, max_tokens_len - tkns_len)
pad = [0, max_tokens_len - tkns_len]
tkns = torch.nn.functional.pad(tkns, pad)
tokens.append(tkns)

Expand Down Expand Up @@ -377,39 +389,40 @@ def output_ports(self):

"""
return {
'texts': NeuralType({
0: AxisType(BatchTag),
1: AxisType(TimeTag)
}),

"texts_length": NeuralType({0: AxisType(BatchTag)})
'texts': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
'texts_length': NeuralType({0: AxisType(BatchTag)}),
}

def __init__(self,
path,
labels,
batch_size,
bos_id=None,
eos_id=None,
pad_id=None,
drop_last=False,
num_workers=0,
shuffle=True,
**kwargs):
def __init__(
self,
path,
labels,
batch_size,
bos_id=None,
eos_id=None,
pad_id=None,
drop_last=False,
num_workers=0,
shuffle=True,
**kwargs,
):
super().__init__(**kwargs)

# Set up dataset
dataset_params = {'path': path,
'labels': labels,
'bos_id': bos_id,
'eos_id': eos_id}
dataset_params = {
'path': path,
'labels': labels,
'bos_id': bos_id,
'eos_id': eos_id,
}

self._dataset = TranscriptDataset(**dataset_params)

# Set up data loader
if self._placement == DeviceType.AllGpu:
sampler = torch.utils.data.distributed.DistributedSampler(
self._dataset)
self._dataset
)
else:
sampler = None

Expand All @@ -423,7 +436,7 @@ def __init__(self,
drop_last=drop_last,
shuffle=shuffle if sampler is None else False,
sampler=sampler,
num_workers=num_workers
num_workers=num_workers,
)

@staticmethod
Expand All @@ -433,8 +446,7 @@ def _collate_fn(batch, pad_id, pad8=False):
if pad8:
max_len = pad_to(max_len, 8)

texts = torch.empty(len(texts_list), max_len,
dtype=torch.long)
texts = torch.empty(len(texts_list), max_len, dtype=torch.long)
texts.fill_(pad_id)

for i, s in enumerate(texts_list):
Expand Down
3 changes: 1 addition & 2 deletions collections/nemo_asr/nemo_asr/parts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .manifest import ManifestEN, ManifestBase
from .dataset import AudioDataset
from .features import WaveformFeaturizer

__all__ = ['ManifestEN', 'ManifestBase', 'AudioDataset', 'WaveformFeaturizer']
__all__ = ['AudioDataset', 'WaveformFeaturizer']
Loading