Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Complete audio data integration: #366

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion demo/vdl_create_scratch_log
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ with logw.mode("train") as logger:
for sample in range(10):
idx = audio.is_sample_taken()
if idx >= 0:
audio.set_sample(idx, 8000, wavdata)
# 8k sample rate, 16bit frame, 1 channel
shape = [8000, 2, 1]
audio.set_sample(idx, shape, wavdata)

audio.finish_sampling()

Expand Down
21 changes: 13 additions & 8 deletions visualdl/logic/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -287,29 +287,34 @@ PYBIND11_MODULE(core, m) {
End a sampling period, it will clear all states for reservoir sampling.
)pbdoc")
.def("set_sample", &cp::Audio::SetSample, R"pbdoc(
Store the flatten audio data with sample rate specified.
Store the flatten audio data as vector of uint8 types. Audio params need to
be specified as a tuple of 3 integers as following:
sample_rate: number of samples(frames) per second, e.g. 8000, 16000 or 44100
sample_width: size of each sample(frame) in bytes, 16bit frame will be 2
num_channels: number of channels associated with the audio data, normally 1 or 2

:param index:
:type index: integer
:param sample_rate: Sample rate of audio
:type sample_rate: integer
:param audio_params: [sample rate, sample width, number of channels]
:type audio_params: tuple
:param audio_data: Flatten audio data
:type audio_data: list
)pbdoc")
.def("add_sample", &cp::Audio::AddSample, R"pbdoc(
A combined interface for is_sample_taken and set_sample, simpler but is less efficient.
Audio params details see set_sample

:param sample_rate: Sample rate of audio
:type sample_rate: integer
:param audio_params: [sample rate, sample width, number of channels]
:type audio_params: tuple
:param audio_data: Flatten audio data
:type audio_data: list
:type audio_data: list of uint8
)pbdoc");

py::class_<cp::AudioReader::AudioRecord>(m, "AudioRecord")
// TODO(Nicky) make these copyless.
.def("data", [](cp::AudioReader::AudioRecord& self) { return self.data; })
.def("sample_rate",
[](cp::AudioReader::AudioRecord& self) { return self.sample_rate; })
.def("shape",
[](cp::AudioReader::AudioRecord& self) { return self.shape; })
.def("step_id",
[](cp::AudioReader::AudioRecord& self) { return self.step_id; });

Expand Down
35 changes: 27 additions & 8 deletions visualdl/logic/sdk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -459,25 +459,40 @@ void Audio::FinishSampling() {
}
}

void Audio::AddSample(int sample_rate, const std::vector<value_t>& data) {
void Audio::AddSample(const std::vector<shape_t>& shape,
const std::vector<value_t>& data) {
auto idx = IndexOfSampleTaken();
if (idx >= 0) {
SetSample(idx, sample_rate, data);
SetSample(idx, shape, data);
}
}

void Audio::SetSample(int index,
int sample_rate,
const std::vector<shape_t>& shape,
const std::vector<value_t>& data) {
CHECK_GT(sample_rate, 0)
<< "sample rate should be something like 6000, 8000 or 44100";
CHECK_EQ(shape.size(), 3)
<< "shape need to be (sample rate, sample width, num channel)";

shape_t sample_rate = shape[0];
shape_t sample_width = shape[1];
shape_t num_channels = shape[2];

CHECK_GT(sample_rate, 0) << "sample rate is number of frames per second, "
"should be something like 8000, 16000 or 44100";
CHECK_GT(sample_width, 0)
<< "sample width is frame size in bytes, 16bits frame will be 2";
CHECK_GT(num_channels, 0) << "num channel will be something like 1 or 2";
CHECK_LT(index, num_samples_)
<< "index should be less than number of samples";
CHECK_LE(index, num_records_)
<< "index should be less than or equal to number of records";

BinaryRecord brcd(GenBinaryRecordDir(step_.parent()->dir()),
std::string(data.begin(), data.end()));
// due to prototype limit size, we create a directory to log binary data such
// as audio or image
BinaryRecord brcd(
GenBinaryRecordDir(step_.parent()->dir()),
std::string(data.begin(),
data.end())); // convert vector to binary string
brcd.tofile();

auto entry = step_.MutableData<std::vector<byte_t>>(index);
Expand All @@ -490,6 +505,7 @@ void Audio::SetSample(int index,
<< old_path << " failed";
}
entry.SetRaw(brcd.filename());
entry.SetMulti(shape);
}

std::string AudioReader::caption() {
Expand All @@ -511,10 +527,13 @@ AudioReader::AudioRecord AudioReader::record(int offset, int index) {
<< "g_log_dir should be set in LogReader construction";
BinaryRecordReader brcd(GenBinaryRecordDir(g_log_dir), filename);

// convert binary string back to vector of uint8_t, equivalent of python
// numpy.fromstring(data, dtype='uint8')
std::transform(brcd.data.begin(),
brcd.data.end(),
std::back_inserter(res.data),
[](byte_t i) { return (int8_t)(i); });
[](byte_t i) { return (uint8_t)(i); });
res.shape = entry.GetMulti<shape_t>();
res.step_id = record.id();
return res;
}
Expand Down
21 changes: 16 additions & 5 deletions visualdl/logic/sdk.h
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,8 @@ struct EmbeddingReader {
* Image component writer.
*/
struct Audio {
using value_t = float;
using value_t = uint8_t;
using shape_t = int32_t;

/*
* step_cycle: store every `step_cycle` as a record.
Expand Down Expand Up @@ -413,7 +414,8 @@ struct Audio {
* might be
* low efficiency.
*/
void AddSample(int sample_rate, const std::vector<value_t>& data);
void AddSample(const std::vector<shape_t>& shape,
const std::vector<value_t>& data);

/*
* Will this sample be taken, this interface is introduced to reduce the cost
Expand All @@ -425,7 +427,9 @@ struct Audio {
/*
* Store audio data with sample rate
*/
void SetSample(int index, int sample_rate, const std::vector<value_t>& data);
void SetSample(int index,
const std::vector<shape_t>& shape,
const std::vector<value_t>& data);

protected:
bool ToSampleThisStep() { return step_id_ % step_cycle_ == 0; }
Expand All @@ -444,11 +448,12 @@ struct Audio {
*/
struct AudioReader {
using value_t = typename Audio::value_t;
using shape_t = typename Audio::shape_t;

struct AudioRecord {
int step_id;
int sample_rate;
std::vector<int8_t> data;
std::vector<uint8_t> data;
std::vector<shape_t> shape;
};

AudioReader(const std::string& mode, TabletReader tablet)
Expand All @@ -475,6 +480,12 @@ struct AudioReader {
*/
std::vector<value_t> data(int offset, int index);

/*
* offset: offset of a step.
* index: index of a sample.
*/
std::vector<shape_t> shape(int offset, int index);

int stepid(int offset, int index);

private:
Expand Down
17 changes: 14 additions & 3 deletions visualdl/server/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,26 @@ def get_individual_audio(storage, mode, tag, step_index, max_size=80):
audio = reader.audio(tag)
record = audio.record(step_index, offset)

data = np.array(record.data(), dtype='uint8')
shape = record.shape()
sample_rate = shape[0]
sample_width = shape[1]
num_channels = shape[2]

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LOL

# sending a temp file to front end
tempfile = NamedTemporaryFile(mode='w+b', suffix='.wav')

# write audio file to that tempfile
wavfile = wave.open(tempfile, 'wb')
wavfile.setnchannels(2)
wavfile.setsampwidth(2)

wavfile.setframerate(sample_rate)
wavfile.setnchannels(num_channels)
wavfile.setsampwidth(sample_width)

# convert to binary string to write to wav file
data = np.array(record.data(), dtype='uint8')
wavfile.writeframes(data.tostring())

# make sure the marker is at the start of file
tempfile.seek(0, 0)

return tempfile
Expand Down
1 change: 0 additions & 1 deletion visualdl/server/visualDL
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,6 @@ def audio():
@app.route('/data/plugin/audio/individualAudio')
def individual_audio():
mode = request.args.get('run')
print mode
tag = request.args.get('tag') # include a index
step_index = int(request.args.get('index')) # index of step

Expand Down