Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[R1.0]asr streaming server add time stamp #1850

Merged
merged 2 commits into from
May 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 44 additions & 1 deletion paddlespeech/server/engine/asr/online/asr_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.
import copy
import os
import time
from typing import Optional

import numpy as np
Expand Down Expand Up @@ -297,7 +296,10 @@ def reset(self):
self.chunk_num = 0
self.global_frame_offset = 0
self.result_transcripts = ['']
self.word_time_stamp = []
self.time_stamp = []
self.first_char_occur_elapsed = None
self.word_time_stamp = None

def decode(self, is_finished=False):
if "deepspeech2online" in self.model_type:
Expand Down Expand Up @@ -513,6 +515,9 @@ def get_result(self):
else:
return ''

def get_word_time_stamp(self):
return self.word_time_stamp

@paddle.no_grad()
def rescoring(self):
if "deepspeech2online" in self.model_type or "deepspeech2offline" in self.model_type:
Expand Down Expand Up @@ -575,10 +580,48 @@ def rescoring(self):
best_index = i

# update the one best result
# hyps stored the beam results and each fields is:

logger.info(f"best index: {best_index}")
# logger.info(f'best result: {hyps[best_index]}')
# the field of the hyps is:
# hyps[0][0]: the sentence word-id in the vocab with a tuple
# hyps[0][1]: the sentence decoding probability with all paths
# hyps[0][2]: viterbi_blank ending probability
# hyps[0][3]: viterbi_non_blank probability
# hyps[0][4]: current_token_prob,
# hyps[0][5]: times_viterbi_blank,
# hyps[0][6]: times_titerbi_non_blank
self.hyps = [hyps[best_index][0]]

# update the hyps time stamp
self.time_stamp = hyps[best_index][5] if hyps[best_index][2] > hyps[
best_index][3] else hyps[best_index][6]
logger.info(f"time stamp: {self.time_stamp}")

self.update_result()

# update each word start and end time stamp
frame_shift_in_ms = self.model.encoder.embed.subsampling_rate * self.n_shift / self.sample_rate
logger.info(f"frame shift ms: {frame_shift_in_ms}")
word_time_stamp = []
for idx, _ in enumerate(self.time_stamp):
start = (self.time_stamp[idx - 1] + self.time_stamp[idx]
) / 2.0 if idx > 0 else 0
start = start * frame_shift_in_ms

end = (self.time_stamp[idx] + self.time_stamp[idx + 1]
) / 2.0 if idx < len(self.time_stamp) - 1 else self.offset
end = end * frame_shift_in_ms
word_time_stamp.append({
"w": self.result_transcripts[0][idx],
"bg": start,
"ed": end
})
# logger.info(f"{self.result_transcripts[0][idx]}, start: {start}, end: {end}")
self.word_time_stamp = word_time_stamp
logger.info(f"word time stamp: {self.word_time_stamp}")


class ASRServerExecutor(ASRExecutor):
def __init__(self):
Expand Down
90 changes: 74 additions & 16 deletions paddlespeech/server/engine/asr/online/ctc_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
from collections import defaultdict

import paddle
Expand All @@ -26,7 +27,7 @@ def __init__(self, config):
"""Implement the ctc prefix beam search

Args:
config (yacs.config.CfgNode): _description_
config (yacs.config.CfgNode): the ctc prefix beam search configuration
"""
self.config = config
self.reset()
Expand Down Expand Up @@ -54,51 +55,107 @@ def search(self, ctc_probs, device, blank_id=0):
assert len(ctc_probs.shape) == 2

# cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
# blank_ending_score and none_blank_ending_score in ln domain
# 0. blank_ending_score,
# 1. none_blank_ending_score,
# 2. viterbi_blank ending,
# 3. viterbi_non_blank,
# 4. current_token_prob,
# 5. times_viterbi_blank,
# 6. times_titerbi_non_blank
if self.cur_hyps is None:
self.cur_hyps = [(tuple(), (0.0, -float('inf')))]
self.cur_hyps = [(tuple(), (0.0, -float('inf'), 0.0, 0.0,
-float('inf'), [], []))]
# self.cur_hyps = [(tuple(), (0.0, -float('inf')))]
# 2. CTC beam search step by step
for t in range(0, maxlen):
logp = ctc_probs[t] # (vocab_size,)
# key: prefix, value (pb, pnb), default value(-inf, -inf)
next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
# next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
next_hyps = defaultdict(
lambda: (-float('inf'), -float('inf'), -float('inf'), -float('inf'), -float('inf'), [], []))

# 2.1 First beam prune: select topk best
# do token passing process
top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,)
for s in top_k_index:
s = s.item()
ps = logp[s].item()
for prefix, (pb, pnb) in self.cur_hyps:
for prefix, (pb, pnb, v_b_s, v_nb_s, cur_token_prob, times_s,
times_ns) in self.cur_hyps:
last = prefix[-1] if len(prefix) > 0 else None
if s == blank_id: # blank
n_pb, n_pnb = next_hyps[prefix]
n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[
prefix]
n_pb = log_add([n_pb, pb + ps, pnb + ps])
next_hyps[prefix] = (n_pb, n_pnb)

pre_times = times_s if v_b_s > v_nb_s else times_ns
n_times_s = copy.deepcopy(pre_times)
viterbi_score = v_b_s if v_b_s > v_nb_s else v_nb_s
n_v_s = viterbi_score + ps
next_hyps[prefix] = (n_pb, n_pnb, n_v_s, n_v_ns,
n_cur_token_prob, n_times_s,
n_times_ns)
elif s == last:
# Update *ss -> *s;
n_pb, n_pnb = next_hyps[prefix]
# case1: *a + a => *a
n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[
prefix]
n_pnb = log_add([n_pnb, pnb + ps])
next_hyps[prefix] = (n_pb, n_pnb)
if n_v_ns < v_nb_s + ps:
n_v_ns = v_nb_s + ps
if n_cur_token_prob < ps:
n_cur_token_prob = ps
n_times_ns = copy.deepcopy(times_ns)
n_times_ns[
-1] = self.abs_time_step # 注意,这里要重新使用绝对时间
next_hyps[prefix] = (n_pb, n_pnb, n_v_s, n_v_ns,
n_cur_token_prob, n_times_s,
n_times_ns)

# Update *s-s -> *ss, - is for blank
# Case 2: *aε + a => *aa
n_prefix = prefix + (s, )
n_pb, n_pnb = next_hyps[n_prefix]
n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[
n_prefix]
if n_v_ns < v_b_s + ps:
n_v_ns = v_b_s + ps
n_cur_token_prob = ps
n_times_ns = copy.deepcopy(times_s)
n_times_ns.append(self.abs_time_step)
n_pnb = log_add([n_pnb, pb + ps])
next_hyps[n_prefix] = (n_pb, n_pnb)
next_hyps[n_prefix] = (n_pb, n_pnb, n_v_s, n_v_ns,
n_cur_token_prob, n_times_s,
n_times_ns)
else:
# Case 3: *a + b => *ab, *aε + b => *ab
n_prefix = prefix + (s, )
n_pb, n_pnb = next_hyps[n_prefix]
n_pb, n_pnb, n_v_s, n_v_ns, n_cur_token_prob, n_times_s, n_times_ns = next_hyps[
n_prefix]
viterbi_score = v_b_s if v_b_s > v_nb_s else v_nb_s
pre_times = times_s if v_b_s > v_nb_s else times_ns
if n_v_ns < viterbi_score + ps:
n_v_ns = viterbi_score + ps
n_cur_token_prob = ps
n_times_ns = copy.deepcopy(pre_times)
n_times_ns.append(self.abs_time_step)

n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
next_hyps[n_prefix] = (n_pb, n_pnb)
next_hyps[n_prefix] = (n_pb, n_pnb, n_v_s, n_v_ns,
n_cur_token_prob, n_times_s,
n_times_ns)

# 2.2 Second beam prune
next_hyps = sorted(
next_hyps.items(),
key=lambda x: log_add(list(x[1])),
key=lambda x: log_add([x[1][0], x[1][1]]),
reverse=True)
self.cur_hyps = next_hyps[:beam_size]

self.hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in self.cur_hyps]
# 2.3 update the absolute time step
self.abs_time_step += 1

self.hyps = [(y[0], log_add([y[1][0], y[1][1]]), y[1][2], y[1][3],
y[1][4], y[1][5], y[1][6]) for y in self.cur_hyps]

logger.info("ctc prefix search success")
return self.hyps

Expand All @@ -123,6 +180,7 @@ def reset(self):
"""
self.cur_hyps = None
self.hyps = None
self.abs_time_step = 0

def finalize_search(self):
"""do nothing in ctc_prefix_beam_search
Expand Down
4 changes: 3 additions & 1 deletion paddlespeech/server/ws/asr_socket.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,14 @@ async def websocket_endpoint(websocket: WebSocket):
connection_handler.decode(is_finished=True)
connection_handler.rescoring()
asr_results = connection_handler.get_result()
word_time_stamp = connection_handler.get_word_time_stamp()
connection_handler.reset()

resp = {
"status": "ok",
"signal": "finished",
'result': asr_results
'result': asr_results,
'times': word_time_stamp
}
await websocket.send_json(resp)
break
Expand Down