From 52f19df07db6d3d3ba2dec7de492b663f19bd4ce Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 5 May 2022 20:00:10 +0800 Subject: [PATCH 1/5] Begin to add web client for streaming recognition. --- .../ASR/transducer_emformer/client/index.html | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 egs/librispeech/ASR/transducer_emformer/client/index.html diff --git a/egs/librispeech/ASR/transducer_emformer/client/index.html b/egs/librispeech/ASR/transducer_emformer/client/index.html new file mode 100644 index 000000000..85a21df49 --- /dev/null +++ b/egs/librispeech/ASR/transducer_emformer/client/index.html @@ -0,0 +1,36 @@ + + + + + + + + + + + Hello next-gen Kaldi + + + + +

Hello next-gen Kaldi

+ + + + + + diff --git a/egs/librispeech/ASR/transducer_emformer/client/main.js b/egs/librispeech/ASR/transducer_emformer/client/main.js new file mode 100644 index 000000000..a25eb5330 --- /dev/null +++ b/egs/librispeech/ASR/transducer_emformer/client/main.js @@ -0,0 +1,60 @@ +/** +References +https://developer.mozilla.org/en-US/docs/Web/API/FileList +https://developer.mozilla.org/en-US/docs/Web/API/FileReader +https://javascript.info/arraybuffer-binary-arrays +https://developer.mozilla.org/zh-CN/docs/Web/API/WebSocket +https://developer.mozilla.org/en-US/docs/Web/API/WebSocket/send +*/ + +var socket; +function initWebSocket() { + socket = new WebSocket("ws://localhost:6008/"); + + // Connection opened + socket.addEventListener( + 'open', + function(event) { document.getElementById('file').disabled = false; }); + + // Connection closed + socket.addEventListener('close', function(event) { + document.getElementById('file').disabled = true; + initWebSocket(); + }); + + // Listen for messages + socket.addEventListener('message', function(event) { + document.getElementById('results').innerHTML = event.data; + console.log('Received message: ', event.data); + }); +} + +function onFileChange() { + var files = document.getElementById("file").files; + + if (files.length == 0) { + console.log('No file selected'); + return; + } + + console.log('files: ' + files); + + const file = files[0]; + console.log(file); + console.log('file.name ' + file.name); + console.log('file.type ' + file.type); + console.log('file.size ' + file.size); + + let reader = new FileReader(); + reader.onload = function() { + let view = new Int16Array(reader.result); + console.log('bytes: ' + view.byteLength); + // we assume the input file is a wav file. + // TODO: add some checks here. + let body = view.subarray(44); + socket.send(body); + socket.send(JSON.stringify({'eof' : 1})); + }; + + reader.readAsArrayBuffer(file); +} diff --git a/egs/librispeech/ASR/transducer_emformer/server.py b/egs/librispeech/ASR/transducer_emformer/server.py new file mode 100755 index 000000000..35f66f60f --- /dev/null +++ b/egs/librispeech/ASR/transducer_emformer/server.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +import asyncio +import logging +from pathlib import Path + +import sentencepiece as spm +import torch +import websockets +from streaming_decode import StreamList, get_parser, process_features +from train import get_params, get_transducer_model + +from icefall.checkpoint import ( + average_checkpoints, + find_checkpoints, + load_checkpoint, +) +from icefall.utils import setup_logger + +g_params = None +g_model = None +g_sp = None + + +def build_stream_list(): + batch_size = 1 # will change it later + + stream_list = StreamList( + batch_size=batch_size, + context_size=g_params.context_size, + decoding_method=g_params.decoding_method, + ) + return stream_list + + +async def echo(websocket): + logging.info(f"connected: {websocket.remote_address}") + + stream_list = build_stream_list() + + # number of frames before subsampling + segment_length = g_model.encoder.segment_length + + right_context_length = g_model.encoder.right_context_length + + # We add 3 here since the subsampling method is using + # ((len - 1) // 2 - 1) // 2) + chunk_length = (segment_length + 3) + right_context_length + + async for message in websocket: + if isinstance(message, bytes): + samples = torch.frombuffer(message, dtype=torch.int16) + samples = samples.to(torch.float32) / 32768 + stream_list.accept_waveform( + audio_samples=[samples], + sampling_rate=g_params.sampling_rate, + ) + + while True: + features, active_streams = stream_list.build_batch( + chunk_length=chunk_length, + segment_length=segment_length, + ) + + if features is not None: + process_features( + model=g_model, + features=features, + streams=active_streams, + params=g_params, + sp=g_sp, + ) + results = [] + for stream in stream_list.streams: + text = g_sp.decode(stream.decoding_result()) + results.append(text) + await websocket.send(results[0]) + else: + break + elif isinstance(message, str): + stream_list[0].input_finished() + while True: + features, active_streams = stream_list.build_batch( + chunk_length=chunk_length, + segment_length=segment_length, + ) + + if features is not None: + process_features( + model=g_model, + features=features, + streams=active_streams, + params=g_params, + sp=g_sp, + ) + else: + break + + results = [] + for stream in stream_list.streams: + text = g_sp.decode(stream.decoding_result()) + results.append(text) + + await websocket.send(results[0]) + await websocket.close() + + logging.info(f"Closed: {websocket.remote_address}") + + +async def loop(): + logging.info("started") + async with websockets.serve(echo, "", 6008): + await asyncio.Future() # run forever + + +def main(): + parser = get_parser() + args = parser.parse_args() + args.exp_dir = Path(args.exp_dir) + + params = get_params() + params.update(vars(args)) + + # Note: params.decoding_method is currently not used. + params.res_dir = params.exp_dir / "streaming" / params.decoding_method + + setup_logger(f"{params.res_dir}/log-streaming-decode") + logging.info("Decoding started") + + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda", 0) + + sp = spm.SentencePieceProcessor() + sp.load(params.bpe_model) + + # and are defined in local/train_bpe_model.py + params.blank_id = sp.piece_to_id("") + params.unk_id = sp.piece_to_id("") + params.vocab_size = sp.get_piece_size() + + params.device = device + + logging.info(params) + + logging.info("About to create model") + model = get_transducer_model(params) + + if params.avg_last_n > 0: + filenames = find_checkpoints(params.exp_dir)[: params.avg_last_n] + logging.info(f"averaging {filenames}") + model.to(device) + model.load_state_dict(average_checkpoints(filenames, device=device)) + elif params.avg == 1: + load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model) + else: + start = params.epoch - params.avg + 1 + filenames = [] + for i in range(start, params.epoch + 1): + if start >= 0: + filenames.append(f"{params.exp_dir}/epoch-{i}.pt") + logging.info(f"averaging {filenames}") + model.to(device) + model.load_state_dict(average_checkpoints(filenames, device=device)) + + model.to(device) + model.eval() + model.device = device + + num_param = sum([p.numel() for p in model.parameters()]) + logging.info(f"Number of model parameters: {num_param}") + + global g_params, g_model, g_sp + g_params = params + g_model = model + g_sp = sp + + asyncio.run(loop()) + + +if __name__ == "__main__": + torch.manual_seed(20220506) + main() diff --git a/egs/librispeech/ASR/transducer_emformer/streaming_decode.py b/egs/librispeech/ASR/transducer_emformer/streaming_decode.py index 8ebfbb210..2064bd344 100755 --- a/egs/librispeech/ASR/transducer_emformer/streaming_decode.py +++ b/egs/librispeech/ASR/transducer_emformer/streaming_decode.py @@ -233,6 +233,9 @@ class StreamList(object): for _ in range(batch_size) ] + def __getitem__(self, i) -> FeatureExtractionStream: + return self.streams[i] + @property def done(self) -> bool: """Return True if all streams have reached end of utterance. @@ -667,8 +670,9 @@ def main(): sp = spm.SentencePieceProcessor() sp.load(params.bpe_model) - # is defined in local/train_bpe_model.py + # and are defined in local/train_bpe_model.py params.blank_id = sp.piece_to_id("") + params.unk_id = sp.piece_to_id("") params.vocab_size = sp.get_piece_size() params.device = device diff --git a/egs/librispeech/ASR/transducer_emformer/train.py b/egs/librispeech/ASR/transducer_emformer/train.py index 9798fe5e6..dae30f91b 100755 --- a/egs/librispeech/ASR/transducer_emformer/train.py +++ b/egs/librispeech/ASR/transducer_emformer/train.py @@ -378,6 +378,7 @@ def get_decoder_model(params: AttributeDict) -> nn.Module: vocab_size=params.vocab_size, embedding_dim=params.embedding_dim, blank_id=params.blank_id, + unk_id=params.unk_id, context_size=params.context_size, ) return decoder @@ -811,8 +812,9 @@ def run(rank, world_size, args): sp = spm.SentencePieceProcessor() sp.load(params.bpe_model) - # is defined in local/train_bpe_model.py + # and are defined in local/train_bpe_model.py params.blank_id = sp.piece_to_id("") + params.unk_id = sp.piece_to_id("") params.vocab_size = sp.get_piece_size() logging.info(params) From 9b5c18438d57b538f37fe639d2d2298950de644d Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 6 May 2022 21:43:36 +0800 Subject: [PATCH 3/5] Minor fixes. --- .../ASR/transducer_emformer/client/index.html | 41 ++++++++----- .../client/nav-partial.html | 20 +++++++ .../transducer_emformer/client/upload.html | 58 +++++++++++++++++++ .../client/{main.js => upload.js} | 2 +- 4 files changed, 105 insertions(+), 16 deletions(-) create mode 100644 egs/librispeech/ASR/transducer_emformer/client/nav-partial.html create mode 100644 egs/librispeech/ASR/transducer_emformer/client/upload.html rename egs/librispeech/ASR/transducer_emformer/client/{main.js => upload.js} (97%) diff --git a/egs/librispeech/ASR/transducer_emformer/client/index.html b/egs/librispeech/ASR/transducer_emformer/client/index.html index 5b6baa001..7e6ce8f45 100644 --- a/egs/librispeech/ASR/transducer_emformer/client/index.html +++ b/egs/librispeech/ASR/transducer_emformer/client/index.html @@ -12,29 +12,41 @@ crossorigin="anonymous"> + + Hello next-gen Kaldi + + + -

Hello next-gen Kaldi

-
- - -
+
    +
  • +
    +
    Upload
    +

    Recognition from a selected file

    +
    +
  • -
    - - -
    +
  • +
    +
    Record
    +

    Recognition from real-time recording

    +
    +
  • +
+ + Code is available at + https://github.com/k2-fsa/icefall/tree/streaming/egs/librispeech/ASR/transducer_emformer - - - diff --git a/egs/librispeech/ASR/transducer_emformer/client/nav-partial.html b/egs/librispeech/ASR/transducer_emformer/client/nav-partial.html new file mode 100644 index 000000000..c9e3aff96 --- /dev/null +++ b/egs/librispeech/ASR/transducer_emformer/client/nav-partial.html @@ -0,0 +1,20 @@ + diff --git a/egs/librispeech/ASR/transducer_emformer/client/upload.html b/egs/librispeech/ASR/transducer_emformer/client/upload.html new file mode 100644 index 000000000..b9d7e267b --- /dev/null +++ b/egs/librispeech/ASR/transducer_emformer/client/upload.html @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + Hello next-gen Kaldi (Upload file for recognition) + + + + + + + +

Recognition from a selected file

+
+
+ + +
+ +
+ + +
+
+ + + + + + + + + + + + diff --git a/egs/librispeech/ASR/transducer_emformer/client/main.js b/egs/librispeech/ASR/transducer_emformer/client/upload.js similarity index 97% rename from egs/librispeech/ASR/transducer_emformer/client/main.js rename to egs/librispeech/ASR/transducer_emformer/client/upload.js index a25eb5330..a2b0f8644 100644 --- a/egs/librispeech/ASR/transducer_emformer/client/main.js +++ b/egs/librispeech/ASR/transducer_emformer/client/upload.js @@ -47,7 +47,7 @@ function onFileChange() { let reader = new FileReader(); reader.onload = function() { - let view = new Int16Array(reader.result); + let view = new Uint8Array(reader.result); console.log('bytes: ' + view.byteLength); // we assume the input file is a wav file. // TODO: add some checks here. From 30b262617233936e92a1b4240a96e9491154a9e7 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 6 May 2022 23:34:40 +0800 Subject: [PATCH 4/5] Begin to add recorder. --- .../ASR/transducer_emformer/client/index.html | 4 +- .../client/nav-partial.html | 2 + .../transducer_emformer/client/record.html | 64 ++++++ .../ASR/transducer_emformer/client/record.js | 189 ++++++++++++++++++ .../transducer_emformer/client/upload.html | 2 +- 5 files changed, 258 insertions(+), 3 deletions(-) create mode 100644 egs/librispeech/ASR/transducer_emformer/client/record.html create mode 100644 egs/librispeech/ASR/transducer_emformer/client/record.js diff --git a/egs/librispeech/ASR/transducer_emformer/client/index.html b/egs/librispeech/ASR/transducer_emformer/client/index.html index 7e6ce8f45..d0fec4fc1 100644 --- a/egs/librispeech/ASR/transducer_emformer/client/index.html +++ b/egs/librispeech/ASR/transducer_emformer/client/index.html @@ -14,7 +14,7 @@ - Hello next-gen Kaldi + Next-gen Kaldi demo @@ -37,7 +37,7 @@
  • Record
    -

    Recognition from real-time recording

    +

    Recognition from real-time recordings

  • diff --git a/egs/librispeech/ASR/transducer_emformer/client/nav-partial.html b/egs/librispeech/ASR/transducer_emformer/client/nav-partial.html index c9e3aff96..513c1511f 100644 --- a/egs/librispeech/ASR/transducer_emformer/client/nav-partial.html +++ b/egs/librispeech/ASR/transducer_emformer/client/nav-partial.html @@ -14,7 +14,9 @@ + diff --git a/egs/librispeech/ASR/transducer_emformer/client/record.html b/egs/librispeech/ASR/transducer_emformer/client/record.html new file mode 100644 index 000000000..ae4d82036 --- /dev/null +++ b/egs/librispeech/ASR/transducer_emformer/client/record.html @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + Next-gen Kaldi demo (Upload file for recognition) + + + + + + + +

    Recognition from real-time recordings

    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    + +
    +
    +
    + +
    +
    + + + + + + + + + + + diff --git a/egs/librispeech/ASR/transducer_emformer/client/record.js b/egs/librispeech/ASR/transducer_emformer/client/record.js new file mode 100644 index 000000000..20de00be5 --- /dev/null +++ b/egs/librispeech/ASR/transducer_emformer/client/record.js @@ -0,0 +1,189 @@ +// see https://mdn.github.io/web-dictaphone/scripts/app.js +// and https://gist.github.com/meziantou/edb7217fddfbb70e899e + +const record = document.getElementById('record'); +const stop = document.getElementById('stop'); +const soundClips = document.getElementById('sound-clips'); +const canvas = document.getElementById('canvas'); + +soundClips.innerHTML = "hello"; + +stop.disabled = true; + +let audioCtx; +const canvasCtx = canvas.getContext("2d"); + +let sampleRate; + +if (navigator.mediaDevices.getUserMedia) { + console.log('getUserMedia supported.'); + + // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia + const constraints = { + // does not work + // audio : {sampleRate : 16000, sampleSize : 16, channelCount : 1} + audio : true, + }; + let chunks = []; + + let onSuccess = function(stream) { + var settings = stream.getAudioTracks()[0].getSettings(); + sampleRate = settings.sampleRate; + console.log(settings); + console.log('sample rate ' + settings.sampleRate); + console.log('channel count ' + settings.channelCount); + console.log('sample size ' + settings.sampleSize); + const mediaRecorder = new MediaRecorder(stream); + console.log('mime type ' + mediaRecorder.mimeType); + console.log('audio bits per second ' + mediaRecorder.audioBitsPerSecond); + console.log(mediaRecorder) + + visualize(stream); + + record.onclick = function() { + mediaRecorder.start(10); // 10ms period to send data + console.log(mediaRecorder.state); + console.log("recorder started"); + record.style.background = "red"; + + stop.disabled = false; + record.disabled = true; + }; + + stop.onclick = function() { + mediaRecorder.stop(); + console.log(mediaRecorder.state); + console.log("recorder stopped"); + record.style.background = ""; + record.style.color = ""; + // mediaRecorder.requestData(); + + stop.disabled = true; + record.disabled = false; + }; + + mediaRecorder.onstop = function(e) { + console.log("data available after MediaRecorder.stop() called."); + + const clipName = + prompt('Enter a name for your sound clip?', 'My unnamed clip'); + + const clipContainer = document.createElement('article'); + const clipLabel = document.createElement('p'); + const audio = document.createElement('audio'); + const deleteButton = document.createElement('button'); + + clipContainer.classList.add('clip'); + audio.setAttribute('controls', ''); + deleteButton.textContent = 'Delete'; + deleteButton.className = 'delete'; + + if (clipName === null) { + clipLabel.textContent = 'My unnamed clip'; + } else { + clipLabel.textContent = clipName; + } + + clipContainer.appendChild(audio); + clipContainer.appendChild(clipLabel); + clipContainer.appendChild(deleteButton); + soundClips.appendChild(clipContainer); + + audio.controls = true; + const blob = new Blob(chunks, {'type' : 'audio/ogg; codecs=opus'}); + chunks = []; + const audioURL = window.URL.createObjectURL(blob); + audio.src = audioURL; + console.log("recorder stopped"); + + deleteButton.onclick = + function(e) { + let evtTgt = e.target; + evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode); + } + + clipLabel.onclick = function() { + const existingName = clipLabel.textContent; + const newClipName = prompt('Enter a new name for your sound clip?'); + if (newClipName === null) { + clipLabel.textContent = existingName; + } else { + clipLabel.textContent = newClipName; + } + } + }; + + mediaRecorder.ondataavailable = function(e) { + console.log('size ' + e.data.size); + console.log(e.data); + chunks.push(e.data); + } + }; + + let onError = function( + err) { console.log('The following error occured: ' + err); }; + + navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); + +} else { + console.log('getUserMedia not supported on your browser!'); +} + +function visualize(stream) { + if (!audioCtx) { + audioCtx = new AudioContext(); + } + + const source = audioCtx.createMediaStreamSource(stream); + + const analyser = audioCtx.createAnalyser(); + analyser.fftSize = 2048; + const bufferLength = analyser.frequencyBinCount; + const dataArray = new Uint8Array(bufferLength); + + source.connect(analyser); + // analyser.connect(audioCtx.destination); + + draw() + + function draw() { + const WIDTH = canvas.width + const HEIGHT = canvas.height; + + requestAnimationFrame(draw); + + analyser.getByteTimeDomainData(dataArray); + + canvasCtx.fillStyle = 'rgb(200, 200, 200)'; + canvasCtx.fillRect(0, 0, WIDTH, HEIGHT); + + canvasCtx.lineWidth = 2; + canvasCtx.strokeStyle = 'rgb(0, 0, 0)'; + + canvasCtx.beginPath(); + + let sliceWidth = WIDTH * 1.0 / bufferLength; + let x = 0; + + for (let i = 0; i < bufferLength; i++) { + + let v = dataArray[i] / 128.0; + let y = v * HEIGHT / 2; + + if (i === 0) { + canvasCtx.moveTo(x, y); + } else { + canvasCtx.lineTo(x, y); + } + + x += sliceWidth; + } + + canvasCtx.lineTo(canvas.width, canvas.height / 2); + canvasCtx.stroke(); + } +} + +window.onresize = function() { canvas.width = mainSection.offsetWidth; }; + +window.onresize(); diff --git a/egs/librispeech/ASR/transducer_emformer/client/upload.html b/egs/librispeech/ASR/transducer_emformer/client/upload.html index b9d7e267b..a2ca1ce18 100644 --- a/egs/librispeech/ASR/transducer_emformer/client/upload.html +++ b/egs/librispeech/ASR/transducer_emformer/client/upload.html @@ -14,7 +14,7 @@ - Hello next-gen Kaldi (Upload file for recognition) + Next-gen Kaldi demo (Upload file for recognition) From 298b52ffa3180ef653be48acec73b5c0e007e661 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 7 May 2022 19:53:42 +0800 Subject: [PATCH 5/5] Support recognition from real-time recordings. --- .../transducer_emformer/client/record.html | 9 +- .../ASR/transducer_emformer/client/record.js | 258 ++++++++++++++---- .../transducer_emformer/client/upload.html | 2 +- 3 files changed, 210 insertions(+), 59 deletions(-) diff --git a/egs/librispeech/ASR/transducer_emformer/client/record.html b/egs/librispeech/ASR/transducer_emformer/client/record.html index ae4d82036..4a06e0ec9 100644 --- a/egs/librispeech/ASR/transducer_emformer/client/record.html +++ b/egs/librispeech/ASR/transducer_emformer/client/record.html @@ -18,7 +18,7 @@ - +