-
Notifications
You must be signed in to change notification settings - Fork 370
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Python ASR examples with alsa (#646)
- Loading branch information
1 parent
e9e8d75
commit d3287f9
Showing
12 changed files
with
326 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
207 changes: 207 additions & 0 deletions
207
python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Real-time speech recognition from a microphone with sherpa-onnx Python API | ||
# with endpoint detection. | ||
# | ||
# Note: This script uses ALSA and works only on Linux systems, especially | ||
# for embedding Linux systems and for running Linux on Windows using WSL. | ||
# | ||
# Please refer to | ||
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
# to download pre-trained models | ||
|
||
import argparse | ||
import sys | ||
from pathlib import Path | ||
import sherpa_onnx | ||
|
||
|
||
def assert_file_exists(filename: str): | ||
assert Path(filename).is_file(), ( | ||
f"{filename} does not exist!\n" | ||
"Please refer to " | ||
"https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" | ||
) | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser( | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
|
||
parser.add_argument( | ||
"--tokens", | ||
type=str, | ||
required=True, | ||
help="Path to tokens.txt", | ||
) | ||
|
||
parser.add_argument( | ||
"--encoder", | ||
type=str, | ||
required=True, | ||
help="Path to the encoder model", | ||
) | ||
|
||
parser.add_argument( | ||
"--decoder", | ||
type=str, | ||
required=True, | ||
help="Path to the decoder model", | ||
) | ||
|
||
parser.add_argument( | ||
"--joiner", | ||
type=str, | ||
required=True, | ||
help="Path to the joiner model", | ||
) | ||
|
||
parser.add_argument( | ||
"--decoding-method", | ||
type=str, | ||
default="greedy_search", | ||
help="Valid values are greedy_search and modified_beam_search", | ||
) | ||
|
||
parser.add_argument( | ||
"--provider", | ||
type=str, | ||
default="cpu", | ||
help="Valid values: cpu, cuda, coreml", | ||
) | ||
|
||
parser.add_argument( | ||
"--hotwords-file", | ||
type=str, | ||
default="", | ||
help=""" | ||
The file containing hotwords, one words/phrases per line, and for each | ||
phrase the bpe/cjkchar are separated by a space. For example: | ||
▁HE LL O ▁WORLD | ||
你 好 世 界 | ||
""", | ||
) | ||
|
||
parser.add_argument( | ||
"--hotwords-score", | ||
type=float, | ||
default=1.5, | ||
help=""" | ||
The hotword score of each token for biasing word/phrase. Used only if | ||
--hotwords-file is given. | ||
""", | ||
) | ||
|
||
parser.add_argument( | ||
"--blank-penalty", | ||
type=float, | ||
default=0.0, | ||
help=""" | ||
The penalty applied on blank symbol during decoding. | ||
Note: It is a positive value that would be applied to logits like | ||
this `logits[:, 0] -= blank_penalty` (suppose logits.shape is | ||
[batch_size, vocab] and blank id is 0). | ||
""", | ||
) | ||
|
||
parser.add_argument( | ||
"--device-name", | ||
type=str, | ||
required=True, | ||
help=""" | ||
The device name specifies which microphone to use in case there are several | ||
on your system. You can use | ||
arecord -l | ||
to find all available microphones on your computer. For instance, if it outputs | ||
**** List of CAPTURE Hardware Devices **** | ||
card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
Subdevices: 1/1 | ||
Subdevice #0: subdevice #0 | ||
and if you want to select card 3 and the device 0 on that card, please use: | ||
plughw:3,0 | ||
as the device_name. | ||
""", | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def create_recognizer(args): | ||
assert_file_exists(args.encoder) | ||
assert_file_exists(args.decoder) | ||
assert_file_exists(args.joiner) | ||
assert_file_exists(args.tokens) | ||
# Please replace the model files if needed. | ||
# See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
# for download links. | ||
recognizer = sherpa_onnx.OnlineRecognizer.from_transducer( | ||
tokens=args.tokens, | ||
encoder=args.encoder, | ||
decoder=args.decoder, | ||
joiner=args.joiner, | ||
num_threads=1, | ||
sample_rate=16000, | ||
feature_dim=80, | ||
enable_endpoint_detection=True, | ||
rule1_min_trailing_silence=2.4, | ||
rule2_min_trailing_silence=1.2, | ||
rule3_min_utterance_length=300, # it essentially disables this rule | ||
decoding_method=args.decoding_method, | ||
provider=args.provider, | ||
hotwords_file=args.hotwords_file, | ||
hotwords_score=args.hotwords_score, | ||
blank_penalty=args.blank_penalty, | ||
) | ||
return recognizer | ||
|
||
|
||
def main(): | ||
args = get_args() | ||
device_name = args.device_name | ||
print(f"device_name: {device_name}") | ||
alsa = sherpa_onnx.Alsa(device_name) | ||
|
||
print("Creating recognizer") | ||
recognizer = create_recognizer(args) | ||
print("Started! Please speak") | ||
|
||
sample_rate = 16000 | ||
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms | ||
|
||
stream = recognizer.create_stream() | ||
|
||
last_result = "" | ||
segment_id = 0 | ||
while True: | ||
samples = alsa.read(samples_per_read) # a blocking read | ||
stream.accept_waveform(sample_rate, samples) | ||
while recognizer.is_ready(stream): | ||
recognizer.decode_stream(stream) | ||
|
||
is_endpoint = recognizer.is_endpoint(stream) | ||
|
||
result = recognizer.get_result(stream) | ||
|
||
if result and (last_result != result): | ||
last_result = result | ||
print("\r{}:{}".format(segment_id, result), end="", flush=True) | ||
if is_endpoint: | ||
if result: | ||
print("\r{}:{}".format(segment_id, result), flush=True) | ||
segment_id += 1 | ||
recognizer.reset(stream) | ||
|
||
|
||
if __name__ == "__main__": | ||
try: | ||
main() | ||
except KeyboardInterrupt: | ||
print("\nCaught Ctrl + C. Exiting") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
// sherpa-onnx/python/csrc/alsa.cc | ||
// | ||
// Copyright (c) 2024 Xiaomi Corporation | ||
|
||
#include "sherpa-onnx/python/csrc/alsa.h" | ||
|
||
#include <vector> | ||
|
||
#include "sherpa-onnx/csrc/alsa.h" | ||
|
||
namespace sherpa_onnx { | ||
|
||
void PybindAlsa(py::module *m) { | ||
using PyClass = Alsa; | ||
py::class_<PyClass>(*m, "Alsa") | ||
.def(py::init<const char *>(), py::arg("device_name"), | ||
py::call_guard<py::gil_scoped_release>()) | ||
.def( | ||
"read", | ||
[](PyClass &self, int32_t num_samples) -> std::vector<float> { | ||
return self.Read(num_samples); | ||
}, | ||
py::arg("num_samples"), py::call_guard<py::gil_scoped_release>()) | ||
.def_property_readonly("expected_sample_rate", | ||
&PyClass::GetExpectedSampleRate) | ||
.def_property_readonly("actual_sample_rate", | ||
&PyClass::GetActualSampleRate); | ||
} | ||
|
||
} // namespace sherpa_onnx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
// sherpa-onnx/python/csrc/alsa.h | ||
// | ||
// Copyright (c) 2024 Xiaomi Corporation | ||
|
||
#ifndef SHERPA_ONNX_PYTHON_CSRC_ALSA_H_ | ||
#define SHERPA_ONNX_PYTHON_CSRC_ALSA_H_ | ||
|
||
#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
|
||
namespace sherpa_onnx { | ||
|
||
void PybindAlsa(py::module *m); | ||
|
||
} // namespace sherpa_onnx | ||
|
||
#endif // SHERPA_ONNX_PYTHON_CSRC_ALSA_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
// sherpa-onnx/python/csrc/faked-alsa.cc | ||
// | ||
// Copyright (c) 2024 Xiaomi Corporation | ||
|
||
#include "sherpa-onnx/csrc/macros.h" | ||
#include "sherpa-onnx/python/csrc/alsa.h" | ||
|
||
namespace sherpa_onnx { | ||
|
||
class FakedAlsa { | ||
public: | ||
explicit FakedAlsa(const char *) { | ||
SHERPA_ONNX_LOGE("This function is for Linux only."); | ||
#if (SHERPA_ONNX_ENABLE_ALSA == 0) && (defined(__unix__) || defined(__unix)) | ||
SHERPA_ONNX_LOGE(R"doc( | ||
sherpa-onnx is compiled without alsa support. To enable that, please run | ||
(1) sudo apt-get install alsa-utils libasound2-dev | ||
(2) rebuild sherpa-onnx | ||
)doc"); | ||
#endif | ||
exit(-1); | ||
} | ||
|
||
std::vector<float> Read(int32_t) const { return {}; } | ||
int32_t GetExpectedSampleRate() const { return -1; } | ||
int32_t GetActualSampleRate() const { return -1; } | ||
}; | ||
|
||
void PybindAlsa(py::module *m) { | ||
using PyClass = FakedAlsa; | ||
py::class_<PyClass>(*m, "Alsa") | ||
.def(py::init<const char *>(), py::arg("device_name")) | ||
.def( | ||
"read", | ||
[](PyClass &self, int32_t num_samples) -> std::vector<float> { | ||
return self.Read(num_samples); | ||
}, | ||
py::arg("num_samples"), py::call_guard<py::gil_scoped_release>()) | ||
.def_property_readonly("expected_sample_rate", | ||
&PyClass::GetExpectedSampleRate) | ||
.def_property_readonly("actual_sample_rate", | ||
&PyClass::GetActualSampleRate); | ||
} | ||
|
||
} // namespace sherpa_onnx | ||
|
||
#endif // SHERPA_ONNX_PYTHON_CSRC_FAKED_ALSA_H_ |
Oops, something went wrong.