From 64e07118cecce057405ea8b8bb1bd97bcc9caa0c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 30 Jun 2023 17:48:31 +0800 Subject: [PATCH 1/3] Real-time speech recognition from microphone for .Net --- dotnet-examples/sherpa-onnx.sln | 6 + .../Program.cs | 237 ++++++++++++++++++ .../speech-recognition-from-microphone/run.sh | 22 ++ .../speech-recognition-from-microphone.csproj | 19 ++ 4 files changed, 284 insertions(+) create mode 100644 dotnet-examples/speech-recognition-from-microphone/Program.cs create mode 100755 dotnet-examples/speech-recognition-from-microphone/run.sh create mode 100644 dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj diff --git a/dotnet-examples/sherpa-onnx.sln b/dotnet-examples/sherpa-onnx.sln index 66dac3c10..1f1615bb1 100644 --- a/dotnet-examples/sherpa-onnx.sln +++ b/dotnet-examples/sherpa-onnx.sln @@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "online-decode-files", "onli EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-decode-files", "offline-decode-files\offline-decode-files.csproj", "{2DAB152C-9E24-47A0-9DB0-781297ECE458}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "speech-recognition-from-microphone", "speech-recognition-from-microphone\speech-recognition-from-microphone.csproj", "{FE4EA1FF-062A-46B3-B78D-C828FED7B82E}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -24,5 +26,9 @@ Global {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Debug|Any CPU.Build.0 = Debug|Any CPU {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.ActiveCfg = Release|Any CPU {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.Build.0 = Release|Any CPU + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection EndGlobal diff --git a/dotnet-examples/speech-recognition-from-microphone/Program.cs b/dotnet-examples/speech-recognition-from-microphone/Program.cs new file mode 100644 index 000000000..c223d9a3b --- /dev/null +++ b/dotnet-examples/speech-recognition-from-microphone/Program.cs @@ -0,0 +1,237 @@ +// Copyright (c) 2023 Xiaomi Corporation +// Copyright (c) 2023 by manyeyes +// +// This file shows how to use a streaming model to decode files +// Please refer to +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html +// to download streaming models + +using CommandLine.Text; +using CommandLine; +using PortAudioSharp; +using System.Threading; +using SherpaOnnx; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System; + + +class OnlineDecodeFiles +{ + class Options + { + [Option(Required = true, HelpText = "Path to tokens.txt")] + public string Tokens { get; set; } + + [Option(Required = false, Default = "cpu", HelpText = "Provider, e.g., cpu, coreml")] + public string Provider { get; set; } + + [Option(Required = true, HelpText = "Path to encoder.onnx")] + public string Encoder { get; set; } + + [Option(Required = true, HelpText = "Path to decoder.onnx")] + public string Decoder { get; set; } + + [Option(Required = true, HelpText = "Path to joiner.onnx")] + public string Joiner { get; set; } + + [Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")] + public int NumThreads { get; set; } + + [Option("decoding-method", Required = false, Default = "greedy_search", + HelpText = "Valid decoding methods are: greedy_search, modified_beam_search")] + public string DecodingMethod { get; set; } + + [Option(Required = false, Default = false, HelpText = "True to show model info during loading")] + public bool Debug { get; set; } + + [Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")] + public int SampleRate { get; set; } + + [Option("max-active-paths", Required = false, Default = 4, + HelpText = @"Used only when --decoding--method is modified_beam_search. +It specifies number of active paths to keep during the search")] + public int MaxActivePaths { get; set; } + + [Option("enable-endpoint", Required = false, Default = true, + HelpText = "True to enable endpoint detection.")] + public bool EnableEndpoint { get; set; } + + [Option("rule1-min-trailing-silence", Required = false, Default = 2.4F, + HelpText = @"An endpoint is detected if trailing silence in seconds is +larger than this value even if nothing has been decoded. Used only when --enable-endpoint is true.")] + public float Rule1MinTrailingSilence { get; set; } + + [Option("rule2-min-trailing-silence", Required = false, Default = 0.8F, + HelpText = @"An endpoint is detected if trailing silence in seconds is +larger than this value after something that is not blank has been decoded. Used +only when --enable-endpoint is true.")] + public float Rule2MinTrailingSilence { get; set; } + + [Option("rule3-min-utterance-length", Required = false, Default = 20.0F, + HelpText = @"An endpoint is detected if the utterance in seconds is +larger than this value. Used only when --enable-endpoint is true.")] + public float Rule3MinUtteranceLength { get; set; } + } + + static void Main(string[] args) + { + var parser = new CommandLine.Parser(with => with.HelpWriter = null); + var parserResult = parser.ParseArguments(args); + + parserResult + .WithParsed(options => Run(options)) + .WithNotParsed(errs => DisplayHelp(parserResult, errs)); + } + + private static void DisplayHelp(ParserResult result, IEnumerable errs) + { + string usage = @" +dotnet run \ + --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ + --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ + --num-threads=2 \ + --decoding-method=modified_beam_search \ + --debug=false + +Please refer to +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html +to download pre-trained streaming models. +"; + + var helpText = HelpText.AutoBuild(result, h => + { + h.AdditionalNewLineAfterOption = false; + h.Heading = usage; + h.Copyright = "Copyright (c) 2023 Xiaomi Corporation"; + return HelpText.DefaultParsingErrorsHandler(result, h); + }, e => e); + Console.WriteLine(helpText); + } + + private static void Run(Options options) + { + OnlineRecognizerConfig config = new OnlineRecognizerConfig(); + config.FeatConfig.SampleRate = options.SampleRate; + + // All models from icefall using feature dim 80. + // You can change it if your model has a different feature dim. + config.FeatConfig.FeatureDim = 80; + + config.TransducerModelConfig.Encoder = options.Encoder; + config.TransducerModelConfig.Decoder = options.Decoder; + config.TransducerModelConfig.Joiner = options.Joiner; + config.TransducerModelConfig.Tokens = options.Tokens; + config.TransducerModelConfig.Provider = options.Provider; + config.TransducerModelConfig.NumThreads = options.NumThreads; + config.TransducerModelConfig.Debug = options.Debug ? 1 : 0; + + config.DecodingMethod = options.DecodingMethod; + config.MaxActivePaths = options.MaxActivePaths; + config.EnableEndpoint = options.EnableEndpoint ? 1 : 0; + + config.Rule1MinTrailingSilence = options.Rule1MinTrailingSilence; + config.Rule2MinTrailingSilence = options.Rule2MinTrailingSilence; + config.Rule3MinUtteranceLength = options.Rule3MinUtteranceLength; + + OnlineRecognizer recognizer = new OnlineRecognizer(config); + + + OnlineStream s = recognizer.CreateStream(); + + Console.WriteLine(PortAudio.VersionInfo.versionText); + PortAudio.Initialize(); + + Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}"); + for (int i = 0; i != PortAudio.DeviceCount; ++i) + { + Console.WriteLine($" Device {i}"); + DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i); + Console.WriteLine($" Name: {deviceInfo.name}"); + Console.WriteLine($" Max input channels: {deviceInfo.maxInputChannels}"); + Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}"); + } + int deviceIndex = PortAudio.DefaultInputDevice; + if (deviceIndex == PortAudio.NoDevice) + { + Console.WriteLine("No default input device found"); + Environment.Exit(1); + } + + DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex); + + Console.WriteLine(); + Console.WriteLine($"Use default device {deviceIndex} ({info.name})"); + + StreamParameters param = new StreamParameters(); + param.device = deviceIndex; + param.channelCount = 1; + param.sampleFormat = SampleFormat.Float32; + param.suggestedLatency = info.defaultLowInputLatency; + param.hostApiSpecificStreamInfo = IntPtr.Zero; + + PortAudioSharp.Stream.Callback callback = (IntPtr input, IntPtr output, + UInt32 frameCount, + ref StreamCallbackTimeInfo timeInfo, + StreamCallbackFlags statusFlags, + IntPtr userData + ) => + { + float[] samples = new float[frameCount]; + Marshal.Copy(input, samples, 0, (Int32)frameCount); + + s.AcceptWaveform(options.SampleRate, samples); + + return StreamCallbackResult.Continue; + }; + + PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: param, outParams: null, sampleRate: options.SampleRate, + framesPerBuffer: 0, + streamFlags: StreamFlags.ClipOff, + callback: callback, + userData: IntPtr.Zero + ); + + Console.WriteLine(param); + + stream.Start(); + + int segment_index = 0; + String lastText = ""; + int segmentIndex = 0; + + while (true) + { + while (recognizer.IsReady(s)) + { + recognizer.Decode(s); + } + + var text = recognizer.GetResult(s).Text; + bool isEndpoint = recognizer.IsEndpoint(s); + if (!string.IsNullOrWhiteSpace(text) && lastText != text) + { + lastText = text; + Console.Write($"\r{segmentIndex}: {lastText}"); + } + + if (isEndpoint) + { + if (!string.IsNullOrWhiteSpace(text)) + { + ++segmentIndex; + Console.WriteLine(); + } + recognizer.Reset(s); + } + + Thread.Sleep(200); // ms + } + + PortAudio.Terminate(); + + + } +} diff --git a/dotnet-examples/speech-recognition-from-microphone/run.sh b/dotnet-examples/speech-recognition-from-microphone/run.sh new file mode 100755 index 000000000..2e92c2ebf --- /dev/null +++ b/dotnet-examples/speech-recognition-from-microphone/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Please refer to +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english +# to download the model files +# +export LD_LIBRARY_PATH=$PWD:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$PWD:$DYLD_LIBRARY_PATH + +if [ ! -d ./icefall-asr-zipformer-streaming-wenetspeech-20230615 ]; then + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615 + cd icefall-asr-zipformer-streaming-wenetspeech-20230615 + git lfs pull --include "*.onnx" + cd .. +fi + +dotnet run -c Release \ + --tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ + --encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx \ + --decoding-method greedy_search diff --git a/dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj b/dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj new file mode 100644 index 000000000..be0f45b2b --- /dev/null +++ b/dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj @@ -0,0 +1,19 @@ + + + + Exe + net6.0 + speech_recognition_from_microphone + enable + enable + + + + + + + + + + + From b285a2f855b69c8dc295f8a27a8dec19865d2695 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 1 Jul 2023 18:36:33 +0800 Subject: [PATCH 2/3] Use PortAudioSharp2. We will maintain it by ourselves. The project is at https://github.com/csukuangfj/PortAudioSharp2 --- CMakeLists.txt | 2 +- dotnet-examples/speech-recognition-from-microphone/Program.cs | 4 ++-- .../speech-recognition-from-microphone.csproj | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 294dc9daf..bc533dc74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.4.6") +set(SHERPA_ONNX_VERSION "1.4.7") # Disable warning about # diff --git a/dotnet-examples/speech-recognition-from-microphone/Program.cs b/dotnet-examples/speech-recognition-from-microphone/Program.cs index c223d9a3b..b3d4aeaa3 100644 --- a/dotnet-examples/speech-recognition-from-microphone/Program.cs +++ b/dotnet-examples/speech-recognition-from-microphone/Program.cs @@ -1,7 +1,7 @@ // Copyright (c) 2023 Xiaomi Corporation -// Copyright (c) 2023 by manyeyes // -// This file shows how to use a streaming model to decode files +// This file shows how to use a streaming model for real-time speech +// recognition from a microphone. // Please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html // to download streaming models diff --git a/dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj b/dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj index be0f45b2b..4d7d90560 100644 --- a/dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj +++ b/dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj @@ -11,9 +11,7 @@ - - - + From 92d2727f8124869b290343edbb0771a83551e4b6 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 1 Jul 2023 18:43:25 +0800 Subject: [PATCH 3/3] minor fixes --- .../speech-recognition-from-microphone/Program.cs | 13 +++++-------- .../speech-recognition-from-microphone/run.sh | 3 +-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/dotnet-examples/speech-recognition-from-microphone/Program.cs b/dotnet-examples/speech-recognition-from-microphone/Program.cs index b3d4aeaa3..e8be99779 100644 --- a/dotnet-examples/speech-recognition-from-microphone/Program.cs +++ b/dotnet-examples/speech-recognition-from-microphone/Program.cs @@ -87,14 +87,11 @@ static void Main(string[] args) private static void DisplayHelp(ParserResult result, IEnumerable errs) { string usage = @" -dotnet run \ - --tokens=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ - --encoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ - --decoder=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ - --joiner=./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ - --num-threads=2 \ - --decoding-method=modified_beam_search \ - --debug=false +dotnet run -c Release \ + --tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ + --encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ + --joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx \ Please refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html diff --git a/dotnet-examples/speech-recognition-from-microphone/run.sh b/dotnet-examples/speech-recognition-from-microphone/run.sh index 2e92c2ebf..7d5fe0b5e 100755 --- a/dotnet-examples/speech-recognition-from-microphone/run.sh +++ b/dotnet-examples/speech-recognition-from-microphone/run.sh @@ -18,5 +18,4 @@ dotnet run -c Release \ --tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ --encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \ --decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ - --joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx \ - --decoding-method greedy_search + --joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx