diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index 28964b9fc..a6cddd9d3 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,10 @@ cd dotnet-examples/ -cd streaming-hlg-decoding/ +cd speaker-identification +./run.sh + +cd ../streaming-hlg-decoding/ ./run.sh cd ../spoken-language-identification diff --git a/.github/workflows/test-dot-net.yaml b/.github/workflows/test-dot-net.yaml index 243b4f1a5..a62e00b5b 100644 --- a/.github/workflows/test-dot-net.yaml +++ b/.github/workflows/test-dot-net.yaml @@ -179,6 +179,7 @@ jobs: cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/ cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/ cp -v scripts/dotnet/examples/streaming-hlg-decoding.csproj dotnet-examples/streaming-hlg-decoding + cp -v scripts/dotnet/examples/speaker-identification.csproj dotnet-examples/speaker-identification ls -lh /tmp diff --git a/CMakeLists.txt b/CMakeLists.txt index f0fdb858b..baaef5afa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.9.23") +set(SHERPA_ONNX_VERSION "1.9.24") # Disable warning about # diff --git a/dotnet-examples/sherpa-onnx.sln b/dotnet-examples/sherpa-onnx.sln index ff514df37..7d295e15f 100644 --- a/dotnet-examples/sherpa-onnx.sln +++ b/dotnet-examples/sherpa-onnx.sln @@ -17,6 +17,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identificat EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "streaming-hlg-decoding", "streaming-hlg-decoding\streaming-hlg-decoding.csproj", "{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "speaker-identification", "speaker-identification\speaker-identification.csproj", "{2B1B140E-A92F-426B-B0DF-5D916B67304F}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -54,5 +56,9 @@ Global {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.Build.0 = Debug|Any CPU {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.ActiveCfg = Release|Any CPU {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.Build.0 = Release|Any CPU + {2B1B140E-A92F-426B-B0DF-5D916B67304F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2B1B140E-A92F-426B-B0DF-5D916B67304F}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2B1B140E-A92F-426B-B0DF-5D916B67304F}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2B1B140E-A92F-426B-B0DF-5D916B67304F}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection EndGlobal diff --git a/dotnet-examples/speaker-identification/Program.cs b/dotnet-examples/speaker-identification/Program.cs new file mode 100644 index 000000000..aef53e851 --- /dev/null +++ b/dotnet-examples/speaker-identification/Program.cs @@ -0,0 +1,155 @@ +// Copyright (c) 2024 Xiaomi Corporation +// +// This file shows how to do speaker identification with sherpa-onnx. +// +// 1. Download a model from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +// +// 2. Download test data from +// +// git clone https://github.com/csukuangfj/sr-data +// +// 3. Now run it +// +// dotnet run + +using SherpaOnnx; +using System.Collections.Generic; +using System; + +class SpeakerIdentificationDemo +{ + public static float[] ComputeEmbedding(SpeakerEmbeddingExtractor extractor, String filename) + { + WaveReader reader = new WaveReader(filename); + + OnlineStream stream = extractor.CreateStream(); + stream.AcceptWaveform(reader.SampleRate, reader.Samples); + stream.InputFinished(); + + float[] embedding = extractor.Compute(stream); + + return embedding; + } + + static void Main(string[] args) + { + var config = new SpeakerEmbeddingExtractorConfig(); + config.Model = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"; + config.Debug = 1; + var extractor = new SpeakerEmbeddingExtractor(config); + + var manager = new SpeakerEmbeddingManager(extractor.Dim); + + string[] spk1Files = + new string[] { + "./sr-data/enroll/fangjun-sr-1.wav", + "./sr-data/enroll/fangjun-sr-2.wav", + "./sr-data/enroll/fangjun-sr-3.wav", + }; + float[][] spk1Vec = new float[spk1Files.Length][]; + + for (int i = 0; i < spk1Files.Length; ++i) + { + spk1Vec[i] = ComputeEmbedding(extractor, spk1Files[i]); + } + + string[] spk2Files = + new string[] { + "./sr-data/enroll/leijun-sr-1.wav", "./sr-data/enroll/leijun-sr-2.wav", + }; + + float[][] spk2Vec = new float[spk2Files.Length][]; + + for (int i = 0; i < spk2Files.Length; ++i) + { + spk2Vec[i] = ComputeEmbedding(extractor, spk2Files[i]); + } + + if (!manager.Add("fangjun", spk1Vec)) + { + Console.WriteLine("Failed to register fangjun"); + return; + } + + if (!manager.Add("leijun", spk2Vec)) + { + Console.WriteLine("Failed to register leijun"); + return; + } + + if (manager.NumSpeakers != 2) + { + Console.WriteLine("There should be two speakers"); + return; + } + + if (!manager.Contains("fangjun")) + { + Console.WriteLine("It should contain the speaker fangjun"); + return; + } + + if (!manager.Contains("leijun")) + { + Console.WriteLine("It should contain the speaker leijun"); + return; + } + + Console.WriteLine("---All speakers---"); + + string[] allSpeakers = manager.GetAllSpeakers(); + foreach (var s in allSpeakers) + { + Console.WriteLine(s); + } + Console.WriteLine("------------"); + + string[] testFiles = + new string[] { + "./sr-data/test/fangjun-test-sr-1.wav", + "./sr-data/test/leijun-test-sr-1.wav", + "./sr-data/test/liudehua-test-sr-1.wav" + }; + + float threshold = 0.6f; + foreach (var file in testFiles) + { + float[] embedding = ComputeEmbedding(extractor, file); + + String name = manager.Search(embedding, threshold); + if (name == "") + { + name = ""; + } + Console.WriteLine("{0}: {1}", file, name); + } + + // test verify + if (!manager.Verify("fangjun", ComputeEmbedding(extractor, testFiles[0]), threshold)) + { + Console.WriteLine("testFiles[0] should match fangjun!"); + return; + } + + if (!manager.Remove("fangjun")) + { + Console.WriteLine("Failed to remove fangjun"); + return; + } + + if (manager.Verify("fangjun", ComputeEmbedding(extractor, testFiles[0]), threshold)) + { + Console.WriteLine("{0} should match no one!", testFiles[0]); + return; + } + + if (manager.NumSpeakers != 1) + { + Console.WriteLine("There should only 1 speaker left."); + return; + } + } +} diff --git a/dotnet-examples/speaker-identification/WaveReader.cs b/dotnet-examples/speaker-identification/WaveReader.cs new file mode 120000 index 000000000..2c5d16793 --- /dev/null +++ b/dotnet-examples/speaker-identification/WaveReader.cs @@ -0,0 +1 @@ +../offline-decode-files/WaveReader.cs \ No newline at end of file diff --git a/dotnet-examples/speaker-identification/run.sh b/dotnet-examples/speaker-identification/run.sh new file mode 100755 index 000000000..66f7b3f00 --- /dev/null +++ b/dotnet-examples/speaker-identification/run.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -e ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi + +if [ ! -d ./sr-data ]; then + git clone https://github.com/csukuangfj/sr-data +fi + +dotnet run diff --git a/dotnet-examples/speaker-identification/speaker-identification.csproj b/dotnet-examples/speaker-identification/speaker-identification.csproj new file mode 100644 index 000000000..3aa481d4e --- /dev/null +++ b/dotnet-examples/speaker-identification/speaker-identification.csproj @@ -0,0 +1,15 @@ + + + + Exe + net6.0 + speaker_identification + enable + enable + + + + + + + diff --git a/scripts/dotnet/examples/speaker-identification.csproj b/scripts/dotnet/examples/speaker-identification.csproj new file mode 100644 index 000000000..4d43ce0d8 --- /dev/null +++ b/scripts/dotnet/examples/speaker-identification.csproj @@ -0,0 +1,19 @@ + + + + Exe + net6.0 + speaker_identification + enable + enable + + + + /tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json + + + + + + + diff --git a/scripts/dotnet/offline.cs b/scripts/dotnet/offline.cs index c0b7e1d43..39de7526f 100644 --- a/scripts/dotnet/offline.cs +++ b/scripts/dotnet/offline.cs @@ -222,6 +222,14 @@ public int SampleRate } } + public int NumSpeakers + { + get + { + return SherpaOnnxOfflineTtsNumSpeakers(_handle.Handle); + } + } + [DllImport(Dll.Filename)] private static extern IntPtr SherpaOnnxCreateOfflineTts(ref OfflineTtsConfig config); @@ -231,6 +239,9 @@ public int SampleRate [DllImport(Dll.Filename)] private static extern int SherpaOnnxOfflineTtsSampleRate(IntPtr handle); + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxOfflineTtsNumSpeakers(IntPtr handle); + [DllImport(Dll.Filename)] private static extern IntPtr SherpaOnnxOfflineTtsGenerate(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string text, int sid, float speed); @@ -556,6 +567,112 @@ private void Cleanup() private static extern void Decode(IntPtr handle, IntPtr[] streams, int n); } + [StructLayout(LayoutKind.Sequential)] + public struct SpeakerEmbeddingExtractorConfig + { + public SpeakerEmbeddingExtractorConfig() + { + Model = ""; + NumThreads = 1; + Debug = 0; + Provider = "cpu"; + } + + [MarshalAs(UnmanagedType.LPStr)] + public string Model; + + public int NumThreads; + public int Debug; + + [MarshalAs(UnmanagedType.LPStr)] + public string Provider; + } + + public class SpeakerEmbeddingExtractor : IDisposable + { + public SpeakerEmbeddingExtractor(SpeakerEmbeddingExtractorConfig config) + { + IntPtr h = SherpaOnnxCreateSpeakerEmbeddingExtractor(ref config); + _handle = new HandleRef(this, h); + } + + public OnlineStream CreateStream() + { + IntPtr p = SherpaOnnxSpeakerEmbeddingExtractorCreateStream(_handle.Handle); + return new OnlineStream(p); + } + + public bool IsReady(OnlineStream stream) + { + return SherpaOnnxSpeakerEmbeddingExtractorIsReady(_handle.Handle, stream.Handle) != 0; + } + + public float[] Compute(OnlineStream stream) + { + IntPtr p = SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(_handle.Handle, stream.Handle); + + int dim = Dim; + float[] ans = new float[dim]; + Marshal.Copy(p, ans, 0, dim); + + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(p); + + return ans; + } + + public int Dim + { + get + { + return SherpaOnnxSpeakerEmbeddingExtractorDim(_handle.Handle); + } + } + + public void Dispose() + { + Cleanup(); + // Prevent the object from being placed on the + // finalization queue + System.GC.SuppressFinalize(this); + } + + ~SpeakerEmbeddingExtractor() + { + Cleanup(); + } + + private void Cleanup() + { + SherpaOnnxDestroySpeakerEmbeddingExtractor(_handle.Handle); + + // Don't permit the handle to be used again. + _handle = new HandleRef(this, IntPtr.Zero); + } + + private HandleRef _handle; + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxCreateSpeakerEmbeddingExtractor(ref SpeakerEmbeddingExtractorConfig config); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroySpeakerEmbeddingExtractor(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxSpeakerEmbeddingExtractorDim(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxSpeakerEmbeddingExtractorCreateStream(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxSpeakerEmbeddingExtractorIsReady(IntPtr handle, IntPtr stream); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(IntPtr handle, IntPtr stream); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(IntPtr p); + } + [StructLayout(LayoutKind.Sequential)] public struct SpokenLanguageIdentificationWhisperConfig { @@ -593,6 +710,185 @@ public SpokenLanguageIdentificationConfig() public string Provider; } + public class SpeakerEmbeddingManager : IDisposable + { + public SpeakerEmbeddingManager(int dim) + { + IntPtr h = SherpaOnnxCreateSpeakerEmbeddingManager(dim); + _handle = new HandleRef(this, h); + this._dim = dim; + } + + public bool Add(string name, float[] v) + { + return SherpaOnnxSpeakerEmbeddingManagerAdd(_handle.Handle, name, v) == 1; + } + + public bool Add(string name, ICollection v_list) + { + int n = v_list.Count; + float[] v = new float[n * _dim]; + int i = 0; + foreach (var item in v_list) + { + item.CopyTo(v, i); + i += _dim; + } + + return SherpaOnnxSpeakerEmbeddingManagerAddListFlattened(_handle.Handle, name, v, n) == 1; + } + + public bool Remove(string name) + { + return SherpaOnnxSpeakerEmbeddingManagerRemove(_handle.Handle, name) == 1; + } + + public string Search(float[] v, float threshold) + { + IntPtr p = SherpaOnnxSpeakerEmbeddingManagerSearch(_handle.Handle, v, threshold); + + string s = ""; + int length = 0; + + unsafe + { + byte* b = (byte*)p; + if (b != null) + { + while (*b != 0) + { + ++b; + length += 1; + } + } + } + + if (length > 0) + { + byte[] stringBuffer = new byte[length]; + Marshal.Copy(p, stringBuffer, 0, length); + s = Encoding.UTF8.GetString(stringBuffer); + } + + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(p); + + return s; + } + + public bool Verify(string name, float[] v, float threshold) + { + return SherpaOnnxSpeakerEmbeddingManagerVerify(_handle.Handle, name, v, threshold) == 1; + } + + public bool Contains(string name) + { + return SherpaOnnxSpeakerEmbeddingManagerContains(_handle.Handle, name) == 1; + } + + public string[] GetAllSpeakers() + { + if (NumSpeakers == 0) + { + return new string[] { }; + } + + IntPtr names = SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(_handle.Handle); + + string[] ans = new string[NumSpeakers]; + + unsafe + { + byte** p = (byte**)names; + for (int i = 0; i != NumSpeakers; i++) + { + int length = 0; + byte* s = p[i]; + while (*s != 0) + { + ++s; + length += 1; + } + byte[] stringBuffer = new byte[length]; + Marshal.Copy((IntPtr)p[i], stringBuffer, 0, length); + ans[i] = Encoding.UTF8.GetString(stringBuffer); + } + } + + SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(names); + + return ans; + } + + public void Dispose() + { + Cleanup(); + // Prevent the object from being placed on the + // finalization queue + System.GC.SuppressFinalize(this); + } + + ~SpeakerEmbeddingManager() + { + Cleanup(); + } + + private void Cleanup() + { + SherpaOnnxDestroySpeakerEmbeddingManager(_handle.Handle); + + // Don't permit the handle to be used again. + _handle = new HandleRef(this, IntPtr.Zero); + } + + public int NumSpeakers + { + get + { + return SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(_handle.Handle); + } + } + + private HandleRef _handle; + private int _dim; + + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxCreateSpeakerEmbeddingManager(int dim); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroySpeakerEmbeddingManager(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxSpeakerEmbeddingManagerAdd(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name, float[] v); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxSpeakerEmbeddingManagerAddListFlattened(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name, float[] v, int n); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxSpeakerEmbeddingManagerRemove(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxSpeakerEmbeddingManagerSearch(IntPtr handle, float[] v, float threshold); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxSpeakerEmbeddingManagerFreeSearch(IntPtr p); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxSpeakerEmbeddingManagerVerify(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name, float[] v, float threshold); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxSpeakerEmbeddingManagerContains(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(IntPtr names); + } + public class SpokenLanguageIdentificationResult { public SpokenLanguageIdentificationResult(IntPtr handle)