Add C API for spoken language identification. (#695)

k2-fsa · Mar 25, 2024 · ab7cff2 · ab7cff2
1 parent 0d258dd
commit ab7cff2
Show file tree

Hide file tree

Showing 18 changed files with 363 additions and 67 deletions.
diff --git a/.github/scripts/test-c-api.sh b/.github/scripts/test-c-api.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+echo "SLID_EXE is $SLID_EXE"
+echo "PATH: $PATH"
+
+
+log "------------------------------------------------------------"
+log "Download whisper tiny for spoken language identification    "
+log "------------------------------------------------------------"
+
+rm -rf sherpa-onnx-whisper-tiny*
+curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
+tar xvf sherpa-onnx-whisper-tiny.tar.bz2
+rm sherpa-onnx-whisper-tiny.tar.bz2
+
+$SLID_EXE
+
+rm -rf sherpa-onnx-whisper-tiny*
diff --git a/.github/scripts/test-spoken-language-identification.sh b/.github/scripts/test-spoken-language-identification.sh
@@ -28,32 +28,32 @@ ar-arabic.wav
 bg-bulgarian.wav
 cs-czech.wav
 da-danish.wav
-de-german.wav
-el-greek.wav
-en-english.wav
-es-spanish.wav
-fa-persian.wav
-fi-finnish.wav
-fr-french.wav
-hi-hindi.wav
-hr-croatian.wav
-id-indonesian.wav
-it-italian.wav
-ja-japanese.wav
-ko-korean.wav
-nl-dutch.wav
-no-norwegian.wav
-po-polish.wav
-pt-portuguese.wav
-ro-romanian.wav
-ru-russian.wav
-sk-slovak.wav
-sv-swedish.wav
-ta-tamil.wav
-tl-tagalog.wav
-tr-turkish.wav
-uk-ukrainian.wav
-zh-chinese.wav
+# de-german.wav
+# el-greek.wav
+# en-english.wav
+# es-spanish.wav
+# fa-persian.wav
+# fi-finnish.wav
+# fr-french.wav
+# hi-hindi.wav
+# hr-croatian.wav
+# id-indonesian.wav
+# it-italian.wav
+# ja-japanese.wav
+# ko-korean.wav
+# nl-dutch.wav
+# no-norwegian.wav
+# po-polish.wav
+# pt-portuguese.wav
+# ro-romanian.wav
+# ru-russian.wav
+# sk-slovak.wav
+# sv-swedish.wav
+# ta-tamil.wav
+# tl-tagalog.wav
+# tr-turkish.wav
+# uk-ukrainian.wav
+# zh-chinese.wav
 )
 
 for wav in ${waves[@]}; do

diff --git a/.github/workflows/android.yaml b/.github/workflows/android.yaml
@@ -113,6 +113,7 @@ jobs:
             git config --global user.email "[email protected]"
             git config --global user.name "Fangjun Kuang"
 
+            rm -rf huggingface
             GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
 
             cd huggingface

diff --git a/.github/workflows/build-xcframework.yaml b/.github/workflows/build-xcframework.yaml
@@ -90,6 +90,7 @@ jobs:
             git config --global user.email "[email protected]"
             git config --global user.name "Fangjun Kuang"
 
+            rm -rf huggingface
             GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
 
             cd huggingface

diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml
@@ -123,8 +123,15 @@ jobs:
           name: release-${{ matrix.build_type }}-${{ matrix.shared_lib }}
           path: build/bin/*
 
-      - name: Test spoken language identification
-        if: matrix.build_type != 'Debug'
+      - name: Test spoken language identification (C API)
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export SLID_EXE=spoken-language-identification-c-api
+
+          .github/scripts/test-c-api.sh
+
+      - name: Test spoken language identification (C++ API)
         shell: bash
         run: |
           export PATH=$PWD/build/bin:$PATH
@@ -243,6 +250,7 @@ jobs:
             git config --global user.email "[email protected]"
             git config --global user.name "Fangjun Kuang"
 
+            rm -rf huggingface
             GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
 
             cd huggingface

diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml
@@ -102,8 +102,15 @@ jobs:
           otool -L build/bin/sherpa-onnx
           otool -l build/bin/sherpa-onnx
 
-      - name: Test spoken language identification
-        if: matrix.build_type != 'Debug'
+      - name: Test spoken language identification (C API)
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export SLID_EXE=spoken-language-identification-c-api
+
+          .github/scripts/test-c-api.sh
+
+      - name: Test spoken language identification (C++ API)
         shell: bash
         run: |
           export PATH=$PWD/build/bin:$PATH

diff --git a/.github/workflows/windows-x64.yaml b/.github/workflows/windows-x64.yaml
@@ -68,7 +68,15 @@ jobs:
 
           ls -lh ./bin/Release/sherpa-onnx.exe
 
-      - name: Test spoken language identification
+      - name: Test spoken language identification (C API)
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin/Release:$PATH
+          export SLID_EXE=spoken-language-identification-c-api.exe
+
+          .github/scripts/test-c-api.sh
+
+      - name: Test spoken language identification (C++ API)
         shell: bash
         run: |
           export PATH=$PWD/build/bin/Release:$PATH

diff --git a/.github/workflows/windows-x86.yaml b/.github/workflows/windows-x86.yaml
@@ -69,6 +69,14 @@ jobs:
 
           ls -lh ./bin/Release/sherpa-onnx.exe
 
+      - name: Test spoken language identification (C API)
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin/Release:$PATH
+          export SLID_EXE=spoken-language-identification-c-api.exe
+
+          .github/scripts/test-c-api.sh
+
       # - name: Test spoken language identification
       #   shell: bash
       #   run: |

diff --git a/.gitignore b/.gitignore
@@ -85,3 +85,4 @@ log
 vits-piper-*
 vits-coqui-*
 vits-mms-*
+*.tar.bz2
diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt
@@ -7,8 +7,11 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs)
 add_executable(offline-tts-c-api offline-tts-c-api.c)
 target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs)
 
+add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c)
+target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
+
 if(SHERPA_ONNX_HAS_ALSA)
   add_subdirectory(./asr-microphone-example)
-else()
+elseif((UNIX AND NOT APPLE) OR LINUX)
   message(WARNING "Not include ./asr-microphone-example since alsa is not available")
 endif()
diff --git a/c-api-examples/Makefile b/c-api-examples/Makefile
@@ -4,7 +4,7 @@ CUR_DIR :=$(shell pwd)
 CFLAGS := -I ../ -I ../build/_deps/cargs-src/include/
 LDFLAGS := -L ../build/lib
 LDFLAGS += -L ../build/_deps/onnxruntime-src/lib
-LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lcargs
+LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime
 LDFLAGS += -framework Foundation
 LDFLAGS += -lc++
 LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/lib

diff --git a/c-api-examples/decode-file-c-api.c b/c-api-examples/decode-file-c-api.c
@@ -169,55 +169,56 @@ int32_t main(int32_t argc, char *argv[]) {
   int32_t segment_id = 0;
 
   const char *wav_filename = argv[context.index];
-  FILE *fp = fopen(wav_filename, "rb");
-  if (!fp) {
-    fprintf(stderr, "Failed to open %s\n", wav_filename);
+  const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
+  if (wave == NULL) {
+    fprintf(stderr, "Failed to read %s\n", wav_filename);
     return -1;
   }
-
-  // Assume the wave header occupies 44 bytes.
-  fseek(fp, 44, SEEK_SET);
-
   // simulate streaming
 
 #define N 3200  // 0.2 s. Sample rate is fixed to 16 kHz
 
   int16_t buffer[N];
   float samples[N];
+  fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n",
+          wave->sample_rate, wave->num_samples,
+          (float)wave->num_samples / wave->sample_rate);
+
+  int32_t k = 0;
+  while (k < wave->num_samples) {
+    int32_t start = k;
+    int32_t end =
+        (start + N > wave->num_samples) ? wave->num_samples : (start + N);
+    k += N;
+
+    AcceptWaveform(stream, wave->sample_rate, wave->samples + start,
+                   end - start);
+    while (IsOnlineStreamReady(recognizer, stream)) {
+      DecodeOnlineStream(recognizer, stream);
+    }
 
-  while (!feof(fp)) {
-    size_t n = fread((void *)buffer, sizeof(int16_t), N, fp);
-    if (n > 0) {
-      for (size_t i = 0; i != n; ++i) {
-        samples[i] = buffer[i] / 32768.;
-      }
-      AcceptWaveform(stream, 16000, samples, n);
-      while (IsOnlineStreamReady(recognizer, stream)) {
-        DecodeOnlineStream(recognizer, stream);
-      }
+    const SherpaOnnxOnlineRecognizerResult *r =
+        GetOnlineStreamResult(recognizer, stream);
 
-      const SherpaOnnxOnlineRecognizerResult *r =
-          GetOnlineStreamResult(recognizer, stream);
+    if (strlen(r->text)) {
+      SherpaOnnxPrint(display, segment_id, r->text);
+    }
 
+    if (IsEndpoint(recognizer, stream)) {
       if (strlen(r->text)) {
-        SherpaOnnxPrint(display, segment_id, r->text);
+        ++segment_id;
       }
-
-      if (IsEndpoint(recognizer, stream)) {
-        if (strlen(r->text)) {
-          ++segment_id;
-        }
-        Reset(recognizer, stream);
-      }
-
-      DestroyOnlineRecognizerResult(r);
+      Reset(recognizer, stream);
     }
+
+    DestroyOnlineRecognizerResult(r);
   }
-  fclose(fp);
 
   // add some tail padding
   float tail_paddings[4800] = {0};  // 0.3 seconds at 16 kHz sample rate
-  AcceptWaveform(stream, 16000, tail_paddings, 4800);
+  AcceptWaveform(stream, wave->sample_rate, tail_paddings, 4800);
+
+  SherpaOnnxFreeWave(wave);
 
   InputFinished(stream);
   while (IsOnlineStreamReady(recognizer, stream)) {

diff --git a/c-api-examples/spoken-language-identification-c-api.c b/c-api-examples/spoken-language-identification-c-api.c
@@ -0,0 +1,65 @@
+
+// We assume you have pre-downloaded the whisper multi-lingual models
+// from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+// An example command to download the "tiny" whisper model is given below:
+//
+// clang-format off
+//
+// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
+// tar xvf sherpa-onnx-whisper-tiny.tar.bz2
+// rm sherpa-onnx-whisper-tiny.tar.bz2
+//
+// clang-format on
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "sherpa-onnx/c-api/c-api.h"
+
+int32_t main() {
+  SherpaOnnxSpokenLanguageIdentificationConfig config;
+
+  memset(&config, 0, sizeof(config));
+
+  config.whisper.encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx";
+  config.whisper.decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx";
+  config.num_threads = 1;
+  config.debug = 1;
+  config.provider = "cpu";
+
+  const SherpaOnnxSpokenLanguageIdentification *slid =
+      SherpaOnnxCreateSpokenLanguageIdentification(&config);
+  if (!slid) {
+    fprintf(stderr, "Failed to create spoken language identifier");
+    return -1;
+  }
+
+  // You can find more test waves from
+  // https://hf-mirror.com/spaces/k2-fsa/spoken-language-identification/tree/main/test_wavs
+  const char *wav_filename = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav";
+  const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
+  if (wave == NULL) {
+    fprintf(stderr, "Failed to read %s\n", wav_filename);
+    return -1;
+  }
+
+  SherpaOnnxOfflineStream *stream =
+      SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid);
+
+  AcceptWaveformOffline(stream, wave->sample_rate, wave->samples,
+                        wave->num_samples);
+
+  const SherpaOnnxSpokenLanguageIdentificationResult *result =
+      SherpaOnnxSpokenLanguageIdentificationCompute(slid, stream);
+
+  fprintf(stderr, "wav_filename: %s\n", wav_filename);
+  fprintf(stderr, "Detected language: %s\n", result->lang);
+
+  SherpaOnnxDestroySpokenLanguageIdentificationResult(result);
+  DestroyOfflineStream(stream);
+  SherpaOnnxFreeWave(wave);
+  SherpaOnnxDestroySpokenLanguageIdentification(slid);
+
+  return 0;
+}
diff --git a/dotnet-examples/offline-decode-files/run-hotwords.sh b/dotnet-examples/offline-decode-files/run-hotwords.sh
@@ -3,7 +3,7 @@
 set -ex
 
 if [ ! -d ./sherpa-onnx-zipformer-en-2023-04-01 ]; then
-  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
   tar xvf sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
   rm sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
 fi

diff --git a/dotnet-examples/offline-decode-files/run-zipformer.sh b/dotnet-examples/offline-decode-files/run-zipformer.sh
@@ -3,7 +3,7 @@
 set -ex
 
 if [ ! -d ./sherpa-onnx-zipformer-en-2023-04-01 ]; then
-  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
   tar xvf sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
   rm sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
 fi

diff --git a/dotnet-examples/online-decode-files/run-transducer.sh b/dotnet-examples/online-decode-files/run-transducer.sh
@@ -6,7 +6,7 @@
 
 set -ex
 if [ ! -d ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 ]; then
-  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
   tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
 fi