Add config for TensorRT and CUDA execution provider (#992)

Signed-off-by: [email protected] <[email protected]> Signed-off-by: [email protected] <[email protected]>
k2-fsa · Jul 5, 2024 · 55decb7 · 55decb7
1 parent f5e9a16
commit 55decb7
Show file tree

Hide file tree

Showing 21 changed files with 622 additions and 49 deletions.
diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc
@@ -73,7 +73,7 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer(
       SHERPA_ONNX_OR(config->model_config.tokens, "");
   recognizer_config.model_config.num_threads =
       SHERPA_ONNX_OR(config->model_config.num_threads, 1);
-  recognizer_config.model_config.provider =
+  recognizer_config.model_config.provider_config.provider =
       SHERPA_ONNX_OR(config->model_config.provider, "cpu");
   recognizer_config.model_config.model_type =
       SHERPA_ONNX_OR(config->model_config.model_type, "");
@@ -570,7 +570,7 @@ SherpaOnnxKeywordSpotter *CreateKeywordSpotter(
       SHERPA_ONNX_OR(config->model_config.tokens, "");
   spotter_config.model_config.num_threads =
       SHERPA_ONNX_OR(config->model_config.num_threads, 1);
-  spotter_config.model_config.provider =
+  spotter_config.model_config.provider_config.provider =
       SHERPA_ONNX_OR(config->model_config.provider, "cpu");
   spotter_config.model_config.model_type =
       SHERPA_ONNX_OR(config->model_config.model_type, "");

diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt
@@ -87,6 +87,7 @@ set(sources
   packed-sequence.cc
   pad-sequence.cc
   parse-options.cc
+  provider-config.cc
   provider.cc
   resample.cc
   session.cc

diff --git a/sherpa-onnx/csrc/online-model-config.cc b/sherpa-onnx/csrc/online-model-config.cc
@@ -16,6 +16,7 @@ void OnlineModelConfig::Register(ParseOptions *po) {
   wenet_ctc.Register(po);
   zipformer2_ctc.Register(po);
   nemo_ctc.Register(po);
+  provider_config.Register(po);
 
   po->Register("tokens", &tokens, "Path to tokens.txt");
 
@@ -29,9 +30,6 @@ void OnlineModelConfig::Register(ParseOptions *po) {
   po->Register("debug", &debug,
                "true to print model information while loading it.");
 
-  po->Register("provider", &provider,
-               "Specify a provider to use: cpu, cuda, coreml");
-
   po->Register("modeling-unit", &modeling_unit,
                "The modeling unit of the model, commonly used units are bpe, "
                "cjkchar, cjkchar+bpe, etc. Currently, it is needed only when "
@@ -87,6 +85,10 @@ bool OnlineModelConfig::Validate() const {
     return nemo_ctc.Validate();
   }
 
+  if (!provider_config.Validate()) {
+    return false;
+  }
+
   return transducer.Validate();
 }
 
@@ -99,11 +101,11 @@ std::string OnlineModelConfig::ToString() const {
   os << "wenet_ctc=" << wenet_ctc.ToString() << ", ";
   os << "zipformer2_ctc=" << zipformer2_ctc.ToString() << ", ";
   os << "nemo_ctc=" << nemo_ctc.ToString() << ", ";
+  os << "provider_config=" << provider_config.ToString() << ", ";
   os << "tokens=\"" << tokens << "\", ";
   os << "num_threads=" << num_threads << ", ";
   os << "warm_up=" << warm_up << ", ";
   os << "debug=" << (debug ? "True" : "False") << ", ";
-  os << "provider=\"" << provider << "\", ";
   os << "model_type=\"" << model_type << "\", ";
   os << "modeling_unit=\"" << modeling_unit << "\", ";
   os << "bpe_vocab=\"" << bpe_vocab << "\")";

diff --git a/sherpa-onnx/csrc/online-model-config.h b/sherpa-onnx/csrc/online-model-config.h
@@ -11,6 +11,7 @@
 #include "sherpa-onnx/csrc/online-transducer-model-config.h"
 #include "sherpa-onnx/csrc/online-wenet-ctc-model-config.h"
 #include "sherpa-onnx/csrc/online-zipformer2-ctc-model-config.h"
+#include "sherpa-onnx/csrc/provider-config.h"
 
 namespace sherpa_onnx {
 
@@ -20,11 +21,11 @@ struct OnlineModelConfig {
   OnlineWenetCtcModelConfig wenet_ctc;
   OnlineZipformer2CtcModelConfig zipformer2_ctc;
   OnlineNeMoCtcModelConfig nemo_ctc;
+  ProviderConfig provider_config;
   std::string tokens;
   int32_t num_threads = 1;
   int32_t warm_up = 0;
   bool debug = false;
-  std::string provider = "cpu";
 
   // Valid values:
   //  - conformer, conformer transducer from icefall
@@ -50,8 +51,9 @@ struct OnlineModelConfig {
                     const OnlineWenetCtcModelConfig &wenet_ctc,
                     const OnlineZipformer2CtcModelConfig &zipformer2_ctc,
                     const OnlineNeMoCtcModelConfig &nemo_ctc,
+                    const ProviderConfig &provider_config,
                     const std::string &tokens, int32_t num_threads,
-                    int32_t warm_up, bool debug, const std::string &provider,
+                    int32_t warm_up, bool debug,
                     const std::string &model_type,
                     const std::string &modeling_unit,
                     const std::string &bpe_vocab)
@@ -60,11 +62,11 @@ struct OnlineModelConfig {
         wenet_ctc(wenet_ctc),
         zipformer2_ctc(zipformer2_ctc),
         nemo_ctc(nemo_ctc),
+        provider_config(provider_config),
         tokens(tokens),
         num_threads(num_threads),
         warm_up(warm_up),
         debug(debug),
-        provider(provider),
         model_type(model_type),
         modeling_unit(modeling_unit),
         bpe_vocab(bpe_vocab) {}

diff --git a/sherpa-onnx/csrc/provider-config.cc b/sherpa-onnx/csrc/provider-config.cc
@@ -0,0 +1,143 @@
+// sherpa-onnx/csrc/provider-config.cc
+//
+// Copyright (c)  2024  Uniphore (Author: Manickavela)
+
+#include "sherpa-onnx/csrc/provider-config.h"
+
+#include <sstream>
+
+#include "sherpa-onnx/csrc/file-utils.h"
+#include "sherpa-onnx/csrc/macros.h"
+
+namespace sherpa_onnx {
+
+void CudaConfig::Register(ParseOptions *po) {
+  po->Register("cuda-cudnn-conv-algo-search", &cudnn_conv_algo_search,
+          "CuDNN convolution algrorithm search");
+}
+
+bool CudaConfig::Validate() const {
+  if (cudnn_conv_algo_search < 1 || cudnn_conv_algo_search > 3) {
+    SHERPA_ONNX_LOGE("cudnn_conv_algo_search: '%d' is not a valid option."
+                     "Options : [1,3]. Check OnnxRT docs",
+                    cudnn_conv_algo_search);
+    return false;
+  }
+  return true;
+}
+
+std::string CudaConfig::ToString() const {
+  std::ostringstream os;
+
+  os << "CudaConfig(";
+  os << "cudnn_conv_algo_search=" << cudnn_conv_algo_search << ")";
+
+  return os.str();
+}
+
+void TensorrtConfig::Register(ParseOptions *po) {
+  po->Register("trt-max-workspace-size", &trt_max_workspace_size,
+              "Set TensorRT EP GPU memory usage limit.");
+  po->Register("trt-max-partition-iterations", &trt_max_partition_iterations,
+              "Limit partitioning iterations for model conversion.");
+  po->Register("trt-min-subgraph-size", &trt_min_subgraph_size,
+              "Set minimum size for subgraphs in partitioning.");
+  po->Register("trt-fp16-enable", &trt_fp16_enable,
+              "Enable FP16 precision for faster performance.");
+  po->Register("trt-detailed-build-log", &trt_detailed_build_log,
+              "Enable detailed logging of build steps.");
+  po->Register("trt-engine-cache-enable", &trt_engine_cache_enable,
+              "Enable caching of TensorRT engines.");
+  po->Register("trt-timing-cache-enable", &trt_timing_cache_enable,
+              "Enable use of timing cache to speed up builds.");
+  po->Register("trt-engine-cache-path", &trt_engine_cache_path,
+              "Set path to store cached TensorRT engines.");
+  po->Register("trt-timing-cache-path", &trt_timing_cache_path,
+              "Set path for storing timing cache.");
+  po->Register("trt-dump-subgraphs", &trt_dump_subgraphs,
+              "Dump optimized subgraphs for debugging.");
+}
+
+bool TensorrtConfig::Validate() const {
+  if (trt_max_workspace_size < 0) {
+    SHERPA_ONNX_LOGE("trt_max_workspace_size: %d is not valid.",
+        trt_max_workspace_size);
+    return false;
+  }
+  if (trt_max_partition_iterations < 0) {
+    SHERPA_ONNX_LOGE("trt_max_partition_iterations: %d is not valid.",
+        trt_max_partition_iterations);
+    return false;
+  }
+  if (trt_min_subgraph_size < 0) {
+    SHERPA_ONNX_LOGE("trt_min_subgraph_size: %d is not valid.",
+        trt_min_subgraph_size);
+    return false;
+  }
+
+  return true;
+}
+
+std::string TensorrtConfig::ToString() const {
+  std::ostringstream os;
+
+  os << "TensorrtConfig(";
+  os << "trt_max_workspace_size=" << trt_max_workspace_size << ", ";
+  os << "trt_max_partition_iterations="
+      << trt_max_partition_iterations << ", ";
+  os << "trt_min_subgraph_size=" << trt_min_subgraph_size << ", ";
+  os << "trt_fp16_enable=\""
+      << (trt_fp16_enable? "True" : "False") << "\", ";
+  os << "trt_detailed_build_log=\""
+      << (trt_detailed_build_log? "True" : "False") << "\", ";
+  os << "trt_engine_cache_enable=\""
+      << (trt_engine_cache_enable? "True" : "False") << "\", ";
+  os << "trt_engine_cache_path=\""
+      << trt_engine_cache_path.c_str() << "\", ";
+  os << "trt_timing_cache_enable=\""
+      << (trt_timing_cache_enable? "True" : "False") << "\", ";
+  os << "trt_timing_cache_path=\""
+      << trt_timing_cache_path.c_str() << "\",";
+  os << "trt_dump_subgraphs=\""
+      << (trt_dump_subgraphs? "True" : "False") << "\" )";
+  return os.str();
+}
+
+void ProviderConfig::Register(ParseOptions *po) {
+  cuda_config.Register(po);
+  trt_config.Register(po);
+
+  po->Register("device", &device, "GPU device index for CUDA and Trt EP");
+  po->Register("provider", &provider,
+               "Specify a provider to use: cpu, cuda, coreml");
+}
+
+bool ProviderConfig::Validate() const {
+  if (device < 0) {
+    SHERPA_ONNX_LOGE("device: '%d' is invalid.", device);
+    return false;
+  }
+
+  if (provider == "cuda" && !cuda_config.Validate()) {
+    return false;
+  }
+
+  if (provider == "trt" && !trt_config.Validate()) {
+    return false;
+  }
+
+  return true;
+}
+
+std::string ProviderConfig::ToString() const {
+  std::ostringstream os;
+
+  os << "ProviderConfig(";
+  os << "device=" << device << ", ";
+  os << "provider=\"" << provider << "\", ";
+  os << "cuda_config=" << cuda_config.ToString() << ", ";
+  os << "trt_config=" << trt_config.ToString() << ")";
+  return os.str();
+}
+
+}  // namespace sherpa_onnx
diff --git a/sherpa-onnx/csrc/provider-config.h b/sherpa-onnx/csrc/provider-config.h
@@ -0,0 +1,95 @@
+// sherpa-onnx/csrc/provider-config.h
+//
+// Copyright (c)  2024  Uniphore (Author: Manickavela)
+
+#ifndef SHERPA_ONNX_CSRC_PROVIDER_CONFIG_H_
+#define SHERPA_ONNX_CSRC_PROVIDER_CONFIG_H_
+
+#include <string>
+
+#include "sherpa-onnx/csrc/parse-options.h"
+#include "sherpa-onnx/csrc/macros.h"
+#include "onnxruntime_cxx_api.h"  // NOLINT
+
+namespace sherpa_onnx {
+
+struct CudaConfig {
+  int32_t cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic;
+
+  CudaConfig() = default;
+  explicit CudaConfig(int32_t cudnn_conv_algo_search)
+      : cudnn_conv_algo_search(cudnn_conv_algo_search) {}
+
+  void Register(ParseOptions *po);
+  bool Validate() const;
+
+  std::string ToString() const;
+};
+
+struct TensorrtConfig {
+  int32_t trt_max_workspace_size = 2147483647;
+  int32_t trt_max_partition_iterations = 10;
+  int32_t trt_min_subgraph_size = 5;
+  bool trt_fp16_enable = true;
+  bool trt_detailed_build_log = false;
+  bool trt_engine_cache_enable = true;
+  bool trt_timing_cache_enable = true;
+  std::string trt_engine_cache_path = ".";
+  std::string trt_timing_cache_path = ".";
+  bool trt_dump_subgraphs = false;
+
+  TensorrtConfig() = default;
+  TensorrtConfig(int32_t trt_max_workspace_size,
+                int32_t trt_max_partition_iterations,
+                int32_t trt_min_subgraph_size,
+                bool trt_fp16_enable,
+                bool trt_detailed_build_log,
+                bool trt_engine_cache_enable,
+                bool trt_timing_cache_enable,
+                const std::string &trt_engine_cache_path,
+                const std::string &trt_timing_cache_path,
+                bool trt_dump_subgraphs)
+      : trt_max_workspace_size(trt_max_workspace_size),
+      trt_max_partition_iterations(trt_max_partition_iterations),
+      trt_min_subgraph_size(trt_min_subgraph_size),
+      trt_fp16_enable(trt_fp16_enable),
+      trt_detailed_build_log(trt_detailed_build_log),
+      trt_engine_cache_enable(trt_engine_cache_enable),
+      trt_timing_cache_enable(trt_timing_cache_enable),
+      trt_engine_cache_path(trt_engine_cache_path),
+      trt_timing_cache_path(trt_timing_cache_path),
+      trt_dump_subgraphs(trt_dump_subgraphs) {}
+
+  void Register(ParseOptions *po);
+  bool Validate() const;
+
+  std::string ToString() const;
+};
+
+struct ProviderConfig {
+  TensorrtConfig trt_config;
+  CudaConfig cuda_config;
+  std::string provider = "cpu";
+  int32_t device = 0;
+  // device only used for cuda and trt
+
+  ProviderConfig() = default;
+  ProviderConfig(const std::string &provider,
+                int32_t device)
+      : provider(provider), device(device) {}
+  ProviderConfig(const TensorrtConfig &trt_config,
+                const CudaConfig &cuda_config,
+                const std::string &provider,
+                int32_t device)
+      : trt_config(trt_config), cuda_config(cuda_config),
+      provider(provider), device(device) {}
+
+  void Register(ParseOptions *po);
+  bool Validate() const;
+
+  std::string ToString() const;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_PROVIDER_CONFIG_H_
diff --git a/sherpa-onnx/csrc/provider.h b/sherpa-onnx/csrc/provider.h
@@ -7,6 +7,7 @@
 
 #include <string>
 
+#include "sherpa-onnx/csrc/provider-config.h"
 namespace sherpa_onnx {
 
 // Please refer to