diff --git a/.github/scripts/aishell3/TTS/run.sh b/.github/scripts/aishell3/TTS/run.sh index 81fba1de44..93ff695728 100755 --- a/.github/scripts/aishell3/TTS/run.sh +++ b/.github/scripts/aishell3/TTS/run.sh @@ -39,6 +39,13 @@ function prepare_data() { echo "------------------------------" wc -l data/tokens.txt echo "------------------------------" + + echo "----------lexicon.txt----------" + head data/lexicon.txt + echo "----" + tail data/lexicon.txt + echo "----" + wc -l data/lexicon.txt } function train() { @@ -47,7 +54,8 @@ function train() { git diff . popd - for t in low medium high; do + # for t in low medium high; do + for t in low; do ./vits/train.py \ --exp-dir vits/exp-$t \ --model-type $t \ @@ -62,12 +70,13 @@ function train() { } function export_onnx() { - for t in low medium high; do + # for t in low medium high; do + for t in low; do ./vits/export-onnx.py \ --model-type $t \ --epoch 1 \ --exp-dir ./vits/exp-$t \ - --tokens data/tokens.txt + --tokens data/tokens.txt \ --speakers ./data/speakers.txt ls -lh vits/exp-$t/ @@ -75,7 +84,30 @@ function export_onnx() { } function test_low() { - echo "TODO" + git clone https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06 + repo=icefall-tts-aishell3-vits-low-2024-04-06 + + ./vits/export-onnx.py \ + --model-type low \ + --epoch 1000 \ + --exp-dir $repo/exp \ + --tokens $repo/data/tokens.txt \ + --speakers $repo/data/speakers.txt + + ls -lh $repo/exp/vits-epoch-1000.onnx + + python3 -m pip install sherpa-onnx + + sherpa-onnx-offline-tts \ + --vits-model=$repo/exp/vits-epoch-960.onnx \ + --vits-tokens=$repo/data/tokens.txt \ + --vits-lexicon=$repo/data/lexicon.txt \ + --num-threads=1 \ + --vits-length-scale=1.0 \ + --sid=33 \ + --output-filename=/icefall/low.wav \ + --debug=1 \ + "这是一个语音合成测试" } diff --git a/.github/workflows/aishell3.yml b/.github/workflows/aishell3.yml index e60c85f4d4..542c77663d 100644 --- a/.github/workflows/aishell3.yml +++ b/.github/workflows/aishell3.yml @@ -1,4 +1,4 @@ -name: aishell +name: aishell3 on: push: @@ -71,3 +71,14 @@ jobs: git config --global --add safe.directory /icefall .github/scripts/aishell3/TTS/run.sh + + - name: display files + shell: bash + run: | + ls -lh + + - uses: actions/upload-artifact@v4 + if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' + with: + name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }} + path: ./*.wav diff --git a/egs/aishell3/TTS/local/generate_lexicon.py b/egs/aishell3/TTS/local/generate_lexicon.py new file mode 100755 index 0000000000..77dd77d625 --- /dev/null +++ b/egs/aishell3/TTS/local/generate_lexicon.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +""" +This file generates the file lexicon.txt that contains pronunciations of all +words and phrases +""" + +from pypinyin import phrases_dict, pinyin_dict +from tokenizer import Tokenizer + +import argparse + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--tokens", + type=str, + default="data/tokens.txt", + help="""Path to vocabulary.""", + ) + + parser.add_argument( + "--lexicon", + type=str, + default="data/lexicon.txt", + help="""Path to save the generated lexicon.""", + ) + return parser + + +def main(): + args = get_parser().parse_args() + filename = args.lexicon + tokens = args.tokens + tokenizer = Tokenizer(tokens) + + word_dict = pinyin_dict.pinyin_dict + phrases = phrases_dict.phrases_dict + + i = 0 + with open(filename, "w", encoding="utf-8") as f: + for key in word_dict: + if not (0x4E00 <= key <= 0x9FFF): + continue + + w = chr(key) + + # 1 to remove the initial sil + # :-1 to remove the final eos + tokens = tokenizer.text_to_tokens(w)[1:-1] + + tokens = " ".join(tokens) + f.write(f"{w} {tokens}\n") + + # TODO(fangjun): Add phrases + # for key in phrases: + # # 1 to remove the initial sil + # # :-1 to remove the final eos + # tokens = tokenizer.text_to_tokens(key)[1:-1] + # tokens = " ".join(tokens) + # f.write(f"{key} {tokens}\n") + + +if __name__ == "__main__": + main() diff --git a/egs/aishell3/TTS/local/prepare_token_file.py b/egs/aishell3/TTS/local/prepare_token_file.py index d90910ab02..57ef837b82 100755 --- a/egs/aishell3/TTS/local/prepare_token_file.py +++ b/egs/aishell3/TTS/local/prepare_token_file.py @@ -17,7 +17,7 @@ """ -This file generates the file that maps tokens to IDs. +This file generates the file tokens.txt that maps tokens to IDs. """ import argparse diff --git a/egs/aishell3/TTS/prepare.sh b/egs/aishell3/TTS/prepare.sh index fe3f762054..db721e67fa 100755 --- a/egs/aishell3/TTS/prepare.sh +++ b/egs/aishell3/TTS/prepare.sh @@ -121,10 +121,14 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Generate token file" + log "Stage 6: Generate tokens.txt and lexicon.txt " if [ ! -e data/tokens.txt ]; then ./local/prepare_token_file.py --tokens data/tokens.txt fi + + if [ ! -e data/lexicon.txt ]; then + ./local/generate_lexicon.py --tokens data/tokens.txt --lexicon data/lexicon.txt + fi fi if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then diff --git a/egs/aishell3/TTS/vits/export-onnx.py b/egs/aishell3/TTS/vits/export-onnx.py index ed5a1c6a33..a2afcaeca6 100755 --- a/egs/aishell3/TTS/vits/export-onnx.py +++ b/egs/aishell3/TTS/vits/export-onnx.py @@ -84,7 +84,7 @@ def get_parser(): parser.add_argument( "--model-type", type=str, - default="medium", + default="low", choices=["low", "medium", "high"], help="""If not empty, valid values are: low, medium, high. It controls the model size. low -> runs faster. diff --git a/egs/aishell3/TTS/vits/train.py b/egs/aishell3/TTS/vits/train.py index b92386e37d..ad30384855 100755 --- a/egs/aishell3/TTS/vits/train.py +++ b/egs/aishell3/TTS/vits/train.py @@ -156,7 +156,7 @@ def get_parser(): parser.add_argument( "--model-type", type=str, - default="medium", + default="low", choices=["low", "medium", "high"], help="""If not empty, valid values are: low, medium, high. It controls the model size. low -> runs faster.