Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text to speech API for Object Pascal. #1273

Merged
merged 2 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions .github/workflows/pascal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,29 @@ jobs:
cp -v install/lib/*.dll ../pascal-api-examples/vad
cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr

cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
fi

- name: Run Pascal test (TTS)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH

cd ./pascal-api-examples
pushd tts

./run-piper.sh
rm -rf vits-piper-*
ls -lh
echo "---"

popd

- name: Run Pascal test (VAD + non-streaming ASR)
shell: bash
run: |
Expand Down
1 change: 1 addition & 0 deletions pascal-api-examples/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
link*.res
2 changes: 2 additions & 0 deletions pascal-api-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|[vad](./vad)| It shows how to use the voice activity detection API.|
|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
|[portaudio-test](./portaudio-test)| It shows how to use PortAudio for recording and playing.|
|[tts](./tts)| It shows how to use the text-to-speech API.|
4 changes: 4 additions & 0 deletions pascal-api-examples/tts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
!run-*.sh
piper
piper-playback
link*.res
9 changes: 9 additions & 0 deletions pascal-api-examples/tts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Introduction

This directory contains examples for how to use the TTS (text to speech) APIs.

|Directory| Description|
|---------|------------|
|[run-piper.sh](./run-piper.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech.|
|[run-piper-playback.sh](./run-piper-playback.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech. It plays the generated audio as it is still generating. |

238 changes: 238 additions & 0 deletions pascal-api-examples/tts/piper-playback.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
{ Copyright (c) 2024 Xiaomi Corporation }
program piper;
{
This file shows how to use the text to speech API of sherpa-onnx
with Piper models.

It generates speech from text and saves it to a wave file.

Note that it plays the audio back as it is still generating.
}

{$mode objfpc}

uses
{$ifdef unix}
cthreads,
{$endif}
SysUtils,
dos,
ctypes,
portaudio,
sherpa_onnx;

var
CriticalSection: TRTLCriticalSection;

Tts: TSherpaOnnxOfflineTts;
Audio: TSherpaOnnxGeneratedAudio;
Resampler: TSherpaOnnxLinearResampler;

Text: AnsiString;
Speed: Single = 1.0; {Use a larger value to speak faster}
SpeakerId: Integer = 0;
Buffer: TSherpaOnnxCircularBuffer;
FinishedGeneration: Boolean = False;
FinishedPlaying: Boolean = False;

Version: String;
EnvStr: String;
Status: Integer;
NumDevices: Integer;
DeviceIndex: Integer;
DeviceInfo: PPaDeviceInfo;

{ If you get EDivByZero: Division by zero error, please change the sample rate
to the one supported by your microphone.
}
DeviceSampleRate: Integer = 48000;
I: Integer;
Param: TPaStreamParameters;
Stream: PPaStream;
Wave: TSherpaOnnxWave;

function GenerateCallback(
Samples: pcfloat; N: cint32;
Arg: Pointer): cint; cdecl;
begin
EnterCriticalSection(CriticalSection);
try
if Resampler <> nil then
Buffer.Push(Resampler.Resample(Samples, N, False))
else
Buffer.Push(Samples, N);
finally
LeaveCriticalSection(CriticalSection);
end;

{ 1 means to continue generating; 0 means to stop generating. }
Result := 1;
end;

function PlayCallback(
input: Pointer; output: Pointer;
frameCount: culong;
timeInfo: PPaStreamCallbackTimeInfo;
statusFlags: TPaStreamCallbackFlags;
userData: Pointer ): cint; cdecl;
var
Samples: TSherpaOnnxSamplesArray;
I: Integer;
begin
EnterCriticalSection(CriticalSection);
try
if Buffer.Size >= frameCount then
begin
Samples := Buffer.Get(Buffer.Head, FrameCount);
Buffer.Pop(FrameCount);
end
else if Buffer.Size > 0 then
begin
Samples := Buffer.Get(Buffer.Head, Buffer.Size);
Buffer.Pop(Buffer.Size);
SetLength(Samples, frameCount);
end
else
SetLength(Samples, frameCount);

for I := 0 to frameCount - 1 do
pcfloat(output)[I] := Samples[I];

if (Buffer.Size > 0) or (not FinishedGeneration) then
Result := paContinue
else
begin
Result := paComplete;
FinishedPlaying := True;
end;
finally
LeaveCriticalSection(CriticalSection);
end;
end;

function GetOfflineTts: TSherpaOnnxOfflineTts;
var
Config: TSherpaOnnxOfflineTtsConfig;
begin
Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
Config.Model.NumThreads := 1;
Config.Model.Debug := False;
Config.MaxNumSentences := 1;

Result := TSherpaOnnxOfflineTts.Create(Config);
end;

begin
Tts := GetOfflineTts;
if Tts.GetSampleRate <> DeviceSampleRate then
Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);

Version := String(Pa_GetVersionText);
WriteLn('Version is ', Version);
Status := Pa_Initialize;
if Status <> paNoError then
begin
WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
Exit;
end;

NumDevices := Pa_GetDeviceCount;
WriteLn('Num devices: ', NumDevices);

DeviceIndex := Pa_GetDefaultOutputDevice;

if DeviceIndex = paNoDevice then
begin
WriteLn('No default output device found');
Pa_Terminate;
Exit;
end;

EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
if EnvStr <> '' then
begin
DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
end;

for I := 0 to (NumDevices - 1) do
begin
DeviceInfo := Pa_GetDeviceInfo(I);
if I = DeviceIndex then
{ WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
else
WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)]));
end;

WriteLn('Use device ', DeviceIndex);
WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);

Initialize(Param);
Param.Device := DeviceIndex;
Param.ChannelCount := 1;
Param.SampleFormat := paFloat32;
param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
param.HostApiSpecificStreamInfo := nil;

Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);


{ Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
PPaStreamCallback(@PlayCallback), nil);

if Status <> paNoError then
begin
WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
Pa_Terminate;
Exit;
end;

InitCriticalSection(CriticalSection);

Status := Pa_StartStream(stream);
if Status <> paNoError then
begin
WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
Pa_Terminate;
Exit;
end;

WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');

Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';

Audio := Tts.Generate(Text, SpeakerId, Speed,
PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
FinishedGeneration := True;
SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
WriteLn('Saved to ./libritts_r-generated.wav');

while not FinishedPlaying do
Pa_Sleep(100); {sleep for 0.1 second }
{TODO(fangjun): Use an event to indicate the play is finished}

DoneCriticalSection(CriticalSection);

FreeAndNil(Tts);
FreeAndNil(Resampler);

Status := Pa_CloseStream(stream);
if Status <> paNoError then
begin
WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
Exit;
end;

Status := Pa_Terminate;
if Status <> paNoError then
begin
WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
Exit;
end;
end.

54 changes: 54 additions & 0 deletions pascal-api-examples/tts/piper.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{ Copyright (c) 2024 Xiaomi Corporation }
program piper;
{
This file shows how to use the text to speech API of sherpa-onnx
with Piper models.

It generates speech from text and saves it to a wave file.

If you want to play it while it is generating, please see
./piper-playback.pas
}

{$mode objfpc}

uses
SysUtils,
sherpa_onnx;

function GetOfflineTts: TSherpaOnnxOfflineTts;
var
Config: TSherpaOnnxOfflineTtsConfig;
begin
Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
Config.Model.NumThreads := 1;
Config.Model.Debug := False;
Config.MaxNumSentences := 1;

Result := TSherpaOnnxOfflineTts.Create(Config);
end;

var
Tts: TSherpaOnnxOfflineTts;
Audio: TSherpaOnnxGeneratedAudio;

Text: AnsiString;
Speed: Single = 1.0; {Use a larger value to speak faster}
SpeakerId: Integer = 0;

begin
Tts := GetOfflineTts;

WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');

Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';

Audio := Tts.Generate(Text, SpeakerId, Speed);
SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
WriteLn('Saved to ./libritts_r-generated.wav');

FreeAndNil(Tts);
end.

45 changes: 45 additions & 0 deletions pascal-api-examples/tts/run-piper-playback.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash

set -ex

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)

echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"

if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..

cmake --build . --target install --config Release
popd
fi

if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2
tar xf vits-piper-en_US-libritts_r-medium.tar.bz2
rm vits-piper-en_US-libritts_r-medium.tar.bz2
fi

fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
-Fl/usr/local/Cellar/portaudio/19.7.0/lib \
./piper-playback.pas

# Please see ../portaudio-test/README.md
# for how to install portaudio on macOS

export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH

./piper-playback
Loading
Loading