Merge pull request #857 from pipecat-ai/aleix/fix-riva-tts-audio-stuttering

riva: fix FastPitchTTSService audio stuttering
2024-12-12 22:20:00 -08:00
parent 2b8c35c681 420ce16807
commit 8f24ca4e58
2 changed files with 17 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Fixed

+- Fixed an audio stuttering issue in `FastPitchTTSService`.
+
 - Fixed a `BaseOutputTransport` issue that was causing non-audio frames being
  processed before the previous audio frames were played. This will allow, for
  example, sending a frame `A` after a `TTSSpeakFrame` and the frame `A` will
--- a/src/pipecat/services/riva.py
+++ b/src/pipecat/services/riva.py
@@ -76,7 +76,10 @@ class FastPitchTTSService(TTSService):
        )

    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
-        def read_audio_responses():
+        def read_audio_responses(queue: asyncio.Queue):
+            def add_response(r):
+                asyncio.run_coroutine_threadsafe(queue.put(r), self.get_event_loop())
+
            try:
                responses = self._service.synthesize_online(
                    text,
@@ -87,26 +90,32 @@ class FastPitchTTSService(TTSService):
                    quality=self._quality,
                    custom_dictionary={},
                )
-                return responses
+                for r in responses:
+                    add_response(r)
+                add_response(None)
            except Exception as e:
                logger.error(f"{self} exception: {e}")
-                return []
+                add_response(None)

        await self.start_ttfb_metrics()
        yield TTSStartedFrame()

        logger.debug(f"Generating TTS: [{text}]")
-        responses = await asyncio.to_thread(read_audio_responses)

-        for resp in responses:
+        queue = asyncio.Queue()
+        await asyncio.to_thread(read_audio_responses, queue)
+
+        # Wait for the thread to start.
+        resp = await queue.get()
+        while resp:
            await self.stop_ttfb_metrics()
-
            frame = TTSAudioRawFrame(
                audio=resp.audio,
                sample_rate=self._sample_rate,
                num_channels=1,
            )
            yield frame
+            resp = await queue.get()

        await self.start_tts_usage_metrics(text)
        yield TTSStoppedFrame()