From 0473556992f7142fc864b4c44a6455122b59d609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= Date: Fri, 10 Oct 2025 12:09:20 -0700 Subject: [PATCH] tts: fix RimeHttpTTSService/PiperTTSService 16-bit audio frames alignment --- CHANGELOG.md | 8 ++++++ src/pipecat/services/piper/tts.py | 16 +++++------ src/pipecat/services/rime/tts.py | 14 +++++----- src/pipecat/services/tts_service.py | 42 ++++++++++++++++++++++++++++- 4 files changed, 62 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e92361569..797ca06ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ All notable changes to **Pipecat** will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed + +- Fixed an issue where `RimeHttpTTSService` and `PiperTTSService` could generate + incorrectly 16-bit aligned audio frames, potentially leading to internal + errors or static audio. + ## [0.0.90] - 2025-10-10 ### Added diff --git a/src/pipecat/services/piper/tts.py b/src/pipecat/services/piper/tts.py index d5b663c77..fa43a720c 100644 --- a/src/pipecat/services/piper/tts.py +++ b/src/pipecat/services/piper/tts.py @@ -14,7 +14,6 @@ from loguru import logger from pipecat.frames.frames import ( ErrorFrame, Frame, - TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, ) @@ -99,16 +98,15 @@ class PiperTTSService(TTSService): await self.start_tts_usage_metrics(text) + yield TTSStartedFrame() + CHUNK_SIZE = self.chunk_size - yield TTSStartedFrame() - async for chunk in response.content.iter_chunked(CHUNK_SIZE): - # remove wav header if present - if chunk.startswith(b"RIFF"): - chunk = chunk[44:] - if len(chunk) > 0: - await self.stop_ttfb_metrics() - yield TTSAudioRawFrame(chunk, self.sample_rate, 1) + async for frame in self._stream_audio_frames_from_iterator( + response.content.iter_chunked(CHUNK_SIZE), strip_wav_header=True + ): + await self.stop_ttfb_metrics() + yield frame except Exception as e: logger.error(f"Error in run_tts: {e}") yield ErrorFrame(error=str(e)) diff --git a/src/pipecat/services/rime/tts.py b/src/pipecat/services/rime/tts.py index 917716545..1ac829ebd 100644 --- a/src/pipecat/services/rime/tts.py +++ b/src/pipecat/services/rime/tts.py @@ -553,15 +553,13 @@ class RimeHttpTTSService(TTSService): CHUNK_SIZE = self.chunk_size - async for chunk in response.content.iter_chunked(CHUNK_SIZE): - if need_to_strip_wav_header and chunk.startswith(b"RIFF"): - chunk = chunk[44:] - need_to_strip_wav_header = False + async for frame in self._stream_audio_frames_from_iterator( + response.content.iter_chunked(CHUNK_SIZE), + strip_wav_header=need_to_strip_wav_header, + ): + await self.stop_ttfb_metrics() + yield frame - if len(chunk) > 0: - await self.stop_ttfb_metrics() - frame = TTSAudioRawFrame(chunk, self.sample_rate, 1) - yield frame except Exception as e: logger.exception(f"Error generating TTS: {e}") yield ErrorFrame(error=f"Rime TTS error: {str(e)}") diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index 02b80b609..a60b50818 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -8,7 +8,17 @@ import asyncio from abc import abstractmethod -from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Sequence, Tuple +from typing import ( + Any, + AsyncGenerator, + AsyncIterator, + Dict, + List, + Mapping, + Optional, + Sequence, + Tuple, +) from loguru import logger @@ -374,6 +384,36 @@ class TTSService(AIService): ): await self._stop_frame_queue.put(frame) + async def _stream_audio_frames_from_iterator( + self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool + ) -> AsyncGenerator[Frame, None]: + buffer = bytearray() + need_to_strip_wav_header = strip_wav_header + async for chunk in iterator: + if need_to_strip_wav_header and chunk.startswith(b"RIFF"): + chunk = chunk[44:] + need_to_strip_wav_header = False + + # Append to current buffer. + buffer.extend(chunk) + + # Round to nearest even number. + aligned_length = len(buffer) & ~1 # 111111111...11110 + if aligned_length > 0: + aligned_chunk = buffer[:aligned_length] + buffer = buffer[aligned_length:] # keep any leftover byte + + if len(aligned_chunk) > 0: + frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1) + yield frame + + if len(buffer) > 0: + # Make sure we don't need an extra padding byte. + if len(buffer) % 2 == 1: + buffer.extend(b"\x00") + frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1) + yield frame + async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection): self._processing_text = False await self._text_aggregator.handle_interruption()