tts: fix RimeHttpTTSService/PiperTTSService 16-bit audio frames alignment

This commit is contained in:
Aleix Conchillo Flaqué
2025-10-10 12:09:20 -07:00
parent fdaa4e476e
commit 0473556992
4 changed files with 62 additions and 18 deletions

View File

@@ -5,6 +5,14 @@ All notable changes to **Pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Fixed
- Fixed an issue where `RimeHttpTTSService` and `PiperTTSService` could generate
incorrectly 16-bit aligned audio frames, potentially leading to internal
errors or static audio.
## [0.0.90] - 2025-10-10
### Added

View File

@@ -14,7 +14,6 @@ from loguru import logger
from pipecat.frames.frames import (
ErrorFrame,
Frame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
@@ -99,16 +98,15 @@ class PiperTTSService(TTSService):
await self.start_tts_usage_metrics(text)
yield TTSStartedFrame()
CHUNK_SIZE = self.chunk_size
yield TTSStartedFrame()
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
# remove wav header if present
if chunk.startswith(b"RIFF"):
chunk = chunk[44:]
if len(chunk) > 0:
await self.stop_ttfb_metrics()
yield TTSAudioRawFrame(chunk, self.sample_rate, 1)
async for frame in self._stream_audio_frames_from_iterator(
response.content.iter_chunked(CHUNK_SIZE), strip_wav_header=True
):
await self.stop_ttfb_metrics()
yield frame
except Exception as e:
logger.error(f"Error in run_tts: {e}")
yield ErrorFrame(error=str(e))

View File

@@ -553,15 +553,13 @@ class RimeHttpTTSService(TTSService):
CHUNK_SIZE = self.chunk_size
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
chunk = chunk[44:]
need_to_strip_wav_header = False
async for frame in self._stream_audio_frames_from_iterator(
response.content.iter_chunked(CHUNK_SIZE),
strip_wav_header=need_to_strip_wav_header,
):
await self.stop_ttfb_metrics()
yield frame
if len(chunk) > 0:
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
yield frame
except Exception as e:
logger.exception(f"Error generating TTS: {e}")
yield ErrorFrame(error=f"Rime TTS error: {str(e)}")

View File

@@ -8,7 +8,17 @@
import asyncio
from abc import abstractmethod
from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Sequence, Tuple
from typing import (
Any,
AsyncGenerator,
AsyncIterator,
Dict,
List,
Mapping,
Optional,
Sequence,
Tuple,
)
from loguru import logger
@@ -374,6 +384,36 @@ class TTSService(AIService):
):
await self._stop_frame_queue.put(frame)
async def _stream_audio_frames_from_iterator(
self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
) -> AsyncGenerator[Frame, None]:
buffer = bytearray()
need_to_strip_wav_header = strip_wav_header
async for chunk in iterator:
if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
chunk = chunk[44:]
need_to_strip_wav_header = False
# Append to current buffer.
buffer.extend(chunk)
# Round to nearest even number.
aligned_length = len(buffer) & ~1 # 111111111...11110
if aligned_length > 0:
aligned_chunk = buffer[:aligned_length]
buffer = buffer[aligned_length:] # keep any leftover byte
if len(aligned_chunk) > 0:
frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
yield frame
if len(buffer) > 0:
# Make sure we don't need an extra padding byte.
if len(buffer) % 2 == 1:
buffer.extend(b"\x00")
frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
yield frame
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
self._processing_text = False
await self._text_aggregator.handle_interruption()