tts: fix RimeHttpTTSService/PiperTTSService 16-bit audio frames alignment
This commit is contained in:
@@ -5,6 +5,14 @@ All notable changes to **Pipecat** will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where `RimeHttpTTSService` and `PiperTTSService` could generate
|
||||
incorrectly 16-bit aligned audio frames, potentially leading to internal
|
||||
errors or static audio.
|
||||
|
||||
## [0.0.90] - 2025-10-10
|
||||
|
||||
### Added
|
||||
|
||||
@@ -14,7 +14,6 @@ from loguru import logger
|
||||
from pipecat.frames.frames import (
|
||||
ErrorFrame,
|
||||
Frame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
@@ -99,16 +98,15 @@ class PiperTTSService(TTSService):
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
yield TTSStartedFrame()
|
||||
|
||||
CHUNK_SIZE = self.chunk_size
|
||||
|
||||
yield TTSStartedFrame()
|
||||
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
|
||||
# remove wav header if present
|
||||
if chunk.startswith(b"RIFF"):
|
||||
chunk = chunk[44:]
|
||||
if len(chunk) > 0:
|
||||
await self.stop_ttfb_metrics()
|
||||
yield TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
||||
async for frame in self._stream_audio_frames_from_iterator(
|
||||
response.content.iter_chunked(CHUNK_SIZE), strip_wav_header=True
|
||||
):
|
||||
await self.stop_ttfb_metrics()
|
||||
yield frame
|
||||
except Exception as e:
|
||||
logger.error(f"Error in run_tts: {e}")
|
||||
yield ErrorFrame(error=str(e))
|
||||
|
||||
@@ -553,15 +553,13 @@ class RimeHttpTTSService(TTSService):
|
||||
|
||||
CHUNK_SIZE = self.chunk_size
|
||||
|
||||
async for chunk in response.content.iter_chunked(CHUNK_SIZE):
|
||||
if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
|
||||
chunk = chunk[44:]
|
||||
need_to_strip_wav_header = False
|
||||
async for frame in self._stream_audio_frames_from_iterator(
|
||||
response.content.iter_chunked(CHUNK_SIZE),
|
||||
strip_wav_header=need_to_strip_wav_header,
|
||||
):
|
||||
await self.stop_ttfb_metrics()
|
||||
yield frame
|
||||
|
||||
if len(chunk) > 0:
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
||||
yield frame
|
||||
except Exception as e:
|
||||
logger.exception(f"Error generating TTS: {e}")
|
||||
yield ErrorFrame(error=f"Rime TTS error: {str(e)}")
|
||||
|
||||
@@ -8,7 +8,17 @@
|
||||
|
||||
import asyncio
|
||||
from abc import abstractmethod
|
||||
from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Sequence, Tuple
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
AsyncIterator,
|
||||
Dict,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
from loguru import logger
|
||||
|
||||
@@ -374,6 +384,36 @@ class TTSService(AIService):
|
||||
):
|
||||
await self._stop_frame_queue.put(frame)
|
||||
|
||||
async def _stream_audio_frames_from_iterator(
|
||||
self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
|
||||
) -> AsyncGenerator[Frame, None]:
|
||||
buffer = bytearray()
|
||||
need_to_strip_wav_header = strip_wav_header
|
||||
async for chunk in iterator:
|
||||
if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
|
||||
chunk = chunk[44:]
|
||||
need_to_strip_wav_header = False
|
||||
|
||||
# Append to current buffer.
|
||||
buffer.extend(chunk)
|
||||
|
||||
# Round to nearest even number.
|
||||
aligned_length = len(buffer) & ~1 # 111111111...11110
|
||||
if aligned_length > 0:
|
||||
aligned_chunk = buffer[:aligned_length]
|
||||
buffer = buffer[aligned_length:] # keep any leftover byte
|
||||
|
||||
if len(aligned_chunk) > 0:
|
||||
frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
|
||||
yield frame
|
||||
|
||||
if len(buffer) > 0:
|
||||
# Make sure we don't need an extra padding byte.
|
||||
if len(buffer) % 2 == 1:
|
||||
buffer.extend(b"\x00")
|
||||
frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
|
||||
yield frame
|
||||
|
||||
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
||||
self._processing_text = False
|
||||
await self._text_aggregator.handle_interruption()
|
||||
|
||||
Reference in New Issue
Block a user