tts: fix RimeHttpTTSService/PiperTTSService 16-bit audio frames alignment

2025-10-10 12:09:20 -07:00
parent fdaa4e476e
commit 0473556992
4 changed files with 62 additions and 18 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to **Pipecat** will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [Unreleased]
+
+### Fixed
+
+- Fixed an issue where `RimeHttpTTSService` and `PiperTTSService` could generate
+  incorrectly 16-bit aligned audio frames, potentially leading to internal
+  errors or static audio.
+
 ## [0.0.90] - 2025-10-10

 ### Added
--- a/src/pipecat/services/piper/tts.py
+++ b/src/pipecat/services/piper/tts.py
@@ -14,7 +14,6 @@ from loguru import logger
 from pipecat.frames.frames import (
    ErrorFrame,
    Frame,
-    TTSAudioRawFrame,
    TTSStartedFrame,
    TTSStoppedFrame,
 )
@@ -99,16 +98,15 @@ class PiperTTSService(TTSService):

                await self.start_tts_usage_metrics(text)

+                yield TTSStartedFrame()
+
                CHUNK_SIZE = self.chunk_size

-                yield TTSStartedFrame()
-                async for chunk in response.content.iter_chunked(CHUNK_SIZE):
-                    # remove wav header if present
-                    if chunk.startswith(b"RIFF"):
-                        chunk = chunk[44:]
-                    if len(chunk) > 0:
-                        await self.stop_ttfb_metrics()
-                        yield TTSAudioRawFrame(chunk, self.sample_rate, 1)
+                async for frame in self._stream_audio_frames_from_iterator(
+                    response.content.iter_chunked(CHUNK_SIZE), strip_wav_header=True
+                ):
+                    await self.stop_ttfb_metrics()
+                    yield frame
        except Exception as e:
            logger.error(f"Error in run_tts: {e}")
            yield ErrorFrame(error=str(e))
--- a/src/pipecat/services/rime/tts.py
+++ b/src/pipecat/services/rime/tts.py
@@ -553,15 +553,13 @@ class RimeHttpTTSService(TTSService):

                CHUNK_SIZE = self.chunk_size

-                async for chunk in response.content.iter_chunked(CHUNK_SIZE):
-                    if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
-                        chunk = chunk[44:]
-                        need_to_strip_wav_header = False
+                async for frame in self._stream_audio_frames_from_iterator(
+                    response.content.iter_chunked(CHUNK_SIZE),
+                    strip_wav_header=need_to_strip_wav_header,
+                ):
+                    await self.stop_ttfb_metrics()
+                    yield frame

-                    if len(chunk) > 0:
-                        await self.stop_ttfb_metrics()
-                        frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
-                        yield frame
        except Exception as e:
            logger.exception(f"Error generating TTS: {e}")
            yield ErrorFrame(error=f"Rime TTS error: {str(e)}")
--- a/src/pipecat/services/tts_service.py
+++ b/src/pipecat/services/tts_service.py
@@ -8,7 +8,17 @@

 import asyncio
 from abc import abstractmethod
-from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Sequence, Tuple
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+)

 from loguru import logger

@@ -374,6 +384,36 @@ class TTSService(AIService):
        ):
            await self._stop_frame_queue.put(frame)

+    async def _stream_audio_frames_from_iterator(
+        self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
+    ) -> AsyncGenerator[Frame, None]:
+        buffer = bytearray()
+        need_to_strip_wav_header = strip_wav_header
+        async for chunk in iterator:
+            if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
+                chunk = chunk[44:]
+                need_to_strip_wav_header = False
+
+            # Append to current buffer.
+            buffer.extend(chunk)
+
+            # Round to nearest even number.
+            aligned_length = len(buffer) & ~1  # 111111111...11110
+            if aligned_length > 0:
+                aligned_chunk = buffer[:aligned_length]
+                buffer = buffer[aligned_length:]  # keep any leftover byte
+
+                if len(aligned_chunk) > 0:
+                    frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
+                    yield frame
+
+        if len(buffer) > 0:
+            # Make sure we don't need an extra padding byte.
+            if len(buffer) % 2 == 1:
+                buffer.extend(b"\x00")
+            frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
+            yield frame
+
    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
        self._processing_text = False
        await self._text_aggregator.handle_interruption()