From 0473556992f7142fc864b4c44a6455122b59d609 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleix=20Conchillo=20Flaqu=C3=A9?= <aleix@daily.co>
Date: Fri, 10 Oct 2025 12:09:20 -0700
Subject: [PATCH] tts: fix RimeHttpTTSService/PiperTTSService 16-bit audio
 frames alignment

---
 CHANGELOG.md                        |  8 ++++++
 src/pipecat/services/piper/tts.py   | 16 +++++------
 src/pipecat/services/rime/tts.py    | 14 +++++-----
 src/pipecat/services/tts_service.py | 42 ++++++++++++++++++++++++++++-
 4 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e92361569..797ca06ab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to **Pipecat** will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Fixed
+
+- Fixed an issue where `RimeHttpTTSService` and `PiperTTSService` could generate
+  incorrectly 16-bit aligned audio frames, potentially leading to internal
+  errors or static audio.
+
 ## [0.0.90] - 2025-10-10
 
 ### Added
diff --git a/src/pipecat/services/piper/tts.py b/src/pipecat/services/piper/tts.py
index d5b663c77..fa43a720c 100644
--- a/src/pipecat/services/piper/tts.py
+++ b/src/pipecat/services/piper/tts.py
@@ -14,7 +14,6 @@ from loguru import logger
 from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
-    TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
 )
@@ -99,16 +98,15 @@ class PiperTTSService(TTSService):
 
                 await self.start_tts_usage_metrics(text)
 
+                yield TTSStartedFrame()
+
                 CHUNK_SIZE = self.chunk_size
 
-                yield TTSStartedFrame()
-                async for chunk in response.content.iter_chunked(CHUNK_SIZE):
-                    # remove wav header if present
-                    if chunk.startswith(b"RIFF"):
-                        chunk = chunk[44:]
-                    if len(chunk) > 0:
-                        await self.stop_ttfb_metrics()
-                        yield TTSAudioRawFrame(chunk, self.sample_rate, 1)
+                async for frame in self._stream_audio_frames_from_iterator(
+                    response.content.iter_chunked(CHUNK_SIZE), strip_wav_header=True
+                ):
+                    await self.stop_ttfb_metrics()
+                    yield frame
         except Exception as e:
             logger.error(f"Error in run_tts: {e}")
             yield ErrorFrame(error=str(e))
diff --git a/src/pipecat/services/rime/tts.py b/src/pipecat/services/rime/tts.py
index 917716545..1ac829ebd 100644
--- a/src/pipecat/services/rime/tts.py
+++ b/src/pipecat/services/rime/tts.py
@@ -553,15 +553,13 @@ class RimeHttpTTSService(TTSService):
 
                 CHUNK_SIZE = self.chunk_size
 
-                async for chunk in response.content.iter_chunked(CHUNK_SIZE):
-                    if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
-                        chunk = chunk[44:]
-                        need_to_strip_wav_header = False
+                async for frame in self._stream_audio_frames_from_iterator(
+                    response.content.iter_chunked(CHUNK_SIZE),
+                    strip_wav_header=need_to_strip_wav_header,
+                ):
+                    await self.stop_ttfb_metrics()
+                    yield frame
 
-                    if len(chunk) > 0:
-                        await self.stop_ttfb_metrics()
-                        frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
-                        yield frame
         except Exception as e:
             logger.exception(f"Error generating TTS: {e}")
             yield ErrorFrame(error=f"Rime TTS error: {str(e)}")
diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py
index 02b80b609..a60b50818 100644
--- a/src/pipecat/services/tts_service.py
+++ b/src/pipecat/services/tts_service.py
@@ -8,7 +8,17 @@
 
 import asyncio
 from abc import abstractmethod
-from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Sequence, Tuple
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+)
 
 from loguru import logger
 
@@ -374,6 +384,36 @@ class TTSService(AIService):
         ):
             await self._stop_frame_queue.put(frame)
 
+    async def _stream_audio_frames_from_iterator(
+        self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
+    ) -> AsyncGenerator[Frame, None]:
+        buffer = bytearray()
+        need_to_strip_wav_header = strip_wav_header
+        async for chunk in iterator:
+            if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
+                chunk = chunk[44:]
+                need_to_strip_wav_header = False
+
+            # Append to current buffer.
+            buffer.extend(chunk)
+
+            # Round to nearest even number.
+            aligned_length = len(buffer) & ~1  # 111111111...11110
+            if aligned_length > 0:
+                aligned_chunk = buffer[:aligned_length]
+                buffer = buffer[aligned_length:]  # keep any leftover byte
+
+                if len(aligned_chunk) > 0:
+                    frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
+                    yield frame
+
+        if len(buffer) > 0:
+            # Make sure we don't need an extra padding byte.
+            if len(buffer) % 2 == 1:
+                buffer.extend(b"\x00")
+            frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
+            yield frame
+
     async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         self._processing_text = False
         await self._text_aggregator.handle_interruption()