Merge pull request #2008 from pipecat-ai/khk/groq-audio

fix groq wav file header parsing
2025-06-16 14:09:09 +01:00
parent a4ea0d2b82 fe16ed3c73
commit d73f7908f2
2 changed files with 25 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Fixed

+- Fixed an issue with `GroqTTSService` where it was not properly parsing the
+  WAV file header.
+
 - Fixed an issue with `GoogleSTTService` where it was constantly reconnecting 
  before starting to receive audio from the user.

--- a/src/pipecat/services/groq/tts.py
+++ b/src/pipecat/services/groq/tts.py
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #

+import io
+import wave
 from typing import AsyncGenerator, Optional

 from loguru import logger
@@ -78,22 +80,26 @@ class GroqTTSService(TTSService):
        await self.start_ttfb_metrics()
        yield TTSStartedFrame()

-        response = await self._client.audio.speech.create(
-            model=self._model_name,
-            voice=self._voice_id,
-            response_format=self._output_format,
-            input=text,
-        )
+        try:
+            response = await self._client.audio.speech.create(
+                model=self._model_name,
+                voice=self._voice_id,
+                response_format=self._output_format,
+                input=text,
+            )

-        async for data in response.iter_bytes():
-            if measuring_ttfb:
-                await self.stop_ttfb_metrics()
-                measuring_ttfb = False
-            # remove wav header if present
-            if data.startswith(b"RIFF"):
-                data = data[44:]
-                if len(data) == 0:
-                    continue
-            yield TTSAudioRawFrame(data, self.sample_rate, 1)
+            async for data in response.iter_bytes():
+                if measuring_ttfb:
+                    await self.stop_ttfb_metrics()
+                    measuring_ttfb = False
+
+                with wave.open(io.BytesIO(data)) as w:
+                    channels = w.getnchannels()
+                    frame_rate = w.getframerate()
+                    num_frames = w.getnframes()
+                    bytes = w.readframes(num_frames)
+                    yield TTSAudioRawFrame(bytes, frame_rate, channels)
+        except Exception as e:
+            logger.error(f"{self} exception: {e}")

        yield TTSStoppedFrame()