Merge pull request #2184 from pipecat-ai/filipi/twilio_issue

Fixing an issue where Pipecat was not receiving the user's audio
2025-07-11 15:32:28 -03:00
parent 6cd6e7ceed 75f8baab33
commit b005bd7b98
10 changed files with 380 additions and 313 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Fixed

+- Fixed an issue where using audio input with a sample rate requiring resampling
+  could result in empty audio being passed to STT services, causing errors.
+
+- Fixed the VAD analyzer to process the full audio buffer as long as it contains
+  more than the minimum required bytes per iteration, instead of only analyzing
+  the first chunk.
+
 - Fixed an issue in ParallelPipeline that caused errors when attempting to drain
  the queues.

--- a/examples/twilio-chatbot/client/typescript/package-lock.json
+++ b/examples/twilio-chatbot/client/typescript/package-lock.json
--- a/examples/twilio-chatbot/client/typescript/package.json
+++ b/examples/twilio-chatbot/client/typescript/package.json
@@ -19,7 +19,7 @@
    "vite": "^6.0.2"
  },
  "dependencies": {
-    "@pipecat-ai/client-js": "^0.4.0",
-    "@pipecat-ai/websocket-transport": "^0.4.2"
+    "@pipecat-ai/client-js": "^1.0.0",
+    "@pipecat-ai/websocket-transport": "^1.0.0"
  }
 }
--- a/examples/twilio-chatbot/client/typescript/src/app.ts
+++ b/examples/twilio-chatbot/client/typescript/src/app.ts
@@ -5,9 +5,11 @@
 */

 import {
-  RTVIClient,
-  RTVIClientOptions,
-  RTVIEvent,
+  BotLLMTextData,
+  Participant,
+  PipecatClient,
+  PipecatClientOptions,
+  RTVIEvent, RTVIMessage, TranscriptData,
 } from '@pipecat-ai/client-js';
 import {
  WebSocketTransport,
@@ -18,7 +20,7 @@ class WebsocketClientApp {
  private static STREAM_SID = 'ws_mock_stream_sid';
  private static CALL_SID = 'ws_mock_call_sid';

-  private rtviClient: RTVIClient | null = null;
+  private rtviClient: PipecatClient | null = null;
  private connectBtn: HTMLButtonElement | null = null;
  private disconnectBtn: HTMLButtonElement | null = null;
  private statusSpan: HTMLElement | null = null;
@@ -122,7 +124,7 @@ class WebsocketClientApp {
    if (!this.rtviClient) return;

    // Listen for new tracks starting
-    this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
+    this.rtviClient.on(RTVIEvent.TrackStarted, (track: MediaStreamTrack, participant?: Participant) => {
      // Only handle non-local (bot) tracks
      if (!participant?.local && track.kind === 'audio') {
        this.setupAudioTrack(track);
@@ -130,7 +132,7 @@ class WebsocketClientApp {
    });

    // Listen for tracks stopping
-    this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
+    this.rtviClient.on(RTVIEvent.TrackStopped, (track: MediaStreamTrack, participant?: Participant) => {
      this.log(
        `Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`
      );
@@ -167,7 +169,7 @@ class WebsocketClientApp {
        playerSampleRate: 8000,
        ws_url: 'http://localhost:8765/ws',
      };
-      const RTVIConfig: RTVIClientOptions = {
+      const RTVIConfig: PipecatClientOptions = {
        transport: new WebSocketTransport(ws_opts),
        enableMic: true,
        enableCam: false,
@@ -184,21 +186,21 @@ class WebsocketClientApp {
            if (this.disconnectBtn) this.disconnectBtn.disabled = true;
            this.log('Client disconnected');
          },
-          onBotReady: (data) => {
+          onBotReady: (data: any) => {
            this.log(`Bot ready: ${JSON.stringify(data)}`);
            this.setupMediaTracks();
          },
-          onUserTranscript: (data) => {
+          onUserTranscript: (data: TranscriptData) => {
            if (data.final) {
              this.log(`User: ${data.text}`);
            }
          },
-          onBotTranscript: (data) => this.log(`Bot: ${data.text}`),
-          onMessageError: (error) => console.error('Message error:', error),
-          onError: (error) => console.error('Error:', error),
+          onBotTranscript: (data: BotLLMTextData) => this.log(`Bot: ${data.text}`),
+          onMessageError: (error: RTVIMessage) => console.error('Message error:', error),
+          onError: (error: RTVIMessage) => console.error('Error:', error),
        },
      };
-      this.rtviClient = new RTVIClient(RTVIConfig);
+      this.rtviClient = new PipecatClient(RTVIConfig);
      this.setupTrackListeners();

      this.log('Initializing devices...');
--- a/src/pipecat/audio/vad/vad_analyzer.py
+++ b/src/pipecat/audio/vad/vad_analyzer.py
@@ -183,36 +183,37 @@ class VADAnalyzer(ABC):
        if len(self._vad_buffer) < num_required_bytes:
            return self._vad_state

-        audio_frames = self._vad_buffer[:num_required_bytes]
-        self._vad_buffer = self._vad_buffer[num_required_bytes:]
+        while len(self._vad_buffer) >= num_required_bytes:
+            audio_frames = self._vad_buffer[:num_required_bytes]
+            self._vad_buffer = self._vad_buffer[num_required_bytes:]

-        confidence = self.voice_confidence(audio_frames)
+            confidence = self.voice_confidence(audio_frames)

-        volume = self._get_smoothed_volume(audio_frames)
-        self._prev_volume = volume
+            volume = self._get_smoothed_volume(audio_frames)
+            self._prev_volume = volume

-        speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
+            speaking = confidence >= self._params.confidence and volume >= self._params.min_volume

-        if speaking:
-            match self._vad_state:
-                case VADState.QUIET:
-                    self._vad_state = VADState.STARTING
-                    self._vad_starting_count = 1
-                case VADState.STARTING:
-                    self._vad_starting_count += 1
-                case VADState.STOPPING:
-                    self._vad_state = VADState.SPEAKING
-                    self._vad_stopping_count = 0
-        else:
-            match self._vad_state:
-                case VADState.STARTING:
-                    self._vad_state = VADState.QUIET
-                    self._vad_starting_count = 0
-                case VADState.SPEAKING:
-                    self._vad_state = VADState.STOPPING
-                    self._vad_stopping_count = 1
-                case VADState.STOPPING:
-                    self._vad_stopping_count += 1
+            if speaking:
+                match self._vad_state:
+                    case VADState.QUIET:
+                        self._vad_state = VADState.STARTING
+                        self._vad_starting_count = 1
+                    case VADState.STARTING:
+                        self._vad_starting_count += 1
+                    case VADState.STOPPING:
+                        self._vad_state = VADState.SPEAKING
+                        self._vad_stopping_count = 0
+            else:
+                match self._vad_state:
+                    case VADState.STARTING:
+                        self._vad_state = VADState.QUIET
+                        self._vad_starting_count = 0
+                    case VADState.SPEAKING:
+                        self._vad_state = VADState.STOPPING
+                        self._vad_stopping_count = 1
+                    case VADState.STOPPING:
+                        self._vad_stopping_count += 1

        if (
            self._vad_state == VADState.STARTING
--- a/src/pipecat/serializers/exotel.py
+++ b/src/pipecat/serializers/exotel.py
@@ -108,6 +108,10 @@ class ExotelFrameSerializer(FrameSerializer):
            serialized_data = await self._output_resampler.resample(
                data, frame.sample_rate, self._exotel_sample_rate
            )
+            if serialized_data is None or len(serialized_data) == 0:
+                # Ignoring in case we don't have audio
+                return None
+
            payload = base64.b64encode(serialized_data).decode("ascii")

            answer = {
@@ -144,6 +148,9 @@ class ExotelFrameSerializer(FrameSerializer):
                self._exotel_sample_rate,
                self._sample_rate,
            )
+            if deserialized_data is None or len(deserialized_data) == 0:
+                # Ignoring in case we don't have audio
+                return None

            # Input: Exotel takes PCM data, so just resample to match sample_rate
            audio_frame = InputAudioRawFrame(
--- a/src/pipecat/serializers/plivo.py
+++ b/src/pipecat/serializers/plivo.py
@@ -132,6 +132,10 @@ class PlivoFrameSerializer(FrameSerializer):
            serialized_data = await pcm_to_ulaw(
                data, frame.sample_rate, self._plivo_sample_rate, self._output_resampler
            )
+            if serialized_data is None or len(serialized_data) == 0:
+                # Ignoring in case we don't have audio
+                return None
+
            payload = base64.b64encode(serialized_data).decode("utf-8")
            answer = {
                "event": "playAudio",
@@ -227,6 +231,10 @@ class PlivoFrameSerializer(FrameSerializer):
            deserialized_data = await ulaw_to_pcm(
                payload, self._plivo_sample_rate, self._sample_rate, self._input_resampler
            )
+            if deserialized_data is None or len(deserialized_data) == 0:
+                # Ignoring in case we don't have audio
+                return None
+
            audio_frame = InputAudioRawFrame(
                audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
            )
--- a/src/pipecat/serializers/telnyx.py
+++ b/src/pipecat/serializers/telnyx.py
@@ -155,6 +155,10 @@ class TelnyxFrameSerializer(FrameSerializer):
            else:
                raise ValueError(f"Unsupported encoding: {self._params.inbound_encoding}")

+            if serialized_data is None or len(serialized_data) == 0:
+                # Ignoring in case we don't have audio
+                return None
+
            payload = base64.b64encode(serialized_data).decode("utf-8")
            answer = {
                "event": "media",
@@ -262,6 +266,10 @@ class TelnyxFrameSerializer(FrameSerializer):
            else:
                raise ValueError(f"Unsupported encoding: {self._params.outbound_encoding}")

+            if deserialized_data is None or len(deserialized_data) == 0:
+                # Ignoring in case we don't have audio
+                return None
+
            audio_frame = InputAudioRawFrame(
                audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
            )
--- a/src/pipecat/serializers/twilio.py
+++ b/src/pipecat/serializers/twilio.py
@@ -132,6 +132,10 @@ class TwilioFrameSerializer(FrameSerializer):
            serialized_data = await pcm_to_ulaw(
                data, frame.sample_rate, self._twilio_sample_rate, self._output_resampler
            )
+            if serialized_data is None or len(serialized_data) == 0:
+                # Ignoring in case we don't have audio
+                return None
+
            payload = base64.b64encode(serialized_data).decode("utf-8")
            answer = {
                "event": "media",
@@ -235,6 +239,10 @@ class TwilioFrameSerializer(FrameSerializer):
            deserialized_data = await ulaw_to_pcm(
                payload, self._twilio_sample_rate, self._sample_rate, self._input_resampler
            )
+            if deserialized_data is None or len(deserialized_data) == 0:
+                # Ignoring in case we don't have audio
+                return None
+
            audio_frame = InputAudioRawFrame(
                audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
            )
--- a/src/pipecat/services/stt_service.py
+++ b/src/pipecat/services/stt_service.py
@@ -152,6 +152,13 @@ class STTService(AIService):
        else:
            self._user_id = ""

+        if not frame.audio:
+            # Ignoring in case we don't have audio to transcribe.
+            logger.warning(
+                f"Empty audio frame received for STT service: {self.name} {frame.num_frames}"
+            )
+            return
+
        await self.process_generator(self.run_stt(frame.audio))

    async def process_frame(self, frame: Frame, direction: FrameDirection):