Merge pull request #2184 from pipecat-ai/filipi/twilio_issue

Fixing an issue where Pipecat was not receiving the user's audio
This commit is contained in:
Filipi da Silva Fuchter
2025-07-11 15:32:28 -03:00
committed by GitHub
10 changed files with 380 additions and 313 deletions

View File

@@ -9,6 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Fixed an issue where using audio input with a sample rate requiring resampling
could result in empty audio being passed to STT services, causing errors.
- Fixed the VAD analyzer to process the full audio buffer as long as it contains
more than the minimum required bytes per iteration, instead of only analyzing
the first chunk.
- Fixed an issue in ParallelPipeline that caused errors when attempting to drain
the queues.

File diff suppressed because it is too large Load Diff

View File

@@ -19,7 +19,7 @@
"vite": "^6.0.2"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.4.0",
"@pipecat-ai/websocket-transport": "^0.4.2"
"@pipecat-ai/client-js": "^1.0.0",
"@pipecat-ai/websocket-transport": "^1.0.0"
}
}

View File

@@ -5,9 +5,11 @@
*/
import {
RTVIClient,
RTVIClientOptions,
RTVIEvent,
BotLLMTextData,
Participant,
PipecatClient,
PipecatClientOptions,
RTVIEvent, RTVIMessage, TranscriptData,
} from '@pipecat-ai/client-js';
import {
WebSocketTransport,
@@ -18,7 +20,7 @@ class WebsocketClientApp {
private static STREAM_SID = 'ws_mock_stream_sid';
private static CALL_SID = 'ws_mock_call_sid';
private rtviClient: RTVIClient | null = null;
private rtviClient: PipecatClient | null = null;
private connectBtn: HTMLButtonElement | null = null;
private disconnectBtn: HTMLButtonElement | null = null;
private statusSpan: HTMLElement | null = null;
@@ -122,7 +124,7 @@ class WebsocketClientApp {
if (!this.rtviClient) return;
// Listen for new tracks starting
this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
this.rtviClient.on(RTVIEvent.TrackStarted, (track: MediaStreamTrack, participant?: Participant) => {
// Only handle non-local (bot) tracks
if (!participant?.local && track.kind === 'audio') {
this.setupAudioTrack(track);
@@ -130,7 +132,7 @@ class WebsocketClientApp {
});
// Listen for tracks stopping
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
this.rtviClient.on(RTVIEvent.TrackStopped, (track: MediaStreamTrack, participant?: Participant) => {
this.log(
`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`
);
@@ -167,7 +169,7 @@ class WebsocketClientApp {
playerSampleRate: 8000,
ws_url: 'http://localhost:8765/ws',
};
const RTVIConfig: RTVIClientOptions = {
const RTVIConfig: PipecatClientOptions = {
transport: new WebSocketTransport(ws_opts),
enableMic: true,
enableCam: false,
@@ -184,21 +186,21 @@ class WebsocketClientApp {
if (this.disconnectBtn) this.disconnectBtn.disabled = true;
this.log('Client disconnected');
},
onBotReady: (data) => {
onBotReady: (data: any) => {
this.log(`Bot ready: ${JSON.stringify(data)}`);
this.setupMediaTracks();
},
onUserTranscript: (data) => {
onUserTranscript: (data: TranscriptData) => {
if (data.final) {
this.log(`User: ${data.text}`);
}
},
onBotTranscript: (data) => this.log(`Bot: ${data.text}`),
onMessageError: (error) => console.error('Message error:', error),
onError: (error) => console.error('Error:', error),
onBotTranscript: (data: BotLLMTextData) => this.log(`Bot: ${data.text}`),
onMessageError: (error: RTVIMessage) => console.error('Message error:', error),
onError: (error: RTVIMessage) => console.error('Error:', error),
},
};
this.rtviClient = new RTVIClient(RTVIConfig);
this.rtviClient = new PipecatClient(RTVIConfig);
this.setupTrackListeners();
this.log('Initializing devices...');

View File

@@ -183,36 +183,37 @@ class VADAnalyzer(ABC):
if len(self._vad_buffer) < num_required_bytes:
return self._vad_state
audio_frames = self._vad_buffer[:num_required_bytes]
self._vad_buffer = self._vad_buffer[num_required_bytes:]
while len(self._vad_buffer) >= num_required_bytes:
audio_frames = self._vad_buffer[:num_required_bytes]
self._vad_buffer = self._vad_buffer[num_required_bytes:]
confidence = self.voice_confidence(audio_frames)
confidence = self.voice_confidence(audio_frames)
volume = self._get_smoothed_volume(audio_frames)
self._prev_volume = volume
volume = self._get_smoothed_volume(audio_frames)
self._prev_volume = volume
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
if speaking:
match self._vad_state:
case VADState.QUIET:
self._vad_state = VADState.STARTING
self._vad_starting_count = 1
case VADState.STARTING:
self._vad_starting_count += 1
case VADState.STOPPING:
self._vad_state = VADState.SPEAKING
self._vad_stopping_count = 0
else:
match self._vad_state:
case VADState.STARTING:
self._vad_state = VADState.QUIET
self._vad_starting_count = 0
case VADState.SPEAKING:
self._vad_state = VADState.STOPPING
self._vad_stopping_count = 1
case VADState.STOPPING:
self._vad_stopping_count += 1
if speaking:
match self._vad_state:
case VADState.QUIET:
self._vad_state = VADState.STARTING
self._vad_starting_count = 1
case VADState.STARTING:
self._vad_starting_count += 1
case VADState.STOPPING:
self._vad_state = VADState.SPEAKING
self._vad_stopping_count = 0
else:
match self._vad_state:
case VADState.STARTING:
self._vad_state = VADState.QUIET
self._vad_starting_count = 0
case VADState.SPEAKING:
self._vad_state = VADState.STOPPING
self._vad_stopping_count = 1
case VADState.STOPPING:
self._vad_stopping_count += 1
if (
self._vad_state == VADState.STARTING

View File

@@ -108,6 +108,10 @@ class ExotelFrameSerializer(FrameSerializer):
serialized_data = await self._output_resampler.resample(
data, frame.sample_rate, self._exotel_sample_rate
)
if serialized_data is None or len(serialized_data) == 0:
# Ignoring in case we don't have audio
return None
payload = base64.b64encode(serialized_data).decode("ascii")
answer = {
@@ -144,6 +148,9 @@ class ExotelFrameSerializer(FrameSerializer):
self._exotel_sample_rate,
self._sample_rate,
)
if deserialized_data is None or len(deserialized_data) == 0:
# Ignoring in case we don't have audio
return None
# Input: Exotel takes PCM data, so just resample to match sample_rate
audio_frame = InputAudioRawFrame(

View File

@@ -132,6 +132,10 @@ class PlivoFrameSerializer(FrameSerializer):
serialized_data = await pcm_to_ulaw(
data, frame.sample_rate, self._plivo_sample_rate, self._output_resampler
)
if serialized_data is None or len(serialized_data) == 0:
# Ignoring in case we don't have audio
return None
payload = base64.b64encode(serialized_data).decode("utf-8")
answer = {
"event": "playAudio",
@@ -227,6 +231,10 @@ class PlivoFrameSerializer(FrameSerializer):
deserialized_data = await ulaw_to_pcm(
payload, self._plivo_sample_rate, self._sample_rate, self._input_resampler
)
if deserialized_data is None or len(deserialized_data) == 0:
# Ignoring in case we don't have audio
return None
audio_frame = InputAudioRawFrame(
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
)

View File

@@ -155,6 +155,10 @@ class TelnyxFrameSerializer(FrameSerializer):
else:
raise ValueError(f"Unsupported encoding: {self._params.inbound_encoding}")
if serialized_data is None or len(serialized_data) == 0:
# Ignoring in case we don't have audio
return None
payload = base64.b64encode(serialized_data).decode("utf-8")
answer = {
"event": "media",
@@ -262,6 +266,10 @@ class TelnyxFrameSerializer(FrameSerializer):
else:
raise ValueError(f"Unsupported encoding: {self._params.outbound_encoding}")
if deserialized_data is None or len(deserialized_data) == 0:
# Ignoring in case we don't have audio
return None
audio_frame = InputAudioRawFrame(
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
)

View File

@@ -132,6 +132,10 @@ class TwilioFrameSerializer(FrameSerializer):
serialized_data = await pcm_to_ulaw(
data, frame.sample_rate, self._twilio_sample_rate, self._output_resampler
)
if serialized_data is None or len(serialized_data) == 0:
# Ignoring in case we don't have audio
return None
payload = base64.b64encode(serialized_data).decode("utf-8")
answer = {
"event": "media",
@@ -235,6 +239,10 @@ class TwilioFrameSerializer(FrameSerializer):
deserialized_data = await ulaw_to_pcm(
payload, self._twilio_sample_rate, self._sample_rate, self._input_resampler
)
if deserialized_data is None or len(deserialized_data) == 0:
# Ignoring in case we don't have audio
return None
audio_frame = InputAudioRawFrame(
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
)

View File

@@ -152,6 +152,13 @@ class STTService(AIService):
else:
self._user_id = ""
if not frame.audio:
# Ignoring in case we don't have audio to transcribe.
logger.warning(
f"Empty audio frame received for STT service: {self.name} {frame.num_frames}"
)
return
await self.process_generator(self.run_stt(frame.audio))
async def process_frame(self, frame: Frame, direction: FrameDirection):