Merge pull request #2184 from pipecat-ai/filipi/twilio_issue
Fixing an issue where Pipecat was not receiving the user's audio
This commit is contained in:
@@ -9,6 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue where using audio input with a sample rate requiring resampling
|
||||
could result in empty audio being passed to STT services, causing errors.
|
||||
|
||||
- Fixed the VAD analyzer to process the full audio buffer as long as it contains
|
||||
more than the minimum required bytes per iteration, instead of only analyzing
|
||||
the first chunk.
|
||||
|
||||
- Fixed an issue in ParallelPipeline that caused errors when attempting to drain
|
||||
the queues.
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -19,7 +19,7 @@
|
||||
"vite": "^6.0.2"
|
||||
},
|
||||
"dependencies": {
|
||||
"@pipecat-ai/client-js": "^0.4.0",
|
||||
"@pipecat-ai/websocket-transport": "^0.4.2"
|
||||
"@pipecat-ai/client-js": "^1.0.0",
|
||||
"@pipecat-ai/websocket-transport": "^1.0.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,9 +5,11 @@
|
||||
*/
|
||||
|
||||
import {
|
||||
RTVIClient,
|
||||
RTVIClientOptions,
|
||||
RTVIEvent,
|
||||
BotLLMTextData,
|
||||
Participant,
|
||||
PipecatClient,
|
||||
PipecatClientOptions,
|
||||
RTVIEvent, RTVIMessage, TranscriptData,
|
||||
} from '@pipecat-ai/client-js';
|
||||
import {
|
||||
WebSocketTransport,
|
||||
@@ -18,7 +20,7 @@ class WebsocketClientApp {
|
||||
private static STREAM_SID = 'ws_mock_stream_sid';
|
||||
private static CALL_SID = 'ws_mock_call_sid';
|
||||
|
||||
private rtviClient: RTVIClient | null = null;
|
||||
private rtviClient: PipecatClient | null = null;
|
||||
private connectBtn: HTMLButtonElement | null = null;
|
||||
private disconnectBtn: HTMLButtonElement | null = null;
|
||||
private statusSpan: HTMLElement | null = null;
|
||||
@@ -122,7 +124,7 @@ class WebsocketClientApp {
|
||||
if (!this.rtviClient) return;
|
||||
|
||||
// Listen for new tracks starting
|
||||
this.rtviClient.on(RTVIEvent.TrackStarted, (track, participant) => {
|
||||
this.rtviClient.on(RTVIEvent.TrackStarted, (track: MediaStreamTrack, participant?: Participant) => {
|
||||
// Only handle non-local (bot) tracks
|
||||
if (!participant?.local && track.kind === 'audio') {
|
||||
this.setupAudioTrack(track);
|
||||
@@ -130,7 +132,7 @@ class WebsocketClientApp {
|
||||
});
|
||||
|
||||
// Listen for tracks stopping
|
||||
this.rtviClient.on(RTVIEvent.TrackStopped, (track, participant) => {
|
||||
this.rtviClient.on(RTVIEvent.TrackStopped, (track: MediaStreamTrack, participant?: Participant) => {
|
||||
this.log(
|
||||
`Track stopped: ${track.kind} from ${participant?.name || 'unknown'}`
|
||||
);
|
||||
@@ -167,7 +169,7 @@ class WebsocketClientApp {
|
||||
playerSampleRate: 8000,
|
||||
ws_url: 'http://localhost:8765/ws',
|
||||
};
|
||||
const RTVIConfig: RTVIClientOptions = {
|
||||
const RTVIConfig: PipecatClientOptions = {
|
||||
transport: new WebSocketTransport(ws_opts),
|
||||
enableMic: true,
|
||||
enableCam: false,
|
||||
@@ -184,21 +186,21 @@ class WebsocketClientApp {
|
||||
if (this.disconnectBtn) this.disconnectBtn.disabled = true;
|
||||
this.log('Client disconnected');
|
||||
},
|
||||
onBotReady: (data) => {
|
||||
onBotReady: (data: any) => {
|
||||
this.log(`Bot ready: ${JSON.stringify(data)}`);
|
||||
this.setupMediaTracks();
|
||||
},
|
||||
onUserTranscript: (data) => {
|
||||
onUserTranscript: (data: TranscriptData) => {
|
||||
if (data.final) {
|
||||
this.log(`User: ${data.text}`);
|
||||
}
|
||||
},
|
||||
onBotTranscript: (data) => this.log(`Bot: ${data.text}`),
|
||||
onMessageError: (error) => console.error('Message error:', error),
|
||||
onError: (error) => console.error('Error:', error),
|
||||
onBotTranscript: (data: BotLLMTextData) => this.log(`Bot: ${data.text}`),
|
||||
onMessageError: (error: RTVIMessage) => console.error('Message error:', error),
|
||||
onError: (error: RTVIMessage) => console.error('Error:', error),
|
||||
},
|
||||
};
|
||||
this.rtviClient = new RTVIClient(RTVIConfig);
|
||||
this.rtviClient = new PipecatClient(RTVIConfig);
|
||||
this.setupTrackListeners();
|
||||
|
||||
this.log('Initializing devices...');
|
||||
|
||||
@@ -183,36 +183,37 @@ class VADAnalyzer(ABC):
|
||||
if len(self._vad_buffer) < num_required_bytes:
|
||||
return self._vad_state
|
||||
|
||||
audio_frames = self._vad_buffer[:num_required_bytes]
|
||||
self._vad_buffer = self._vad_buffer[num_required_bytes:]
|
||||
while len(self._vad_buffer) >= num_required_bytes:
|
||||
audio_frames = self._vad_buffer[:num_required_bytes]
|
||||
self._vad_buffer = self._vad_buffer[num_required_bytes:]
|
||||
|
||||
confidence = self.voice_confidence(audio_frames)
|
||||
confidence = self.voice_confidence(audio_frames)
|
||||
|
||||
volume = self._get_smoothed_volume(audio_frames)
|
||||
self._prev_volume = volume
|
||||
volume = self._get_smoothed_volume(audio_frames)
|
||||
self._prev_volume = volume
|
||||
|
||||
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
|
||||
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
|
||||
|
||||
if speaking:
|
||||
match self._vad_state:
|
||||
case VADState.QUIET:
|
||||
self._vad_state = VADState.STARTING
|
||||
self._vad_starting_count = 1
|
||||
case VADState.STARTING:
|
||||
self._vad_starting_count += 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_state = VADState.SPEAKING
|
||||
self._vad_stopping_count = 0
|
||||
else:
|
||||
match self._vad_state:
|
||||
case VADState.STARTING:
|
||||
self._vad_state = VADState.QUIET
|
||||
self._vad_starting_count = 0
|
||||
case VADState.SPEAKING:
|
||||
self._vad_state = VADState.STOPPING
|
||||
self._vad_stopping_count = 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_stopping_count += 1
|
||||
if speaking:
|
||||
match self._vad_state:
|
||||
case VADState.QUIET:
|
||||
self._vad_state = VADState.STARTING
|
||||
self._vad_starting_count = 1
|
||||
case VADState.STARTING:
|
||||
self._vad_starting_count += 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_state = VADState.SPEAKING
|
||||
self._vad_stopping_count = 0
|
||||
else:
|
||||
match self._vad_state:
|
||||
case VADState.STARTING:
|
||||
self._vad_state = VADState.QUIET
|
||||
self._vad_starting_count = 0
|
||||
case VADState.SPEAKING:
|
||||
self._vad_state = VADState.STOPPING
|
||||
self._vad_stopping_count = 1
|
||||
case VADState.STOPPING:
|
||||
self._vad_stopping_count += 1
|
||||
|
||||
if (
|
||||
self._vad_state == VADState.STARTING
|
||||
|
||||
@@ -108,6 +108,10 @@ class ExotelFrameSerializer(FrameSerializer):
|
||||
serialized_data = await self._output_resampler.resample(
|
||||
data, frame.sample_rate, self._exotel_sample_rate
|
||||
)
|
||||
if serialized_data is None or len(serialized_data) == 0:
|
||||
# Ignoring in case we don't have audio
|
||||
return None
|
||||
|
||||
payload = base64.b64encode(serialized_data).decode("ascii")
|
||||
|
||||
answer = {
|
||||
@@ -144,6 +148,9 @@ class ExotelFrameSerializer(FrameSerializer):
|
||||
self._exotel_sample_rate,
|
||||
self._sample_rate,
|
||||
)
|
||||
if deserialized_data is None or len(deserialized_data) == 0:
|
||||
# Ignoring in case we don't have audio
|
||||
return None
|
||||
|
||||
# Input: Exotel takes PCM data, so just resample to match sample_rate
|
||||
audio_frame = InputAudioRawFrame(
|
||||
|
||||
@@ -132,6 +132,10 @@ class PlivoFrameSerializer(FrameSerializer):
|
||||
serialized_data = await pcm_to_ulaw(
|
||||
data, frame.sample_rate, self._plivo_sample_rate, self._output_resampler
|
||||
)
|
||||
if serialized_data is None or len(serialized_data) == 0:
|
||||
# Ignoring in case we don't have audio
|
||||
return None
|
||||
|
||||
payload = base64.b64encode(serialized_data).decode("utf-8")
|
||||
answer = {
|
||||
"event": "playAudio",
|
||||
@@ -227,6 +231,10 @@ class PlivoFrameSerializer(FrameSerializer):
|
||||
deserialized_data = await ulaw_to_pcm(
|
||||
payload, self._plivo_sample_rate, self._sample_rate, self._input_resampler
|
||||
)
|
||||
if deserialized_data is None or len(deserialized_data) == 0:
|
||||
# Ignoring in case we don't have audio
|
||||
return None
|
||||
|
||||
audio_frame = InputAudioRawFrame(
|
||||
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
|
||||
)
|
||||
|
||||
@@ -155,6 +155,10 @@ class TelnyxFrameSerializer(FrameSerializer):
|
||||
else:
|
||||
raise ValueError(f"Unsupported encoding: {self._params.inbound_encoding}")
|
||||
|
||||
if serialized_data is None or len(serialized_data) == 0:
|
||||
# Ignoring in case we don't have audio
|
||||
return None
|
||||
|
||||
payload = base64.b64encode(serialized_data).decode("utf-8")
|
||||
answer = {
|
||||
"event": "media",
|
||||
@@ -262,6 +266,10 @@ class TelnyxFrameSerializer(FrameSerializer):
|
||||
else:
|
||||
raise ValueError(f"Unsupported encoding: {self._params.outbound_encoding}")
|
||||
|
||||
if deserialized_data is None or len(deserialized_data) == 0:
|
||||
# Ignoring in case we don't have audio
|
||||
return None
|
||||
|
||||
audio_frame = InputAudioRawFrame(
|
||||
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
|
||||
)
|
||||
|
||||
@@ -132,6 +132,10 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
serialized_data = await pcm_to_ulaw(
|
||||
data, frame.sample_rate, self._twilio_sample_rate, self._output_resampler
|
||||
)
|
||||
if serialized_data is None or len(serialized_data) == 0:
|
||||
# Ignoring in case we don't have audio
|
||||
return None
|
||||
|
||||
payload = base64.b64encode(serialized_data).decode("utf-8")
|
||||
answer = {
|
||||
"event": "media",
|
||||
@@ -235,6 +239,10 @@ class TwilioFrameSerializer(FrameSerializer):
|
||||
deserialized_data = await ulaw_to_pcm(
|
||||
payload, self._twilio_sample_rate, self._sample_rate, self._input_resampler
|
||||
)
|
||||
if deserialized_data is None or len(deserialized_data) == 0:
|
||||
# Ignoring in case we don't have audio
|
||||
return None
|
||||
|
||||
audio_frame = InputAudioRawFrame(
|
||||
audio=deserialized_data, num_channels=1, sample_rate=self._sample_rate
|
||||
)
|
||||
|
||||
@@ -152,6 +152,13 @@ class STTService(AIService):
|
||||
else:
|
||||
self._user_id = ""
|
||||
|
||||
if not frame.audio:
|
||||
# Ignoring in case we don't have audio to transcribe.
|
||||
logger.warning(
|
||||
f"Empty audio frame received for STT service: {self.name} {frame.num_frames}"
|
||||
)
|
||||
return
|
||||
|
||||
await self.process_generator(self.run_stt(frame.audio))
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
|
||||
Reference in New Issue
Block a user