Merge pull request #539 from pipecat-ai/aleix/pipecat-0.0.42-fixes

pipecat 0.0.42 fixes
This commit is contained in:
Aleix Conchillo Flaqué
2024-10-02 13:34:28 -07:00
committed by GitHub
17 changed files with 1211 additions and 4721 deletions

View File

@@ -53,7 +53,6 @@ async def main():
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
tts = GoogleTTSService(
credentials=os.getenv("GOOGLE_CREDENTIALS"),
voice_id="en-US-Neural2-J",
params=GoogleTTSService.InputParams(language="en-US", rate="1.05"),
)

File diff suppressed because it is too large Load Diff

View File

@@ -11,28 +11,28 @@
"dependencies": {
"@daily-co/daily-js": "^0.62.0",
"@daily-co/daily-react": "^0.18.0",
"@radix-ui/react-select": "^2.0.0",
"@radix-ui/react-select": "^2.1.2",
"@radix-ui/react-slot": "^1.0.2",
"@tabler/icons-react": "^3.1.0",
"@tabler/icons-react": "^3.19.0",
"class-variance-authority": "^0.7.0",
"clsx": "^2.1.0",
"framer-motion": "^11.0.27",
"next": "14.1.4",
"react": "^18",
"react-dom": "^18",
"clsx": "^2.1.1",
"framer-motion": "^11.9.0",
"next": "^14.2.14",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"recoil": "^0.7.7",
"tailwind-merge": "^2.2.2",
"tailwind-merge": "^2.5.2",
"tailwindcss-animate": "^1.0.7"
},
"devDependencies": {
"@types/node": "^20",
"@types/react": "^18",
"@types/react-dom": "^18",
"autoprefixer": "^10.0.1",
"eslint": "^8",
"@types/node": "^20.16.10",
"@types/react": "^18.3.11",
"@types/react-dom": "^18.3.0",
"autoprefixer": "^10.4.20",
"eslint": "^8.57.1",
"eslint-config-next": "14.1.4",
"postcss": "^8",
"tailwindcss": "^3.4.3",
"typescript": "^5"
"postcss": "^8.4.47",
"tailwindcss": "^3.4.13",
"typescript": "^5.6.2"
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -143,7 +143,7 @@ async def main(room_url, token=None):
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
intro_task.queue_frame(EndFrame())
await intro_task.queue_frame(EndFrame())
await main_task.queue_frame(EndFrame())
@transport.event_handler("on_call_state_updated")

View File

@@ -127,6 +127,8 @@ class OpenAILLMContext:
if item["type"] == "image_url":
if item["image_url"]["url"].startswith("data:image/"):
item["image_url"]["url"] = "data:image/..."
if "mime_type" in msg and msg["mime_type"].startswith("image/"):
msg["data"] = "..."
msgs.append(msg)
return json.dumps(msgs)

View File

@@ -121,7 +121,7 @@ class AWSTTSService(TTSService):
self._settings = {
"sample_rate": sample_rate,
"engine": params.engine,
"language": language_to_aws_language(params.language) if params.language else "en-US",
"language": params.language if params.language else Language.EN,
"pitch": params.pitch,
"rate": params.rate,
"volume": params.volume,
@@ -135,8 +135,8 @@ class AWSTTSService(TTSService):
def _construct_ssml(self, text: str) -> str:
ssml = "<speak>"
if self._settings["language"]:
ssml += f"<lang xml:lang='{self._settings['language']}'>"
language = language_to_aws_language(self._settings["language"])
ssml += f"<lang xml:lang='{language}'>"
prosody_attrs = []
# Prosody tags are only supported for standard and neural engines
@@ -158,8 +158,7 @@ class AWSTTSService(TTSService):
if prosody_attrs:
ssml += "</prosody>"
if self._settings["language"]:
ssml += "</lang>"
ssml += "</lang>"
ssml += "</speak>"
@@ -190,7 +189,7 @@ class AWSTTSService(TTSService):
await self.start_tts_usage_metrics(text)
await self.push_frame(TTSStartedFrame())
yield TTSStartedFrame()
if "AudioStream" in response:
with response["AudioStream"] as stream:
@@ -203,7 +202,7 @@ class AWSTTSService(TTSService):
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()
except (BotoCoreError, ClientError) as error:
logger.exception(f"{self} error generating TTS: {error}")
@@ -211,4 +210,4 @@ class AWSTTSService(TTSService):
yield ErrorFrame(error=error_message)
finally:
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()

View File

@@ -27,6 +27,7 @@ from pipecat.frames.frames import (
)
from pipecat.services.ai_services import ImageGenService, STTService, TTSService
from pipecat.services.openai import BaseOpenAILLMService
from pipecat.transcriptions import language
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
@@ -191,7 +192,7 @@ class AzureTTSService(TTSService):
self._settings = {
"sample_rate": sample_rate,
"emphasis": params.emphasis,
"language": language_to_azure_language(params.language) if params.language else "en-US",
"language": params.language if params.language else Language.EN,
"pitch": params.pitch,
"rate": params.rate,
"role": params.role,
@@ -206,8 +207,9 @@ class AzureTTSService(TTSService):
return True
def _construct_ssml(self, text: str) -> str:
language = language_to_azure_language(self._settings["language"])
ssml = (
f"<speak version='1.0' xml:lang='{self._settings['language']}' "
f"<speak version='1.0' xml:lang='{language}' "
"xmlns='http://www.w3.org/2001/10/synthesis' "
"xmlns:mstts='http://www.w3.org/2001/mstts'>"
f"<voice name='{self._voice_id}'>"
@@ -261,14 +263,14 @@ class AzureTTSService(TTSService):
if result.reason == ResultReason.SynthesizingAudioCompleted:
await self.start_tts_usage_metrics(text)
await self.stop_ttfb_metrics()
await self.push_frame(TTSStartedFrame())
yield TTSStartedFrame()
# Azure always sends a 44-byte header. Strip it off.
yield TTSAudioRawFrame(
audio=result.audio_data[44:],
sample_rate=self._settings["sample_rate"],
num_channels=1,
)
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()
elif result.reason == ResultReason.Canceled:
cancellation_details = result.cancellation_details
logger.warning(f"Speech synthesis canceled: {cancellation_details.reason}")

View File

@@ -106,7 +106,7 @@ class CartesiaTTSService(WordTTSService):
"encoding": params.encoding,
"sample_rate": params.sample_rate,
},
"language": language_to_cartesia_language(params.language) if params.language else "en",
"language": params.language if params.language else Language.EN,
"speed": params.speed,
"emotion": params.emotion,
}
@@ -146,7 +146,7 @@ class CartesiaTTSService(WordTTSService):
"model_id": self.model_name,
"voice": voice_config,
"output_format": self._settings["output_format"],
"language": self._settings["language"],
"language": language_to_cartesia_language(self._settings["language"]),
"add_timestamps": add_timestamps,
}
return json.dumps(msg)
@@ -255,8 +255,8 @@ class CartesiaTTSService(WordTTSService):
await self._connect()
if not self._context_id:
await self.push_frame(TTSStartedFrame())
await self.start_ttfb_metrics()
yield TTSStartedFrame()
self._context_id = str(uuid.uuid4())
msg = self._build_msg(text=text)
@@ -266,7 +266,7 @@ class CartesiaTTSService(WordTTSService):
await self.start_tts_usage_metrics(text)
except Exception as e:
logger.error(f"{self} error sending message: {e}")
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()
await self._disconnect()
await self._connect()
return
@@ -303,7 +303,7 @@ class CartesiaHttpTTSService(TTSService):
"encoding": params.encoding,
"sample_rate": params.sample_rate,
},
"language": language_to_cartesia_language(params.language) if params.language else None,
"language": params.language if params.language else Language.EN,
"speed": params.speed,
"emotion": params.emotion,
}
@@ -315,11 +315,6 @@ class CartesiaHttpTTSService(TTSService):
def can_generate_metrics(self) -> bool:
return True
async def set_model(self, model: str):
logger.debug(f"Switching TTS model to: [{model}]")
self._model_id = model
await super().set_model(model)
async def stop(self, frame: EndFrame):
await super().stop(frame)
await self._client.close()
@@ -331,8 +326,8 @@ class CartesiaHttpTTSService(TTSService):
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
logger.debug(f"Generating TTS: [{text}]")
await self.push_frame(TTSStartedFrame())
await self.start_ttfb_metrics()
yield TTSStartedFrame()
try:
voice_controls = None
@@ -344,11 +339,11 @@ class CartesiaHttpTTSService(TTSService):
voice_controls["emotion"] = self._settings["emotion"]
output = await self._client.tts.sse(
model_id=self._model_id,
model_id=self._model_name,
transcript=text,
voice_id=self._voice_id,
output_format=self._settings["output_format"],
language=self._settings["language"],
language=language_to_cartesia_language(self._settings["language"]),
stream=False,
_experimental_voice_controls=voice_controls,
)
@@ -365,4 +360,4 @@ class CartesiaHttpTTSService(TTSService):
logger.error(f"{self} exception: {e}")
await self.start_tts_usage_metrics(text)
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()

View File

@@ -84,7 +84,7 @@ class DeepgramTTSService(TTSService):
)
await self.start_tts_usage_metrics(text)
await self.push_frame(TTSStartedFrame())
yield TTSStartedFrame()
# The response.stream_memory is already a BytesIO object
audio_buffer = response.stream_memory
@@ -105,7 +105,7 @@ class DeepgramTTSService(TTSService):
)
yield frame
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()
except Exception as e:
logger.exception(f"{self} exception: {e}")

View File

@@ -198,9 +198,7 @@ class ElevenLabsTTSService(WordTTSService):
self._url = url
self._settings = {
"sample_rate": sample_rate_from_output_format(params.output_format),
"language": language_to_elevenlabs_language(params.language)
if params.language
else "en",
"language": params.language if params.language else Language.EN,
"output_format": params.output_format,
"optimize_streaming_latency": params.optimize_streaming_latency,
"stability": params.stability,
@@ -294,14 +292,14 @@ class ElevenLabsTTSService(WordTTSService):
if self._settings["optimize_streaming_latency"]:
url += f"&optimize_streaming_latency={self._settings['optimize_streaming_latency']}"
# language can only be used with the 'eleven_turbo_v2_5' model
if self._settings["language"]:
if model == "eleven_turbo_v2_5":
url += f"&language_code={self._settings['language']}"
else:
logger.debug(
f"Language code [{self._settings['language']}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
)
# Language can only be used with the 'eleven_turbo_v2_5' model
language = language_to_elevenlabs_language(self._settings["language"])
if model == "eleven_turbo_v2_5":
url += f"&language_code={language}"
else:
logger.debug(
f"Language code [{language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
)
self._websocket = await websockets.connect(url)
self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
@@ -387,8 +385,8 @@ class ElevenLabsTTSService(WordTTSService):
try:
if not self._started:
await self.push_frame(TTSStartedFrame())
await self.start_ttfb_metrics()
yield TTSStartedFrame()
self._started = True
self._cumulative_time = 0
@@ -396,7 +394,7 @@ class ElevenLabsTTSService(WordTTSService):
await self.start_tts_usage_metrics(text)
except Exception as e:
logger.error(f"{self} error sending message: {e}")
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()
await self._disconnect()
await self._connect()
return

View File

@@ -37,17 +37,17 @@ except ModuleNotFoundError as e:
def language_to_gladia_language(language: Language) -> str | None:
match language:
case Language.BG:
return "bg"
return "bulgarian"
case Language.CA:
return "ca"
return "catalan"
case Language.ZH:
return "zh"
return "chinese"
case Language.CS:
return "cs"
return "czech"
case Language.DA:
return "da"
return "danish"
case Language.NL:
return "nl"
return "dutch"
case (
Language.EN
| Language.EN_US
@@ -56,59 +56,59 @@ def language_to_gladia_language(language: Language) -> str | None:
| Language.EN_NZ
| Language.EN_IN
):
return "en"
return "english"
case Language.ET:
return "et"
return "estonian"
case Language.FI:
return "fi"
return "finnish"
case Language.FR | Language.FR_CA:
return "fr"
return "french"
case Language.DE | Language.DE_CH:
return "de"
return "german"
case Language.EL:
return "el"
return "greek"
case Language.HI:
return "hi"
return "hindi"
case Language.HU:
return "hu"
return "hungarian"
case Language.ID:
return "id"
return "indonesian"
case Language.IT:
return "it"
return "italian"
case Language.JA:
return "ja"
return "japanese"
case Language.KO:
return "ko"
return "korean"
case Language.LV:
return "lv"
return "latvian"
case Language.LT:
return "lt"
return "lithuanian"
case Language.MS:
return "ms"
return "malay"
case Language.NO:
return "no"
return "norwegian"
case Language.PL:
return "pl"
return "polish"
case Language.PT | Language.PT_BR:
return "pt"
return "portuguese"
case Language.RO:
return "ro"
return "romanian"
case Language.RU:
return "ru"
return "russian"
case Language.SK:
return "sk"
return "slovak"
case Language.ES:
return "es"
return "spanish"
case Language.SV:
return "sv"
return "slovenian"
case Language.TH:
return "th"
return "thai"
case Language.TR:
return "tr"
return "turkish"
case Language.UK:
return "uk"
return "ukrainian"
case Language.VI:
return "vi"
return "vietnamese"
return None
@@ -135,7 +135,7 @@ class GladiaSTTService(STTService):
self._url = url
self._settings = {
"sample_rate": params.sample_rate,
"language": language_to_gladia_language(params.language) if params.language else "en",
"language": params.language if params.language else Language.EN,
"transcription_hint": params.transcription_hint,
"endpointing": params.endpointing,
"prosody": params.prosody,
@@ -169,7 +169,7 @@ class GladiaSTTService(STTService):
"model_type": "fast",
"language_behaviour": "manual",
"sample_rate": self._settings["sample_rate"],
"language": self._settings["language"],
"language": language_to_gladia_language(self._settings["language"]),
"transcription_hint": self._settings["transcription_hint"],
"endpointing": self._settings["endpointing"],
"prosody": self._settings["prosody"],

View File

@@ -40,7 +40,7 @@ try:
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set `GOOGLE_API_KEY` environment variable."
"In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set the environment variable GOOGLE_API_KEY for the GoogleLLMService and GOOGLE_APPLICATION_CREDENTIALS for the GoogleTTSService`."
)
raise Exception(f"Missing module: {e}")
@@ -261,9 +261,7 @@ class GoogleTTSService(TTSService):
"rate": params.rate,
"volume": params.volume,
"emphasis": params.emphasis,
"language": language_to_google_language(params.language)
if params.language
else "en-US",
"language": params.language if params.language else Language.EN,
"gender": params.gender,
"google_style": params.google_style,
}
@@ -287,8 +285,6 @@ class GoogleTTSService(TTSService):
elif credentials_path:
# Use service account JSON file if provided
creds = service_account.Credentials.from_service_account_file(credentials_path)
else:
raise ValueError("Either 'credentials' or 'credentials_path' must be provided.")
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
@@ -300,8 +296,10 @@ class GoogleTTSService(TTSService):
# Voice tag
voice_attrs = [f"name='{self._voice_id}'"]
if self._settings["language"]:
voice_attrs.append(f"language='{self._settings['language']}'")
language = language_to_google_language(self._settings["language"])
voice_attrs.append(f"language='{language}'")
if self._settings["gender"]:
voice_attrs.append(f"gender='{self._settings['gender']}'")
ssml += f"<voice {' '.join(voice_attrs)}>"
@@ -363,7 +361,7 @@ class GoogleTTSService(TTSService):
await self.start_tts_usage_metrics(text)
await self.push_frame(TTSStartedFrame())
yield TTSStartedFrame()
# Skip the first 44 bytes to remove the WAV header
audio_content = response.audio_content[44:]
@@ -379,11 +377,11 @@ class GoogleTTSService(TTSService):
yield frame
await asyncio.sleep(0) # Allow other tasks to run
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()
except Exception as e:
logger.exception(f"{self} error generating TTS: {e}")
error_message = f"TTS generation error: {str(e)}"
yield ErrorFrame(error=error_message)
finally:
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()

View File

@@ -82,7 +82,7 @@ class LmntTTSService(TTSService):
"encoding": "pcm_s16le",
"sample_rate": sample_rate,
},
"language": language_to_lmnt_language(language) if language else "en",
"language": language,
}
self.set_voice(voice_id)
@@ -176,8 +176,8 @@ class LmntTTSService(TTSService):
await self._connect()
if not self._started:
await self.push_frame(TTSStartedFrame())
await self.start_ttfb_metrics()
yield TTSStartedFrame()
self._started = True
try:
@@ -186,7 +186,7 @@ class LmntTTSService(TTSService):
await self.start_tts_usage_metrics(text)
except Exception as e:
logger.error(f"{self} error sending message: {e}")
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()
await self._disconnect()
await self._connect()
return

View File

@@ -426,13 +426,13 @@ class OpenAITTSService(TTSService):
await self.start_tts_usage_metrics(text)
await self.push_frame(TTSStartedFrame())
yield TTSStartedFrame()
async for chunk in r.iter_bytes(8192):
if len(chunk) > 0:
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()
except BadRequestError as e:
logger.exception(f"{self} error generating TTS: {e}")

View File

@@ -75,7 +75,7 @@ class PlayHTTTSService(TTSService):
await self.start_tts_usage_metrics(text)
await self.push_frame(TTSStartedFrame())
yield TTSStartedFrame()
async for chunk in playht_gen:
# skip the RIFF header.
if in_header:
@@ -95,6 +95,6 @@ class PlayHTTTSService(TTSService):
await self.stop_ttfb_metrics()
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
yield frame
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()
except Exception as e:
logger.exception(f"{self} error generating TTS: {e}")

View File

@@ -94,7 +94,7 @@ class XTTSService(TTSService):
super().__init__(**kwargs)
self._settings = {
"language": language_to_xtts_language(language) if language else "en",
"language": language,
"base_url": base_url,
}
self.set_voice(voice_id)
@@ -131,9 +131,11 @@ class XTTSService(TTSService):
url = self._settings["base_url"] + "/tts_stream"
language = language_to_xtts_language(self._settings["language"])
payload = {
"text": text.replace(".", "").replace("*", ""),
"language": self._settings["language"],
"language": language,
"speaker_embedding": embeddings["speaker_embedding"],
"gpt_cond_latent": embeddings["gpt_cond_latent"],
"add_wav_header": False,
@@ -151,7 +153,7 @@ class XTTSService(TTSService):
await self.start_tts_usage_metrics(text)
await self.push_frame(TTSStartedFrame())
yield TTSStartedFrame()
buffer = bytearray()
async for chunk in r.content.iter_chunked(1024):
@@ -187,4 +189,4 @@ class XTTSService(TTSService):
frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
yield frame
await self.push_frame(TTSStoppedFrame())
yield TTSStoppedFrame()