Merge pull request #539 from pipecat-ai/aleix/pipecat-0.0.42-fixes
pipecat 0.0.42 fixes
This commit is contained in:
@@ -53,7 +53,6 @@ async def main():
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
|
||||
tts = GoogleTTSService(
|
||||
credentials=os.getenv("GOOGLE_CREDENTIALS"),
|
||||
voice_id="en-US-Neural2-J",
|
||||
params=GoogleTTSService.InputParams(language="en-US", rate="1.05"),
|
||||
)
|
||||
|
||||
2497
examples/storytelling-chatbot/frontend/package-lock.json
generated
2497
examples/storytelling-chatbot/frontend/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -11,28 +11,28 @@
|
||||
"dependencies": {
|
||||
"@daily-co/daily-js": "^0.62.0",
|
||||
"@daily-co/daily-react": "^0.18.0",
|
||||
"@radix-ui/react-select": "^2.0.0",
|
||||
"@radix-ui/react-select": "^2.1.2",
|
||||
"@radix-ui/react-slot": "^1.0.2",
|
||||
"@tabler/icons-react": "^3.1.0",
|
||||
"@tabler/icons-react": "^3.19.0",
|
||||
"class-variance-authority": "^0.7.0",
|
||||
"clsx": "^2.1.0",
|
||||
"framer-motion": "^11.0.27",
|
||||
"next": "14.1.4",
|
||||
"react": "^18",
|
||||
"react-dom": "^18",
|
||||
"clsx": "^2.1.1",
|
||||
"framer-motion": "^11.9.0",
|
||||
"next": "^14.2.14",
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
"recoil": "^0.7.7",
|
||||
"tailwind-merge": "^2.2.2",
|
||||
"tailwind-merge": "^2.5.2",
|
||||
"tailwindcss-animate": "^1.0.7"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^20",
|
||||
"@types/react": "^18",
|
||||
"@types/react-dom": "^18",
|
||||
"autoprefixer": "^10.0.1",
|
||||
"eslint": "^8",
|
||||
"@types/node": "^20.16.10",
|
||||
"@types/react": "^18.3.11",
|
||||
"@types/react-dom": "^18.3.0",
|
||||
"autoprefixer": "^10.4.20",
|
||||
"eslint": "^8.57.1",
|
||||
"eslint-config-next": "14.1.4",
|
||||
"postcss": "^8",
|
||||
"tailwindcss": "^3.4.3",
|
||||
"typescript": "^5"
|
||||
"postcss": "^8.4.47",
|
||||
"tailwindcss": "^3.4.13",
|
||||
"typescript": "^5.6.2"
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -143,7 +143,7 @@ async def main(room_url, token=None):
|
||||
|
||||
@transport.event_handler("on_participant_left")
|
||||
async def on_participant_left(transport, participant, reason):
|
||||
intro_task.queue_frame(EndFrame())
|
||||
await intro_task.queue_frame(EndFrame())
|
||||
await main_task.queue_frame(EndFrame())
|
||||
|
||||
@transport.event_handler("on_call_state_updated")
|
||||
|
||||
@@ -127,6 +127,8 @@ class OpenAILLMContext:
|
||||
if item["type"] == "image_url":
|
||||
if item["image_url"]["url"].startswith("data:image/"):
|
||||
item["image_url"]["url"] = "data:image/..."
|
||||
if "mime_type" in msg and msg["mime_type"].startswith("image/"):
|
||||
msg["data"] = "..."
|
||||
msgs.append(msg)
|
||||
return json.dumps(msgs)
|
||||
|
||||
|
||||
@@ -121,7 +121,7 @@ class AWSTTSService(TTSService):
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate,
|
||||
"engine": params.engine,
|
||||
"language": language_to_aws_language(params.language) if params.language else "en-US",
|
||||
"language": params.language if params.language else Language.EN,
|
||||
"pitch": params.pitch,
|
||||
"rate": params.rate,
|
||||
"volume": params.volume,
|
||||
@@ -135,8 +135,8 @@ class AWSTTSService(TTSService):
|
||||
def _construct_ssml(self, text: str) -> str:
|
||||
ssml = "<speak>"
|
||||
|
||||
if self._settings["language"]:
|
||||
ssml += f"<lang xml:lang='{self._settings['language']}'>"
|
||||
language = language_to_aws_language(self._settings["language"])
|
||||
ssml += f"<lang xml:lang='{language}'>"
|
||||
|
||||
prosody_attrs = []
|
||||
# Prosody tags are only supported for standard and neural engines
|
||||
@@ -158,8 +158,7 @@ class AWSTTSService(TTSService):
|
||||
if prosody_attrs:
|
||||
ssml += "</prosody>"
|
||||
|
||||
if self._settings["language"]:
|
||||
ssml += "</lang>"
|
||||
ssml += "</lang>"
|
||||
|
||||
ssml += "</speak>"
|
||||
|
||||
@@ -190,7 +189,7 @@ class AWSTTSService(TTSService):
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
yield TTSStartedFrame()
|
||||
|
||||
if "AudioStream" in response:
|
||||
with response["AudioStream"] as stream:
|
||||
@@ -203,7 +202,7 @@ class AWSTTSService(TTSService):
|
||||
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
|
||||
yield frame
|
||||
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
except (BotoCoreError, ClientError) as error:
|
||||
logger.exception(f"{self} error generating TTS: {error}")
|
||||
@@ -211,4 +210,4 @@ class AWSTTSService(TTSService):
|
||||
yield ErrorFrame(error=error_message)
|
||||
|
||||
finally:
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
@@ -27,6 +27,7 @@ from pipecat.frames.frames import (
|
||||
)
|
||||
from pipecat.services.ai_services import ImageGenService, STTService, TTSService
|
||||
from pipecat.services.openai import BaseOpenAILLMService
|
||||
from pipecat.transcriptions import language
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
@@ -191,7 +192,7 @@ class AzureTTSService(TTSService):
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate,
|
||||
"emphasis": params.emphasis,
|
||||
"language": language_to_azure_language(params.language) if params.language else "en-US",
|
||||
"language": params.language if params.language else Language.EN,
|
||||
"pitch": params.pitch,
|
||||
"rate": params.rate,
|
||||
"role": params.role,
|
||||
@@ -206,8 +207,9 @@ class AzureTTSService(TTSService):
|
||||
return True
|
||||
|
||||
def _construct_ssml(self, text: str) -> str:
|
||||
language = language_to_azure_language(self._settings["language"])
|
||||
ssml = (
|
||||
f"<speak version='1.0' xml:lang='{self._settings['language']}' "
|
||||
f"<speak version='1.0' xml:lang='{language}' "
|
||||
"xmlns='http://www.w3.org/2001/10/synthesis' "
|
||||
"xmlns:mstts='http://www.w3.org/2001/mstts'>"
|
||||
f"<voice name='{self._voice_id}'>"
|
||||
@@ -261,14 +263,14 @@ class AzureTTSService(TTSService):
|
||||
if result.reason == ResultReason.SynthesizingAudioCompleted:
|
||||
await self.start_tts_usage_metrics(text)
|
||||
await self.stop_ttfb_metrics()
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
yield TTSStartedFrame()
|
||||
# Azure always sends a 44-byte header. Strip it off.
|
||||
yield TTSAudioRawFrame(
|
||||
audio=result.audio_data[44:],
|
||||
sample_rate=self._settings["sample_rate"],
|
||||
num_channels=1,
|
||||
)
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
elif result.reason == ResultReason.Canceled:
|
||||
cancellation_details = result.cancellation_details
|
||||
logger.warning(f"Speech synthesis canceled: {cancellation_details.reason}")
|
||||
|
||||
@@ -106,7 +106,7 @@ class CartesiaTTSService(WordTTSService):
|
||||
"encoding": params.encoding,
|
||||
"sample_rate": params.sample_rate,
|
||||
},
|
||||
"language": language_to_cartesia_language(params.language) if params.language else "en",
|
||||
"language": params.language if params.language else Language.EN,
|
||||
"speed": params.speed,
|
||||
"emotion": params.emotion,
|
||||
}
|
||||
@@ -146,7 +146,7 @@ class CartesiaTTSService(WordTTSService):
|
||||
"model_id": self.model_name,
|
||||
"voice": voice_config,
|
||||
"output_format": self._settings["output_format"],
|
||||
"language": self._settings["language"],
|
||||
"language": language_to_cartesia_language(self._settings["language"]),
|
||||
"add_timestamps": add_timestamps,
|
||||
}
|
||||
return json.dumps(msg)
|
||||
@@ -255,8 +255,8 @@ class CartesiaTTSService(WordTTSService):
|
||||
await self._connect()
|
||||
|
||||
if not self._context_id:
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.start_ttfb_metrics()
|
||||
yield TTSStartedFrame()
|
||||
self._context_id = str(uuid.uuid4())
|
||||
|
||||
msg = self._build_msg(text=text)
|
||||
@@ -266,7 +266,7 @@ class CartesiaTTSService(WordTTSService):
|
||||
await self.start_tts_usage_metrics(text)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error sending message: {e}")
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
return
|
||||
@@ -303,7 +303,7 @@ class CartesiaHttpTTSService(TTSService):
|
||||
"encoding": params.encoding,
|
||||
"sample_rate": params.sample_rate,
|
||||
},
|
||||
"language": language_to_cartesia_language(params.language) if params.language else None,
|
||||
"language": params.language if params.language else Language.EN,
|
||||
"speed": params.speed,
|
||||
"emotion": params.emotion,
|
||||
}
|
||||
@@ -315,11 +315,6 @@ class CartesiaHttpTTSService(TTSService):
|
||||
def can_generate_metrics(self) -> bool:
|
||||
return True
|
||||
|
||||
async def set_model(self, model: str):
|
||||
logger.debug(f"Switching TTS model to: [{model}]")
|
||||
self._model_id = model
|
||||
await super().set_model(model)
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
await super().stop(frame)
|
||||
await self._client.close()
|
||||
@@ -331,8 +326,8 @@ class CartesiaHttpTTSService(TTSService):
|
||||
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
||||
logger.debug(f"Generating TTS: [{text}]")
|
||||
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.start_ttfb_metrics()
|
||||
yield TTSStartedFrame()
|
||||
|
||||
try:
|
||||
voice_controls = None
|
||||
@@ -344,11 +339,11 @@ class CartesiaHttpTTSService(TTSService):
|
||||
voice_controls["emotion"] = self._settings["emotion"]
|
||||
|
||||
output = await self._client.tts.sse(
|
||||
model_id=self._model_id,
|
||||
model_id=self._model_name,
|
||||
transcript=text,
|
||||
voice_id=self._voice_id,
|
||||
output_format=self._settings["output_format"],
|
||||
language=self._settings["language"],
|
||||
language=language_to_cartesia_language(self._settings["language"]),
|
||||
stream=False,
|
||||
_experimental_voice_controls=voice_controls,
|
||||
)
|
||||
@@ -365,4 +360,4 @@ class CartesiaHttpTTSService(TTSService):
|
||||
logger.error(f"{self} exception: {e}")
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
@@ -84,7 +84,7 @@ class DeepgramTTSService(TTSService):
|
||||
)
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
yield TTSStartedFrame()
|
||||
|
||||
# The response.stream_memory is already a BytesIO object
|
||||
audio_buffer = response.stream_memory
|
||||
@@ -105,7 +105,7 @@ class DeepgramTTSService(TTSService):
|
||||
)
|
||||
yield frame
|
||||
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} exception: {e}")
|
||||
|
||||
@@ -198,9 +198,7 @@ class ElevenLabsTTSService(WordTTSService):
|
||||
self._url = url
|
||||
self._settings = {
|
||||
"sample_rate": sample_rate_from_output_format(params.output_format),
|
||||
"language": language_to_elevenlabs_language(params.language)
|
||||
if params.language
|
||||
else "en",
|
||||
"language": params.language if params.language else Language.EN,
|
||||
"output_format": params.output_format,
|
||||
"optimize_streaming_latency": params.optimize_streaming_latency,
|
||||
"stability": params.stability,
|
||||
@@ -294,14 +292,14 @@ class ElevenLabsTTSService(WordTTSService):
|
||||
if self._settings["optimize_streaming_latency"]:
|
||||
url += f"&optimize_streaming_latency={self._settings['optimize_streaming_latency']}"
|
||||
|
||||
# language can only be used with the 'eleven_turbo_v2_5' model
|
||||
if self._settings["language"]:
|
||||
if model == "eleven_turbo_v2_5":
|
||||
url += f"&language_code={self._settings['language']}"
|
||||
else:
|
||||
logger.debug(
|
||||
f"Language code [{self._settings['language']}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
|
||||
)
|
||||
# Language can only be used with the 'eleven_turbo_v2_5' model
|
||||
language = language_to_elevenlabs_language(self._settings["language"])
|
||||
if model == "eleven_turbo_v2_5":
|
||||
url += f"&language_code={language}"
|
||||
else:
|
||||
logger.debug(
|
||||
f"Language code [{language}] not applied. Language codes can only be used with the 'eleven_turbo_v2_5' model."
|
||||
)
|
||||
|
||||
self._websocket = await websockets.connect(url)
|
||||
self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
|
||||
@@ -387,8 +385,8 @@ class ElevenLabsTTSService(WordTTSService):
|
||||
|
||||
try:
|
||||
if not self._started:
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.start_ttfb_metrics()
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
self._cumulative_time = 0
|
||||
|
||||
@@ -396,7 +394,7 @@ class ElevenLabsTTSService(WordTTSService):
|
||||
await self.start_tts_usage_metrics(text)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error sending message: {e}")
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
return
|
||||
|
||||
@@ -37,17 +37,17 @@ except ModuleNotFoundError as e:
|
||||
def language_to_gladia_language(language: Language) -> str | None:
|
||||
match language:
|
||||
case Language.BG:
|
||||
return "bg"
|
||||
return "bulgarian"
|
||||
case Language.CA:
|
||||
return "ca"
|
||||
return "catalan"
|
||||
case Language.ZH:
|
||||
return "zh"
|
||||
return "chinese"
|
||||
case Language.CS:
|
||||
return "cs"
|
||||
return "czech"
|
||||
case Language.DA:
|
||||
return "da"
|
||||
return "danish"
|
||||
case Language.NL:
|
||||
return "nl"
|
||||
return "dutch"
|
||||
case (
|
||||
Language.EN
|
||||
| Language.EN_US
|
||||
@@ -56,59 +56,59 @@ def language_to_gladia_language(language: Language) -> str | None:
|
||||
| Language.EN_NZ
|
||||
| Language.EN_IN
|
||||
):
|
||||
return "en"
|
||||
return "english"
|
||||
case Language.ET:
|
||||
return "et"
|
||||
return "estonian"
|
||||
case Language.FI:
|
||||
return "fi"
|
||||
return "finnish"
|
||||
case Language.FR | Language.FR_CA:
|
||||
return "fr"
|
||||
return "french"
|
||||
case Language.DE | Language.DE_CH:
|
||||
return "de"
|
||||
return "german"
|
||||
case Language.EL:
|
||||
return "el"
|
||||
return "greek"
|
||||
case Language.HI:
|
||||
return "hi"
|
||||
return "hindi"
|
||||
case Language.HU:
|
||||
return "hu"
|
||||
return "hungarian"
|
||||
case Language.ID:
|
||||
return "id"
|
||||
return "indonesian"
|
||||
case Language.IT:
|
||||
return "it"
|
||||
return "italian"
|
||||
case Language.JA:
|
||||
return "ja"
|
||||
return "japanese"
|
||||
case Language.KO:
|
||||
return "ko"
|
||||
return "korean"
|
||||
case Language.LV:
|
||||
return "lv"
|
||||
return "latvian"
|
||||
case Language.LT:
|
||||
return "lt"
|
||||
return "lithuanian"
|
||||
case Language.MS:
|
||||
return "ms"
|
||||
return "malay"
|
||||
case Language.NO:
|
||||
return "no"
|
||||
return "norwegian"
|
||||
case Language.PL:
|
||||
return "pl"
|
||||
return "polish"
|
||||
case Language.PT | Language.PT_BR:
|
||||
return "pt"
|
||||
return "portuguese"
|
||||
case Language.RO:
|
||||
return "ro"
|
||||
return "romanian"
|
||||
case Language.RU:
|
||||
return "ru"
|
||||
return "russian"
|
||||
case Language.SK:
|
||||
return "sk"
|
||||
return "slovak"
|
||||
case Language.ES:
|
||||
return "es"
|
||||
return "spanish"
|
||||
case Language.SV:
|
||||
return "sv"
|
||||
return "slovenian"
|
||||
case Language.TH:
|
||||
return "th"
|
||||
return "thai"
|
||||
case Language.TR:
|
||||
return "tr"
|
||||
return "turkish"
|
||||
case Language.UK:
|
||||
return "uk"
|
||||
return "ukrainian"
|
||||
case Language.VI:
|
||||
return "vi"
|
||||
return "vietnamese"
|
||||
return None
|
||||
|
||||
|
||||
@@ -135,7 +135,7 @@ class GladiaSTTService(STTService):
|
||||
self._url = url
|
||||
self._settings = {
|
||||
"sample_rate": params.sample_rate,
|
||||
"language": language_to_gladia_language(params.language) if params.language else "en",
|
||||
"language": params.language if params.language else Language.EN,
|
||||
"transcription_hint": params.transcription_hint,
|
||||
"endpointing": params.endpointing,
|
||||
"prosody": params.prosody,
|
||||
@@ -169,7 +169,7 @@ class GladiaSTTService(STTService):
|
||||
"model_type": "fast",
|
||||
"language_behaviour": "manual",
|
||||
"sample_rate": self._settings["sample_rate"],
|
||||
"language": self._settings["language"],
|
||||
"language": language_to_gladia_language(self._settings["language"]),
|
||||
"transcription_hint": self._settings["transcription_hint"],
|
||||
"endpointing": self._settings["endpointing"],
|
||||
"prosody": self._settings["prosody"],
|
||||
|
||||
@@ -40,7 +40,7 @@ try:
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
"In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set `GOOGLE_API_KEY` environment variable."
|
||||
"In order to use Google AI, you need to `pip install pipecat-ai[google]`. Also, set the environment variable GOOGLE_API_KEY for the GoogleLLMService and GOOGLE_APPLICATION_CREDENTIALS for the GoogleTTSService`."
|
||||
)
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
@@ -261,9 +261,7 @@ class GoogleTTSService(TTSService):
|
||||
"rate": params.rate,
|
||||
"volume": params.volume,
|
||||
"emphasis": params.emphasis,
|
||||
"language": language_to_google_language(params.language)
|
||||
if params.language
|
||||
else "en-US",
|
||||
"language": params.language if params.language else Language.EN,
|
||||
"gender": params.gender,
|
||||
"google_style": params.google_style,
|
||||
}
|
||||
@@ -287,8 +285,6 @@ class GoogleTTSService(TTSService):
|
||||
elif credentials_path:
|
||||
# Use service account JSON file if provided
|
||||
creds = service_account.Credentials.from_service_account_file(credentials_path)
|
||||
else:
|
||||
raise ValueError("Either 'credentials' or 'credentials_path' must be provided.")
|
||||
|
||||
return texttospeech_v1.TextToSpeechAsyncClient(credentials=creds)
|
||||
|
||||
@@ -300,8 +296,10 @@ class GoogleTTSService(TTSService):
|
||||
|
||||
# Voice tag
|
||||
voice_attrs = [f"name='{self._voice_id}'"]
|
||||
if self._settings["language"]:
|
||||
voice_attrs.append(f"language='{self._settings['language']}'")
|
||||
|
||||
language = language_to_google_language(self._settings["language"])
|
||||
voice_attrs.append(f"language='{language}'")
|
||||
|
||||
if self._settings["gender"]:
|
||||
voice_attrs.append(f"gender='{self._settings['gender']}'")
|
||||
ssml += f"<voice {' '.join(voice_attrs)}>"
|
||||
@@ -363,7 +361,7 @@ class GoogleTTSService(TTSService):
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
yield TTSStartedFrame()
|
||||
|
||||
# Skip the first 44 bytes to remove the WAV header
|
||||
audio_content = response.audio_content[44:]
|
||||
@@ -379,11 +377,11 @@ class GoogleTTSService(TTSService):
|
||||
yield frame
|
||||
await asyncio.sleep(0) # Allow other tasks to run
|
||||
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} error generating TTS: {e}")
|
||||
error_message = f"TTS generation error: {str(e)}"
|
||||
yield ErrorFrame(error=error_message)
|
||||
finally:
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
@@ -82,7 +82,7 @@ class LmntTTSService(TTSService):
|
||||
"encoding": "pcm_s16le",
|
||||
"sample_rate": sample_rate,
|
||||
},
|
||||
"language": language_to_lmnt_language(language) if language else "en",
|
||||
"language": language,
|
||||
}
|
||||
|
||||
self.set_voice(voice_id)
|
||||
@@ -176,8 +176,8 @@ class LmntTTSService(TTSService):
|
||||
await self._connect()
|
||||
|
||||
if not self._started:
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
await self.start_ttfb_metrics()
|
||||
yield TTSStartedFrame()
|
||||
self._started = True
|
||||
|
||||
try:
|
||||
@@ -186,7 +186,7 @@ class LmntTTSService(TTSService):
|
||||
await self.start_tts_usage_metrics(text)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} error sending message: {e}")
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
await self._disconnect()
|
||||
await self._connect()
|
||||
return
|
||||
|
||||
@@ -426,13 +426,13 @@ class OpenAITTSService(TTSService):
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
yield TTSStartedFrame()
|
||||
async for chunk in r.iter_bytes(8192):
|
||||
if len(chunk) > 0:
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
|
||||
yield frame
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
except BadRequestError as e:
|
||||
logger.exception(f"{self} error generating TTS: {e}")
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ class PlayHTTTSService(TTSService):
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
yield TTSStartedFrame()
|
||||
async for chunk in playht_gen:
|
||||
# skip the RIFF header.
|
||||
if in_header:
|
||||
@@ -95,6 +95,6 @@ class PlayHTTTSService(TTSService):
|
||||
await self.stop_ttfb_metrics()
|
||||
frame = TTSAudioRawFrame(chunk, self._settings["sample_rate"], 1)
|
||||
yield frame
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
except Exception as e:
|
||||
logger.exception(f"{self} error generating TTS: {e}")
|
||||
|
||||
@@ -94,7 +94,7 @@ class XTTSService(TTSService):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._settings = {
|
||||
"language": language_to_xtts_language(language) if language else "en",
|
||||
"language": language,
|
||||
"base_url": base_url,
|
||||
}
|
||||
self.set_voice(voice_id)
|
||||
@@ -131,9 +131,11 @@ class XTTSService(TTSService):
|
||||
|
||||
url = self._settings["base_url"] + "/tts_stream"
|
||||
|
||||
language = language_to_xtts_language(self._settings["language"])
|
||||
|
||||
payload = {
|
||||
"text": text.replace(".", "").replace("*", ""),
|
||||
"language": self._settings["language"],
|
||||
"language": language,
|
||||
"speaker_embedding": embeddings["speaker_embedding"],
|
||||
"gpt_cond_latent": embeddings["gpt_cond_latent"],
|
||||
"add_wav_header": False,
|
||||
@@ -151,7 +153,7 @@ class XTTSService(TTSService):
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
await self.push_frame(TTSStartedFrame())
|
||||
yield TTSStartedFrame()
|
||||
|
||||
buffer = bytearray()
|
||||
async for chunk in r.content.iter_chunked(1024):
|
||||
@@ -187,4 +189,4 @@ class XTTSService(TTSService):
|
||||
frame = TTSAudioRawFrame(resampled_audio_bytes, 16000, 1)
|
||||
yield frame
|
||||
|
||||
await self.push_frame(TTSStoppedFrame())
|
||||
yield TTSStoppedFrame()
|
||||
|
||||
Reference in New Issue
Block a user