From 23a4933af9d58828efd62b867f63b6045b9f632c Mon Sep 17 00:00:00 2001 From: Adnan Siddiquei Date: Wed, 12 Mar 2025 17:15:31 +0000 Subject: [PATCH 1/7] Initial implementation of Neuphonic service. A TTS provider. --- dot-env.template | 3 + pyproject.toml | 1 + src/pipecat/services/neuphonic.py | 364 ++++++++++++++++++++++++++++++ 3 files changed, 368 insertions(+) create mode 100644 src/pipecat/services/neuphonic.py diff --git a/dot-env.template b/dot-env.template index e31681235..331eba79c 100644 --- a/dot-env.template +++ b/dot-env.template @@ -29,6 +29,9 @@ DAILY_SAMPLE_ROOM_URL=https://... ELEVENLABS_API_KEY=... ELEVENLABS_VOICE_ID=... +# Neuphonic +NEUPHONIC_API_TOKEN=... + # Fal FAL_KEY=... diff --git a/pyproject.toml b/pyproject.toml index dd58b1bfe..b5a870941 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ aws = [ "boto3~=1.35.99" ] azure = [ "azure-cognitiveservices-speech~=1.42.0"] canonical = [ "aiofiles~=24.1.0" ] cartesia = [ "cartesia~=1.3.1", "websockets~=13.1" ] +neuphonic = [ "pyneuphonic~=1.5.10", "websockets>=12.0,<14.0" ] cerebras = [] deepseek = [] daily = [ "daily-python~=0.15.0" ] diff --git a/src/pipecat/services/neuphonic.py b/src/pipecat/services/neuphonic.py new file mode 100644 index 000000000..b14c325f8 --- /dev/null +++ b/src/pipecat/services/neuphonic.py @@ -0,0 +1,364 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import base64 +import json +from typing import Any, AsyncGenerator, Mapping, Optional + +from loguru import logger +from pydantic import BaseModel + +from pipecat.frames.frames import ( + BotStoppedSpeakingFrame, + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + LLMFullResponseEndFrame, + StartFrame, + StartInterruptionFrame, + TTSAudioRawFrame, + TTSSpeakFrame, + TTSStartedFrame, + TTSStoppedFrame, +) +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.ai_services import TTSService, WordTTSService +from pipecat.services.websocket_service import WebsocketService +from pipecat.transcriptions.language import Language + +# See .env.example for Neuphonic configuration needed +try: + import websockets + from pyneuphonic import Neuphonic, TTSConfig +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use Neuphonic, you need to `pip install pipecat-ai[neuphonic]`. Also, set `NEUPHONIC_API_TOKEN` environment variable." + ) + raise Exception(f"Missing module: {e}") + +# Models that support language codes +NEUPHONIC_MULTILINGUAL_MODELS = { + "neu_fast", + "neu_hq", +} + + +def language_to_neuphonic_lang_code(language: Language) -> Optional[str]: + BASE_LANGUAGES = { + Language.DE: "de", + Language.EN: "en", + Language.ES: "es", + Language.NL: "nl", + Language.AR: "ar", + } + + result = BASE_LANGUAGES.get(language) + + # If not found in base languages, try to find the base language from a variant + if not result: + # Convert enum value to string and get the base language part (e.g. es-ES -> es) + lang_str = str(language.value) + base_code = lang_str.split("-")[0].lower() + # Look up the base code in our supported languages + result = base_code if base_code in BASE_LANGUAGES.values() else None + + return result + + +class NeuphonicTTSService(WordTTSService, WebsocketService): + class InputParams(BaseModel): + language: Optional[Language] = Language.EN + speed: Optional[float] = 1.0 + + def __init__( + self, + *, + api_key: str, + voice_id: Optional[str] = None, + model: str = "neu_hq", + url: str = "wss://api.neuphonic.com", + sample_rate: Optional[int] = 22050, + encoding: str = "pcm_linear", + params: InputParams = InputParams(), + **kwargs, + ): + WordTTSService.__init__( + self, + aggregate_sentences=True, + push_text_frames=False, + push_stop_frames=True, + stop_frame_timeout_s=2.0, + sample_rate=sample_rate, + **kwargs, + ) + WebsocketService.__init__(self) + + self._api_key = api_key + self._url = url + self._settings = { + "lang_code": self.language_to_service_language(params.language), + "speed": params.speed, + "encoding": encoding, + "model": model, + "sampling_rate": sample_rate, + } + self.set_model_name(model) + self.set_voice(voice_id) + + # Indicates if we have sent TTSStartedFrame. It will reset to False when + # there's an interruption or TTSStoppedFrame. + self._started = False + self._cumulative_time = 0 + + def can_generate_metrics(self) -> bool: + return True + + def language_to_service_language(self, language: Language) -> Optional[str]: + return language_to_neuphonic_lang_code(language) + + async def set_model(self, model: str): + await super().set_model(model) + logger.info(f"Switching TTS model to: [{model}]") + await self._disconnect() + await self._connect() + + async def _update_settings(self, settings: Mapping[str, Any]): + if "voice_id" in settings: + self.set_voice(settings["voice_id"]) + + await super()._update_settings(settings) + await self._disconnect() + await self._connect() + logger.info(f"Switching TTS to settings: [{self._settings}]") + + async def start(self, frame: StartFrame): + await super().start(frame) + await self._connect() + + async def stop(self, frame: EndFrame): + await super().stop(frame) + await self._disconnect() + + async def cancel(self, frame: CancelFrame): + await super().cancel(frame) + await self._disconnect() + + async def flush_audio(self): + if self._websocket: + msg = {"text": ""} + await self._websocket.send(json.dumps(msg)) + + async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM): + await super().push_frame(frame, direction) + if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)): + self._started = False + if isinstance(frame, TTSStoppedFrame): + await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)]) + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + # If we received a TTSSpeakFrame and the LLM response included text (it + # might be that it's only a function calling response) we pause + # processing more frames until we receive a BotStoppedSpeakingFrame. + if isinstance(frame, TTSSpeakFrame): + await self.pause_processing_frames() + elif isinstance(frame, LLMFullResponseEndFrame) and self._started: + await self.pause_processing_frames() + elif isinstance(frame, BotStoppedSpeakingFrame): + await self.resume_processing_frames() + + async def _connect(self): + await self._connect_websocket() + + self._receive_task = self.create_task(self._receive_task_handler(self.push_error)) + self._keepalive_task = self.create_task(self._keepalive_task_handler()) + + async def _disconnect(self): + if self._receive_task: + await self.cancel_task(self._receive_task) + self._receive_task = None + + if self._keepalive_task: + await self.cancel_task(self._keepalive_task) + self._keepalive_task = None + + await self._disconnect_websocket() + + async def _connect_websocket(self): + try: + logger.debug("Connecting to Neuphonic") + + tts_config = { + **self._settings, + "voice_id": self._voice_id, + "model": self.model_name, + } + + query_params = [f"api_key={self._api_key}"] + for key, value in tts_config.items(): + if value is not None: + query_params.append(f"{key}={value}") + + url = f"{self._url}/speak/{self._settings['lang_code']}?{'&'.join(query_params)}" + + self._websocket = await websockets.connect(url) + + except Exception as e: + logger.error(f"{self} initialization error: {e}") + self._websocket = None + + async def _disconnect_websocket(self): + try: + await self.stop_all_metrics() + + if self._websocket: + logger.debug("Disconnecting from Neuphonic") + await self._websocket.close() + self._websocket = None + + self._started = False + except Exception as e: + logger.error(f"{self} error closing websocket: {e}") + + async def _receive_messages(self): + async for message in self._websocket: + if isinstance(message, str): + msg = json.loads(message) + if msg.get("data", {}).get("audio") is not None: + await self.stop_ttfb_metrics() + self.start_word_timestamps() + + audio = base64.b64decode(msg["data"]["audio"]) + frame = TTSAudioRawFrame(audio, self.sample_rate, 1) + await self.push_frame(frame) + + async def _keepalive_task_handler(self): + while True: + await asyncio.sleep(10) + await self._send_text("") + + async def _send_text(self, text: str): + if self._websocket: + msg = {"text": text} + logger.debug(f"Sending text to websocket: {msg}") + await self._websocket.send(json.dumps(msg)) + + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + logger.debug(f"Generating TTS: [{text}]") + + try: + if not self._websocket: + await self._connect() + + try: + if not self._started: + await self.start_ttfb_metrics() + yield TTSStartedFrame() + self._started = True + self._cumulative_time = 0 + + await self._send_text(text) + await self.start_tts_usage_metrics(text) + except Exception as e: + logger.error(f"{self} error sending message: {e}") + yield TTSStoppedFrame() + await self._disconnect() + await self._connect() + return + yield None + except Exception as e: + logger.error(f"{self} exception: {e}") + + +class NeuphonicHttpTTSService(TTSService): + """Neuphonic Text-to-Speech service using HTTP streaming. + + Args: + api_key: Neuphonic API key + voice_id: ID of the voice to use + model: Neuphonic model to use (default: "neu_hq") + url: Base URL for the Neuphonic API (default: "https://api.neuphonic.com") + sample_rate: Sample rate for audio output (default: 22050Hz) + encoding: Audio encoding format (default: "pcm_linear") + params: Additional parameters for TTS generation including language and speed + **kwargs: Additional keyword arguments passed to the parent class + """ + + class InputParams(BaseModel): + language: Optional[Language] = Language.EN + speed: Optional[float] = 1.0 + + def __init__( + self, + *, + api_key: str, + voice_id: Optional[str] = None, + model: str = "neu_hq", + url: str = "https://api.neuphonic.com", + sample_rate: Optional[int] = 22050, + encoding: str = "pcm_linear", + params: InputParams = InputParams(), + **kwargs, + ): + super().__init__(sample_rate=sample_rate, **kwargs) + + self._api_key = api_key + self._url = url + self._settings = { + "lang_code": self.language_to_service_language(params.language), + "speed": params.speed, + "encoding": encoding, + "model": model, + "sampling_rate": sample_rate, + } + self.set_model_name(model) + self.set_voice(voice_id) + + def can_generate_metrics(self) -> bool: + return True + + async def start(self, frame: StartFrame): + await super().start(frame) + + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: + """Generate speech from text using Neuphonic streaming API. + + Args: + text: The text to convert to speech + Yields: + Frames containing audio data and status information + """ + logger.debug(f"Generating TTS: [{text}]") + + client = Neuphonic(api_key=self._api_key, base_url=self._url.replace("https://", "")) + + sse = client.tts.AsyncSSEClient() + + try: + await self.start_ttfb_metrics() + response = sse.send( + text, TTSConfig(**self._settings, model=self.model_name, voice_id=self._voice_id) + ) + + await self.start_tts_usage_metrics(text) + yield TTSStartedFrame() + + async for message in response: + if response.status_code != 200: + logger.error(f"{self} error: {message.errors}") + yield ErrorFrame(error=f"Neuphonic API error: {message.errors}") + + await self.stop_ttfb_metrics() + yield TTSAudioRawFrame(message.data.audio, self.sample_rate, 1) + except Exception as e: + logger.error(f"Error in run_tts: {e}") + yield ErrorFrame(error=str(e)) + finally: + yield TTSStoppedFrame() From ead555eb4bca2c6db01beefc9ec2a097c9186109 Mon Sep 17 00:00:00 2001 From: Adnan Siddiquei Date: Wed, 12 Mar 2025 17:39:04 +0000 Subject: [PATCH 2/7] Corrected versions on pyproject.toml. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b5a870941..e698ab134 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ aws = [ "boto3~=1.35.99" ] azure = [ "azure-cognitiveservices-speech~=1.42.0"] canonical = [ "aiofiles~=24.1.0" ] cartesia = [ "cartesia~=1.3.1", "websockets~=13.1" ] -neuphonic = [ "pyneuphonic~=1.5.10", "websockets>=12.0,<14.0" ] +neuphonic = [ "pyneuphonic~=1.5.11", "websockets~=13.1" ] cerebras = [] deepseek = [] daily = [ "daily-python~=0.15.0" ] From 0b9c4b2255e81753e4a37ef405950f6d00d0894e Mon Sep 17 00:00:00 2001 From: Adnan Siddiquei Date: Wed, 12 Mar 2025 18:04:48 +0000 Subject: [PATCH 3/7] Fixed a couple of small bugs. --- src/pipecat/services/neuphonic.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pipecat/services/neuphonic.py b/src/pipecat/services/neuphonic.py index b14c325f8..f86280850 100644 --- a/src/pipecat/services/neuphonic.py +++ b/src/pipecat/services/neuphonic.py @@ -105,7 +105,6 @@ class NeuphonicTTSService(WordTTSService, WebsocketService): "lang_code": self.language_to_service_language(params.language), "speed": params.speed, "encoding": encoding, - "model": model, "sampling_rate": sample_rate, } self.set_model_name(model) @@ -315,7 +314,6 @@ class NeuphonicHttpTTSService(TTSService): "lang_code": self.language_to_service_language(params.language), "speed": params.speed, "encoding": encoding, - "model": model, "sampling_rate": sample_rate, } self.set_model_name(model) @@ -327,6 +325,9 @@ class NeuphonicHttpTTSService(TTSService): async def start(self, frame: StartFrame): await super().start(frame) + async def flush_audio(self): + pass + async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: """Generate speech from text using Neuphonic streaming API. @@ -351,7 +352,7 @@ class NeuphonicHttpTTSService(TTSService): yield TTSStartedFrame() async for message in response: - if response.status_code != 200: + if message.status_code != 200: logger.error(f"{self} error: {message.errors}") yield ErrorFrame(error=f"Neuphonic API error: {message.errors}") From 08fb931ef6a295bd4b32a86b1e279f6f079374a8 Mon Sep 17 00:00:00 2001 From: Adnan Siddiquei Date: Thu, 13 Mar 2025 12:10:03 +0000 Subject: [PATCH 4/7] Swapped NEUPHONIC_API_TOKEN for NEUPHONIC_API_KEY. --- dot-env.template | 2 +- src/pipecat/services/neuphonic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dot-env.template b/dot-env.template index 331eba79c..2da20fc0b 100644 --- a/dot-env.template +++ b/dot-env.template @@ -30,7 +30,7 @@ ELEVENLABS_API_KEY=... ELEVENLABS_VOICE_ID=... # Neuphonic -NEUPHONIC_API_TOKEN=... +NEUPHONIC_API_KEY=... # Fal FAL_KEY=... diff --git a/src/pipecat/services/neuphonic.py b/src/pipecat/services/neuphonic.py index f86280850..757d60c3e 100644 --- a/src/pipecat/services/neuphonic.py +++ b/src/pipecat/services/neuphonic.py @@ -38,7 +38,7 @@ try: except ModuleNotFoundError as e: logger.error(f"Exception: {e}") logger.error( - "In order to use Neuphonic, you need to `pip install pipecat-ai[neuphonic]`. Also, set `NEUPHONIC_API_TOKEN` environment variable." + "In order to use Neuphonic, you need to `pip install pipecat-ai[neuphonic]`. Also, set `NEUPHONIC_API_KEY` environment variable." ) raise Exception(f"Missing module: {e}") From 1bf964a667f2a756ef82a22088864adf205c00e8 Mon Sep 17 00:00:00 2001 From: Adnan Siddiquei Date: Thu, 13 Mar 2025 14:42:42 +0000 Subject: [PATCH 5/7] Added two examples on how to use Neuphonic as a TTS (07u). --- .../07u-interruptible-neuphonic-http.py | 102 ++++++++++++++++++ .../07u-interruptible-neuphonic.py | 102 ++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 examples/foundational/07u-interruptible-neuphonic-http.py create mode 100644 examples/foundational/07u-interruptible-neuphonic.py diff --git a/examples/foundational/07u-interruptible-neuphonic-http.py b/examples/foundational/07u-interruptible-neuphonic-http.py new file mode 100644 index 000000000..8377c3baf --- /dev/null +++ b/examples/foundational/07u-interruptible-neuphonic-http.py @@ -0,0 +1,102 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import os +import sys + +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.neuphonic import NeuphonicHttpTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + (room_url, token) = await configure(session) + + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_out_enabled=True, + transcription_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + ) + + tts = NeuphonicHttpTTSService( + api_key=os.getenv("NEUPHONIC_API_KEY"), + voice_id="fc854436-2dac-4d21-aa69-ae17b54e98eb", # Emily + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + context_aggregator.user(), # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + report_only_initial_ttfb=True, + ), + ) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + await transport.capture_participant_transcription(participant["id"]) + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_participant_left") + async def on_participant_left(transport, participant, reason): + await task.cancel() + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/foundational/07u-interruptible-neuphonic.py b/examples/foundational/07u-interruptible-neuphonic.py new file mode 100644 index 000000000..b5d666511 --- /dev/null +++ b/examples/foundational/07u-interruptible-neuphonic.py @@ -0,0 +1,102 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import os +import sys + +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.neuphonic import NeuphonicTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + (room_url, token) = await configure(session) + + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_out_enabled=True, + transcription_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + ) + + tts = NeuphonicTTSService( + api_key=os.getenv("NEUPHONIC_API_KEY"), + voice_id="fc854436-2dac-4d21-aa69-ae17b54e98eb", # Emily + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + context_aggregator.user(), # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + report_only_initial_ttfb=True, + ), + ) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + await transport.capture_participant_transcription(participant["id"]) + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_participant_left") + async def on_participant_left(transport, participant, reason): + await task.cancel() + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main()) From 7dec8431e16a6675d869c057ef7e4181a23a2ea3 Mon Sep 17 00:00:00 2001 From: Adnan Siddiquei Date: Fri, 14 Mar 2025 10:52:13 +0000 Subject: [PATCH 6/7] Review comments by aconchillo. --- pyproject.toml | 2 +- src/pipecat/services/neuphonic.py | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e698ab134..1b324c3a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ aws = [ "boto3~=1.35.99" ] azure = [ "azure-cognitiveservices-speech~=1.42.0"] canonical = [ "aiofiles~=24.1.0" ] cartesia = [ "cartesia~=1.3.1", "websockets~=13.1" ] -neuphonic = [ "pyneuphonic~=1.5.11", "websockets~=13.1" ] +neuphonic = [ "pyneuphonic~=1.5.12", "websockets~=13.1" ] cerebras = [] deepseek = [] daily = [ "daily-python~=0.15.0" ] diff --git a/src/pipecat/services/neuphonic.py b/src/pipecat/services/neuphonic.py index 757d60c3e..c0d35fb40 100644 --- a/src/pipecat/services/neuphonic.py +++ b/src/pipecat/services/neuphonic.py @@ -27,8 +27,7 @@ from pipecat.frames.frames import ( TTSStoppedFrame, ) from pipecat.processors.frame_processor import FrameDirection -from pipecat.services.ai_services import TTSService, WordTTSService -from pipecat.services.websocket_service import WebsocketService +from pipecat.services.ai_services import InterruptibleTTSService, TTSService from pipecat.transcriptions.language import Language # See .env.example for Neuphonic configuration needed @@ -71,7 +70,7 @@ def language_to_neuphonic_lang_code(language: Language) -> Optional[str]: return result -class NeuphonicTTSService(WordTTSService, WebsocketService): +class NeuphonicTTSService(InterruptibleTTSService): class InputParams(BaseModel): language: Optional[Language] = Language.EN speed: Optional[float] = 1.0 @@ -88,7 +87,7 @@ class NeuphonicTTSService(WordTTSService, WebsocketService): params: InputParams = InputParams(), **kwargs, ): - WordTTSService.__init__( + super().__init__( self, aggregate_sentences=True, push_text_frames=False, @@ -97,7 +96,6 @@ class NeuphonicTTSService(WordTTSService, WebsocketService): sample_rate=sample_rate, **kwargs, ) - WebsocketService.__init__(self) self._api_key = api_key self._url = url @@ -232,7 +230,6 @@ class NeuphonicTTSService(WordTTSService, WebsocketService): msg = json.loads(message) if msg.get("data", {}).get("audio") is not None: await self.stop_ttfb_metrics() - self.start_word_timestamps() audio = base64.b64decode(msg["data"]["audio"]) frame = TTSAudioRawFrame(audio, self.sample_rate, 1) From 11b13d053be82eca507f79284692cb8fb1d4ecb2 Mon Sep 17 00:00:00 2001 From: Adnan Siddiquei Date: Fri, 14 Mar 2025 11:17:22 +0000 Subject: [PATCH 7/7] Fixed a bug from previous commit. Removed the concept of model from Neuphonic. --- pyproject.toml | 2 +- src/pipecat/services/neuphonic.py | 25 +------------------------ 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1b324c3a5..f87aa7572 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ aws = [ "boto3~=1.35.99" ] azure = [ "azure-cognitiveservices-speech~=1.42.0"] canonical = [ "aiofiles~=24.1.0" ] cartesia = [ "cartesia~=1.3.1", "websockets~=13.1" ] -neuphonic = [ "pyneuphonic~=1.5.12", "websockets~=13.1" ] +neuphonic = [ "pyneuphonic~=1.5.13", "websockets~=13.1" ] cerebras = [] deepseek = [] daily = [ "daily-python~=0.15.0" ] diff --git a/src/pipecat/services/neuphonic.py b/src/pipecat/services/neuphonic.py index c0d35fb40..a2ea98f42 100644 --- a/src/pipecat/services/neuphonic.py +++ b/src/pipecat/services/neuphonic.py @@ -41,12 +41,6 @@ except ModuleNotFoundError as e: ) raise Exception(f"Missing module: {e}") -# Models that support language codes -NEUPHONIC_MULTILINGUAL_MODELS = { - "neu_fast", - "neu_hq", -} - def language_to_neuphonic_lang_code(language: Language) -> Optional[str]: BASE_LANGUAGES = { @@ -80,7 +74,6 @@ class NeuphonicTTSService(InterruptibleTTSService): *, api_key: str, voice_id: Optional[str] = None, - model: str = "neu_hq", url: str = "wss://api.neuphonic.com", sample_rate: Optional[int] = 22050, encoding: str = "pcm_linear", @@ -88,7 +81,6 @@ class NeuphonicTTSService(InterruptibleTTSService): **kwargs, ): super().__init__( - self, aggregate_sentences=True, push_text_frames=False, push_stop_frames=True, @@ -105,7 +97,6 @@ class NeuphonicTTSService(InterruptibleTTSService): "encoding": encoding, "sampling_rate": sample_rate, } - self.set_model_name(model) self.set_voice(voice_id) # Indicates if we have sent TTSStartedFrame. It will reset to False when @@ -119,12 +110,6 @@ class NeuphonicTTSService(InterruptibleTTSService): def language_to_service_language(self, language: Language) -> Optional[str]: return language_to_neuphonic_lang_code(language) - async def set_model(self, model: str): - await super().set_model(model) - logger.info(f"Switching TTS model to: [{model}]") - await self._disconnect() - await self._connect() - async def _update_settings(self, settings: Mapping[str, Any]): if "voice_id" in settings: self.set_voice(settings["voice_id"]) @@ -155,8 +140,6 @@ class NeuphonicTTSService(InterruptibleTTSService): await super().push_frame(frame, direction) if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)): self._started = False - if isinstance(frame, TTSStoppedFrame): - await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)]) async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) @@ -195,7 +178,6 @@ class NeuphonicTTSService(InterruptibleTTSService): tts_config = { **self._settings, "voice_id": self._voice_id, - "model": self.model_name, } query_params = [f"api_key={self._api_key}"] @@ -279,7 +261,6 @@ class NeuphonicHttpTTSService(TTSService): Args: api_key: Neuphonic API key voice_id: ID of the voice to use - model: Neuphonic model to use (default: "neu_hq") url: Base URL for the Neuphonic API (default: "https://api.neuphonic.com") sample_rate: Sample rate for audio output (default: 22050Hz) encoding: Audio encoding format (default: "pcm_linear") @@ -296,7 +277,6 @@ class NeuphonicHttpTTSService(TTSService): *, api_key: str, voice_id: Optional[str] = None, - model: str = "neu_hq", url: str = "https://api.neuphonic.com", sample_rate: Optional[int] = 22050, encoding: str = "pcm_linear", @@ -313,7 +293,6 @@ class NeuphonicHttpTTSService(TTSService): "encoding": encoding, "sampling_rate": sample_rate, } - self.set_model_name(model) self.set_voice(voice_id) def can_generate_metrics(self) -> bool: @@ -341,9 +320,7 @@ class NeuphonicHttpTTSService(TTSService): try: await self.start_ttfb_metrics() - response = sse.send( - text, TTSConfig(**self._settings, model=self.model_name, voice_id=self._voice_id) - ) + response = sse.send(text, TTSConfig(**self._settings, voice_id=self._voice_id)) await self.start_tts_usage_metrics(text) yield TTSStartedFrame()