diff --git a/CHANGELOG.md b/CHANGELOG.md index 247c3280d..6d55b97c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 handler will be triggered if the idle timeout is reached (whether the pipeline task is cancelled or not). +- Added `FalSTTService`, which provides STT for Fal's Wizper API. + - Added a `reconnect_on_error` parameter to websocket-based TTS services as well as a `on_connection_error` event handler. The `reconnect_on_error` indicates whether the TTS service should reconnect on error. The `on_connection_error` @@ -216,6 +218,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Other +- Add foundational example `07w-interruptible-fal.py`, showing `FalSTTService`. + - Added a new example `examples/foundational/36-user-email-gathering.py` to show how to gather user emails. The example uses's Cartesia's `` tags and Rime `spell()` function to spell out the emails for confirmation. diff --git a/README.md b/README.md index 9522ce175..f28005618 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ pip install "pipecat-ai[option,...]" | Category | Services | Install Command Example | | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------- | -| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | `pip install "pipecat-ai[deepgram]"` | +| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | `pip install "pipecat-ai[deepgram]"` | | LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | `pip install "pipecat-ai[openai]"` | | Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | `pip install "pipecat-ai[cartesia]"` | | Speech-to-Speech | [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | `pip install "pipecat-ai[google]"` | diff --git a/examples/foundational/07u-interruptible-ultravox.py b/examples/foundational/07v-interruptible-ultravox.py similarity index 100% rename from examples/foundational/07u-interruptible-ultravox.py rename to examples/foundational/07v-interruptible-ultravox.py diff --git a/examples/foundational/07w-interruptible-fal.py b/examples/foundational/07w-interruptible-fal.py new file mode 100644 index 000000000..526602166 --- /dev/null +++ b/examples/foundational/07w-interruptible-fal.py @@ -0,0 +1,110 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import os +import sys + +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.cartesia import CartesiaTTSService +from pipecat.services.fal import FalSTTService +from pipecat.services.gladia import GladiaSTTService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + + +async def main(): + async with aiohttp.ClientSession() as session: + (room_url, token) = await configure(session) + + transport = DailyTransport( + room_url, + token, + "Respond bot", + DailyParams( + audio_out_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + vad_audio_passthrough=True, + ), + ) + + stt = FalSTTService( + api_key=os.getenv("FAL_KEY"), + ) + + tts = CartesiaTTSService( + api_key=os.getenv("CARTESIA_API_KEY"), + voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", # British Reading Lady + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + context_aggregator.user(), # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + report_only_initial_ttfb=True, + ), + ) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + await transport.capture_participant_transcription(participant["id"]) + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + # Register an event handler to exit the application when the user leaves. + @transport.event_handler("on_participant_left") + async def on_participant_left(transport, participant, reason): + await task.cancel() + + runner = PipelineRunner() + + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/pipecat/services/fal.py b/src/pipecat/services/fal.py index 7173861ab..cb39da75f 100644 --- a/src/pipecat/services/fal.py +++ b/src/pipecat/services/fal.py @@ -7,6 +7,7 @@ import asyncio import io import os +import wave from typing import AsyncGenerator, Dict, Optional, Union import aiohttp @@ -14,8 +15,10 @@ from loguru import logger from PIL import Image from pydantic import BaseModel -from pipecat.frames.frames import ErrorFrame, Frame, URLImageRawFrame -from pipecat.services.ai_services import ImageGenService +from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame, URLImageRawFrame +from pipecat.services.ai_services import ImageGenService, SegmentedSTTService +from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 try: import fal_client @@ -27,6 +30,120 @@ except ModuleNotFoundError as e: raise Exception(f"Missing module: {e}") +def language_to_fal_language(language: Language) -> Optional[str]: + """Language support for Fal's Wizper API.""" + BASE_LANGUAGES = { + Language.AF: "af", + Language.AM: "am", + Language.AR: "ar", + Language.AS: "as", + Language.AZ: "az", + Language.BA: "ba", + Language.BE: "be", + Language.BG: "bg", + Language.BN: "bn", + Language.BO: "bo", + Language.BR: "br", + Language.BS: "bs", + Language.CA: "ca", + Language.CS: "cs", + Language.CY: "cy", + Language.DA: "da", + Language.DE: "de", + Language.EL: "el", + Language.EN: "en", + Language.ES: "es", + Language.ET: "et", + Language.EU: "eu", + Language.FA: "fa", + Language.FI: "fi", + Language.FO: "fo", + Language.FR: "fr", + Language.GL: "gl", + Language.GU: "gu", + Language.HA: "ha", + Language.HE: "he", + Language.HI: "hi", + Language.HR: "hr", + Language.HT: "ht", + Language.HU: "hu", + Language.HY: "hy", + Language.ID: "id", + Language.IS: "is", + Language.IT: "it", + Language.JA: "ja", + Language.JW: "jw", + Language.KA: "ka", + Language.KK: "kk", + Language.KM: "km", + Language.KN: "kn", + Language.KO: "ko", + Language.LA: "la", + Language.LB: "lb", + Language.LN: "ln", + Language.LO: "lo", + Language.LT: "lt", + Language.LV: "lv", + Language.MG: "mg", + Language.MI: "mi", + Language.MK: "mk", + Language.ML: "ml", + Language.MN: "mn", + Language.MR: "mr", + Language.MS: "ms", + Language.MT: "mt", + Language.MY: "my", + Language.NE: "ne", + Language.NL: "nl", + Language.NN: "nn", + Language.NO: "no", + Language.OC: "oc", + Language.PA: "pa", + Language.PL: "pl", + Language.PS: "ps", + Language.PT: "pt", + Language.RO: "ro", + Language.RU: "ru", + Language.SA: "sa", + Language.SD: "sd", + Language.SI: "si", + Language.SK: "sk", + Language.SL: "sl", + Language.SN: "sn", + Language.SO: "so", + Language.SQ: "sq", + Language.SR: "sr", + Language.SU: "su", + Language.SV: "sv", + Language.SW: "sw", + Language.TA: "ta", + Language.TE: "te", + Language.TG: "tg", + Language.TH: "th", + Language.TK: "tk", + Language.TL: "tl", + Language.TR: "tr", + Language.TT: "tt", + Language.UK: "uk", + Language.UR: "ur", + Language.UZ: "uz", + Language.VI: "vi", + Language.YI: "yi", + Language.YO: "yo", + Language.ZH: "zh", + } + + result = BASE_LANGUAGES.get(language) + + # If not found in base languages, try to find the base language from a variant + if not result: + lang_str = str(language.value) + base_code = lang_str.split("-")[0].lower() + result = base_code if base_code in BASE_LANGUAGES.values() else None + + return result + + class FalImageGenService(ImageGenService): class InputParams(BaseModel): seed: Optional[int] = None @@ -84,3 +201,109 @@ class FalImageGenService(ImageGenService): frame = URLImageRawFrame(url=image_url, image=image_bytes, size=size, format=format) yield frame + + +class FalSTTService(SegmentedSTTService): + """Speech-to-text service using Fal's Wizper API. + + This service uses Fal's Wizper API to perform speech-to-text transcription on audio + segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection. + + Args: + api_key: Fal API key. If not provided, will check FAL_KEY environment variable. + sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate. + params: Configuration parameters for the Wizper API. + **kwargs: Additional arguments passed to SegmentedSTTService. + """ + + class InputParams(BaseModel): + """Configuration parameters for Fal's Wizper API. + + Attributes: + language: Language of the audio input. Defaults to English. + task: Task to perform ('transcribe' or 'translate'). Defaults to 'transcribe'. + chunk_level: Level of chunking ('segment'). Defaults to 'segment'. + version: Version of Wizper model to use. Defaults to '3'. + """ + + language: Optional[Language] = Language.EN + task: str = "transcribe" + chunk_level: str = "segment" + version: str = "3" + + def __init__( + self, + *, + api_key: Optional[str] = None, + sample_rate: Optional[int] = None, + params: InputParams = InputParams(), + **kwargs, + ): + super().__init__( + sample_rate=sample_rate, + **kwargs, + ) + + if api_key: + os.environ["FAL_KEY"] = api_key + elif "FAL_KEY" not in os.environ: + raise ValueError( + "FAL_KEY must be provided either through api_key parameter or environment variable" + ) + + self._fal_client = fal_client.AsyncClient(key=api_key or os.getenv("FAL_KEY")) + self._settings = { + "task": params.task, + "language": self.language_to_service_language(params.language) + if params.language + else "en", + "chunk_level": params.chunk_level, + "version": params.version, + } + + def can_generate_metrics(self) -> bool: + return True + + def language_to_service_language(self, language: Language) -> Optional[str]: + return language_to_fal_language(language) + + async def set_language(self, language: Language): + logger.info(f"Switching STT language to: [{language}]") + self._settings["language"] = self.language_to_service_language(language) + + async def set_model(self, model: str): + await super().set_model(model) + logger.info(f"Switching STT model to: [{model}]") + + async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: + """Transcribes an audio segment using Fal's Wizper API. + + Args: + audio: Raw audio bytes in WAV format (already converted by base class). + + Yields: + Frame: TranscriptionFrame containing the transcribed text. + + Note: + The audio is already in WAV format from the SegmentedSTTService. + Only non-empty transcriptions are yielded. + """ + try: + # Send to Fal directly (audio is already in WAV format from base class) + data_uri = fal_client.encode(audio, "audio/x-wav") + response = await self._fal_client.run( + "fal-ai/wizper", + arguments={"audio_url": data_uri, **self._settings}, + ) + + if response and "text" in response: + text = response["text"].strip() + if text: # Only yield non-empty text + logger.debug(f"Transcription: [{text}]") + yield TranscriptionFrame( + text, "", time_now_iso8601(), Language(self._settings["language"]) + ) + + except Exception as e: + logger.error(f"Fal Wizper error: {e}") + yield ErrorFrame(f"Fal Wizper error: {str(e)}") diff --git a/src/pipecat/transcriptions/language.py b/src/pipecat/transcriptions/language.py index b8b9fafe9..75f714a72 100644 --- a/src/pipecat/transcriptions/language.py +++ b/src/pipecat/transcriptions/language.py @@ -54,6 +54,9 @@ class Language(StrEnum): AZ = "az" AZ_AZ = "az-AZ" + # Bashkir + BA = "ba" + # Belarusian BE = "be" @@ -66,6 +69,12 @@ class Language(StrEnum): BN_BD = "bn-BD" BN_IN = "bn-IN" + # Tibetan + BO = "bo" + + # Breton + BR = "br" + # Bosnian BS = "bs" BS_BA = "bs-BA" @@ -159,6 +168,9 @@ class Language(StrEnum): FIL = "fil" FIL_PH = "fil-PH" + # Faroese + FO = "fo" + # French FR = "fr" FR_BE = "fr-BE" @@ -178,6 +190,9 @@ class Language(StrEnum): GU = "gu" GU_IN = "gu-IN" + # Hausa + HA = "ha" + # Hebrew HE = "he" HE_IL = "he-IL" @@ -190,6 +205,9 @@ class Language(StrEnum): HR = "hr" HR_HR = "hr-HR" + # Haitian Creole + HT = "ht" + # Hungarian HU = "hu" HU_HU = "hu-HU" @@ -224,6 +242,7 @@ class Language(StrEnum): # Javanese JV = "jv" JV_ID = "jv-ID" + JW = "jw" # Fal requires for Javanese # Georgian KA = "ka" @@ -245,6 +264,15 @@ class Language(StrEnum): KO = "ko" KO_KR = "ko-KR" + # Latin + LA = "la" + + # Luxembourgish + LB = "lb" + + # Lingala + LN = "ln" + # Lao LO = "lo" LO_LA = "lo-LA" @@ -257,6 +285,9 @@ class Language(StrEnum): LV = "lv" LV_LV = "lv-LV" + # Malagasy + MG = "mg" + # Macedonian MK = "mk" MK_MK = "mk-MK" @@ -289,9 +320,10 @@ class Language(StrEnum): MY_MM = "my-MM" # Norwegian - NB = "nb" + NB = "nb" # Norwegian Bokmål NB_NO = "nb-NO" NO = "no" + NN = "nn" # Norwegian Nynorsk # Nepali NE = "ne" @@ -302,6 +334,9 @@ class Language(StrEnum): NL_BE = "nl-BE" NL_NL = "nl-NL" + # Occitan + OC = "oc" + # Odia OR = "or" OR_IN = "or-IN" @@ -331,6 +366,12 @@ class Language(StrEnum): RU = "ru" RU_RU = "ru-RU" + # Sanskrit + SA = "sa" + + # Sindhi + SD = "sd" + # Sinhala SI = "si" SI_LK = "si-LK" @@ -343,6 +384,9 @@ class Language(StrEnum): SL = "sl" SL_SI = "sl-SI" + # Shona + SN = "sn" + # Somali SO = "so" SO_SO = "so-SO" @@ -384,14 +428,23 @@ class Language(StrEnum): TE = "te" TE_IN = "te-IN" + # Tajik + TG = "tg" + # Thai TH = "th" TH_TH = "th-TH" + # Turkmen + TK = "tk" + # Turkish TR = "tr" TR_TR = "tr-TR" + # Tatar + TT = "tt" + # Ukrainian UK = "uk" UK_UA = "uk-UA" @@ -413,6 +466,12 @@ class Language(StrEnum): WUU = "wuu" WUU_CN = "wuu-CN" + # Yiddish + YI = "yi" + + # Yoruba + YO = "yo" + # Yue Chinese YUE = "yue" YUE_CN = "yue-CN"