Merge pull request #2716 from pipecat-ai/mb/add-11labs-stt
Add ElevenLabsSTTService
This commit is contained in:
@@ -9,8 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- Added a peer connection monitor to the `SmallWebRTCConnection` that
|
||||
automatically disconnects if the connection fails to establish within
|
||||
- Added `ElevenLabsSTTService` for speech-to-text transcription.
|
||||
|
||||
- Added a peer connection monitor to the `SmallWebRTCConnection` that
|
||||
automatically disconnects if the connection fails to establish within
|
||||
the timeout (1 minute by default).
|
||||
|
||||
- Added memory cleanup improvements to reduce memory peaks.
|
||||
|
||||
@@ -79,7 +79,7 @@ You can connect to Pipecat from any platform using our official SDKs:
|
||||
|
||||
| Category | Services |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) |
|
||||
| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) |
|
||||
| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
|
||||
| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) |
|
||||
|
||||
@@ -24,6 +24,7 @@ from pipecat.processors.aggregators.llm_response_universal import LLMContextAggr
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
|
||||
from pipecat.services.elevenlabs.tts import ElevenLabsHttpTTSService
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
@@ -63,7 +64,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
|
||||
# Create an HTTP session
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
||||
stt = ElevenLabsSTTService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
aiohttp_session=session,
|
||||
)
|
||||
|
||||
tts = ElevenLabsHttpTTSService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
|
||||
|
||||
89
examples/foundational/13k-elevenlabs-transcription.py
Normal file
89
examples/foundational/13k-elevenlabs-transcription.py
Normal file
@@ -0,0 +1,89 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import os
|
||||
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import Frame, TranscriptionFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineTask
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.runner.types import RunnerArguments
|
||||
from pipecat.runner.utils import create_transport
|
||||
from pipecat.services.elevenlabs.stt import ElevenLabsSTTService
|
||||
from pipecat.transports.base_transport import BaseTransport, TransportParams
|
||||
from pipecat.transports.daily.transport import DailyParams
|
||||
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
|
||||
class TranscriptionLogger(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
print(f"Transcription: {frame.text}")
|
||||
|
||||
# Push all frames through
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
|
||||
# instantiated. The function will be called when the desired transport gets
|
||||
# selected.
|
||||
transport_params = {
|
||||
"daily": lambda: DailyParams(audio_in_enabled=True, vad_analyzer=SileroVADAnalyzer()),
|
||||
"twilio": lambda: FastAPIWebsocketParams(
|
||||
audio_in_enabled=True, vad_analyzer=SileroVADAnalyzer()
|
||||
),
|
||||
"webrtc": lambda: TransportParams(audio_in_enabled=True, vad_analyzer=SileroVADAnalyzer()),
|
||||
}
|
||||
|
||||
|
||||
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
|
||||
logger.info(f"Starting bot")
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
stt = ElevenLabsSTTService(
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
aiohttp_session=session,
|
||||
)
|
||||
|
||||
tl = TranscriptionLogger()
|
||||
|
||||
pipeline = Pipeline([transport.input(), stt, tl])
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(transport, client):
|
||||
logger.info(f"Client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
async def bot(runner_args: RunnerArguments):
|
||||
"""Main bot entry point compatible with Pipecat Cloud."""
|
||||
transport = await create_transport(runner_args, transport_params)
|
||||
await run_bot(transport, runner_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from pipecat.runner.run import main
|
||||
|
||||
main()
|
||||
339
src/pipecat/services/elevenlabs/stt.py
Normal file
339
src/pipecat/services/elevenlabs/stt.py
Normal file
@@ -0,0 +1,339 @@
|
||||
#
|
||||
# Copyright (c) 2024–2025, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
"""ElevenLabs speech-to-text service implementation.
|
||||
|
||||
This module provides integration with ElevenLabs' Speech-to-Text API for transcription
|
||||
using segmented audio processing. The service uploads audio files and receives
|
||||
transcription results directly.
|
||||
"""
|
||||
|
||||
import io
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
|
||||
from pipecat.services.stt_service import SegmentedSTTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
|
||||
def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to ElevenLabs language code.
|
||||
|
||||
Source:
|
||||
https://elevenlabs.io/docs/capabilities/speech-to-text
|
||||
|
||||
Args:
|
||||
language: The Language enum value to convert.
|
||||
|
||||
Returns:
|
||||
The corresponding ElevenLabs language code, or None if not supported.
|
||||
"""
|
||||
BASE_LANGUAGES = {
|
||||
Language.AF: "afr", # Afrikaans
|
||||
Language.AM: "amh", # Amharic
|
||||
Language.AR: "ara", # Arabic
|
||||
Language.HY: "hye", # Armenian
|
||||
Language.AS: "asm", # Assamese
|
||||
Language.AST: "ast", # Asturian
|
||||
Language.AZ: "aze", # Azerbaijani
|
||||
Language.BE: "bel", # Belarusian
|
||||
Language.BN: "ben", # Bengali
|
||||
Language.BS: "bos", # Bosnian
|
||||
Language.BG: "bul", # Bulgarian
|
||||
Language.MY: "mya", # Burmese
|
||||
Language.YUE: "yue", # Cantonese
|
||||
Language.CA: "cat", # Catalan
|
||||
Language.CEB: "ceb", # Cebuano
|
||||
Language.NY: "nya", # Chichewa
|
||||
Language.HR: "hrv", # Croatian
|
||||
Language.CS: "ces", # Czech
|
||||
Language.DA: "dan", # Danish
|
||||
Language.NL: "nld", # Dutch
|
||||
Language.EN: "eng", # English
|
||||
Language.ET: "est", # Estonian
|
||||
Language.FIL: "fil", # Filipino
|
||||
Language.FI: "fin", # Finnish
|
||||
Language.FR: "fra", # French
|
||||
Language.FF: "ful", # Fulah
|
||||
Language.GL: "glg", # Galician
|
||||
Language.LG: "lug", # Ganda
|
||||
Language.KA: "kat", # Georgian
|
||||
Language.DE: "deu", # German
|
||||
Language.EL: "ell", # Greek
|
||||
Language.GU: "guj", # Gujarati
|
||||
Language.HA: "hau", # Hausa
|
||||
Language.HE: "heb", # Hebrew
|
||||
Language.HI: "hin", # Hindi
|
||||
Language.HU: "hun", # Hungarian
|
||||
Language.IS: "isl", # Icelandic
|
||||
Language.IG: "ibo", # Igbo
|
||||
Language.ID: "ind", # Indonesian
|
||||
Language.GA: "gle", # Irish
|
||||
Language.IT: "ita", # Italian
|
||||
Language.JA: "jpn", # Japanese
|
||||
Language.JV: "jav", # Javanese
|
||||
Language.KEA: "kea", # Kabuverdianu
|
||||
Language.KN: "kan", # Kannada
|
||||
Language.KK: "kaz", # Kazakh
|
||||
Language.KM: "khm", # Khmer
|
||||
Language.KO: "kor", # Korean
|
||||
Language.KU: "kur", # Kurdish
|
||||
Language.KY: "kir", # Kyrgyz
|
||||
Language.LO: "lao", # Lao
|
||||
Language.LV: "lav", # Latvian
|
||||
Language.LN: "lin", # Lingala
|
||||
Language.LT: "lit", # Lithuanian
|
||||
Language.LUO: "luo", # Luo
|
||||
Language.LB: "ltz", # Luxembourgish
|
||||
Language.MK: "mkd", # Macedonian
|
||||
Language.MS: "msa", # Malay
|
||||
Language.ML: "mal", # Malayalam
|
||||
Language.MT: "mlt", # Maltese
|
||||
Language.ZH: "zho", # Mandarin Chinese
|
||||
Language.MI: "mri", # Māori
|
||||
Language.MR: "mar", # Marathi
|
||||
Language.MN: "mon", # Mongolian
|
||||
Language.NE: "nep", # Nepali
|
||||
Language.NSO: "nso", # Northern Sotho
|
||||
Language.NO: "nor", # Norwegian
|
||||
Language.OC: "oci", # Occitan
|
||||
Language.OR: "ori", # Odia
|
||||
Language.PS: "pus", # Pashto
|
||||
Language.FA: "fas", # Persian
|
||||
Language.PL: "pol", # Polish
|
||||
Language.PT: "por", # Portuguese
|
||||
Language.PA: "pan", # Punjabi
|
||||
Language.RO: "ron", # Romanian
|
||||
Language.RU: "rus", # Russian
|
||||
Language.SR: "srp", # Serbian
|
||||
Language.SN: "sna", # Shona
|
||||
Language.SD: "snd", # Sindhi
|
||||
Language.SK: "slk", # Slovak
|
||||
Language.SL: "slv", # Slovenian
|
||||
Language.SO: "som", # Somali
|
||||
Language.ES: "spa", # Spanish
|
||||
Language.SW: "swa", # Swahili
|
||||
Language.SV: "swe", # Swedish
|
||||
Language.TA: "tam", # Tamil
|
||||
Language.TG: "tgk", # Tajik
|
||||
Language.TE: "tel", # Telugu
|
||||
Language.TH: "tha", # Thai
|
||||
Language.TR: "tur", # Turkish
|
||||
Language.UK: "ukr", # Ukrainian
|
||||
Language.UMB: "umb", # Umbundu
|
||||
Language.UR: "urd", # Urdu
|
||||
Language.UZ: "uzb", # Uzbek
|
||||
Language.VI: "vie", # Vietnamese
|
||||
Language.CY: "cym", # Welsh
|
||||
Language.WO: "wol", # Wolof
|
||||
Language.XH: "xho", # Xhosa
|
||||
Language.ZU: "zul", # Zulu
|
||||
}
|
||||
|
||||
result = BASE_LANGUAGES.get(language)
|
||||
|
||||
# If not found in base languages, try to find the base language from a variant
|
||||
if not result:
|
||||
lang_str = str(language.value)
|
||||
base_code = lang_str.split("-")[0].lower()
|
||||
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class ElevenLabsSTTService(SegmentedSTTService):
|
||||
"""Speech-to-text service using ElevenLabs' file-based API.
|
||||
|
||||
This service uses ElevenLabs' Speech-to-Text API to perform transcription on audio
|
||||
segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection.
|
||||
The service uploads audio files to ElevenLabs and receives transcription results directly.
|
||||
"""
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Configuration parameters for ElevenLabs STT API.
|
||||
|
||||
Parameters:
|
||||
language: Target language for transcription.
|
||||
tag_audio_events: Whether to include audio events like (laughter), (coughing), in the transcription.
|
||||
"""
|
||||
|
||||
language: Optional[Language] = None
|
||||
tag_audio_events: bool = True
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
base_url: str = "https://api.elevenlabs.io",
|
||||
model: str = "scribe_v1",
|
||||
sample_rate: Optional[int] = None,
|
||||
params: Optional[InputParams] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the ElevenLabs STT service.
|
||||
|
||||
Args:
|
||||
api_key: ElevenLabs API key for authentication.
|
||||
aiohttp_session: aiohttp ClientSession for HTTP requests.
|
||||
base_url: Base URL for ElevenLabs API.
|
||||
model: Model ID for transcription. Defaults to "scribe_v1".
|
||||
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
||||
params: Configuration parameters for the STT service.
|
||||
**kwargs: Additional arguments passed to SegmentedSTTService.
|
||||
"""
|
||||
super().__init__(
|
||||
sample_rate=sample_rate,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
params = params or ElevenLabsSTTService.InputParams()
|
||||
|
||||
self._api_key = api_key
|
||||
self._base_url = base_url
|
||||
self._session = aiohttp_session
|
||||
self._model_id = model
|
||||
self._tag_audio_events = params.tag_audio_events
|
||||
|
||||
self._settings = {
|
||||
"language": self.language_to_service_language(params.language)
|
||||
if params.language
|
||||
else "eng",
|
||||
}
|
||||
|
||||
def can_generate_metrics(self) -> bool:
|
||||
"""Check if the service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
True, as ElevenLabs STT service supports metrics generation.
|
||||
"""
|
||||
return True
|
||||
|
||||
def language_to_service_language(self, language: Language) -> Optional[str]:
|
||||
"""Convert a Language enum to ElevenLabs service-specific language code.
|
||||
|
||||
Args:
|
||||
language: The language to convert.
|
||||
|
||||
Returns:
|
||||
The ElevenLabs-specific language code, or None if not supported.
|
||||
"""
|
||||
return language_to_elevenlabs_language(language)
|
||||
|
||||
async def set_language(self, language: Language):
|
||||
"""Set the transcription language.
|
||||
|
||||
Args:
|
||||
language: The language to use for speech-to-text transcription.
|
||||
"""
|
||||
logger.info(f"Switching STT language to: [{language}]")
|
||||
self._settings["language"] = self.language_to_service_language(language)
|
||||
|
||||
async def set_model(self, model: str):
|
||||
"""Set the STT model.
|
||||
|
||||
Args:
|
||||
model: The model name to use for transcription.
|
||||
|
||||
Note:
|
||||
ElevenLabs STT API does not currently support model selection.
|
||||
This method is provided for interface compatibility.
|
||||
"""
|
||||
await super().set_model(model)
|
||||
logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
|
||||
|
||||
async def _transcribe_audio(self, audio_data: bytes) -> dict:
|
||||
"""Upload audio data to ElevenLabs and get transcription result.
|
||||
|
||||
Args:
|
||||
audio_data: Raw audio bytes in WAV format.
|
||||
|
||||
Returns:
|
||||
The transcription result data.
|
||||
|
||||
Raises:
|
||||
Exception: If transcription fails or returns an error.
|
||||
"""
|
||||
url = f"{self._base_url}/v1/speech-to-text"
|
||||
headers = {"xi-api-key": self._api_key}
|
||||
|
||||
# Create form data with the audio file
|
||||
data = aiohttp.FormData()
|
||||
data.add_field(
|
||||
"file",
|
||||
io.BytesIO(audio_data),
|
||||
filename="audio.wav",
|
||||
content_type="audio/x-wav",
|
||||
)
|
||||
|
||||
# Add required model_id, language_code, and tag_audio_events
|
||||
data.add_field("model_id", self._model_id)
|
||||
data.add_field("language_code", self._settings["language"])
|
||||
data.add_field("tag_audio_events", str(self._tag_audio_events).lower())
|
||||
|
||||
async with self._session.post(url, data=data, headers=headers) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"ElevenLabs transcription error: {error_text}")
|
||||
raise Exception(f"Transcription failed with status {response.status}: {error_text}")
|
||||
|
||||
result = await response.json()
|
||||
return result
|
||||
|
||||
@traced_stt
|
||||
async def _handle_transcription(
|
||||
self, transcript: str, is_final: bool, language: Optional[str] = None
|
||||
):
|
||||
"""Handle a transcription result with tracing."""
|
||||
await self.stop_ttfb_metrics()
|
||||
await self.stop_processing_metrics()
|
||||
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
||||
"""Transcribe an audio segment using ElevenLabs' STT API.
|
||||
|
||||
Args:
|
||||
audio: Raw audio bytes in WAV format (already converted by base class).
|
||||
|
||||
Yields:
|
||||
Frame: TranscriptionFrame containing the transcribed text, or ErrorFrame on failure.
|
||||
|
||||
Note:
|
||||
The audio is already in WAV format from the SegmentedSTTService.
|
||||
Only non-empty transcriptions are yielded.
|
||||
"""
|
||||
try:
|
||||
await self.start_processing_metrics()
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
# Upload audio and get transcription result directly
|
||||
result = await self._transcribe_audio(audio)
|
||||
|
||||
# Extract transcription text
|
||||
text = result.get("text", "").strip()
|
||||
if text:
|
||||
# Use the language_code returned by the API
|
||||
detected_language = result.get("language_code", "eng")
|
||||
|
||||
await self._handle_transcription(text, True, detected_language)
|
||||
logger.debug(f"Transcription: [{text}]")
|
||||
|
||||
yield TranscriptionFrame(
|
||||
text,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
detected_language,
|
||||
result=result,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ElevenLabs STT error: {e}")
|
||||
yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
|
||||
@@ -68,6 +68,9 @@ class Language(StrEnum):
|
||||
AS = "as"
|
||||
AS_IN = "as-IN"
|
||||
|
||||
# Asturian
|
||||
AST = "ast"
|
||||
|
||||
# Azerbaijani
|
||||
AZ = "az"
|
||||
AZ_AZ = "az-AZ"
|
||||
@@ -101,6 +104,9 @@ class Language(StrEnum):
|
||||
CA = "ca"
|
||||
CA_ES = "ca-ES"
|
||||
|
||||
# Cebuano
|
||||
CEB = "ceb"
|
||||
|
||||
# Mandarin Chinese
|
||||
CMN = "cmn"
|
||||
CMN_CN = "cmn-CN"
|
||||
@@ -185,6 +191,9 @@ class Language(StrEnum):
|
||||
FA = "fa"
|
||||
FA_IR = "fa-IR"
|
||||
|
||||
# Fulah
|
||||
FF = "ff"
|
||||
|
||||
# Finnish
|
||||
FI = "fi"
|
||||
FI_FI = "fi-FI"
|
||||
@@ -251,6 +260,9 @@ class Language(StrEnum):
|
||||
ID = "id"
|
||||
ID_ID = "id-ID"
|
||||
|
||||
# Igbo
|
||||
IG = "ig"
|
||||
|
||||
# Icelandic
|
||||
IS = "is"
|
||||
IS_IS = "is-IS"
|
||||
@@ -279,6 +291,9 @@ class Language(StrEnum):
|
||||
KA = "ka"
|
||||
KA_GE = "ka-GE"
|
||||
|
||||
# Kabuverdianu
|
||||
KEA = "kea"
|
||||
|
||||
# Kazakh
|
||||
KK = "kk"
|
||||
KK_KZ = "kk-KZ"
|
||||
@@ -295,6 +310,13 @@ class Language(StrEnum):
|
||||
KO = "ko"
|
||||
KO_KR = "ko-KR"
|
||||
|
||||
# Kurdish
|
||||
KU = "ku"
|
||||
|
||||
# Kyrgyz
|
||||
KY = "ky"
|
||||
KY_KG = "ky-KG"
|
||||
|
||||
# Latin
|
||||
LA = "la"
|
||||
|
||||
@@ -312,6 +334,12 @@ class Language(StrEnum):
|
||||
LT = "lt"
|
||||
LT_LT = "lt-LT"
|
||||
|
||||
# Ganda
|
||||
LG = "lg"
|
||||
|
||||
# Luo
|
||||
LUO = "luo"
|
||||
|
||||
# Latvian
|
||||
LV = "lv"
|
||||
LV_LV = "lv-LV"
|
||||
@@ -366,6 +394,12 @@ class Language(StrEnum):
|
||||
NL_BE = "nl-BE"
|
||||
NL_NL = "nl-NL"
|
||||
|
||||
# Northern Sotho
|
||||
NSO = "nso"
|
||||
|
||||
# Chichewa
|
||||
NY = "ny"
|
||||
|
||||
# Occitan
|
||||
OC = "oc"
|
||||
|
||||
@@ -484,6 +518,9 @@ class Language(StrEnum):
|
||||
UK = "uk"
|
||||
UK_UA = "uk-UA"
|
||||
|
||||
# Umbundu
|
||||
UMB = "umb"
|
||||
|
||||
# Urdu
|
||||
UR = "ur"
|
||||
UR_IN = "ur-IN"
|
||||
@@ -497,6 +534,9 @@ class Language(StrEnum):
|
||||
VI = "vi"
|
||||
VI_VN = "vi-VN"
|
||||
|
||||
# Wolof
|
||||
WO = "wo"
|
||||
|
||||
# Wu Chinese
|
||||
WUU = "wuu"
|
||||
WUU_CN = "wuu-CN"
|
||||
@@ -507,7 +547,7 @@ class Language(StrEnum):
|
||||
# Yoruba
|
||||
YO = "yo"
|
||||
|
||||
# Yue Chinese
|
||||
# Yue Chinese (Cantonese)
|
||||
YUE = "yue"
|
||||
YUE_CN = "yue-CN"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user