From 7596d71460d7c0a4f76159ceba9b7cfd8bccb65e Mon Sep 17 00:00:00 2001 From: Sam Sykes Date: Thu, 3 Jul 2025 21:25:13 +0100 Subject: [PATCH] Speechmatics STT + multi-speaker conversations (#2036) * initial config * skeleton * Added a README (to be added to). * Payloads coming from the ASR. * doc update * handle the partials and finals * enable diarization in the example * support sending messages to pipecat pipeline * requirements fix in README * updated example (with amusement) * updated example to match master * updated docs * support for diarization tags * logic fix for wrapper * Use an internal SpeechFrame for speaker_id (not user_id). * only include speaker tags on finalised transcript (as this may skew end of utterance detection) * updated docs * correction to docs and updated example * updated requirement * Fix for using default EU server. * Updates from PR comments. * Refactor based on comments in the original PR. Primary focus on documentation, naming conventions and how `user_id` is used. * Check for SMX installed when importing. * Variable name change * Comment correction. * Support for Esporanto and Uyghur * Impoved language support * function name change * Locale fix * intercept * interim changes * pass the pipeline task to the module for adding events to the top of the pipeline * logging for the pipeline * Reduce timeout for content aggregator. * staged update * testing with Azure * Updated context (Azure was dropping punctuation) and using better ElevenLabs model. * Updated to RT 0.3.0 and use OpenAI (not Azure). * Missing OpenAI import; parameter name change for output locale validation. * Revert to `0.2.0` of RT SDK. * fix for assignment of `output_locale_code`. * update Speechmatics library to 0.3.1 * new transcription example * updated asyncio task handling * Updated doc strings * enable OpenTelemetry logging * removed import from stt for __init__ * updated examples and default values * updated examples * prevent lock up when closing the STT connection --- README.md | 26 +- docs/api/requirements.txt | 1 + dot-env.template | 4 + .../07a-interruptible-speechmatics.py | 153 ++++ .../13h-speechmatics-transcription.py | 89 ++ pyproject.toml | 1 + src/pipecat/services/speechmatics/__init__.py | 5 + src/pipecat/services/speechmatics/stt.py | 813 ++++++++++++++++++ src/pipecat/transcriptions/language.py | 6 + 9 files changed, 1085 insertions(+), 13 deletions(-) create mode 100644 examples/foundational/07a-interruptible-speechmatics.py create mode 100644 examples/foundational/13h-speechmatics-transcription.py create mode 100644 src/pipecat/services/speechmatics/__init__.py create mode 100644 src/pipecat/services/speechmatics/stt.py diff --git a/README.md b/README.md index 2d14f37c9..45f6611a7 100644 --- a/README.md +++ b/README.md @@ -51,19 +51,19 @@ You can connect to Pipecat from any platform using our official SDKs: ## 🧩 Available services -| Category | Services | -| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova) [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | -| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) | -| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | -| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | -| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | -| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) | -| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | -| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | -| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | -| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) | -| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | +| Category | Services | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | +| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) | +| Text-to-Speech | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | +| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | +| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | +| Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) | +| Video | [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | +| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | +| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/fal), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | +| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp](https://docs.pipecat.ai/server/utilities/audio/krisp-filter), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [Noisereduce](https://docs.pipecat.ai/server/utilities/audio/noisereduce-filter) | +| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | 📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services) diff --git a/docs/api/requirements.txt b/docs/api/requirements.txt index d783b33e8..c9e8e2ce9 100644 --- a/docs/api/requirements.txt +++ b/docs/api/requirements.txt @@ -46,6 +46,7 @@ pipecat-ai[sambanova] pipecat-ai[silero] pipecat-ai[simli] pipecat-ai[soundfile] +pipecat-ai[speechmatics] pipecat-ai[tavus] pipecat-ai[together] # pipecat-ai[ultravox] # Mocked diff --git a/dot-env.template b/dot-env.template index f4fd43eea..ab085757f 100644 --- a/dot-env.template +++ b/dot-env.template @@ -109,6 +109,10 @@ MINIMAX_GROUP_ID=... # Sarvam AI SARVAM_API_KEY=... +# Speechmatics +SPEECHMATICS_API_KEY=... + + # SambaNova SAMBANOVA_API_KEY=... diff --git a/examples/foundational/07a-interruptible-speechmatics.py b/examples/foundational/07a-interruptible-speechmatics.py new file mode 100644 index 000000000..1582e79ba --- /dev/null +++ b/examples/foundational/07a-interruptible-speechmatics.py @@ -0,0 +1,153 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import argparse +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.llm_response import ( + LLMUserAggregatorParams, +) +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.elevenlabs.tts import ElevenLabsTTSService +from pipecat.services.openai.base_llm import BaseOpenAILLMService +from pipecat.services.openai.llm import OpenAILLMService +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService +from pipecat.transcriptions.language import Language +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams +from pipecat.transports.services.daily import DailyParams + +load_dotenv(override=True) + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "twilio": lambda: FastAPIWebsocketParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), +} + + +async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool): + """Run example using Speechmatics STT. + + This example will use diarization within our STT service and output the words spoken by + each individual speaker and wrap them with XML tags for the LLM to process. Note the + instructions in the system context for the LLM. This greatly improves the conversation + experience by allowing the LLM to understand who is speaking in a multi-party call. + + If you do not wish to use diarization, then set the `enable_speaker_diarization` parameter + to `False` or omit it altogether. The `text_format` will only be used if diarization is enabled. + + By default, this example will use our ENHANCED operating point, which is optimized for + high accuracy. You can change this by setting the `operating_point` parameter to a different + value. + + For more information on operating points, see the Speechmatics documentation: + https://docs.speechmatics.com/rt-api-ref + """ + logger.info(f"Starting bot") + + stt = SpeechmaticsSTTService( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + language=Language.EN, + enable_speaker_diarization=True, + text_format="<{speaker_id}>{text}", + ) + + tts = ElevenLabsTTSService( + api_key=os.getenv("ELEVENLABS_API_KEY", ""), + voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""), + model="eleven_turbo_v2_5", + ) + + llm = OpenAILLMService( + api_key=os.getenv("OPENAI_API_KEY"), + params=BaseOpenAILLMService.InputParams(temperature=0.75), + ) + + messages = [ + { + "role": "system", + "content": ( + "You are a helpful British assistant called Alfred. " + "Your goal is to demonstrate your capabilities in a succinct way. " + "Your output will be converted to audio so don't include special characters in your answers. " + "Always include punctuation in your responses. " + "Give very short replies - do not give longer replies unless strictly necessary. " + "Respond to what the user said in a concise, funny, creative and helpful way. " + "Use `` tags to identify different speakers - do not use tags in your replies." + ), + }, + ] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator( + context, + user_params=LLMUserAggregatorParams(aggregation_timeout=0.005), + ) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, # STT + context_aggregator.user(), # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + messages.append({"role": "system", "content": "Say a short hello to the user."}) + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=handle_sigint) + + await runner.run(task) + + +if __name__ == "__main__": + from pipecat.examples.run import main + + main(run_example, transport_params=transport_params) diff --git a/examples/foundational/13h-speechmatics-transcription.py b/examples/foundational/13h-speechmatics-transcription.py new file mode 100644 index 000000000..ec3197e19 --- /dev/null +++ b/examples/foundational/13h-speechmatics-transcription.py @@ -0,0 +1,89 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import argparse +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.frames.frames import Frame, TranscriptionFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineTask +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.services.speechmatics.stt import SpeechmaticsSTTService +from pipecat.transcriptions.language import Language +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams +from pipecat.transports.services.daily import DailyParams + +load_dotenv(override=True) + + +class TranscriptionLogger(FrameProcessor): + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, TranscriptionFrame): + print(f"Transcription: {frame.text}") + + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams(audio_in_enabled=True), + "twilio": lambda: FastAPIWebsocketParams(audio_in_enabled=True), + "webrtc": lambda: TransportParams(audio_in_enabled=True), +} + + +async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool): + """Run example using Speechmatics STT. + + This example will use diarization within our STT service and output the words spoken by + each individual speaker and wrap them with XML tags. + + If you do not wish to use diarization, then set the `enable_speaker_diarization` parameter + to `False` or omit it altogether. The `text_format` will only be used if diarization is enabled. + + By default, this example will use our ENHANCED operating point, which is optimized for + high accuracy. You can change this by setting the `operating_point` parameter to a different + value. + + For more information on operating points, see the Speechmatics documentation: + https://docs.speechmatics.com/rt-api-ref + """ + logger.info(f"Starting bot") + + stt = SpeechmaticsSTTService( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + language=Language.EN, + enable_speaker_diarization=True, + text_format="<{speaker_id}>{text}", + ) + + tl = TranscriptionLogger() + + pipeline = Pipeline([transport.input(), stt, tl]) + + task = PipelineTask(pipeline) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=handle_sigint) + + await runner.run(task) + + +if __name__ == "__main__": + from pipecat.examples.run import main + + main(run_example, transport_params=transport_params) diff --git a/pyproject.toml b/pyproject.toml index 42b06060d..2a0fd2175 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,7 @@ remote-smart-turn = [] silero = [ "onnxruntime~=1.20.1" ] simli = [ "simli-ai~=0.1.10"] soundfile = [ "soundfile~=0.13.0" ] +speechmatics = [ "speechmatics-rt>=0.3.1" ] tavus=[] together = [] tracing = [ "opentelemetry-sdk>=1.33.0", "opentelemetry-api>=1.33.0", "opentelemetry-instrumentation>=0.54b0" ] diff --git a/src/pipecat/services/speechmatics/__init__.py b/src/pipecat/services/speechmatics/__init__.py new file mode 100644 index 000000000..d23112945 --- /dev/null +++ b/src/pipecat/services/speechmatics/__init__.py @@ -0,0 +1,5 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# diff --git a/src/pipecat/services/speechmatics/stt.py b/src/pipecat/services/speechmatics/stt.py new file mode 100644 index 000000000..5cba8d931 --- /dev/null +++ b/src/pipecat/services/speechmatics/stt.py @@ -0,0 +1,813 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""Speechmatics STT service integration.""" + +import asyncio +import datetime +import re +from dataclasses import dataclass, field +from typing import Any, AsyncGenerator, Optional +from urllib.parse import urlencode + +from loguru import logger + +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + Frame, + InterimTranscriptionFrame, + StartFrame, + TranscriptionFrame, +) +from pipecat.services.stt_service import STTService +from pipecat.transcriptions.language import Language +from pipecat.utils.tracing.service_decorators import traced_stt + +try: + from speechmatics.rt import ( + AsyncClient, + AudioEncoding, + AudioFormat, + ConversationConfig, + OperatingPoint, + ServerMessageType, + SpeakerDiarizationConfig, + TranscriptionConfig, + __version__, + ) +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error( + "In order to use Speechmatics, you need to `pip install pipecat-ai[speechmatics]`." + ) + raise Exception(f"Missing module: {e}") + + +class AudioBuffer: + """Audio buffer for STT clients. + + The Python SDK expects audio in a pre-defined number of frames. This + buffer will accumulate the data from the pipeline and provide it to the + STT client in the correct lengths, waiting for the number of frames to + be available. + """ + + def __init__(self, maxsize: int = 0): + """Initialize the audio buffer. + + Args: + maxsize: Maximum size of the buffer. + """ + self._queue = asyncio.Queue(maxsize=maxsize) + self._current_chunk = b"" + self._position = 0 + self._closed = False + + def write_audio(self, data: bytes) -> None: + """Write audio data to the buffer (thread-safe). + + Args: + data: Audio data to write. + """ + if data: + try: + self._queue.put_nowait(data) + except asyncio.QueueFull: + pass + + async def read(self, size: int) -> bytes: + """Read exactly `size` bytes from the buffer (thread-safe). + + This process will block until the required number of bytes are available + in the buffer. Audio is received from the pipeline in varying sizes, so + this buffer will accumulate the data and provide it to the STT client in + the correct lengths, waiting for the number of frames to be available. + + Calling stop() will close the buffer and release the blocking read + process. + + Args: + size: Number of bytes to read. + + Returns: + bytes: Audio data read from the buffer. + """ + result = b"" + bytes_needed = size + + while bytes_needed > 0 and not self._closed: + # Use data from current chunk if available + if self._position < len(self._current_chunk): + available = len(self._current_chunk) - self._position + take = min(bytes_needed, available) + result += self._current_chunk[self._position : self._position + take] + self._position += take + bytes_needed -= take + continue + + # Get next chunk + try: + chunk = await asyncio.wait_for(self._queue.get(), timeout=0.1) + if chunk is None: + continue + self._current_chunk = chunk + self._position = 0 + except asyncio.TimeoutError: + await asyncio.sleep(0) + continue + + return result + + def stop(self) -> None: + """Close the audio buffer.""" + self._closed = True + + +@dataclass +class SpeechFragment: + """Fragment of an utterance. + + Parameters: + start_time: Start time of the fragment in seconds (from session start). + end_time: End time of the fragment in seconds (from session start). + language: Language of the fragment. Defaults to `Language.EN`. + is_eos: Whether the fragment is the end of a sentence. Defaults to `False`. + is_final: Whether the fragment is the final fragment. Defaults to `False`. + attaches_to: Whether the fragment attaches to the previous or next fragment (punctuation). Defaults to empty string. + content: Content of the fragment. Defaults to empty string. + speaker: Speaker of the fragment (if diarization is enabled). Defaults to `None`. + confidence: Confidence of the fragment (0.0 to 1.0). Defaults to `1.0`. + result: Raw result of the fragment from the TTS. + """ + + start_time: float + end_time: float + language: Language = Language.EN + is_eos: bool = False + is_final: bool = False + attaches_to: str = "" + content: str = "" + speaker: Optional[str] = None + confidence: float = 1.0 + result: Optional[Any] = None + + +@dataclass +class SpeakerFragments: + """SpeechFragment items grouped by speaker_id. + + Parameters: + speaker_id: The ID of the speaker. + timestamp: The timestamp of the frame. + language: The language of the frame. + fragments: The list of SpeechFragment items. + """ + + speaker_id: Optional[str] = None + timestamp: Optional[str] = None + language: Optional[Language] = None + fragments: list[SpeechFragment] = field(default_factory=list) + + def __str__(self): + """Return a string representation of the object.""" + return f"SpeakerFragments(speaker_id: {self.speaker_id}, timestamp: {self.timestamp}, language: {self.language}, text: {self._format_text()})" + + def _format_text(self, format: Optional[str] = None) -> str: + """Wrap text with speaker ID in an optional f-string format. + + Args: + format: Format to wrap the text with. + + Returns: + str: The wrapped text. + """ + # Cumulative contents + content = "" + + # Assemble the text + for frag in self.fragments: + if content == "" or frag.attaches_to == "previous": + content += frag.content + else: + content += " " + frag.content + + # Format the text, if format is provided + if format is None or self.speaker_id is None: + return content + return format.format(**{"speaker_id": self.speaker_id, "text": content}) + + def _as_frame_attributes(self, format: Optional[str] = None) -> dict[str, Any]: + """Return a dictionary of attributes for a TranscriptionFrame. + + Args: + format: Format to wrap the text with. + + Returns: + dict[str, Any]: The dictionary of attributes. + """ + return { + "text": self._format_text(format), + "user_id": self.speaker_id, + "timestamp": self.timestamp, + "language": self.language, + "result": [frag.result for frag in self.fragments], + } + + +class SpeechmaticsSTTService(STTService): + """Speechmatics STT service implementation. + + This service provides real-time speech-to-text transcription using the Speechmatics API. + It supports partial and final transcriptions, multiple languages, various audio formats, + and speaker diarization. + """ + + def __init__( + self, + *, + api_key: str, + language: Optional[Language] = None, + language_code: Optional[str] = None, + base_url: str = "wss://eu2.rt.speechmatics.com/v2", + domain: Optional[str] = None, + output_locale: Optional[Language] = None, + output_locale_code: Optional[str] = None, + enable_partials: bool = True, + max_delay: float = 1.5, + sample_rate: Optional[int] = 16000, + chunk_size: int = 256, + audio_encoding: AudioEncoding = AudioEncoding.PCM_S16LE, + end_of_utterance_silence_trigger: float = 0.5, + operating_point: OperatingPoint = OperatingPoint.ENHANCED, + enable_speaker_diarization: bool = False, + text_format: str = "<{speaker_id}>{text}", + max_speakers: Optional[int] = None, + transcription_config: Optional[TranscriptionConfig] = None, + **kwargs, + ): + """Initialize the Speechmatics STT service. + + Args: + api_key: Speechmatics API key for authentication. + language: Language code for transcription. Defaults to `None`. + language_code: Language code string for transcription. Defaults to `None`. + base_url: Base URL for Speechmatics API. Defaults to `wss://eu2.rt.speechmatics.com/v2`. + domain: Domain for Speechmatics API. Defaults to `None`. + output_locale: Output locale for transcription, e.g. `Language.EN_GB`. Defaults to `None`. + output_locale_code: Output locale code for transcription. Defaults to `None`. + enable_partials: Enable partial transcription results. Defaults to `True`. + max_delay: Maximum delay for transcription in seconds. Defaults to `1.5`. + sample_rate: Audio sample rate in Hz. Defaults to `16000`. + chunk_size: Audio chunk size for streaming. Defaults to `256`. + audio_encoding: Audio encoding format. Defaults to `pcm_s16le`. + end_of_utterance_silence_trigger: Silence duration in seconds to trigger end of utterance detection. Defaults to `0.5`. + operating_point: Operating point for transcription accuracy vs. latency tradeoff. Defaults to `enhanced`. + enable_speaker_diarization: Enable speaker diarization to identify different speakers. Defaults to `False`. + text_format: Wrapper for speaker ID. Defaults to `<{speaker_id}>{text}`. + max_speakers: Maximum number of speakers to detect. Defaults to `None` (auto-detect). + transcription_config: Custom transcription configuration (other set parameters are merged). Defaults to `None`. + **kwargs: Additional arguments passed to STTService. + """ + super().__init__(sample_rate=sample_rate, **kwargs) + + # Client configuration + self._api_key: str = api_key + self._language: Optional[Language] = language + self._language_code: Optional[str] = language_code + self._base_url: str = base_url + self._domain: Optional[str] = domain + self._output_locale: Optional[Language] = output_locale + self._output_locale_code: Optional[str] = output_locale_code + self._enable_partials: bool = enable_partials + self._max_delay: float = max_delay + self._sample_rate: int = sample_rate + self._chunk_size: int = chunk_size + self._audio_encoding: AudioEncoding = audio_encoding + self._end_of_utterance_silence_trigger: Optional[float] = end_of_utterance_silence_trigger + self._operating_point: OperatingPoint = operating_point + self._enable_speaker_diarization: bool = enable_speaker_diarization + self._text_format: str = text_format + self._max_speakers: Optional[int] = max_speakers + + # Check we have required attributes + if not self._api_key: + raise ValueError("Missing Speechmatics API key") + if not self._base_url: + raise ValueError("Missing Speechmatics base URL") + + # Validate the language code + if self._language and self._language_code: + raise ValueError("Language and language code cannot both be specified") + elif self._language: + self._language_code = _language_to_speechmatics_language(self._language) + + # Validate the output locale code + if self._output_locale and self._output_locale_code: + raise ValueError("Output locale and output locale code cannot both be specified") + elif self._output_locale: + self._output_locale_code = _locale_to_speechmatics_locale( + self._language_code, self._output_locale + ) + + # Complete configuration objects + self._transcription_config: TranscriptionConfig = None + self._process_config(transcription_config) + + # STT client + self._client: Optional[AsyncClient] = None + self._client_task: Optional[asyncio.Task] = None + self._audio_buffer: AudioBuffer = AudioBuffer(maxsize=10) + self._start_time: Optional[datetime.datetime] = None + + # Current utterance speech data + self._speech_fragments: list[SpeechFragment] = [] + + async def start(self, frame: StartFrame): + """Called when the new session starts.""" + await super().start(frame) + await self._connect() + + async def stop(self, frame: EndFrame): + """Called when the session ends.""" + await super().stop(frame) + await self._disconnect() + + async def cancel(self, frame: CancelFrame): + """Called when the session is cancelled.""" + await super().cancel(frame) + await self._disconnect() + + async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: + """Adds audio to the audio buffer and yields None.""" + self._audio_buffer.write_audio(audio) + yield None + + async def _run_client(self) -> None: + """Runs the Speechmatics client in a thread.""" + await self._client.transcribe( + self._audio_buffer, + transcription_config=self._transcription_config, + audio_format=AudioFormat( + encoding=self._audio_encoding, + sample_rate=self.sample_rate, + chunk_size=self._chunk_size, + ), + ) + + async def _connect(self) -> None: + """Connect to the STT service.""" + # Create new STT RT client + self._client = AsyncClient( + api_key=self._api_key, + url=_get_endpoint_url(self._base_url), + ) + + # Log the event + logger.debug("Connected to Speechmatics STT service") + + # Recognition started event + @self._client.on(ServerMessageType.RECOGNITION_STARTED) + def _evt_on_recognition_started(message: dict[str, Any]): + logger.debug(f"Recognition started (session: {message.get('id')})") + self._start_time = datetime.datetime.now(datetime.timezone.utc) + + # Partial transcript event + @self._client.on(ServerMessageType.ADD_PARTIAL_TRANSCRIPT) + def _evt_on_partial_transcript(message: dict[str, Any]): + self._handle_transcript(message, is_final=False) + + # Final transcript event + @self._client.on(ServerMessageType.ADD_TRANSCRIPT) + def _evt_on_final_transcript(message: dict[str, Any]): + self._handle_transcript(message, is_final=True) + + # End of Utterance + @self._client.on(ServerMessageType.END_OF_UTTERANCE) + def _evt_on_end_of_utterance(message: dict[str, Any]): + logger.debug("End of utterance received from STT") + asyncio.run_coroutine_threadsafe( + self._send_frames(finalized=True), self.get_event_loop() + ) + + # Start the client in a thread + self._client_task = self.create_task(self._run_client()) + + async def _disconnect(self) -> None: + """Disconnect from the STT service.""" + # Stop the audio buffer + self._audio_buffer.stop() + + # Disconnect the client + try: + if self._client: + await asyncio.wait_for(self._client.close(), timeout=1.0) + except asyncio.TimeoutError: + logger.warning("Timeout while closing Speechmatics client connection") + except Exception as e: + logger.error(f"Error closing Speechmatics client: {e}") + finally: + self._client = None + + # Cancel the client task + if self._client_task: + await self.cancel_task(self._client_task) + self._client_task = None + + # Log the event + logger.debug("Disconnected from Speechmatics STT service") + + def _process_config(self, transcription_config: Optional[TranscriptionConfig] = None) -> None: + """Create a formatted STT transcription config. + + This takes an optional TranscriptionConfig object and populates it with the + values from the STT service. Individual parameters take priority over those + within the config object. + + Args: + transcription_config: Optional transcription config to use. + """ + # Transcription config + if not transcription_config: + transcription_config = TranscriptionConfig( + language=self._language_code or "en", + domain=self._domain, + output_locale=self._output_locale_code, + operating_point=self._operating_point, + diarization="speaker" if self._enable_speaker_diarization else None, + enable_partials=self._enable_partials, + max_delay=self._max_delay or 2.0, + ) + else: + if self._language_code: + transcription_config.language = self._language_code + if self._domain: + transcription_config.domain = self._domain + if self._output_locale_code: + transcription_config.output_locale = self._output_locale_code + if self._operating_point: + transcription_config.operating_point = self._operating_point + if self._enable_speaker_diarization: + transcription_config.diarization = "speaker" + if self._enable_partials: + transcription_config.enable_partials = self._enable_partials + if self._max_delay: + transcription_config.max_delay = self._max_delay + + # Diarization + if self._enable_speaker_diarization and self._max_speakers: + transcription_config.speaker_diarization_config = SpeakerDiarizationConfig( + max_speakers=self._max_speakers, + ) + + # End of Utterance + if self._end_of_utterance_silence_trigger: + transcription_config.conversation_config = ConversationConfig( + end_of_utterance_silence_trigger=self._end_of_utterance_silence_trigger, + ) + + # Set config + self._transcription_config = transcription_config + + def _handle_transcript(self, message: dict[str, Any], is_final: bool) -> None: + """Handle the partial and final transcript events. + + Args: + message: The new Partial or Final from the STT engine. + is_final: Whether the data is final or partial. + """ + # Add the speech fragments + has_changed = self._add_speech_fragments( + message=message, + is_final=is_final, + ) + + # Skip if unchanged + if not has_changed: + return + + # Send frames + asyncio.run_coroutine_threadsafe(self._send_frames(), self.get_event_loop()) + + @traced_stt + async def _handle_transcription( + self, transcript: str, is_final: bool, language: Optional[Language] = None + ): + """Handle a transcription result with tracing.""" + pass + + async def _send_frames(self, finalized: bool = False) -> None: + """Send frames to the pipeline. + + Send speech frames to the pipeline. If VAD is enabled, then this will + also send an interruption and user started speaking frames. When the + final transcript is received, then this will send a user stopped speaking + and stop interruption frames. + + Args: + finalized: Whether the data is final or partial. + """ + # Get speech frames (InterimTranscriptionFrame) + speech_frames = self._get_frames_from_fragments() + + # Skip if no frames + if not speech_frames: + return + + # If final, then re=parse into TranscriptionFrame + if finalized: + # Reset the speech fragments + self._speech_fragments.clear() + + # Transform frames + frames = [ + TranscriptionFrame(**frame._as_frame_attributes(self._text_format)) + for frame in speech_frames + ] + + # Log transcript(s) + logger.debug(f"Finalized transcript: {[f.text for f in frames]}") + + # Return as interim results + else: + frames = [ + InterimTranscriptionFrame(**frame._as_frame_attributes()) for frame in speech_frames + ] + + # Send the frames back to pipecat + for frame in frames: + await self._handle_transcription( + transcript=frame.text, + is_final=finalized, + language=frame.language, + ) + await self.push_frame(frame) + + def _add_speech_fragments(self, message: dict[str, Any], is_final: bool = False) -> bool: + """Takes a new Partial or Final from the STT engine. + + Accumulates it into the _speech_data list. As new final data is added, all + partials are removed from the list. + + Note: If a known speaker is `__[A-Z0-9_]{2,}__`, then the words are skipped, + as this is used to protect against self-interruption by the assistant or to + block out specific known voices. + + Args: + message: The new Partial or Final from the STT engine. + is_final: Whether the data is final or partial. + + Returns: + bool: True if the speech data was updated, False otherwise. + """ + # Parsed new speech data from the STT engine + fragments: list[SpeechFragment] = [] + + # Current length of the speech data + current_length = len(self._speech_fragments) + + # Iterate over the results in the payload + for result in message.get("results", []): + alt = result.get("alternatives", [{}])[0] + if alt.get("content", None): + # Create the new fragment + fragment = SpeechFragment( + start_time=result.get("start_time", 0), + end_time=result.get("end_time", 0), + language=alt.get("language", Language.EN), + is_eos=alt.get("is_eos", False), + is_final=is_final, + attaches_to=result.get("attaches_to", ""), + content=alt.get("content", ""), + speaker=alt.get("speaker", None), + confidence=alt.get("confidence", 1.0), + result=result, + ) + + # Drop `__XX__` speakers + if fragment.speaker and re.match(r"^__[A-Z0-9_]{2,}__$", fragment.speaker): + continue + + # Add the fragment + fragments.append(fragment) + + # Remove existing partials, as new partials and finals are provided + self._speech_fragments = [frag for frag in self._speech_fragments if frag.is_final] + + # Return if no new fragments and length of the existing data is unchanged + if not fragments and len(self._speech_fragments) == current_length: + return False + + # Add the fragments to the speech data + self._speech_fragments.extend(fragments) + + # Data was updated + return True + + def _get_frames_from_fragments(self) -> list[SpeakerFragments]: + """Get speech data objects for the current fragment list. + + Each speech fragments is grouped by contiguous speaker and then + returned as internal SpeakerFragments objects with the `speaker_id` field + set to the current speaker (string). An utterance may contain speech from + more than one speaker (e.g. S1, S2, S1, S3, ...), so they are kept + in strict order for the context of the conversation. + + Returns: + list[SpeakerFragments]: The list of objects. + """ + # Speaker groups + current_speaker: str | None = None + speaker_groups: list[list[SpeechFragment]] = [[]] + + # Group by speakers + for frag in self._speech_fragments: + if frag.speaker != current_speaker: + current_speaker = frag.speaker + if speaker_groups[-1]: + speaker_groups.append([]) + speaker_groups[-1].append(frag) + + # Create SpeakerFragments objects + speaker_fragments: list[SpeakerFragments] = [] + for group in speaker_groups: + sd = self._get_speaker_fragments_from_fragment_group(group) + if sd: + speaker_fragments.append(sd) + + # Return the grouped SpeakerFragments objects + return speaker_fragments + + def _get_speaker_fragments_from_fragment_group( + self, + group: list[SpeechFragment], + ) -> SpeakerFragments | None: + """Take a group of fragments and piece together into SpeakerFragments. + + Each fragment for a given speaker is assembled into a string, + taking into consideration whether words are attached to the + previous or next word (notably punctuation). This ensures that + the text does not have extra spaces. This will also check for + any straggling punctuation from earlier utterances that should + be removed. + + Args: + group: List of SpeechFragment objects. + + Returns: + SpeakerFragments: The object for the group. + """ + # Check for starting fragments that are attached to previous + if group and group[0].attaches_to == "previous": + group = group[1:] + + # Check for trailing fragments that are attached to next + if group and group[-1].attaches_to == "next": + group = group[:-1] + + # Check there are results + if not group: + return None + + # Get the timing extremes + start_time = min(frag.start_time for frag in group) + + # Timestamp + ts = (self._start_time + datetime.timedelta(seconds=start_time)).isoformat( + timespec="milliseconds" + ) + + # Return the SpeakerFragments object + return SpeakerFragments( + speaker_id=group[0].speaker, + timestamp=ts, + language=group[0].language, + fragments=group, + ) + + +def _get_endpoint_url(url: str) -> str: + """Format the endpoint URL with the SDK and app versions. + + Args: + url: The base URL for the endpoint. + + Returns: + str: The formatted endpoint URL. + """ + query_params = dict() + query_params["sm-app"] = f"pipecat/{__version__}" + query = urlencode(query_params) + + return f"{url}?{query}" + + +def _language_to_speechmatics_language(language: Language) -> str: + """Convert a Language enum to a Speechmatics language code. + + Args: + language: The Language enum to convert. + + Returns: + str: The Speechmatics language code, if found. + """ + # List of supported input languages + BASE_LANGUAGES = { + Language.AR: "ar", + Language.BA: "ba", + Language.EU: "eu", + Language.BE: "be", + Language.BG: "bg", + Language.BN: "bn", + Language.YUE: "yue", + Language.CA: "ca", + Language.HR: "hr", + Language.CS: "cs", + Language.DA: "da", + Language.NL: "nl", + Language.EN: "en", + Language.EO: "eo", + Language.ET: "et", + Language.FA: "fa", + Language.FI: "fi", + Language.FR: "fr", + Language.GL: "gl", + Language.DE: "de", + Language.EL: "el", + Language.HE: "he", + Language.HI: "hi", + Language.HU: "hu", + Language.IT: "it", + Language.ID: "id", + Language.GA: "ga", + Language.JA: "ja", + Language.KO: "ko", + Language.LV: "lv", + Language.LT: "lt", + Language.MS: "ms", + Language.MT: "mt", + Language.CMN: "cmn", + Language.MR: "mr", + Language.MN: "mn", + Language.NO: "no", + Language.PL: "pl", + Language.PT: "pt", + Language.RO: "ro", + Language.RU: "ru", + Language.SK: "sk", + Language.SL: "sl", + Language.ES: "es", + Language.SV: "sv", + Language.SW: "sw", + Language.TA: "ta", + Language.TH: "th", + Language.TR: "tr", + Language.UG: "ug", + Language.UK: "uk", + Language.UR: "ur", + Language.VI: "vi", + Language.CY: "cy", + } + + # Get the language code + result = BASE_LANGUAGES.get(language) + + # Fail if language is not supported + if not result: + raise ValueError(f"Unsupported language: {language}") + + # Return the language code + return result + + +def _locale_to_speechmatics_locale(language_code: str, locale: Language) -> Optional[str]: + """Convert a Language enum to a Speechmatics language code. + + Args: + language_code: The language code. + locale: The Language enum to convert. + + Returns: + str: The Speechmatics language code, if found. + """ + # Languages and output locales + LOCALES = { + "en": { + Language.EN_GB: "en-GB", + Language.EN_US: "en-US", + Language.EN_AU: "en-AU", + }, + } + + # Get the locale code + result = LOCALES.get(language_code, {}).get(locale) + + # Fail if locale is not supported + if not result: + logger.warning(f"Unsupported output locale: {locale}, defaulting to {language_code}") + + # Return the locale code + return result diff --git a/src/pipecat/transcriptions/language.py b/src/pipecat/transcriptions/language.py index a2f269309..182a89321 100644 --- a/src/pipecat/transcriptions/language.py +++ b/src/pipecat/transcriptions/language.py @@ -145,6 +145,9 @@ class Language(StrEnum): EN_US = "en-US" EN_ZA = "en-ZA" + # Esperanto + EO = "eo" + # Spanish ES = "es" ES_AR = "es-AR" @@ -474,6 +477,9 @@ class Language(StrEnum): # Tatar TT = "tt" + # Uyghur + UG = "ug" + # Ukrainian UK = "uk" UK_UA = "uk-UA"