Live translation (#61)

* added translator

* fixup
This commit is contained in:
chadbailey59
2024-03-18 13:26:05 -05:00
committed by GitHub
parent 141a5bb548
commit 78638d2dba
3 changed files with 90 additions and 3 deletions

View File

@@ -25,20 +25,21 @@ from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
class AzureTTSService(TTSService):
def __init__(self, *, api_key, region):
def __init__(self, *, api_key, region, voice="en-US-SaraNeural"):
super().__init__()
self.speech_config = SpeechConfig(subscription=api_key, region=region)
self.speech_synthesizer = SpeechSynthesizer(
speech_config=self.speech_config, audio_config=None
)
self._voice = voice
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
self.logger.info("Running azure tts")
ssml = (
"<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' "
"xmlns:mstts='http://www.w3.org/2001/mstts'>"
"<voice name='en-US-SaraNeural'>"
f"<voice name='{self._voice}'>"
"<mstts:silence type='Sentenceboundary' value='20ms' />"
"<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>"
"<prosody rate='1.05'>"

View File

@@ -16,16 +16,18 @@ class ElevenLabsTTSService(TTSService):
aiohttp_session: aiohttp.ClientSession,
api_key,
voice_id,
model="eleven_turbo_v2",
):
super().__init__()
self._api_key = api_key
self._voice_id = voice_id
self._aiohttp_session = aiohttp_session
self._model = model
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
payload = {"text": sentence, "model_id": self._model}
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
headers = {
"xi-api-key": self._api_key,

View File

@@ -0,0 +1,84 @@
import asyncio
import aiohttp
import logging
import os
from PIL import Image
from typing import AsyncGenerator
from dailyai.pipeline.aggregators import (
LLMResponseAggregator,
UserResponseAggregator,
SentenceAggregator,
)
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, TextFrame
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.services.ai_services import AIService, FrameLogger
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.azure_ai_services import AzureTTSService
from dailyai.services.open_ai_services import OpenAILLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from examples.support.runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
"""
This example looks a bit different than the chatbot example, because it isn't waiting on the user to stop talking to start translating.
It also isn't saving what the user or bot says into the context object for use in subsequent interactions.
"""
# We need to use a custom service here to yield LLM frames without saving any context
class TranslationProcessor(FrameProcessor):
def __init__(self, language):
self._language = language
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TextFrame):
context = [
{
"role": "system",
"content": f"You will be provided with a sentence in English, and your task is to translate it into {self._language}.",
},
{"role": "user", "content": frame.text},
]
yield LLMMessagesQueueFrame(context)
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Translator",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=False,
vad_enabled=True,
)
tts = AzureTTSService(
api_key=os.getenv("AZURE_SPEECH_API_KEY"),
region=os.getenv("AZURE_SPEECH_REGION"),
voice="es-ES-AlvaroNeural",
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"), model="gpt-4-turbo-preview"
)
sa = SentenceAggregator()
tp = TranslationProcessor("Spanish")
pipeline = Pipeline([sa, tp, llm, tts])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))