Live translation (#61)

* added translator * fixup
2024-03-18 13:26:05 -05:00
parent 141a5bb548
commit 78638d2dba
3 changed files with 90 additions and 3 deletions
--- a/src/dailyai/services/azure_ai_services.py
+++ b/src/dailyai/services/azure_ai_services.py
@@ -25,20 +25,21 @@ from dailyai.services.openai_api_llm_service import BaseOpenAILLMService


 class AzureTTSService(TTSService):
-    def __init__(self, *, api_key, region):
+    def __init__(self, *, api_key, region, voice="en-US-SaraNeural"):
        super().__init__()

        self.speech_config = SpeechConfig(subscription=api_key, region=region)
        self.speech_synthesizer = SpeechSynthesizer(
            speech_config=self.speech_config, audio_config=None
        )
+        self._voice = voice

    async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
        self.logger.info("Running azure tts")
        ssml = (
            "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' "
            "xmlns:mstts='http://www.w3.org/2001/mstts'>"
-            "<voice name='en-US-SaraNeural'>"
+            f"<voice name='{self._voice}'>"
            "<mstts:silence type='Sentenceboundary' value='20ms' />"
            "<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>"
            "<prosody rate='1.05'>"
--- a/src/dailyai/services/elevenlabs_ai_service.py
+++ b/src/dailyai/services/elevenlabs_ai_service.py
@@ -16,16 +16,18 @@ class ElevenLabsTTSService(TTSService):
        aiohttp_session: aiohttp.ClientSession,
        api_key,
        voice_id,
+        model="eleven_turbo_v2",
    ):
        super().__init__()

        self._api_key = api_key
        self._voice_id = voice_id
        self._aiohttp_session = aiohttp_session
+        self._model = model

    async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
        url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
-        payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
+        payload = {"text": sentence, "model_id": self._model}
        querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
        headers = {
            "xi-api-key": self._api_key,
--- a/src/examples/starter-apps/translator.py
+++ b/src/examples/starter-apps/translator.py
@@ -0,0 +1,84 @@
+import asyncio
+import aiohttp
+import logging
+import os
+from PIL import Image
+from typing import AsyncGenerator
+
+from dailyai.pipeline.aggregators import (
+    LLMResponseAggregator,
+    UserResponseAggregator,
+    SentenceAggregator,
+)
+from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, TextFrame
+from dailyai.pipeline.frame_processor import FrameProcessor
+from dailyai.services.ai_services import AIService, FrameLogger
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.azure_ai_services import AzureTTSService
+from dailyai.services.open_ai_services import OpenAILLMService
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from examples.support.runner import configure
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+"""
+This example looks a bit different than the chatbot example, because it isn't waiting on the user to stop talking to start translating.
+It also isn't saving what the user or bot says into the context object for use in subsequent interactions.
+"""
+
+
+# We need to use a custom service here to yield LLM frames without saving any context
+class TranslationProcessor(FrameProcessor):
+    def __init__(self, language):
+        self._language = language
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, TextFrame):
+            context = [
+                {
+                    "role": "system",
+                    "content": f"You will be provided with a sentence in English, and your task is to translate it into {self._language}.",
+                },
+                {"role": "user", "content": frame.text},
+            ]
+            yield LLMMessagesQueueFrame(context)
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Translator",
+            duration_minutes=5,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=False,
+            vad_enabled=True,
+        )
+        tts = AzureTTSService(
+            api_key=os.getenv("AZURE_SPEECH_API_KEY"),
+            region=os.getenv("AZURE_SPEECH_REGION"),
+            voice="es-ES-AlvaroNeural",
+        )
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_CHATGPT_API_KEY"), model="gpt-4-turbo-preview"
+        )
+        sa = SentenceAggregator()
+        tp = TranslationProcessor("Spanish")
+        pipeline = Pipeline([sa, tp, llm, tts])
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+        await transport.run(pipeline)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))