Japanese example

2025-01-02 18:05:20 +08:00
1 changed files with 37 additions and 15 deletions
--- a/examples/translation-chatbot/bot.py
+++ b/examples/translation-chatbot/bot.py
@@ -16,15 +16,14 @@ from runner import configure
 from pipecat.frames.frames import Frame, LLMMessagesFrame, TextFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
-from pipecat.pipeline.task import PipelineTask
+from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.llm_response import LLMFullResponseAggregator
 from pipecat.processors.aggregators.sentence import SentenceAggregator
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
-from pipecat.services.azure import AzureTTSService
+from pipecat.services.azure import AzureSTTService, AzureTTSService
 from pipecat.services.openai import OpenAILLMService
 from pipecat.transports.services.daily import (
    DailyParams,
-    DailyTranscriptionSettings,
    DailyTransport,
    DailyTransportMessageFrame,
 )
@@ -44,18 +43,20 @@ It also isn't saving what the user or bot says into the context object for use i
 # We need to use a custom service here to yield LLM frames without saving
 # any context
 class TranslationProcessor(FrameProcessor):
-    def __init__(self, language):
+    def __init__(self, source_language, language):
        super().__init__()
        self._language = language
+        self._source_language = source_language

    async def process_frame(self, frame: Frame, direction: FrameDirection):
        await super().process_frame(frame, direction)

        if isinstance(frame, TextFrame):
+            logger.debug(f"Translating {self._source_language}: {frame.text} to {self._language}")
            context = [
                {
                    "role": "system",
-                    "content": f"You will be provided with a sentence in English, and your task is to translate it into {self._language}.",
+                    "content": f"You will be provided with a sentence in {self._source_language}, and your task is to only translate it into {self._language}.",
                },
                {"role": "user", "content": frame.text},
            ]
@@ -79,7 +80,8 @@ class TranslationSubtitles(FrameProcessor):
        await super().process_frame(frame, direction)

        if isinstance(frame, TextFrame):
-            message = {"language": self._language, "text": frame.text}
+            print(f"TranslationSubtitles: {frame.text}")
+            message = {"event": "translation", "language": self._language, "text": frame.text}
            await self.push_frame(DailyTransportMessageFrame(message))

        await self.push_frame(frame)
@@ -92,34 +94,54 @@ async def main():
        transport = DailyTransport(
            room_url,
            token,
-            "Translator",
+            "Translator bot",
            DailyParams(
                audio_out_enabled=True,
-                transcription_enabled=True,
-                transcription_settings=DailyTranscriptionSettings(extra={"interim_results": False}),
+                vad_enabled=True,
+                vad_audio_passthrough=True,
            ),
        )

+        stt = AzureSTTService(
+            api_key=os.getenv("AZURE_SPEECH_API_KEY"),
+            region=os.getenv("AZURE_SPEECH_REGION"),
+            language="ja-JP",
+        )
+
        tts = AzureTTSService(
            api_key=os.getenv("AZURE_SPEECH_API_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION"),
-            voice="es-ES-AlvaroNeural",
+            # Use Japanese Voice from Azure,
+            # https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#text-to-speech
+            voice="ja-JP-KeitaNeural",
        )

        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")

        sa = SentenceAggregator()
-        tp = TranslationProcessor("Spanish")
+        tp = TranslationProcessor(source_language="English", language="Japanese")
        lfra = LLMFullResponseAggregator()
-        ts = TranslationSubtitles("spanish")
+        ts = TranslationSubtitles("japanese")

-        pipeline = Pipeline([transport.input(), sa, tp, llm, lfra, ts, tts, transport.output()])
+        pipeline = Pipeline(
+            [
+                transport.input(),
+                stt,
+                sa,
+                tp,
+                llm,
+                lfra,
+                ts,
+                tts,
+                transport.output(),
+            ]
+        )

-        task = PipelineTask(pipeline)
+        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True))

        @transport.event_handler("on_first_participant_joined")
        async def on_first_participant_joined(transport, participant):
-            await transport.capture_participant_transcription(participant["id"])
+            logger.info("First participant joined")

        runner = PipelineRunner()