From 34b10cb4c7a07fdd57cfe7c362fe4d2e1479bbde Mon Sep 17 00:00:00 2001
From: Chad Bailey <chadbailey@gmail.com>
Date: Tue, 19 Mar 2024 22:04:47 +0000
Subject: [PATCH] wip

---
 src/dailyai/pipeline/aggregators.py           |  6 ++
 src/dailyai/services/ai_services.py           |  9 ++-
 src/dailyai/services/elevenlabs_ai_service.py |  9 ++-
 src/dailyai/services/open_ai_services.py      |  1 +
 .../starter-apps/telestrator/telestrator.py   | 77 +++++++++++++++++--
 5 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/src/dailyai/pipeline/aggregators.py b/src/dailyai/pipeline/aggregators.py
index f9bcaca13..6dd5824cf 100644
--- a/src/dailyai/pipeline/aggregators.py
+++ b/src/dailyai/pipeline/aggregators.py
@@ -252,9 +252,15 @@ class LLMFullResponseAggregator(FrameProcessor):
         self.aggregation = ""
 
     async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if not isinstance(frame, AudioFrame):
+            print(f"^^^ LFRA got frame: {frame}")
         if isinstance(frame, TextFrame):
             self.aggregation += frame.text
+            print(
+                f"^^^ LFRA got textframe. aggregation is now {self.aggregation}")
         elif isinstance(frame, LLMResponseEndFrame):
+            print(
+                f"^^^ LFRA got an llmresponseendframe. About to yield aggregation: {self.aggregation}")
             yield TextFrame(self.aggregation)
             yield frame
             self.aggregation = ""
diff --git a/src/dailyai/services/ai_services.py b/src/dailyai/services/ai_services.py
index 0edbb53ae..206a0ff4e 100644
--- a/src/dailyai/services/ai_services.py
+++ b/src/dailyai/services/ai_services.py
@@ -62,6 +62,7 @@ class TTSService(AIService):
                 yield TextFrame(self.current_sentence)
 
         if not isinstance(frame, TextFrame):
+            print(f"*** tts yielding non-text: {frame}")
             yield frame
             return
 
@@ -80,6 +81,7 @@ class TTSService(AIService):
 
             # note we pass along the text frame *after* the audio, so the text
             # frame is completed after the audio is processed.
+            print(f"*** tts yielding text: {text}")
             yield TextFrame(text)
 
 
@@ -147,6 +149,8 @@ class VisionService(AIService):
     async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
         if isinstance(frame, VisionFrame):
             async for frame in self.run_vision(frame.prompt, frame.image):
+                print(
+                    f"&&& visionservce processframe got frame to yield: {frame}")
                 yield frame
             yield LLMResponseEndFrame()
         else:
@@ -159,8 +163,9 @@ class FrameLogger(AIService):
         self.prefix = prefix
 
     async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
-        if isinstance(frame, (AudioFrame, ImageFrame)):
-            self.logger.info(f"{self.prefix}: {type(frame)}")
+        if isinstance(frame, (AudioFrame)):
+            # self.logger.info(f"{self.prefix}: {type(frame)}")
+            pass
         else:
             print(f"{self.prefix}: {frame}")
 
diff --git a/src/dailyai/services/elevenlabs_ai_service.py b/src/dailyai/services/elevenlabs_ai_service.py
index e010c7934..b2675e007 100644
--- a/src/dailyai/services/elevenlabs_ai_service.py
+++ b/src/dailyai/services/elevenlabs_ai_service.py
@@ -15,18 +15,19 @@ class ElevenLabsTTSService(TTSService):
         *,
         aiohttp_session: aiohttp.ClientSession,
         api_key,
-        voice_id,
+        narrator,
         model="eleven_turbo_v2",
+        aggregate_sentences=True
     ):
-        super().__init__()
+        super().__init__(aggregate_sentences)
 
         self._api_key = api_key
-        self._voice_id = voice_id
+        self._narrator = narrator
         self._aiohttp_session = aiohttp_session
         self._model = model
 
     async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
-        url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
+        url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._narrator['narrator']['voice_id']}/stream"
         payload = {"text": sentence, "model_id": self._model}
         querystring = {
             "output_format": "pcm_16000",
diff --git a/src/dailyai/services/open_ai_services.py b/src/dailyai/services/open_ai_services.py
index afcb62979..d3f56122a 100644
--- a/src/dailyai/services/open_ai_services.py
+++ b/src/dailyai/services/open_ai_services.py
@@ -118,6 +118,7 @@ class OpenAIVisionService(VisionService):
             )
         )
         async for chunk in chunks:
+            print(f"%%% chunk: {chunk}")
             if len(chunk.choices) == 0:
                 continue
             if chunk.choices[0].delta.content:
diff --git a/src/examples/starter-apps/telestrator/telestrator.py b/src/examples/starter-apps/telestrator/telestrator.py
index 8c405b015..db2c8bf74 100644
--- a/src/examples/starter-apps/telestrator/telestrator.py
+++ b/src/examples/starter-apps/telestrator/telestrator.py
@@ -2,6 +2,7 @@ import asyncio
 import aiohttp
 import logging
 import os
+import random
 from typing import AsyncGenerator
 
 from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame
@@ -25,14 +26,69 @@ logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
 logger = logging.getLogger("dailyai")
 logger.setLevel(logging.DEBUG)
 
+narrators = [
+    {
+        "voice_id": "wDRBdcyPzQOCeq51IxW5",
+        "prompt": "Describe the image in one sentence, in the style of David Attenborough."
+    },
+    {
+        "voice_id": "M3bAX0o3Ptb2l6XqwQJV",
+        "prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."
+    },
+    {
+        "voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
+        "prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."
+    },
+    {
+        "voice_id": "7SNUlQ8GAbnZxRO9CKOt",
+        "prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."
+    },
+    {
+        "voice_id": "PWEz02ggFiibL6P5PKRx",
+        "prompt": "Describe the image in one sentence, in the style of Kanye West."
+    },
+    {
+        "voice_id": "gvpBhHjzfd7M2WedYVUI",
+        "prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."
+    },
+    {
+        "voice_id": "bnyr1EF3snReVXauGBNn",
+        "prompt": "Describe the image in one sentence, in the style of Maya Angelou."
 
-class VideoImageFrameProcessor(FrameProcessor):
-    def __init__(self):
-        pass
+    }
+]
+
+random.shuffle(narrators)
+print(f"$$$ narrators: {narrators}")
+narrator = {"narrator": narrators[0]}
+
+
+class NarratorShuffle(FrameProcessor):
+    def __init__(self, narrator, narrators):
+        self._narrator = narrator
+        self._narrators = narrators
+        self._i = 0
 
     async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
-        if isinstance(frame, VideoImageFrame) or isinstance(frame, TelestratorImageFrame):
-            yield VisionFrame("Describe the image in one sentence, in the style of David Attenborough.", frame.image)
+        if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
+            self._i += 1
+            if self._i >= len(self._narrators):
+                print(f"### shuffling narrators")
+                random.shuffle(self._narrators)
+                self._i = 0
+
+            self._narrator["narrator"] = self._narrators[self._i]
+            print(f"### new narrator is {self._narrator}")
+        yield frame
+
+
+class VideoImageFrameProcessor(FrameProcessor):
+    def __init__(self, narrator):
+        self._narrator = narrator
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
+            yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
         else:
             yield frame
 
@@ -75,7 +131,8 @@ async def main(room_url: str, token):
         tts = ElevenLabsTTSService(
             aiohttp_session=session,
             api_key=os.getenv("ELEVENLABS_API_KEY"),
-            voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
+            narrator=narrator,
+            aggregate_sentences=False
         )
 
         llm = OpenAILLMService(
@@ -83,7 +140,7 @@ async def main(room_url: str, token):
             model="gpt-4-turbo-preview")
 
         vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
-        vifp = VideoImageFrameProcessor()
+        vifp = VideoImageFrameProcessor(narrator)
         ir = ImageRefresher()
         img = FalImageGenService(
             image_size="1024x1024",
@@ -93,13 +150,17 @@ async def main(room_url: str, token):
         )
         tiw = TelestratorImageWrapper()
         lfra = LLMFullResponseAggregator()
+        fl0 = FrameLogger("@@@ About to describe")
         fl1 = FrameLogger("!!! About to image gen")
+        ns = NarratorShuffle(narrator, narrators)
         pipeline = Pipeline(
             processors=[
+                ns,
+                fl0,
                 vifp,
                 vs,
-                tts,
                 lfra,
+                tts,
                 fl1,
                 img,
                 tiw,