wip

2024-03-19 22:04:47 +00:00
parent e726f15c4e
commit 34b10cb4c7
5 changed files with 88 additions and 14 deletions
--- a/src/dailyai/pipeline/aggregators.py
+++ b/src/dailyai/pipeline/aggregators.py
@@ -252,9 +252,15 @@ class LLMFullResponseAggregator(FrameProcessor):
        self.aggregation = ""

    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if not isinstance(frame, AudioFrame):
+            print(f"^^^ LFRA got frame: {frame}")
        if isinstance(frame, TextFrame):
            self.aggregation += frame.text
+            print(
+                f"^^^ LFRA got textframe. aggregation is now {self.aggregation}")
        elif isinstance(frame, LLMResponseEndFrame):
+            print(
+                f"^^^ LFRA got an llmresponseendframe. About to yield aggregation: {self.aggregation}")
            yield TextFrame(self.aggregation)
            yield frame
            self.aggregation = ""
--- a/src/dailyai/services/ai_services.py
+++ b/src/dailyai/services/ai_services.py
@@ -62,6 +62,7 @@ class TTSService(AIService):
                yield TextFrame(self.current_sentence)

        if not isinstance(frame, TextFrame):
+            print(f"*** tts yielding non-text: {frame}")
            yield frame
            return

@@ -80,6 +81,7 @@ class TTSService(AIService):

            # note we pass along the text frame *after* the audio, so the text
            # frame is completed after the audio is processed.
+            print(f"*** tts yielding text: {text}")
            yield TextFrame(text)


@@ -147,6 +149,8 @@ class VisionService(AIService):
    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
        if isinstance(frame, VisionFrame):
            async for frame in self.run_vision(frame.prompt, frame.image):
+                print(
+                    f"&&& visionservce processframe got frame to yield: {frame}")
                yield frame
            yield LLMResponseEndFrame()
        else:
@@ -159,8 +163,9 @@ class FrameLogger(AIService):
        self.prefix = prefix

    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
-        if isinstance(frame, (AudioFrame, ImageFrame)):
-            self.logger.info(f"{self.prefix}: {type(frame)}")
+        if isinstance(frame, (AudioFrame)):
+            # self.logger.info(f"{self.prefix}: {type(frame)}")
+            pass
        else:
            print(f"{self.prefix}: {frame}")

--- a/src/dailyai/services/elevenlabs_ai_service.py
+++ b/src/dailyai/services/elevenlabs_ai_service.py
@@ -15,18 +15,19 @@ class ElevenLabsTTSService(TTSService):
        *,
        aiohttp_session: aiohttp.ClientSession,
        api_key,
-        voice_id,
+        narrator,
        model="eleven_turbo_v2",
+        aggregate_sentences=True
    ):
-        super().__init__()
+        super().__init__(aggregate_sentences)

        self._api_key = api_key
-        self._voice_id = voice_id
+        self._narrator = narrator
        self._aiohttp_session = aiohttp_session
        self._model = model

    async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
-        url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
+        url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._narrator['narrator']['voice_id']}/stream"
        payload = {"text": sentence, "model_id": self._model}
        querystring = {
            "output_format": "pcm_16000",
--- a/src/dailyai/services/open_ai_services.py
+++ b/src/dailyai/services/open_ai_services.py
@@ -118,6 +118,7 @@ class OpenAIVisionService(VisionService):
            )
        )
        async for chunk in chunks:
+            print(f"%%% chunk: {chunk}")
            if len(chunk.choices) == 0:
                continue
            if chunk.choices[0].delta.content:
--- a/src/examples/starter-apps/telestrator/telestrator.py
+++ b/src/examples/starter-apps/telestrator/telestrator.py
@@ -2,6 +2,7 @@ import asyncio
 import aiohttp
 import logging
 import os
+import random
 from typing import AsyncGenerator

 from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame
@@ -25,14 +26,69 @@ logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
 logger = logging.getLogger("dailyai")
 logger.setLevel(logging.DEBUG)

+narrators = [
+    {
+        "voice_id": "wDRBdcyPzQOCeq51IxW5",
+        "prompt": "Describe the image in one sentence, in the style of David Attenborough."
+    },
+    {
+        "voice_id": "M3bAX0o3Ptb2l6XqwQJV",
+        "prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."
+    },
+    {
+        "voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
+        "prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."
+    },
+    {
+        "voice_id": "7SNUlQ8GAbnZxRO9CKOt",
+        "prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."
+    },
+    {
+        "voice_id": "PWEz02ggFiibL6P5PKRx",
+        "prompt": "Describe the image in one sentence, in the style of Kanye West."
+    },
+    {
+        "voice_id": "gvpBhHjzfd7M2WedYVUI",
+        "prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."
+    },
+    {
+        "voice_id": "bnyr1EF3snReVXauGBNn",
+        "prompt": "Describe the image in one sentence, in the style of Maya Angelou."

-class VideoImageFrameProcessor(FrameProcessor):
-    def __init__(self):
-        pass
+    }
+]
+
+random.shuffle(narrators)
+print(f"$$$ narrators: {narrators}")
+narrator = {"narrator": narrators[0]}
+
+
+class NarratorShuffle(FrameProcessor):
+    def __init__(self, narrator, narrators):
+        self._narrator = narrator
+        self._narrators = narrators
+        self._i = 0

    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
-        if isinstance(frame, VideoImageFrame) or isinstance(frame, TelestratorImageFrame):
-            yield VisionFrame("Describe the image in one sentence, in the style of David Attenborough.", frame.image)
+        if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
+            self._i += 1
+            if self._i >= len(self._narrators):
+                print(f"### shuffling narrators")
+                random.shuffle(self._narrators)
+                self._i = 0
+
+            self._narrator["narrator"] = self._narrators[self._i]
+            print(f"### new narrator is {self._narrator}")
+        yield frame
+
+
+class VideoImageFrameProcessor(FrameProcessor):
+    def __init__(self, narrator):
+        self._narrator = narrator
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
+            yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
        else:
            yield frame

@@ -75,7 +131,8 @@ async def main(room_url: str, token):
        tts = ElevenLabsTTSService(
            aiohttp_session=session,
            api_key=os.getenv("ELEVENLABS_API_KEY"),
-            voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
+            narrator=narrator,
+            aggregate_sentences=False
        )

        llm = OpenAILLMService(
@@ -83,7 +140,7 @@ async def main(room_url: str, token):
            model="gpt-4-turbo-preview")

        vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
-        vifp = VideoImageFrameProcessor()
+        vifp = VideoImageFrameProcessor(narrator)
        ir = ImageRefresher()
        img = FalImageGenService(
            image_size="1024x1024",
@@ -93,13 +150,17 @@ async def main(room_url: str, token):
        )
        tiw = TelestratorImageWrapper()
        lfra = LLMFullResponseAggregator()
+        fl0 = FrameLogger("@@@ About to describe")
        fl1 = FrameLogger("!!! About to image gen")
+        ns = NarratorShuffle(narrator, narrators)
        pipeline = Pipeline(
            processors=[
+                ns,
+                fl0,
                vifp,
                vs,
-                tts,
                lfra,
+                tts,
                fl1,
                img,
                tiw,