From 34b10cb4c7a07fdd57cfe7c362fe4d2e1479bbde Mon Sep 17 00:00:00 2001 From: Chad Bailey Date: Tue, 19 Mar 2024 22:04:47 +0000 Subject: [PATCH] wip --- src/dailyai/pipeline/aggregators.py | 6 ++ src/dailyai/services/ai_services.py | 9 ++- src/dailyai/services/elevenlabs_ai_service.py | 9 ++- src/dailyai/services/open_ai_services.py | 1 + .../starter-apps/telestrator/telestrator.py | 77 +++++++++++++++++-- 5 files changed, 88 insertions(+), 14 deletions(-) diff --git a/src/dailyai/pipeline/aggregators.py b/src/dailyai/pipeline/aggregators.py index f9bcaca13..6dd5824cf 100644 --- a/src/dailyai/pipeline/aggregators.py +++ b/src/dailyai/pipeline/aggregators.py @@ -252,9 +252,15 @@ class LLMFullResponseAggregator(FrameProcessor): self.aggregation = "" async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: + if not isinstance(frame, AudioFrame): + print(f"^^^ LFRA got frame: {frame}") if isinstance(frame, TextFrame): self.aggregation += frame.text + print( + f"^^^ LFRA got textframe. aggregation is now {self.aggregation}") elif isinstance(frame, LLMResponseEndFrame): + print( + f"^^^ LFRA got an llmresponseendframe. About to yield aggregation: {self.aggregation}") yield TextFrame(self.aggregation) yield frame self.aggregation = "" diff --git a/src/dailyai/services/ai_services.py b/src/dailyai/services/ai_services.py index 0edbb53ae..206a0ff4e 100644 --- a/src/dailyai/services/ai_services.py +++ b/src/dailyai/services/ai_services.py @@ -62,6 +62,7 @@ class TTSService(AIService): yield TextFrame(self.current_sentence) if not isinstance(frame, TextFrame): + print(f"*** tts yielding non-text: {frame}") yield frame return @@ -80,6 +81,7 @@ class TTSService(AIService): # note we pass along the text frame *after* the audio, so the text # frame is completed after the audio is processed. + print(f"*** tts yielding text: {text}") yield TextFrame(text) @@ -147,6 +149,8 @@ class VisionService(AIService): async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: if isinstance(frame, VisionFrame): async for frame in self.run_vision(frame.prompt, frame.image): + print( + f"&&& visionservce processframe got frame to yield: {frame}") yield frame yield LLMResponseEndFrame() else: @@ -159,8 +163,9 @@ class FrameLogger(AIService): self.prefix = prefix async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, (AudioFrame, ImageFrame)): - self.logger.info(f"{self.prefix}: {type(frame)}") + if isinstance(frame, (AudioFrame)): + # self.logger.info(f"{self.prefix}: {type(frame)}") + pass else: print(f"{self.prefix}: {frame}") diff --git a/src/dailyai/services/elevenlabs_ai_service.py b/src/dailyai/services/elevenlabs_ai_service.py index e010c7934..b2675e007 100644 --- a/src/dailyai/services/elevenlabs_ai_service.py +++ b/src/dailyai/services/elevenlabs_ai_service.py @@ -15,18 +15,19 @@ class ElevenLabsTTSService(TTSService): *, aiohttp_session: aiohttp.ClientSession, api_key, - voice_id, + narrator, model="eleven_turbo_v2", + aggregate_sentences=True ): - super().__init__() + super().__init__(aggregate_sentences) self._api_key = api_key - self._voice_id = voice_id + self._narrator = narrator self._aiohttp_session = aiohttp_session self._model = model async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]: - url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream" + url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._narrator['narrator']['voice_id']}/stream" payload = {"text": sentence, "model_id": self._model} querystring = { "output_format": "pcm_16000", diff --git a/src/dailyai/services/open_ai_services.py b/src/dailyai/services/open_ai_services.py index afcb62979..d3f56122a 100644 --- a/src/dailyai/services/open_ai_services.py +++ b/src/dailyai/services/open_ai_services.py @@ -118,6 +118,7 @@ class OpenAIVisionService(VisionService): ) ) async for chunk in chunks: + print(f"%%% chunk: {chunk}") if len(chunk.choices) == 0: continue if chunk.choices[0].delta.content: diff --git a/src/examples/starter-apps/telestrator/telestrator.py b/src/examples/starter-apps/telestrator/telestrator.py index 8c405b015..db2c8bf74 100644 --- a/src/examples/starter-apps/telestrator/telestrator.py +++ b/src/examples/starter-apps/telestrator/telestrator.py @@ -2,6 +2,7 @@ import asyncio import aiohttp import logging import os +import random from typing import AsyncGenerator from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame @@ -25,14 +26,69 @@ logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s") logger = logging.getLogger("dailyai") logger.setLevel(logging.DEBUG) +narrators = [ + { + "voice_id": "wDRBdcyPzQOCeq51IxW5", + "prompt": "Describe the image in one sentence, in the style of David Attenborough." + }, + { + "voice_id": "M3bAX0o3Ptb2l6XqwQJV", + "prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show." + }, + { + "voice_id": "lJm5d2ZZ3UE4qYOxl2t7", + "prompt": "Describe the image in one sentence, in the style of Oprah Winfrey." + }, + { + "voice_id": "7SNUlQ8GAbnZxRO9CKOt", + "prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England." + }, + { + "voice_id": "PWEz02ggFiibL6P5PKRx", + "prompt": "Describe the image in one sentence, in the style of Kanye West." + }, + { + "voice_id": "gvpBhHjzfd7M2WedYVUI", + "prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek." + }, + { + "voice_id": "bnyr1EF3snReVXauGBNn", + "prompt": "Describe the image in one sentence, in the style of Maya Angelou." -class VideoImageFrameProcessor(FrameProcessor): - def __init__(self): - pass + } +] + +random.shuffle(narrators) +print(f"$$$ narrators: {narrators}") +narrator = {"narrator": narrators[0]} + + +class NarratorShuffle(FrameProcessor): + def __init__(self, narrator, narrators): + self._narrator = narrator + self._narrators = narrators + self._i = 0 async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: - if isinstance(frame, VideoImageFrame) or isinstance(frame, TelestratorImageFrame): - yield VisionFrame("Describe the image in one sentence, in the style of David Attenborough.", frame.image) + if isinstance(frame, (ImageFrame, TelestratorImageFrame)): + self._i += 1 + if self._i >= len(self._narrators): + print(f"### shuffling narrators") + random.shuffle(self._narrators) + self._i = 0 + + self._narrator["narrator"] = self._narrators[self._i] + print(f"### new narrator is {self._narrator}") + yield frame + + +class VideoImageFrameProcessor(FrameProcessor): + def __init__(self, narrator): + self._narrator = narrator + + async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]: + if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)): + yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image) else: yield frame @@ -75,7 +131,8 @@ async def main(room_url: str, token): tts = ElevenLabsTTSService( aiohttp_session=session, api_key=os.getenv("ELEVENLABS_API_KEY"), - voice_id=os.getenv("ELEVENLABS_VOICE_ID"), + narrator=narrator, + aggregate_sentences=False ) llm = OpenAILLMService( @@ -83,7 +140,7 @@ async def main(room_url: str, token): model="gpt-4-turbo-preview") vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY")) - vifp = VideoImageFrameProcessor() + vifp = VideoImageFrameProcessor(narrator) ir = ImageRefresher() img = FalImageGenService( image_size="1024x1024", @@ -93,13 +150,17 @@ async def main(room_url: str, token): ) tiw = TelestratorImageWrapper() lfra = LLMFullResponseAggregator() + fl0 = FrameLogger("@@@ About to describe") fl1 = FrameLogger("!!! About to image gen") + ns = NarratorShuffle(narrator, narrators) pipeline = Pipeline( processors=[ + ns, + fl0, vifp, vs, - tts, lfra, + tts, fl1, img, tiw,