This commit is contained in:
Chad Bailey
2024-03-19 22:04:47 +00:00
parent e726f15c4e
commit 34b10cb4c7
5 changed files with 88 additions and 14 deletions

View File

@@ -252,9 +252,15 @@ class LLMFullResponseAggregator(FrameProcessor):
self.aggregation = ""
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if not isinstance(frame, AudioFrame):
print(f"^^^ LFRA got frame: {frame}")
if isinstance(frame, TextFrame):
self.aggregation += frame.text
print(
f"^^^ LFRA got textframe. aggregation is now {self.aggregation}")
elif isinstance(frame, LLMResponseEndFrame):
print(
f"^^^ LFRA got an llmresponseendframe. About to yield aggregation: {self.aggregation}")
yield TextFrame(self.aggregation)
yield frame
self.aggregation = ""

View File

@@ -62,6 +62,7 @@ class TTSService(AIService):
yield TextFrame(self.current_sentence)
if not isinstance(frame, TextFrame):
print(f"*** tts yielding non-text: {frame}")
yield frame
return
@@ -80,6 +81,7 @@ class TTSService(AIService):
# note we pass along the text frame *after* the audio, so the text
# frame is completed after the audio is processed.
print(f"*** tts yielding text: {text}")
yield TextFrame(text)
@@ -147,6 +149,8 @@ class VisionService(AIService):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, VisionFrame):
async for frame in self.run_vision(frame.prompt, frame.image):
print(
f"&&& visionservce processframe got frame to yield: {frame}")
yield frame
yield LLMResponseEndFrame()
else:
@@ -159,8 +163,9 @@ class FrameLogger(AIService):
self.prefix = prefix
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (AudioFrame, ImageFrame)):
self.logger.info(f"{self.prefix}: {type(frame)}")
if isinstance(frame, (AudioFrame)):
# self.logger.info(f"{self.prefix}: {type(frame)}")
pass
else:
print(f"{self.prefix}: {frame}")

View File

@@ -15,18 +15,19 @@ class ElevenLabsTTSService(TTSService):
*,
aiohttp_session: aiohttp.ClientSession,
api_key,
voice_id,
narrator,
model="eleven_turbo_v2",
aggregate_sentences=True
):
super().__init__()
super().__init__(aggregate_sentences)
self._api_key = api_key
self._voice_id = voice_id
self._narrator = narrator
self._aiohttp_session = aiohttp_session
self._model = model
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._narrator['narrator']['voice_id']}/stream"
payload = {"text": sentence, "model_id": self._model}
querystring = {
"output_format": "pcm_16000",

View File

@@ -118,6 +118,7 @@ class OpenAIVisionService(VisionService):
)
)
async for chunk in chunks:
print(f"%%% chunk: {chunk}")
if len(chunk.choices) == 0:
continue
if chunk.choices[0].delta.content:

View File

@@ -2,6 +2,7 @@ import asyncio
import aiohttp
import logging
import os
import random
from typing import AsyncGenerator
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame
@@ -25,14 +26,69 @@ logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
narrators = [
{
"voice_id": "wDRBdcyPzQOCeq51IxW5",
"prompt": "Describe the image in one sentence, in the style of David Attenborough."
},
{
"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."
},
{
"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."
},
{
"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."
},
{
"voice_id": "PWEz02ggFiibL6P5PKRx",
"prompt": "Describe the image in one sentence, in the style of Kanye West."
},
{
"voice_id": "gvpBhHjzfd7M2WedYVUI",
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."
},
{
"voice_id": "bnyr1EF3snReVXauGBNn",
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."
class VideoImageFrameProcessor(FrameProcessor):
def __init__(self):
pass
}
]
random.shuffle(narrators)
print(f"$$$ narrators: {narrators}")
narrator = {"narrator": narrators[0]}
class NarratorShuffle(FrameProcessor):
def __init__(self, narrator, narrators):
self._narrator = narrator
self._narrators = narrators
self._i = 0
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, VideoImageFrame) or isinstance(frame, TelestratorImageFrame):
yield VisionFrame("Describe the image in one sentence, in the style of David Attenborough.", frame.image)
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
self._i += 1
if self._i >= len(self._narrators):
print(f"### shuffling narrators")
random.shuffle(self._narrators)
self._i = 0
self._narrator["narrator"] = self._narrators[self._i]
print(f"### new narrator is {self._narrator}")
yield frame
class VideoImageFrameProcessor(FrameProcessor):
def __init__(self, narrator):
self._narrator = narrator
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
else:
yield frame
@@ -75,7 +131,8 @@ async def main(room_url: str, token):
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
narrator=narrator,
aggregate_sentences=False
)
llm = OpenAILLMService(
@@ -83,7 +140,7 @@ async def main(room_url: str, token):
model="gpt-4-turbo-preview")
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
vifp = VideoImageFrameProcessor()
vifp = VideoImageFrameProcessor(narrator)
ir = ImageRefresher()
img = FalImageGenService(
image_size="1024x1024",
@@ -93,13 +150,17 @@ async def main(room_url: str, token):
)
tiw = TelestratorImageWrapper()
lfra = LLMFullResponseAggregator()
fl0 = FrameLogger("@@@ About to describe")
fl1 = FrameLogger("!!! About to image gen")
ns = NarratorShuffle(narrator, narrators)
pipeline = Pipeline(
processors=[
ns,
fl0,
vifp,
vs,
tts,
lfra,
tts,
fl1,
img,
tiw,