wip
This commit is contained in:
@@ -252,9 +252,15 @@ class LLMFullResponseAggregator(FrameProcessor):
|
||||
self.aggregation = ""
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if not isinstance(frame, AudioFrame):
|
||||
print(f"^^^ LFRA got frame: {frame}")
|
||||
if isinstance(frame, TextFrame):
|
||||
self.aggregation += frame.text
|
||||
print(
|
||||
f"^^^ LFRA got textframe. aggregation is now {self.aggregation}")
|
||||
elif isinstance(frame, LLMResponseEndFrame):
|
||||
print(
|
||||
f"^^^ LFRA got an llmresponseendframe. About to yield aggregation: {self.aggregation}")
|
||||
yield TextFrame(self.aggregation)
|
||||
yield frame
|
||||
self.aggregation = ""
|
||||
|
||||
@@ -62,6 +62,7 @@ class TTSService(AIService):
|
||||
yield TextFrame(self.current_sentence)
|
||||
|
||||
if not isinstance(frame, TextFrame):
|
||||
print(f"*** tts yielding non-text: {frame}")
|
||||
yield frame
|
||||
return
|
||||
|
||||
@@ -80,6 +81,7 @@ class TTSService(AIService):
|
||||
|
||||
# note we pass along the text frame *after* the audio, so the text
|
||||
# frame is completed after the audio is processed.
|
||||
print(f"*** tts yielding text: {text}")
|
||||
yield TextFrame(text)
|
||||
|
||||
|
||||
@@ -147,6 +149,8 @@ class VisionService(AIService):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, VisionFrame):
|
||||
async for frame in self.run_vision(frame.prompt, frame.image):
|
||||
print(
|
||||
f"&&& visionservce processframe got frame to yield: {frame}")
|
||||
yield frame
|
||||
yield LLMResponseEndFrame()
|
||||
else:
|
||||
@@ -159,8 +163,9 @@ class FrameLogger(AIService):
|
||||
self.prefix = prefix
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (AudioFrame, ImageFrame)):
|
||||
self.logger.info(f"{self.prefix}: {type(frame)}")
|
||||
if isinstance(frame, (AudioFrame)):
|
||||
# self.logger.info(f"{self.prefix}: {type(frame)}")
|
||||
pass
|
||||
else:
|
||||
print(f"{self.prefix}: {frame}")
|
||||
|
||||
|
||||
@@ -15,18 +15,19 @@ class ElevenLabsTTSService(TTSService):
|
||||
*,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key,
|
||||
voice_id,
|
||||
narrator,
|
||||
model="eleven_turbo_v2",
|
||||
aggregate_sentences=True
|
||||
):
|
||||
super().__init__()
|
||||
super().__init__(aggregate_sentences)
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_id = voice_id
|
||||
self._narrator = narrator
|
||||
self._aiohttp_session = aiohttp_session
|
||||
self._model = model
|
||||
|
||||
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._narrator['narrator']['voice_id']}/stream"
|
||||
payload = {"text": sentence, "model_id": self._model}
|
||||
querystring = {
|
||||
"output_format": "pcm_16000",
|
||||
|
||||
@@ -118,6 +118,7 @@ class OpenAIVisionService(VisionService):
|
||||
)
|
||||
)
|
||||
async for chunk in chunks:
|
||||
print(f"%%% chunk: {chunk}")
|
||||
if len(chunk.choices) == 0:
|
||||
continue
|
||||
if chunk.choices[0].delta.content:
|
||||
|
||||
@@ -2,6 +2,7 @@ import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame
|
||||
@@ -25,14 +26,69 @@ logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
narrators = [
|
||||
{
|
||||
"voice_id": "wDRBdcyPzQOCeq51IxW5",
|
||||
"prompt": "Describe the image in one sentence, in the style of David Attenborough."
|
||||
},
|
||||
{
|
||||
"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
|
||||
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."
|
||||
},
|
||||
{
|
||||
"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
|
||||
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."
|
||||
},
|
||||
{
|
||||
"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
|
||||
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."
|
||||
},
|
||||
{
|
||||
"voice_id": "PWEz02ggFiibL6P5PKRx",
|
||||
"prompt": "Describe the image in one sentence, in the style of Kanye West."
|
||||
},
|
||||
{
|
||||
"voice_id": "gvpBhHjzfd7M2WedYVUI",
|
||||
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."
|
||||
},
|
||||
{
|
||||
"voice_id": "bnyr1EF3snReVXauGBNn",
|
||||
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."
|
||||
|
||||
class VideoImageFrameProcessor(FrameProcessor):
|
||||
def __init__(self):
|
||||
pass
|
||||
}
|
||||
]
|
||||
|
||||
random.shuffle(narrators)
|
||||
print(f"$$$ narrators: {narrators}")
|
||||
narrator = {"narrator": narrators[0]}
|
||||
|
||||
|
||||
class NarratorShuffle(FrameProcessor):
|
||||
def __init__(self, narrator, narrators):
|
||||
self._narrator = narrator
|
||||
self._narrators = narrators
|
||||
self._i = 0
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, VideoImageFrame) or isinstance(frame, TelestratorImageFrame):
|
||||
yield VisionFrame("Describe the image in one sentence, in the style of David Attenborough.", frame.image)
|
||||
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
|
||||
self._i += 1
|
||||
if self._i >= len(self._narrators):
|
||||
print(f"### shuffling narrators")
|
||||
random.shuffle(self._narrators)
|
||||
self._i = 0
|
||||
|
||||
self._narrator["narrator"] = self._narrators[self._i]
|
||||
print(f"### new narrator is {self._narrator}")
|
||||
yield frame
|
||||
|
||||
|
||||
class VideoImageFrameProcessor(FrameProcessor):
|
||||
def __init__(self, narrator):
|
||||
self._narrator = narrator
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
|
||||
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
@@ -75,7 +131,8 @@ async def main(room_url: str, token):
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
narrator=narrator,
|
||||
aggregate_sentences=False
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
@@ -83,7 +140,7 @@ async def main(room_url: str, token):
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
vifp = VideoImageFrameProcessor()
|
||||
vifp = VideoImageFrameProcessor(narrator)
|
||||
ir = ImageRefresher()
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
@@ -93,13 +150,17 @@ async def main(room_url: str, token):
|
||||
)
|
||||
tiw = TelestratorImageWrapper()
|
||||
lfra = LLMFullResponseAggregator()
|
||||
fl0 = FrameLogger("@@@ About to describe")
|
||||
fl1 = FrameLogger("!!! About to image gen")
|
||||
ns = NarratorShuffle(narrator, narrators)
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
ns,
|
||||
fl0,
|
||||
vifp,
|
||||
vs,
|
||||
tts,
|
||||
lfra,
|
||||
tts,
|
||||
fl1,
|
||||
img,
|
||||
tiw,
|
||||
|
||||
Reference in New Issue
Block a user