Compare commits
7 Commits
hush/realt
...
cb/telestr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c73fb4750f | ||
|
|
34b10cb4c7 | ||
|
|
e726f15c4e | ||
|
|
25ca8b751e | ||
|
|
0b4b63d2ee | ||
|
|
6c9425d66a | ||
|
|
6d3c52ae81 |
@@ -252,9 +252,15 @@ class LLMFullResponseAggregator(FrameProcessor):
|
|||||||
self.aggregation = ""
|
self.aggregation = ""
|
||||||
|
|
||||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if not isinstance(frame, AudioFrame):
|
||||||
|
print(f"^^^ LFRA got frame: {frame}")
|
||||||
if isinstance(frame, TextFrame):
|
if isinstance(frame, TextFrame):
|
||||||
self.aggregation += frame.text
|
self.aggregation += frame.text
|
||||||
|
print(
|
||||||
|
f"^^^ LFRA got textframe. aggregation is now {self.aggregation}")
|
||||||
elif isinstance(frame, LLMResponseEndFrame):
|
elif isinstance(frame, LLMResponseEndFrame):
|
||||||
|
print(
|
||||||
|
f"^^^ LFRA got an llmresponseendframe. About to yield aggregation: {self.aggregation}")
|
||||||
yield TextFrame(self.aggregation)
|
yield TextFrame(self.aggregation)
|
||||||
yield frame
|
yield frame
|
||||||
self.aggregation = ""
|
self.aggregation = ""
|
||||||
|
|||||||
@@ -179,3 +179,33 @@ class LLMFunctionCallFrame(Frame):
|
|||||||
"""Emitted when the LLM has received an entire function call completion."""
|
"""Emitted when the LLM has received an entire function call completion."""
|
||||||
function_name: str
|
function_name: str
|
||||||
arguments: str
|
arguments: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass()
|
||||||
|
class VideoImageFrame(Frame):
|
||||||
|
"""Contains a still image from a partcipant's video stream."""
|
||||||
|
participantId: str
|
||||||
|
image: bytes
|
||||||
|
|
||||||
|
# def __str__(self):
|
||||||
|
# return f"{self.__class__.__name__}, participantId: {self.participantId}, image size: {len(self.image)} B"
|
||||||
|
|
||||||
|
|
||||||
|
class TelestratorImageFrame(ImageFrame):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass()
|
||||||
|
class VisionFrame(Frame):
|
||||||
|
prompt: str
|
||||||
|
image: bytes
|
||||||
|
|
||||||
|
# def __str__(self):
|
||||||
|
# return f"{self.__class__.__name__}, prompt: {self.prompt}, image size: {len(self.image)} B"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass()
|
||||||
|
class RequestVideoImageFrame(Frame):
|
||||||
|
"""Send to the transport to request a new video image from a specific participant. Leave participantId
|
||||||
|
empty to request a frame from all participants."""
|
||||||
|
participantId: str | None
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from dailyai.pipeline.frames import (
|
|||||||
Frame,
|
Frame,
|
||||||
TextFrame,
|
TextFrame,
|
||||||
TranscriptionQueueFrame,
|
TranscriptionQueueFrame,
|
||||||
|
VisionFrame
|
||||||
)
|
)
|
||||||
|
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
@@ -61,6 +62,7 @@ class TTSService(AIService):
|
|||||||
yield TextFrame(self.current_sentence)
|
yield TextFrame(self.current_sentence)
|
||||||
|
|
||||||
if not isinstance(frame, TextFrame):
|
if not isinstance(frame, TextFrame):
|
||||||
|
print(f"*** tts yielding non-text: {frame}")
|
||||||
yield frame
|
yield frame
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -79,6 +81,7 @@ class TTSService(AIService):
|
|||||||
|
|
||||||
# note we pass along the text frame *after* the audio, so the text
|
# note we pass along the text frame *after* the audio, so the text
|
||||||
# frame is completed after the audio is processed.
|
# frame is completed after the audio is processed.
|
||||||
|
print(f"*** tts yielding text: {text}")
|
||||||
yield TextFrame(text)
|
yield TextFrame(text)
|
||||||
|
|
||||||
|
|
||||||
@@ -133,14 +136,36 @@ class STTService(AIService):
|
|||||||
yield TranscriptionQueueFrame(text, "", str(time.time()))
|
yield TranscriptionQueueFrame(text, "", str(time.time()))
|
||||||
|
|
||||||
|
|
||||||
|
class VisionService(AIService):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# Renders the image. Returns an Image object.
|
||||||
|
# TODO-CB: return type
|
||||||
|
@abstractmethod
|
||||||
|
async def run_vision(self, prompt: str, image: bytes):
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, VisionFrame):
|
||||||
|
async for frame in self.run_vision(frame.prompt, frame.image):
|
||||||
|
print(
|
||||||
|
f"&&& visionservce processframe got frame to yield: {frame}")
|
||||||
|
yield frame
|
||||||
|
yield LLMResponseEndFrame()
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
class FrameLogger(AIService):
|
class FrameLogger(AIService):
|
||||||
def __init__(self, prefix="Frame", **kwargs):
|
def __init__(self, prefix="Frame", **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
|
||||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
if isinstance(frame, (AudioFrame, ImageFrame)):
|
if isinstance(frame, (AudioFrame)):
|
||||||
self.logger.info(f"{self.prefix}: {type(frame)}")
|
# self.logger.info(f"{self.prefix}: {type(frame)}")
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
print(f"{self.prefix}: {frame}")
|
print(f"{self.prefix}: {frame}")
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ from dailyai.pipeline.frames import (
|
|||||||
TextFrame,
|
TextFrame,
|
||||||
UserStartedSpeakingFrame,
|
UserStartedSpeakingFrame,
|
||||||
UserStoppedSpeakingFrame,
|
UserStoppedSpeakingFrame,
|
||||||
|
RequestVideoImageFrame,
|
||||||
|
TelestratorImageFrame
|
||||||
)
|
)
|
||||||
from dailyai.pipeline.pipeline import Pipeline
|
from dailyai.pipeline.pipeline import Pipeline
|
||||||
from dailyai.services.ai_services import TTSService
|
from dailyai.services.ai_services import TTSService
|
||||||
@@ -90,7 +92,9 @@ class BaseTransportService:
|
|||||||
self._vad_stop_s = kwargs.get("vad_stop_s") or 0.8
|
self._vad_stop_s = kwargs.get("vad_stop_s") or 0.8
|
||||||
self._context = kwargs.get("context") or []
|
self._context = kwargs.get("context") or []
|
||||||
self._vad_enabled = kwargs.get("vad_enabled") or False
|
self._vad_enabled = kwargs.get("vad_enabled") or False
|
||||||
|
self._receive_video = kwargs.get("receive_video") or False
|
||||||
|
self._receive_video_fps = kwargs.get("receive_video_fps") or 0.0
|
||||||
|
self._participant_frame_times = {}
|
||||||
if self._vad_enabled and self._speaker_enabled:
|
if self._vad_enabled and self._speaker_enabled:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False."
|
"Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False."
|
||||||
@@ -441,6 +445,7 @@ class BaseTransportService:
|
|||||||
# discard them
|
# discard them
|
||||||
if not self._is_interrupted.is_set():
|
if not self._is_interrupted.is_set():
|
||||||
if frame:
|
if frame:
|
||||||
|
|
||||||
if isinstance(frame, AudioFrame):
|
if isinstance(frame, AudioFrame):
|
||||||
chunk = frame.data
|
chunk = frame.data
|
||||||
|
|
||||||
@@ -452,6 +457,12 @@ class BaseTransportService:
|
|||||||
self.write_frame_to_mic(
|
self.write_frame_to_mic(
|
||||||
bytes(b[:truncated_length]))
|
bytes(b[:truncated_length]))
|
||||||
b = b[truncated_length:]
|
b = b[truncated_length:]
|
||||||
|
elif isinstance(frame, TelestratorImageFrame):
|
||||||
|
self._set_image(frame.image)
|
||||||
|
asyncio.run_coroutine_threadsafe(
|
||||||
|
self.receive_queue.put(frame),
|
||||||
|
self._loop,
|
||||||
|
)
|
||||||
elif isinstance(frame, ImageFrame):
|
elif isinstance(frame, ImageFrame):
|
||||||
self._set_image(frame.image)
|
self._set_image(frame.image)
|
||||||
elif isinstance(frame, SpriteFrame):
|
elif isinstance(frame, SpriteFrame):
|
||||||
@@ -459,6 +470,15 @@ class BaseTransportService:
|
|||||||
elif isinstance(frame, SendAppMessageFrame):
|
elif isinstance(frame, SendAppMessageFrame):
|
||||||
self.send_app_message(
|
self.send_app_message(
|
||||||
frame.message, frame.participantId)
|
frame.message, frame.participantId)
|
||||||
|
elif isinstance(frame, RequestVideoImageFrame):
|
||||||
|
# removing one or all participant IDs from _participant_frame_times
|
||||||
|
# will cause the transport to send the next available frame from
|
||||||
|
# that participant
|
||||||
|
if frame.participantId:
|
||||||
|
self._participant_frame_times.pop(
|
||||||
|
frame.participantId, None)
|
||||||
|
else:
|
||||||
|
self._participant_frame_times.clear()
|
||||||
elif len(b):
|
elif len(b):
|
||||||
self.write_frame_to_mic(bytes(b))
|
self.write_frame_to_mic(bytes(b))
|
||||||
b = bytearray()
|
b = bytearray()
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import asyncio
|
|||||||
import inspect
|
import inspect
|
||||||
import logging
|
import logging
|
||||||
import signal
|
import signal
|
||||||
|
import time
|
||||||
import threading
|
import threading
|
||||||
import types
|
import types
|
||||||
|
|
||||||
@@ -11,6 +12,8 @@ from typing import Any
|
|||||||
from dailyai.pipeline.frames import (
|
from dailyai.pipeline.frames import (
|
||||||
ReceivedAppMessageFrame,
|
ReceivedAppMessageFrame,
|
||||||
TranscriptionQueueFrame,
|
TranscriptionQueueFrame,
|
||||||
|
VideoImageFrame,
|
||||||
|
TelestratorImageFrame
|
||||||
)
|
)
|
||||||
|
|
||||||
from threading import Event
|
from threading import Event
|
||||||
@@ -204,11 +207,12 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
|||||||
)
|
)
|
||||||
self._my_participant_id = self.client.participants()["local"]["id"]
|
self._my_participant_id = self.client.participants()["local"]["id"]
|
||||||
|
|
||||||
self.client.update_subscription_profiles({
|
if not self._receive_video:
|
||||||
"base": {
|
self.client.update_subscription_profiles({
|
||||||
"camera": "unsubscribed",
|
"base": {
|
||||||
}
|
"camera": "unsubscribed",
|
||||||
})
|
}
|
||||||
|
})
|
||||||
|
|
||||||
if self._token and self._start_transcription:
|
if self._token and self._start_transcription:
|
||||||
self.client.start_transcription(self.transcription_settings)
|
self.client.start_transcription(self.transcription_settings)
|
||||||
@@ -225,6 +229,31 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
|||||||
self.client.leave()
|
self.client.leave()
|
||||||
self.client.release()
|
self.client.release()
|
||||||
|
|
||||||
|
def _handle_video_frame(self, participant_id, video_frame):
|
||||||
|
"""If receive_video is true, this function is called once for each frame from each participant. We
|
||||||
|
don't need to send every frame to the pipeline, so there are two ways to decide how to send frames:
|
||||||
|
1. Set a greater-than-zero value for receive_video_fps. The transport will track the last send time
|
||||||
|
for each participant and send a new frame when the requested frame rate has elapsed. This
|
||||||
|
guarantees an image every second, for example.
|
||||||
|
2. Set receive_video_fps less than or equal to zero to disable timed frame sending. Then, put a
|
||||||
|
RequestVideoImageFrame in the pipeline to get a new frame for one or all participants. By
|
||||||
|
sending a RequestVideoImageFrame immediately after successfully processing an image, you can
|
||||||
|
ensure you don't end up queueing up frames faster than you can process them.
|
||||||
|
"""
|
||||||
|
send_frame = False
|
||||||
|
if not participant_id in self._participant_frame_times:
|
||||||
|
# then it's a new participant; send the first frame
|
||||||
|
send_frame = True
|
||||||
|
elif self._receive_video_fps > 0 and time.time() > self._participant_frame_times[participant_id] + 1.0/self._receive_video_fps:
|
||||||
|
# Then it's an existing participant who is due to send a new frame
|
||||||
|
send_frame = True
|
||||||
|
|
||||||
|
if send_frame:
|
||||||
|
self._participant_frame_times[participant_id] = time.time()
|
||||||
|
future = asyncio.run_coroutine_threadsafe(
|
||||||
|
self.receive_queue.put(
|
||||||
|
VideoImageFrame(participant_id, video_frame)), self._loop)
|
||||||
|
|
||||||
def on_first_other_participant_joined(self):
|
def on_first_other_participant_joined(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -248,6 +277,9 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
|||||||
if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
|
if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
|
||||||
self._other_participant_has_joined = True
|
self._other_participant_has_joined = True
|
||||||
self.on_first_other_participant_joined()
|
self.on_first_other_participant_joined()
|
||||||
|
if self._receive_video:
|
||||||
|
self.client.set_video_renderer(
|
||||||
|
participant["id"], self._handle_video_frame)
|
||||||
|
|
||||||
def on_participant_left(self, participant, reason):
|
def on_participant_left(self, participant, reason):
|
||||||
if len(self.client.participants()) < self._min_others_count + 1:
|
if len(self.client.participants()) < self._min_others_count + 1:
|
||||||
|
|||||||
@@ -15,18 +15,19 @@ class ElevenLabsTTSService(TTSService):
|
|||||||
*,
|
*,
|
||||||
aiohttp_session: aiohttp.ClientSession,
|
aiohttp_session: aiohttp.ClientSession,
|
||||||
api_key,
|
api_key,
|
||||||
voice_id,
|
narrator,
|
||||||
model="eleven_turbo_v2",
|
model="eleven_turbo_v2",
|
||||||
|
aggregate_sentences=True
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__(aggregate_sentences)
|
||||||
|
|
||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
self._voice_id = voice_id
|
self._narrator = narrator
|
||||||
self._aiohttp_session = aiohttp_session
|
self._aiohttp_session = aiohttp_session
|
||||||
self._model = model
|
self._model = model
|
||||||
|
|
||||||
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
|
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
|
||||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
|
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._narrator['narrator']['voice_id']}/stream"
|
||||||
payload = {"text": sentence, "model_id": self._model}
|
payload = {"text": sentence, "model_id": self._model}
|
||||||
querystring = {
|
querystring = {
|
||||||
"output_format": "pcm_16000",
|
"output_format": "pcm_16000",
|
||||||
@@ -35,6 +36,7 @@ class ElevenLabsTTSService(TTSService):
|
|||||||
"xi-api-key": self._api_key,
|
"xi-api-key": self._api_key,
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
}
|
}
|
||||||
|
|
||||||
async with self._aiohttp_session.post(
|
async with self._aiohttp_session.post(
|
||||||
url, json=payload, headers=headers, params=querystring
|
url, json=payload, headers=headers, params=querystring
|
||||||
) as r:
|
) as r:
|
||||||
|
|||||||
@@ -53,4 +53,7 @@ class FalImageGenService(ImageGenService):
|
|||||||
async with self._aiohttp_session.get(image_url) as response:
|
async with self._aiohttp_session.get(image_url) as response:
|
||||||
image_stream = io.BytesIO(await response.content.read())
|
image_stream = io.BytesIO(await response.content.read())
|
||||||
image = Image.open(image_stream)
|
image = Image.open(image_stream)
|
||||||
return (image_url, image.tobytes())
|
image_bytes = image.tobytes()
|
||||||
|
print(f"!!! fal image tobytes is:")
|
||||||
|
print(image)
|
||||||
|
return (image_url, image_bytes)
|
||||||
|
|||||||
@@ -2,13 +2,22 @@ import aiohttp
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
import io
|
import io
|
||||||
import time
|
import time
|
||||||
from openai import AsyncOpenAI
|
import base64
|
||||||
|
from openai import AsyncOpenAI, AsyncStream
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
|
|
||||||
from dailyai.services.ai_services import LLMService, ImageGenService
|
from openai.types.chat import (
|
||||||
|
ChatCompletion,
|
||||||
|
ChatCompletionChunk,
|
||||||
|
ChatCompletionMessageParam,
|
||||||
|
)
|
||||||
|
|
||||||
|
from daily import VideoFrame
|
||||||
|
from dailyai.services.ai_services import LLMService, ImageGenService, VisionService
|
||||||
from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
|
from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
|
||||||
|
from dailyai.pipeline.frames import TextFrame
|
||||||
|
|
||||||
|
|
||||||
class OpenAILLMService(BaseOpenAILLMService):
|
class OpenAILLMService(BaseOpenAILLMService):
|
||||||
@@ -50,3 +59,67 @@ class OpenAIImageGenService(ImageGenService):
|
|||||||
image_stream = io.BytesIO(await response.content.read())
|
image_stream = io.BytesIO(await response.content.read())
|
||||||
image = Image.open(image_stream)
|
image = Image.open(image_stream)
|
||||||
return (image_url, image.tobytes())
|
return (image_url, image.tobytes())
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAIVisionService(VisionService):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
model="gpt-4-vision-preview",
|
||||||
|
api_key,
|
||||||
|
):
|
||||||
|
self._model = model
|
||||||
|
self._client = AsyncOpenAI(api_key=api_key)
|
||||||
|
|
||||||
|
async def run_vision(self, prompt: str, image: bytes):
|
||||||
|
if isinstance(image, VideoFrame):
|
||||||
|
# Then it's from a daily video frame
|
||||||
|
print("### processing daily video frame for recognition")
|
||||||
|
IMAGE_WIDTH = image.width
|
||||||
|
IMAGE_HEIGHT = image.height
|
||||||
|
COLOR_FORMAT = image.color_format
|
||||||
|
a_image = Image.frombytes(
|
||||||
|
'RGBA', (IMAGE_WIDTH, IMAGE_HEIGHT), image.buffer)
|
||||||
|
new_image = a_image.convert('RGB')
|
||||||
|
else:
|
||||||
|
# handle it as a byte stream from image gen
|
||||||
|
new_image = Image.frombytes('RGB', (1024, 1024), image)
|
||||||
|
# Uncomment these lines to write the frame to a jpg in the same directory.
|
||||||
|
# current_path = os.getcwd()
|
||||||
|
# image_path = os.path.join(current_path, "image.jpg")
|
||||||
|
# image.save(image_path, format="JPEG")
|
||||||
|
|
||||||
|
jpeg_buffer = io.BytesIO()
|
||||||
|
|
||||||
|
new_image.save(jpeg_buffer, format='JPEG')
|
||||||
|
|
||||||
|
jpeg_bytes = jpeg_buffer.getvalue()
|
||||||
|
base64_image = base64.b64encode(jpeg_bytes).decode('utf-8')
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": prompt},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/jpeg;base64,{base64_image}"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
chunks: AsyncStream[ChatCompletionChunk] = (
|
||||||
|
await self._client.chat.completions.create(
|
||||||
|
model=self._model,
|
||||||
|
stream=True,
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
async for chunk in chunks:
|
||||||
|
print(f"%%% chunk: {chunk}")
|
||||||
|
if len(chunk.choices) == 0:
|
||||||
|
continue
|
||||||
|
if chunk.choices[0].delta.content:
|
||||||
|
yield TextFrame(chunk.choices[0].delta.content)
|
||||||
|
|||||||
97
src/examples/foundational/12-describe-video.py
Normal file
97
src/examples/foundational/12-describe-video.py
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame
|
||||||
|
from dailyai.pipeline.pipeline import Pipeline
|
||||||
|
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||||
|
from dailyai.services.daily_transport_service import DailyTransportService
|
||||||
|
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||||
|
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||||
|
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||||
|
from dailyai.services.ai_services import FrameLogger
|
||||||
|
from dailyai.pipeline.aggregators import (
|
||||||
|
LLMAssistantContextAggregator,
|
||||||
|
LLMUserContextAggregator,
|
||||||
|
)
|
||||||
|
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||||
|
from examples.support.runner import configure
|
||||||
|
|
||||||
|
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||||
|
logger = logging.getLogger("dailyai")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
class VideoImageFrameProcessor(FrameProcessor):
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, VideoImageFrame):
|
||||||
|
yield VisionFrame("Describe the image in one sentence.", frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class ImageRefresher(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, LLMResponseEndFrame):
|
||||||
|
yield RequestVideoImageFrame(participantId=None)
|
||||||
|
yield frame
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
async def main(room_url: str, token):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
transport = DailyTransportService(
|
||||||
|
room_url,
|
||||||
|
token,
|
||||||
|
"Respond bot",
|
||||||
|
duration_minutes=5,
|
||||||
|
start_transcription=True,
|
||||||
|
mic_enabled=True,
|
||||||
|
mic_sample_rate=16000,
|
||||||
|
camera_enabled=False,
|
||||||
|
vad_enabled=True,
|
||||||
|
receive_video=True,
|
||||||
|
receive_video_fps=0
|
||||||
|
)
|
||||||
|
|
||||||
|
tts = ElevenLabsTTSService(
|
||||||
|
aiohttp_session=session,
|
||||||
|
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||||
|
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = OpenAILLMService(
|
||||||
|
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||||
|
model="gpt-4-turbo-preview")
|
||||||
|
|
||||||
|
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||||
|
vifp = VideoImageFrameProcessor()
|
||||||
|
ir = ImageRefresher()
|
||||||
|
pipeline = Pipeline(
|
||||||
|
processors=[
|
||||||
|
vifp,
|
||||||
|
vs,
|
||||||
|
llm,
|
||||||
|
tts,
|
||||||
|
ir,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_first_other_participant_joined")
|
||||||
|
async def on_first_other_participant_joined(transport):
|
||||||
|
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||||
|
|
||||||
|
transport.transcription_settings["extra"]["endpointing"] = True
|
||||||
|
transport.transcription_settings["extra"]["punctuate"] = True
|
||||||
|
await transport.run(pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
(url, token) = configure()
|
||||||
|
asyncio.run(main(url, token))
|
||||||
@@ -124,7 +124,6 @@ async def main(room_url: str, token):
|
|||||||
|
|
||||||
@transport.event_handler("on_first_other_participant_joined")
|
@transport.event_handler("on_first_other_participant_joined")
|
||||||
async def on_first_other_participant_joined(transport):
|
async def on_first_other_participant_joined(transport):
|
||||||
print(f"!!! in here, pipeline.source is {pipeline.source}")
|
|
||||||
await pipeline.queue_frames([LLMMessagesQueueFrame(messages)])
|
await pipeline.queue_frames([LLMMessagesQueueFrame(messages)])
|
||||||
|
|
||||||
async def run_conversation():
|
async def run_conversation():
|
||||||
|
|||||||
100
src/examples/starter-apps/telestrator/describer.py
Normal file
100
src/examples/starter-apps/telestrator/describer.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame
|
||||||
|
from dailyai.pipeline.pipeline import Pipeline
|
||||||
|
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||||
|
from dailyai.services.daily_transport_service import DailyTransportService
|
||||||
|
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||||
|
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||||
|
from dailyai.services.fal_ai_services import FalImageGenService
|
||||||
|
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||||
|
from dailyai.services.ai_services import FrameLogger
|
||||||
|
from dailyai.pipeline.aggregators import (
|
||||||
|
LLMAssistantContextAggregator,
|
||||||
|
LLMUserContextAggregator,
|
||||||
|
)
|
||||||
|
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||||
|
from examples.support.runner import configure
|
||||||
|
|
||||||
|
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||||
|
logger = logging.getLogger("dailyai")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
class VideoImageFrameProcessor(FrameProcessor):
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, VideoImageFrame):
|
||||||
|
yield VisionFrame("Describe the image in one sentence.", frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class ImageRefresher(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, LLMResponseEndFrame):
|
||||||
|
yield RequestVideoImageFrame(participantId=None)
|
||||||
|
yield frame
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
async def main(room_url: str, token):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
transport = DailyTransportService(
|
||||||
|
room_url,
|
||||||
|
token,
|
||||||
|
"Respond bot",
|
||||||
|
duration_minutes=5,
|
||||||
|
start_transcription=True,
|
||||||
|
mic_enabled=True,
|
||||||
|
mic_sample_rate=16000,
|
||||||
|
camera_enabled=True,
|
||||||
|
camera_width=1024,
|
||||||
|
camera_height=1024,
|
||||||
|
vad_enabled=False,
|
||||||
|
receive_video=True,
|
||||||
|
receive_video_fps=0
|
||||||
|
)
|
||||||
|
|
||||||
|
tts = ElevenLabsTTSService(
|
||||||
|
aiohttp_session=session,
|
||||||
|
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||||
|
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = OpenAILLMService(
|
||||||
|
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||||
|
model="gpt-4-turbo-preview")
|
||||||
|
|
||||||
|
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||||
|
vifp = VideoImageFrameProcessor()
|
||||||
|
ir = ImageRefresher()
|
||||||
|
|
||||||
|
pipeline = Pipeline(
|
||||||
|
processors=[
|
||||||
|
vifp,
|
||||||
|
vs,
|
||||||
|
tts,
|
||||||
|
ir,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_first_other_participant_joined")
|
||||||
|
async def on_first_other_participant_joined(transport):
|
||||||
|
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||||
|
|
||||||
|
transport.transcription_settings["extra"]["endpointing"] = True
|
||||||
|
transport.transcription_settings["extra"]["punctuate"] = True
|
||||||
|
await transport.run(pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
(url, token) = configure()
|
||||||
|
asyncio.run(main(url, token))
|
||||||
112
src/examples/starter-apps/telestrator/illustrator.py
Normal file
112
src/examples/starter-apps/telestrator/illustrator.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame, TranscriptionQueueFrame, TextFrame
|
||||||
|
from dailyai.pipeline.pipeline import Pipeline
|
||||||
|
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||||
|
from dailyai.services.daily_transport_service import DailyTransportService
|
||||||
|
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||||
|
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||||
|
from dailyai.services.fal_ai_services import FalImageGenService
|
||||||
|
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||||
|
from dailyai.services.ai_services import FrameLogger
|
||||||
|
from dailyai.pipeline.aggregators import (
|
||||||
|
LLMAssistantContextAggregator,
|
||||||
|
LLMUserContextAggregator,
|
||||||
|
)
|
||||||
|
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||||
|
from examples.support.runner import configure
|
||||||
|
|
||||||
|
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||||
|
logger = logging.getLogger("dailyai")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
class VADAggregator(FrameProcessor):
|
||||||
|
def __init__(self):
|
||||||
|
self.aggregating = False
|
||||||
|
self.aggregation = ""
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, UserStartedSpeakingFrame):
|
||||||
|
self.aggregating = True
|
||||||
|
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||||
|
self.aggregating = False
|
||||||
|
# Sometimes VAD triggers quickly on and off. If we don't get any transcription,
|
||||||
|
# it creates empty LLM message queue frames
|
||||||
|
if len(self.aggregation) > 0:
|
||||||
|
yield TextFrame(self.aggregation)
|
||||||
|
|
||||||
|
self.aggregation = ""
|
||||||
|
yield frame
|
||||||
|
elif isinstance(frame, TranscriptionQueueFrame) and self.aggregating:
|
||||||
|
self.aggregation += f" {frame.text}"
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
async def main(room_url: str, token):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
transport = DailyTransportService(
|
||||||
|
room_url,
|
||||||
|
token,
|
||||||
|
"Respond bot",
|
||||||
|
duration_minutes=5,
|
||||||
|
start_transcription=True,
|
||||||
|
mic_enabled=True,
|
||||||
|
mic_sample_rate=16000,
|
||||||
|
camera_enabled=True,
|
||||||
|
camera_width=1024,
|
||||||
|
camera_height=1024,
|
||||||
|
vad_enabled=True,
|
||||||
|
receive_video=True,
|
||||||
|
receive_video_fps=0,
|
||||||
|
vad_timeout_s=1.0
|
||||||
|
)
|
||||||
|
|
||||||
|
tts = ElevenLabsTTSService(
|
||||||
|
aiohttp_session=session,
|
||||||
|
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||||
|
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = OpenAILLMService(
|
||||||
|
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||||
|
model="gpt-4-turbo-preview")
|
||||||
|
|
||||||
|
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||||
|
vad = VADAggregator()
|
||||||
|
img = FalImageGenService(
|
||||||
|
image_size="1024x1024",
|
||||||
|
aiohttp_session=session,
|
||||||
|
key_id=os.getenv("FAL_KEY_ID"),
|
||||||
|
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||||
|
)
|
||||||
|
fl = FrameLogger("!!! Start")
|
||||||
|
fl2 = FrameLogger("!!! AFTER VAD")
|
||||||
|
fl3 = FrameLogger("!!! After img")
|
||||||
|
pipeline = Pipeline(
|
||||||
|
processors=[
|
||||||
|
fl,
|
||||||
|
vad,
|
||||||
|
fl2,
|
||||||
|
img,
|
||||||
|
fl3
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_first_other_participant_joined")
|
||||||
|
async def on_first_other_participant_joined(transport):
|
||||||
|
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||||
|
|
||||||
|
transport.transcription_settings["extra"]["endpointing"] = True
|
||||||
|
transport.transcription_settings["extra"]["punctuate"] = True
|
||||||
|
await transport.run(pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
(url, token) = configure()
|
||||||
|
asyncio.run(main(url, token))
|
||||||
210
src/examples/starter-apps/telestrator/telestrator-fuzz.py
Normal file
210
src/examples/starter-apps/telestrator/telestrator-fuzz.py
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
|
||||||
|
from dailyai.pipeline.pipeline import Pipeline
|
||||||
|
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||||
|
from dailyai.services.daily_transport_service import DailyTransportService
|
||||||
|
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||||
|
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||||
|
from dailyai.services.fal_ai_services import FalImageGenService
|
||||||
|
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||||
|
from dailyai.services.ai_services import FrameLogger
|
||||||
|
from dailyai.pipeline.aggregators import (
|
||||||
|
LLMAssistantContextAggregator,
|
||||||
|
LLMUserContextAggregator,
|
||||||
|
LLMFullResponseAggregator
|
||||||
|
)
|
||||||
|
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||||
|
from examples.support.runner import configure
|
||||||
|
|
||||||
|
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||||
|
logger = logging.getLogger("dailyai")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
|
||||||
|
"prompt": "Describe the image in one sentence."},
|
||||||
|
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
|
||||||
|
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
|
||||||
|
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
|
||||||
|
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
|
||||||
|
{"voice_id": "bnyr1EF3snReVXauGBNn",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
|
||||||
|
|
||||||
|
# random.shuffle(narrators)
|
||||||
|
print(f"$$$ narrators: {narrators}")
|
||||||
|
narrator = {"narrator": narrators[0]}
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationProcessor(FrameProcessor):
|
||||||
|
def __init__(self, in_language, out_language):
|
||||||
|
self._in_language = in_language
|
||||||
|
self._out_language = out_language
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, TextFrame):
|
||||||
|
context = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": frame.text},
|
||||||
|
]
|
||||||
|
|
||||||
|
yield LLMMessagesQueueFrame(context)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class NarratorShuffle(FrameProcessor):
|
||||||
|
def __init__(self, narrator, narrators):
|
||||||
|
self._narrator = narrator
|
||||||
|
self._narrators = narrators
|
||||||
|
self._i = 0
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
|
||||||
|
self._i += 1
|
||||||
|
if self._i >= len(self._narrators):
|
||||||
|
print(f"### shuffling narrators")
|
||||||
|
random.shuffle(self._narrators)
|
||||||
|
self._i = 0
|
||||||
|
|
||||||
|
self._narrator["narrator"] = self._narrators[self._i]
|
||||||
|
print(f"### new narrator is {self._narrator}")
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class VideoImageFrameProcessor(FrameProcessor):
|
||||||
|
def __init__(self, narrator):
|
||||||
|
self._narrator = narrator
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
|
||||||
|
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class ImageRefresher(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, LLMResponseEndFrame):
|
||||||
|
yield RequestVideoImageFrame(participantId=None)
|
||||||
|
yield frame
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class TelestratorImageWrapper(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, ImageFrame):
|
||||||
|
yield TelestratorImageFrame(None, frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
async def main(room_url: str, token):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
transport = DailyTransportService(
|
||||||
|
room_url,
|
||||||
|
token,
|
||||||
|
"Respond bot",
|
||||||
|
duration_minutes=5,
|
||||||
|
start_transcription=True,
|
||||||
|
mic_enabled=True,
|
||||||
|
mic_sample_rate=16000,
|
||||||
|
camera_enabled=True,
|
||||||
|
camera_width=1024,
|
||||||
|
camera_height=576,
|
||||||
|
vad_enabled=False,
|
||||||
|
receive_video=True,
|
||||||
|
receive_video_fps=0
|
||||||
|
)
|
||||||
|
|
||||||
|
tts = ElevenLabsTTSService(
|
||||||
|
aiohttp_session=session,
|
||||||
|
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||||
|
narrator=narrator,
|
||||||
|
aggregate_sentences=False
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = OpenAILLMService(
|
||||||
|
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||||
|
model="gpt-4-turbo-preview")
|
||||||
|
|
||||||
|
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||||
|
vifp = VideoImageFrameProcessor(narrator)
|
||||||
|
ir = ImageRefresher()
|
||||||
|
img = FalImageGenService(
|
||||||
|
image_size="1024x1024",
|
||||||
|
aiohttp_session=session,
|
||||||
|
key_id=os.getenv("FAL_KEY_ID"),
|
||||||
|
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||||
|
)
|
||||||
|
tiw = TelestratorImageWrapper()
|
||||||
|
lfra = LLMFullResponseAggregator()
|
||||||
|
lfra1 = LLMFullResponseAggregator()
|
||||||
|
lfra2 = LLMFullResponseAggregator()
|
||||||
|
lfra3 = LLMFullResponseAggregator()
|
||||||
|
lfra4 = LLMFullResponseAggregator()
|
||||||
|
fl0 = FrameLogger("@@@ About to describe")
|
||||||
|
fl1 = FrameLogger("!!! About to image gen")
|
||||||
|
f4 = FrameLogger("((( partway through )))")
|
||||||
|
f5 = FrameLogger("!!! f5")
|
||||||
|
ns = NarratorShuffle(narrator, narrators)
|
||||||
|
t1 = TranslationProcessor("English", "Spanish")
|
||||||
|
t2 = TranslationProcessor("Spanish", "German")
|
||||||
|
t3 = TranslationProcessor("German", "Japanese")
|
||||||
|
t4 = TranslationProcessor("Japanese", "English")
|
||||||
|
pipeline = Pipeline(
|
||||||
|
processors=[
|
||||||
|
fl0,
|
||||||
|
vifp,
|
||||||
|
vs,
|
||||||
|
lfra,
|
||||||
|
tts,
|
||||||
|
f4,
|
||||||
|
t1,
|
||||||
|
llm,
|
||||||
|
lfra1,
|
||||||
|
f5,
|
||||||
|
tts,
|
||||||
|
|
||||||
|
t2,
|
||||||
|
llm,
|
||||||
|
lfra2,
|
||||||
|
tts,
|
||||||
|
t3,
|
||||||
|
llm,
|
||||||
|
lfra3,
|
||||||
|
tts,
|
||||||
|
t4,
|
||||||
|
llm,
|
||||||
|
lfra4,
|
||||||
|
tts,
|
||||||
|
fl1,
|
||||||
|
img,
|
||||||
|
tiw,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_first_other_participant_joined")
|
||||||
|
async def on_first_other_participant_joined(transport):
|
||||||
|
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||||
|
|
||||||
|
transport.transcription_settings["extra"]["endpointing"] = True
|
||||||
|
transport.transcription_settings["extra"]["punctuate"] = True
|
||||||
|
await transport.run(pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
(url, token) = configure()
|
||||||
|
asyncio.run(main(url, token))
|
||||||
191
src/examples/starter-apps/telestrator/telestrator-haiku.py
Normal file
191
src/examples/starter-apps/telestrator/telestrator-haiku.py
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
|
||||||
|
from dailyai.pipeline.pipeline import Pipeline
|
||||||
|
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||||
|
from dailyai.services.daily_transport_service import DailyTransportService
|
||||||
|
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||||
|
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||||
|
from dailyai.services.fal_ai_services import FalImageGenService
|
||||||
|
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||||
|
from dailyai.services.ai_services import FrameLogger
|
||||||
|
from dailyai.pipeline.aggregators import (
|
||||||
|
LLMAssistantContextAggregator,
|
||||||
|
LLMUserContextAggregator,
|
||||||
|
LLMFullResponseAggregator
|
||||||
|
)
|
||||||
|
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||||
|
from examples.support.runner import configure
|
||||||
|
|
||||||
|
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||||
|
logger = logging.getLogger("dailyai")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
|
||||||
|
"prompt": "Describe the image in a haiku."},
|
||||||
|
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
|
||||||
|
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
|
||||||
|
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
|
||||||
|
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
|
||||||
|
{"voice_id": "bnyr1EF3snReVXauGBNn",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
|
||||||
|
|
||||||
|
# random.shuffle(narrators)
|
||||||
|
print(f"$$$ narrators: {narrators}")
|
||||||
|
narrator = {"narrator": narrators[0]}
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationProcessor(FrameProcessor):
|
||||||
|
def __init__(self, in_language, out_language):
|
||||||
|
self._in_language = in_language
|
||||||
|
self._out_language = out_language
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, TextFrame):
|
||||||
|
context = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": frame.text},
|
||||||
|
]
|
||||||
|
|
||||||
|
yield LLMMessagesQueueFrame(context)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class NarratorShuffle(FrameProcessor):
|
||||||
|
def __init__(self, narrator, narrators):
|
||||||
|
self._narrator = narrator
|
||||||
|
self._narrators = narrators
|
||||||
|
self._i = 0
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
|
||||||
|
self._i += 1
|
||||||
|
if self._i >= len(self._narrators):
|
||||||
|
print(f"### shuffling narrators")
|
||||||
|
random.shuffle(self._narrators)
|
||||||
|
self._i = 0
|
||||||
|
|
||||||
|
self._narrator["narrator"] = self._narrators[self._i]
|
||||||
|
print(f"### new narrator is {self._narrator}")
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class VideoImageFrameProcessor(FrameProcessor):
|
||||||
|
def __init__(self, narrator):
|
||||||
|
self._narrator = narrator
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
|
||||||
|
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class ImageRefresher(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, LLMResponseEndFrame):
|
||||||
|
yield RequestVideoImageFrame(participantId=None)
|
||||||
|
yield frame
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class TelestratorImageWrapper(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, ImageFrame):
|
||||||
|
yield TelestratorImageFrame(None, frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
async def main(room_url: str, token):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
transport = DailyTransportService(
|
||||||
|
room_url,
|
||||||
|
token,
|
||||||
|
"Respond bot",
|
||||||
|
duration_minutes=5,
|
||||||
|
start_transcription=True,
|
||||||
|
mic_enabled=True,
|
||||||
|
mic_sample_rate=16000,
|
||||||
|
camera_enabled=True,
|
||||||
|
camera_width=1024,
|
||||||
|
camera_height=1024,
|
||||||
|
vad_enabled=False,
|
||||||
|
receive_video=True,
|
||||||
|
receive_video_fps=0
|
||||||
|
)
|
||||||
|
|
||||||
|
tts = ElevenLabsTTSService(
|
||||||
|
aiohttp_session=session,
|
||||||
|
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||||
|
narrator=narrator,
|
||||||
|
aggregate_sentences=False
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = OpenAILLMService(
|
||||||
|
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||||
|
model="gpt-4-turbo-preview")
|
||||||
|
|
||||||
|
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||||
|
vifp = VideoImageFrameProcessor(narrator)
|
||||||
|
ir = ImageRefresher()
|
||||||
|
img = FalImageGenService(
|
||||||
|
image_size="1024x1024",
|
||||||
|
aiohttp_session=session,
|
||||||
|
key_id=os.getenv("FAL_KEY_ID"),
|
||||||
|
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||||
|
)
|
||||||
|
tiw = TelestratorImageWrapper()
|
||||||
|
lfra = LLMFullResponseAggregator()
|
||||||
|
lfra1 = LLMFullResponseAggregator()
|
||||||
|
lfra2 = LLMFullResponseAggregator()
|
||||||
|
lfra3 = LLMFullResponseAggregator()
|
||||||
|
lfra4 = LLMFullResponseAggregator()
|
||||||
|
fl0 = FrameLogger("@@@ About to describe")
|
||||||
|
fl1 = FrameLogger("!!! About to image gen")
|
||||||
|
f4 = FrameLogger("((( partway through )))")
|
||||||
|
f5 = FrameLogger("!!! f5")
|
||||||
|
ns = NarratorShuffle(narrator, narrators)
|
||||||
|
t1 = TranslationProcessor("English", "Spanish")
|
||||||
|
t2 = TranslationProcessor("Spanish", "German")
|
||||||
|
t3 = TranslationProcessor("German", "Japanese")
|
||||||
|
t4 = TranslationProcessor("Japanese", "English")
|
||||||
|
pipeline = Pipeline(
|
||||||
|
processors=[
|
||||||
|
fl0,
|
||||||
|
vifp,
|
||||||
|
vs,
|
||||||
|
lfra,
|
||||||
|
tts,
|
||||||
|
fl1,
|
||||||
|
img,
|
||||||
|
tiw,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_first_other_participant_joined")
|
||||||
|
async def on_first_other_participant_joined(transport):
|
||||||
|
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||||
|
|
||||||
|
transport.transcription_settings["extra"]["endpointing"] = True
|
||||||
|
transport.transcription_settings["extra"]["punctuate"] = True
|
||||||
|
await transport.run(pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
(url, token) = configure()
|
||||||
|
asyncio.run(main(url, token))
|
||||||
191
src/examples/starter-apps/telestrator/telestrator-wordcount.py
Normal file
191
src/examples/starter-apps/telestrator/telestrator-wordcount.py
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
|
||||||
|
from dailyai.pipeline.pipeline import Pipeline
|
||||||
|
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||||
|
from dailyai.services.daily_transport_service import DailyTransportService
|
||||||
|
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||||
|
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||||
|
from dailyai.services.fal_ai_services import FalImageGenService
|
||||||
|
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||||
|
from dailyai.services.ai_services import FrameLogger
|
||||||
|
from dailyai.pipeline.aggregators import (
|
||||||
|
LLMAssistantContextAggregator,
|
||||||
|
LLMUserContextAggregator,
|
||||||
|
LLMFullResponseAggregator
|
||||||
|
)
|
||||||
|
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||||
|
from examples.support.runner import configure
|
||||||
|
|
||||||
|
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||||
|
logger = logging.getLogger("dailyai")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
|
||||||
|
"prompt": "Describe the image in nine words."},
|
||||||
|
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
|
||||||
|
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
|
||||||
|
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
|
||||||
|
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
|
||||||
|
{"voice_id": "bnyr1EF3snReVXauGBNn",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
|
||||||
|
|
||||||
|
# random.shuffle(narrators)
|
||||||
|
print(f"$$$ narrators: {narrators}")
|
||||||
|
narrator = {"narrator": narrators[0]}
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationProcessor(FrameProcessor):
|
||||||
|
def __init__(self, in_language, out_language):
|
||||||
|
self._in_language = in_language
|
||||||
|
self._out_language = out_language
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, TextFrame):
|
||||||
|
context = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": frame.text},
|
||||||
|
]
|
||||||
|
|
||||||
|
yield LLMMessagesQueueFrame(context)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class NarratorShuffle(FrameProcessor):
|
||||||
|
def __init__(self, narrator, narrators):
|
||||||
|
self._narrator = narrator
|
||||||
|
self._narrators = narrators
|
||||||
|
self._i = 0
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
|
||||||
|
self._i += 1
|
||||||
|
if self._i >= len(self._narrators):
|
||||||
|
print(f"### shuffling narrators")
|
||||||
|
random.shuffle(self._narrators)
|
||||||
|
self._i = 0
|
||||||
|
|
||||||
|
self._narrator["narrator"] = self._narrators[self._i]
|
||||||
|
print(f"### new narrator is {self._narrator}")
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class VideoImageFrameProcessor(FrameProcessor):
|
||||||
|
def __init__(self, narrator):
|
||||||
|
self._narrator = narrator
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
|
||||||
|
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class ImageRefresher(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, LLMResponseEndFrame):
|
||||||
|
yield RequestVideoImageFrame(participantId=None)
|
||||||
|
yield frame
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class TelestratorImageWrapper(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, ImageFrame):
|
||||||
|
yield TelestratorImageFrame(None, frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
async def main(room_url: str, token):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
transport = DailyTransportService(
|
||||||
|
room_url,
|
||||||
|
token,
|
||||||
|
"Respond bot",
|
||||||
|
duration_minutes=5,
|
||||||
|
start_transcription=True,
|
||||||
|
mic_enabled=True,
|
||||||
|
mic_sample_rate=16000,
|
||||||
|
camera_enabled=True,
|
||||||
|
camera_width=1024,
|
||||||
|
camera_height=1024,
|
||||||
|
vad_enabled=False,
|
||||||
|
receive_video=True,
|
||||||
|
receive_video_fps=0
|
||||||
|
)
|
||||||
|
|
||||||
|
tts = ElevenLabsTTSService(
|
||||||
|
aiohttp_session=session,
|
||||||
|
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||||
|
narrator=narrator,
|
||||||
|
aggregate_sentences=False
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = OpenAILLMService(
|
||||||
|
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||||
|
model="gpt-4-turbo-preview")
|
||||||
|
|
||||||
|
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||||
|
vifp = VideoImageFrameProcessor(narrator)
|
||||||
|
ir = ImageRefresher()
|
||||||
|
img = FalImageGenService(
|
||||||
|
image_size="1024x1024",
|
||||||
|
aiohttp_session=session,
|
||||||
|
key_id=os.getenv("FAL_KEY_ID"),
|
||||||
|
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||||
|
)
|
||||||
|
tiw = TelestratorImageWrapper()
|
||||||
|
lfra = LLMFullResponseAggregator()
|
||||||
|
lfra1 = LLMFullResponseAggregator()
|
||||||
|
lfra2 = LLMFullResponseAggregator()
|
||||||
|
lfra3 = LLMFullResponseAggregator()
|
||||||
|
lfra4 = LLMFullResponseAggregator()
|
||||||
|
fl0 = FrameLogger("@@@ About to describe")
|
||||||
|
fl1 = FrameLogger("!!! About to image gen")
|
||||||
|
f4 = FrameLogger("((( partway through )))")
|
||||||
|
f5 = FrameLogger("!!! f5")
|
||||||
|
ns = NarratorShuffle(narrator, narrators)
|
||||||
|
t1 = TranslationProcessor("English", "Spanish")
|
||||||
|
t2 = TranslationProcessor("Spanish", "German")
|
||||||
|
t3 = TranslationProcessor("German", "Japanese")
|
||||||
|
t4 = TranslationProcessor("Japanese", "English")
|
||||||
|
pipeline = Pipeline(
|
||||||
|
processors=[
|
||||||
|
fl0,
|
||||||
|
vifp,
|
||||||
|
vs,
|
||||||
|
lfra,
|
||||||
|
tts,
|
||||||
|
fl1,
|
||||||
|
img,
|
||||||
|
tiw,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_first_other_participant_joined")
|
||||||
|
async def on_first_other_participant_joined(transport):
|
||||||
|
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||||
|
|
||||||
|
transport.transcription_settings["extra"]["endpointing"] = True
|
||||||
|
transport.transcription_settings["extra"]["punctuate"] = True
|
||||||
|
await transport.run(pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
(url, token) = configure()
|
||||||
|
asyncio.run(main(url, token))
|
||||||
162
src/examples/starter-apps/telestrator/telestrator.py
Normal file
162
src/examples/starter-apps/telestrator/telestrator.py
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame
|
||||||
|
from dailyai.pipeline.pipeline import Pipeline
|
||||||
|
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||||
|
from dailyai.services.daily_transport_service import DailyTransportService
|
||||||
|
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||||
|
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||||
|
from dailyai.services.fal_ai_services import FalImageGenService
|
||||||
|
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||||
|
from dailyai.services.ai_services import FrameLogger
|
||||||
|
from dailyai.pipeline.aggregators import (
|
||||||
|
LLMAssistantContextAggregator,
|
||||||
|
LLMUserContextAggregator,
|
||||||
|
LLMFullResponseAggregator
|
||||||
|
)
|
||||||
|
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||||
|
from examples.support.runner import configure
|
||||||
|
|
||||||
|
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||||
|
logger = logging.getLogger("dailyai")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of David Attenborough."},
|
||||||
|
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
|
||||||
|
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
|
||||||
|
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
|
||||||
|
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
|
||||||
|
{"voice_id": "bnyr1EF3snReVXauGBNn",
|
||||||
|
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
|
||||||
|
|
||||||
|
random.shuffle(narrators)
|
||||||
|
print(f"$$$ narrators: {narrators}")
|
||||||
|
narrator = {"narrator": narrators[0]}
|
||||||
|
|
||||||
|
|
||||||
|
class NarratorShuffle(FrameProcessor):
|
||||||
|
def __init__(self, narrator, narrators):
|
||||||
|
self._narrator = narrator
|
||||||
|
self._narrators = narrators
|
||||||
|
self._i = 0
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
|
||||||
|
self._i += 1
|
||||||
|
if self._i >= len(self._narrators):
|
||||||
|
print(f"### shuffling narrators")
|
||||||
|
random.shuffle(self._narrators)
|
||||||
|
self._i = 0
|
||||||
|
|
||||||
|
self._narrator["narrator"] = self._narrators[self._i]
|
||||||
|
print(f"### new narrator is {self._narrator}")
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class VideoImageFrameProcessor(FrameProcessor):
|
||||||
|
def __init__(self, narrator):
|
||||||
|
self._narrator = narrator
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
|
||||||
|
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class ImageRefresher(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, LLMResponseEndFrame):
|
||||||
|
yield RequestVideoImageFrame(participantId=None)
|
||||||
|
yield frame
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class TelestratorImageWrapper(FrameProcessor):
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, ImageFrame):
|
||||||
|
yield TelestratorImageFrame(None, frame.image)
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
async def main(room_url: str, token):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
transport = DailyTransportService(
|
||||||
|
room_url,
|
||||||
|
token,
|
||||||
|
"Respond bot",
|
||||||
|
duration_minutes=5,
|
||||||
|
start_transcription=True,
|
||||||
|
mic_enabled=True,
|
||||||
|
mic_sample_rate=16000,
|
||||||
|
camera_enabled=True,
|
||||||
|
camera_width=1024,
|
||||||
|
camera_height=1024,
|
||||||
|
vad_enabled=False,
|
||||||
|
receive_video=True,
|
||||||
|
receive_video_fps=0
|
||||||
|
)
|
||||||
|
|
||||||
|
tts = ElevenLabsTTSService(
|
||||||
|
aiohttp_session=session,
|
||||||
|
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||||
|
narrator=narrator,
|
||||||
|
aggregate_sentences=False
|
||||||
|
)
|
||||||
|
|
||||||
|
llm = OpenAILLMService(
|
||||||
|
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||||
|
model="gpt-4-turbo-preview")
|
||||||
|
|
||||||
|
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||||
|
vifp = VideoImageFrameProcessor(narrator)
|
||||||
|
ir = ImageRefresher()
|
||||||
|
img = FalImageGenService(
|
||||||
|
image_size="1024x1024",
|
||||||
|
aiohttp_session=session,
|
||||||
|
key_id=os.getenv("FAL_KEY_ID"),
|
||||||
|
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||||
|
)
|
||||||
|
tiw = TelestratorImageWrapper()
|
||||||
|
lfra = LLMFullResponseAggregator()
|
||||||
|
fl0 = FrameLogger("@@@ About to describe")
|
||||||
|
fl1 = FrameLogger("!!! About to image gen")
|
||||||
|
ns = NarratorShuffle(narrator, narrators)
|
||||||
|
pipeline = Pipeline(
|
||||||
|
processors=[
|
||||||
|
ns,
|
||||||
|
fl0,
|
||||||
|
vifp,
|
||||||
|
vs,
|
||||||
|
lfra,
|
||||||
|
tts,
|
||||||
|
fl1,
|
||||||
|
img,
|
||||||
|
tiw,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@transport.event_handler("on_first_other_participant_joined")
|
||||||
|
async def on_first_other_participant_joined(transport):
|
||||||
|
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||||
|
|
||||||
|
transport.transcription_settings["extra"]["endpointing"] = True
|
||||||
|
transport.transcription_settings["extra"]["punctuate"] = True
|
||||||
|
await transport.run(pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
(url, token) = configure()
|
||||||
|
asyncio.run(main(url, token))
|
||||||
@@ -26,7 +26,8 @@ logger.setLevel(logging.DEBUG)
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
This example looks a bit different than the chatbot example, because it isn't waiting on the user to stop talking to start translating.
|
This example looks a bit different than the chatbot example, because it isn't waiting on the user to stop talking to start translating.
|
||||||
It also isn't saving what the user or bot says into the context object for use in subsequent interactions.
|
It also isn't saving what the user or bot says into the context object for use in subsequent interactions. This example also sends
|
||||||
|
the translated text back to the transport as an App Message, so clients can show subtitles.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@@ -50,6 +51,20 @@ class TranslationProcessor(FrameProcessor):
|
|||||||
yield frame
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationSubtitles(FrameProcessor):
|
||||||
|
def __init__(self, language):
|
||||||
|
self._language = language
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||||
|
if isinstance(frame, TextFrame):
|
||||||
|
app_message = {
|
||||||
|
"language": self._language,
|
||||||
|
"text": frame.text
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
yield frame
|
||||||
|
|
||||||
|
|
||||||
async def main(room_url: str, token):
|
async def main(room_url: str, token):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
transport = DailyTransportService(
|
transport = DailyTransportService(
|
||||||
@@ -73,7 +88,8 @@ async def main(room_url: str, token):
|
|||||||
model="gpt-4-turbo-preview")
|
model="gpt-4-turbo-preview")
|
||||||
sa = SentenceAggregator()
|
sa = SentenceAggregator()
|
||||||
tp = TranslationProcessor("Spanish")
|
tp = TranslationProcessor("Spanish")
|
||||||
pipeline = Pipeline([sa, tp, llm, tts])
|
ts = TranslationSubtitles("Spanish")
|
||||||
|
pipeline = Pipeline([sa, tp, llm, tts, ts])
|
||||||
|
|
||||||
transport.transcription_settings["extra"]["endpointing"] = True
|
transport.transcription_settings["extra"]["endpointing"] = True
|
||||||
transport.transcription_settings["extra"]["punctuate"] = True
|
transport.transcription_settings["extra"]["punctuate"] = True
|
||||||
|
|||||||
Reference in New Issue
Block a user