Compare commits
7 Commits
hush/rtviS
...
cb/telestr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c73fb4750f | ||
|
|
34b10cb4c7 | ||
|
|
e726f15c4e | ||
|
|
25ca8b751e | ||
|
|
0b4b63d2ee | ||
|
|
6c9425d66a | ||
|
|
6d3c52ae81 |
@@ -252,9 +252,15 @@ class LLMFullResponseAggregator(FrameProcessor):
|
||||
self.aggregation = ""
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if not isinstance(frame, AudioFrame):
|
||||
print(f"^^^ LFRA got frame: {frame}")
|
||||
if isinstance(frame, TextFrame):
|
||||
self.aggregation += frame.text
|
||||
print(
|
||||
f"^^^ LFRA got textframe. aggregation is now {self.aggregation}")
|
||||
elif isinstance(frame, LLMResponseEndFrame):
|
||||
print(
|
||||
f"^^^ LFRA got an llmresponseendframe. About to yield aggregation: {self.aggregation}")
|
||||
yield TextFrame(self.aggregation)
|
||||
yield frame
|
||||
self.aggregation = ""
|
||||
|
||||
@@ -179,3 +179,33 @@ class LLMFunctionCallFrame(Frame):
|
||||
"""Emitted when the LLM has received an entire function call completion."""
|
||||
function_name: str
|
||||
arguments: str
|
||||
|
||||
|
||||
@dataclass()
|
||||
class VideoImageFrame(Frame):
|
||||
"""Contains a still image from a partcipant's video stream."""
|
||||
participantId: str
|
||||
image: bytes
|
||||
|
||||
# def __str__(self):
|
||||
# return f"{self.__class__.__name__}, participantId: {self.participantId}, image size: {len(self.image)} B"
|
||||
|
||||
|
||||
class TelestratorImageFrame(ImageFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass()
|
||||
class VisionFrame(Frame):
|
||||
prompt: str
|
||||
image: bytes
|
||||
|
||||
# def __str__(self):
|
||||
# return f"{self.__class__.__name__}, prompt: {self.prompt}, image size: {len(self.image)} B"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class RequestVideoImageFrame(Frame):
|
||||
"""Send to the transport to request a new video image from a specific participant. Leave participantId
|
||||
empty to request a frame from all participants."""
|
||||
participantId: str | None
|
||||
|
||||
@@ -18,6 +18,7 @@ from dailyai.pipeline.frames import (
|
||||
Frame,
|
||||
TextFrame,
|
||||
TranscriptionQueueFrame,
|
||||
VisionFrame
|
||||
)
|
||||
|
||||
from abc import abstractmethod
|
||||
@@ -61,6 +62,7 @@ class TTSService(AIService):
|
||||
yield TextFrame(self.current_sentence)
|
||||
|
||||
if not isinstance(frame, TextFrame):
|
||||
print(f"*** tts yielding non-text: {frame}")
|
||||
yield frame
|
||||
return
|
||||
|
||||
@@ -79,6 +81,7 @@ class TTSService(AIService):
|
||||
|
||||
# note we pass along the text frame *after* the audio, so the text
|
||||
# frame is completed after the audio is processed.
|
||||
print(f"*** tts yielding text: {text}")
|
||||
yield TextFrame(text)
|
||||
|
||||
|
||||
@@ -133,14 +136,36 @@ class STTService(AIService):
|
||||
yield TranscriptionQueueFrame(text, "", str(time.time()))
|
||||
|
||||
|
||||
class VisionService(AIService):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
# Renders the image. Returns an Image object.
|
||||
# TODO-CB: return type
|
||||
@abstractmethod
|
||||
async def run_vision(self, prompt: str, image: bytes):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, VisionFrame):
|
||||
async for frame in self.run_vision(frame.prompt, frame.image):
|
||||
print(
|
||||
f"&&& visionservce processframe got frame to yield: {frame}")
|
||||
yield frame
|
||||
yield LLMResponseEndFrame()
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class FrameLogger(AIService):
|
||||
def __init__(self, prefix="Frame", **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.prefix = prefix
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (AudioFrame, ImageFrame)):
|
||||
self.logger.info(f"{self.prefix}: {type(frame)}")
|
||||
if isinstance(frame, (AudioFrame)):
|
||||
# self.logger.info(f"{self.prefix}: {type(frame)}")
|
||||
pass
|
||||
else:
|
||||
print(f"{self.prefix}: {frame}")
|
||||
|
||||
|
||||
@@ -24,6 +24,8 @@ from dailyai.pipeline.frames import (
|
||||
TextFrame,
|
||||
UserStartedSpeakingFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
RequestVideoImageFrame,
|
||||
TelestratorImageFrame
|
||||
)
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.services.ai_services import TTSService
|
||||
@@ -90,7 +92,9 @@ class BaseTransportService:
|
||||
self._vad_stop_s = kwargs.get("vad_stop_s") or 0.8
|
||||
self._context = kwargs.get("context") or []
|
||||
self._vad_enabled = kwargs.get("vad_enabled") or False
|
||||
|
||||
self._receive_video = kwargs.get("receive_video") or False
|
||||
self._receive_video_fps = kwargs.get("receive_video_fps") or 0.0
|
||||
self._participant_frame_times = {}
|
||||
if self._vad_enabled and self._speaker_enabled:
|
||||
raise Exception(
|
||||
"Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False."
|
||||
@@ -441,6 +445,7 @@ class BaseTransportService:
|
||||
# discard them
|
||||
if not self._is_interrupted.is_set():
|
||||
if frame:
|
||||
|
||||
if isinstance(frame, AudioFrame):
|
||||
chunk = frame.data
|
||||
|
||||
@@ -452,6 +457,12 @@ class BaseTransportService:
|
||||
self.write_frame_to_mic(
|
||||
bytes(b[:truncated_length]))
|
||||
b = b[truncated_length:]
|
||||
elif isinstance(frame, TelestratorImageFrame):
|
||||
self._set_image(frame.image)
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.receive_queue.put(frame),
|
||||
self._loop,
|
||||
)
|
||||
elif isinstance(frame, ImageFrame):
|
||||
self._set_image(frame.image)
|
||||
elif isinstance(frame, SpriteFrame):
|
||||
@@ -459,6 +470,15 @@ class BaseTransportService:
|
||||
elif isinstance(frame, SendAppMessageFrame):
|
||||
self.send_app_message(
|
||||
frame.message, frame.participantId)
|
||||
elif isinstance(frame, RequestVideoImageFrame):
|
||||
# removing one or all participant IDs from _participant_frame_times
|
||||
# will cause the transport to send the next available frame from
|
||||
# that participant
|
||||
if frame.participantId:
|
||||
self._participant_frame_times.pop(
|
||||
frame.participantId, None)
|
||||
else:
|
||||
self._participant_frame_times.clear()
|
||||
elif len(b):
|
||||
self.write_frame_to_mic(bytes(b))
|
||||
b = bytearray()
|
||||
|
||||
@@ -2,6 +2,7 @@ import asyncio
|
||||
import inspect
|
||||
import logging
|
||||
import signal
|
||||
import time
|
||||
import threading
|
||||
import types
|
||||
|
||||
@@ -11,6 +12,8 @@ from typing import Any
|
||||
from dailyai.pipeline.frames import (
|
||||
ReceivedAppMessageFrame,
|
||||
TranscriptionQueueFrame,
|
||||
VideoImageFrame,
|
||||
TelestratorImageFrame
|
||||
)
|
||||
|
||||
from threading import Event
|
||||
@@ -204,11 +207,12 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
||||
)
|
||||
self._my_participant_id = self.client.participants()["local"]["id"]
|
||||
|
||||
self.client.update_subscription_profiles({
|
||||
"base": {
|
||||
"camera": "unsubscribed",
|
||||
}
|
||||
})
|
||||
if not self._receive_video:
|
||||
self.client.update_subscription_profiles({
|
||||
"base": {
|
||||
"camera": "unsubscribed",
|
||||
}
|
||||
})
|
||||
|
||||
if self._token and self._start_transcription:
|
||||
self.client.start_transcription(self.transcription_settings)
|
||||
@@ -225,6 +229,31 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
||||
self.client.leave()
|
||||
self.client.release()
|
||||
|
||||
def _handle_video_frame(self, participant_id, video_frame):
|
||||
"""If receive_video is true, this function is called once for each frame from each participant. We
|
||||
don't need to send every frame to the pipeline, so there are two ways to decide how to send frames:
|
||||
1. Set a greater-than-zero value for receive_video_fps. The transport will track the last send time
|
||||
for each participant and send a new frame when the requested frame rate has elapsed. This
|
||||
guarantees an image every second, for example.
|
||||
2. Set receive_video_fps less than or equal to zero to disable timed frame sending. Then, put a
|
||||
RequestVideoImageFrame in the pipeline to get a new frame for one or all participants. By
|
||||
sending a RequestVideoImageFrame immediately after successfully processing an image, you can
|
||||
ensure you don't end up queueing up frames faster than you can process them.
|
||||
"""
|
||||
send_frame = False
|
||||
if not participant_id in self._participant_frame_times:
|
||||
# then it's a new participant; send the first frame
|
||||
send_frame = True
|
||||
elif self._receive_video_fps > 0 and time.time() > self._participant_frame_times[participant_id] + 1.0/self._receive_video_fps:
|
||||
# Then it's an existing participant who is due to send a new frame
|
||||
send_frame = True
|
||||
|
||||
if send_frame:
|
||||
self._participant_frame_times[participant_id] = time.time()
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self.receive_queue.put(
|
||||
VideoImageFrame(participant_id, video_frame)), self._loop)
|
||||
|
||||
def on_first_other_participant_joined(self):
|
||||
pass
|
||||
|
||||
@@ -248,6 +277,9 @@ class DailyTransportService(BaseTransportService, EventHandler):
|
||||
if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
|
||||
self._other_participant_has_joined = True
|
||||
self.on_first_other_participant_joined()
|
||||
if self._receive_video:
|
||||
self.client.set_video_renderer(
|
||||
participant["id"], self._handle_video_frame)
|
||||
|
||||
def on_participant_left(self, participant, reason):
|
||||
if len(self.client.participants()) < self._min_others_count + 1:
|
||||
|
||||
@@ -15,18 +15,19 @@ class ElevenLabsTTSService(TTSService):
|
||||
*,
|
||||
aiohttp_session: aiohttp.ClientSession,
|
||||
api_key,
|
||||
voice_id,
|
||||
narrator,
|
||||
model="eleven_turbo_v2",
|
||||
aggregate_sentences=True
|
||||
):
|
||||
super().__init__()
|
||||
super().__init__(aggregate_sentences)
|
||||
|
||||
self._api_key = api_key
|
||||
self._voice_id = voice_id
|
||||
self._narrator = narrator
|
||||
self._aiohttp_session = aiohttp_session
|
||||
self._model = model
|
||||
|
||||
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._narrator['narrator']['voice_id']}/stream"
|
||||
payload = {"text": sentence, "model_id": self._model}
|
||||
querystring = {
|
||||
"output_format": "pcm_16000",
|
||||
@@ -35,6 +36,7 @@ class ElevenLabsTTSService(TTSService):
|
||||
"xi-api-key": self._api_key,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
async with self._aiohttp_session.post(
|
||||
url, json=payload, headers=headers, params=querystring
|
||||
) as r:
|
||||
|
||||
@@ -53,4 +53,7 @@ class FalImageGenService(ImageGenService):
|
||||
async with self._aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
image_bytes = image.tobytes()
|
||||
print(f"!!! fal image tobytes is:")
|
||||
print(image)
|
||||
return (image_url, image_bytes)
|
||||
|
||||
@@ -2,13 +2,22 @@ import aiohttp
|
||||
from PIL import Image
|
||||
import io
|
||||
import time
|
||||
from openai import AsyncOpenAI
|
||||
import base64
|
||||
from openai import AsyncOpenAI, AsyncStream
|
||||
|
||||
import json
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from dailyai.services.ai_services import LLMService, ImageGenService
|
||||
from openai.types.chat import (
|
||||
ChatCompletion,
|
||||
ChatCompletionChunk,
|
||||
ChatCompletionMessageParam,
|
||||
)
|
||||
|
||||
from daily import VideoFrame
|
||||
from dailyai.services.ai_services import LLMService, ImageGenService, VisionService
|
||||
from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
|
||||
from dailyai.pipeline.frames import TextFrame
|
||||
|
||||
|
||||
class OpenAILLMService(BaseOpenAILLMService):
|
||||
@@ -50,3 +59,67 @@ class OpenAIImageGenService(ImageGenService):
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
|
||||
|
||||
class OpenAIVisionService(VisionService):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
model="gpt-4-vision-preview",
|
||||
api_key,
|
||||
):
|
||||
self._model = model
|
||||
self._client = AsyncOpenAI(api_key=api_key)
|
||||
|
||||
async def run_vision(self, prompt: str, image: bytes):
|
||||
if isinstance(image, VideoFrame):
|
||||
# Then it's from a daily video frame
|
||||
print("### processing daily video frame for recognition")
|
||||
IMAGE_WIDTH = image.width
|
||||
IMAGE_HEIGHT = image.height
|
||||
COLOR_FORMAT = image.color_format
|
||||
a_image = Image.frombytes(
|
||||
'RGBA', (IMAGE_WIDTH, IMAGE_HEIGHT), image.buffer)
|
||||
new_image = a_image.convert('RGB')
|
||||
else:
|
||||
# handle it as a byte stream from image gen
|
||||
new_image = Image.frombytes('RGB', (1024, 1024), image)
|
||||
# Uncomment these lines to write the frame to a jpg in the same directory.
|
||||
# current_path = os.getcwd()
|
||||
# image_path = os.path.join(current_path, "image.jpg")
|
||||
# image.save(image_path, format="JPEG")
|
||||
|
||||
jpeg_buffer = io.BytesIO()
|
||||
|
||||
new_image.save(jpeg_buffer, format='JPEG')
|
||||
|
||||
jpeg_bytes = jpeg_buffer.getvalue()
|
||||
base64_image = base64.b64encode(jpeg_bytes).decode('utf-8')
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
chunks: AsyncStream[ChatCompletionChunk] = (
|
||||
await self._client.chat.completions.create(
|
||||
model=self._model,
|
||||
stream=True,
|
||||
messages=messages,
|
||||
)
|
||||
)
|
||||
async for chunk in chunks:
|
||||
print(f"%%% chunk: {chunk}")
|
||||
if len(chunk.choices) == 0:
|
||||
continue
|
||||
if chunk.choices[0].delta.content:
|
||||
yield TextFrame(chunk.choices[0].delta.content)
|
||||
|
||||
97
src/examples/foundational/12-describe-video.py
Normal file
97
src/examples/foundational/12-describe-video.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
)
|
||||
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||
from examples.support.runner import configure
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class VideoImageFrameProcessor(FrameProcessor):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, VideoImageFrame):
|
||||
yield VisionFrame("Describe the image in one sentence.", frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class ImageRefresher(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, LLMResponseEndFrame):
|
||||
yield RequestVideoImageFrame(participantId=None)
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=False,
|
||||
vad_enabled=True,
|
||||
receive_video=True,
|
||||
receive_video_fps=0
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
vifp = VideoImageFrameProcessor()
|
||||
ir = ImageRefresher()
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
vifp,
|
||||
vs,
|
||||
llm,
|
||||
tts,
|
||||
ir,
|
||||
],
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -124,7 +124,6 @@ async def main(room_url: str, token):
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
print(f"!!! in here, pipeline.source is {pipeline.source}")
|
||||
await pipeline.queue_frames([LLMMessagesQueueFrame(messages)])
|
||||
|
||||
async def run_conversation():
|
||||
|
||||
100
src/examples/starter-apps/telestrator/describer.py
Normal file
100
src/examples/starter-apps/telestrator/describer.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
)
|
||||
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||
from examples.support.runner import configure
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class VideoImageFrameProcessor(FrameProcessor):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, VideoImageFrame):
|
||||
yield VisionFrame("Describe the image in one sentence.", frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class ImageRefresher(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, LLMResponseEndFrame):
|
||||
yield RequestVideoImageFrame(participantId=None)
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
vad_enabled=False,
|
||||
receive_video=True,
|
||||
receive_video_fps=0
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
vifp = VideoImageFrameProcessor()
|
||||
ir = ImageRefresher()
|
||||
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
vifp,
|
||||
vs,
|
||||
tts,
|
||||
ir,
|
||||
],
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
112
src/examples/starter-apps/telestrator/illustrator.py
Normal file
112
src/examples/starter-apps/telestrator/illustrator.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame, TranscriptionQueueFrame, TextFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
)
|
||||
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||
from examples.support.runner import configure
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class VADAggregator(FrameProcessor):
|
||||
def __init__(self):
|
||||
self.aggregating = False
|
||||
self.aggregation = ""
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, UserStartedSpeakingFrame):
|
||||
self.aggregating = True
|
||||
elif isinstance(frame, UserStoppedSpeakingFrame):
|
||||
self.aggregating = False
|
||||
# Sometimes VAD triggers quickly on and off. If we don't get any transcription,
|
||||
# it creates empty LLM message queue frames
|
||||
if len(self.aggregation) > 0:
|
||||
yield TextFrame(self.aggregation)
|
||||
|
||||
self.aggregation = ""
|
||||
yield frame
|
||||
elif isinstance(frame, TranscriptionQueueFrame) and self.aggregating:
|
||||
self.aggregation += f" {frame.text}"
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
vad_enabled=True,
|
||||
receive_video=True,
|
||||
receive_video_fps=0,
|
||||
vad_timeout_s=1.0
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
vad = VADAggregator()
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
fl = FrameLogger("!!! Start")
|
||||
fl2 = FrameLogger("!!! AFTER VAD")
|
||||
fl3 = FrameLogger("!!! After img")
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
fl,
|
||||
vad,
|
||||
fl2,
|
||||
img,
|
||||
fl3
|
||||
],
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
210
src/examples/starter-apps/telestrator/telestrator-fuzz.py
Normal file
210
src/examples/starter-apps/telestrator/telestrator-fuzz.py
Normal file
@@ -0,0 +1,210 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
LLMFullResponseAggregator
|
||||
)
|
||||
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||
from examples.support.runner import configure
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
|
||||
"prompt": "Describe the image in one sentence."},
|
||||
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
|
||||
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
|
||||
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
|
||||
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
|
||||
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
|
||||
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
|
||||
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
|
||||
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
|
||||
{"voice_id": "bnyr1EF3snReVXauGBNn",
|
||||
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
|
||||
|
||||
# random.shuffle(narrators)
|
||||
print(f"$$$ narrators: {narrators}")
|
||||
narrator = {"narrator": narrators[0]}
|
||||
|
||||
|
||||
class TranslationProcessor(FrameProcessor):
|
||||
def __init__(self, in_language, out_language):
|
||||
self._in_language = in_language
|
||||
self._out_language = out_language
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
context = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
|
||||
},
|
||||
{"role": "user", "content": frame.text},
|
||||
]
|
||||
|
||||
yield LLMMessagesQueueFrame(context)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class NarratorShuffle(FrameProcessor):
|
||||
def __init__(self, narrator, narrators):
|
||||
self._narrator = narrator
|
||||
self._narrators = narrators
|
||||
self._i = 0
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
|
||||
self._i += 1
|
||||
if self._i >= len(self._narrators):
|
||||
print(f"### shuffling narrators")
|
||||
random.shuffle(self._narrators)
|
||||
self._i = 0
|
||||
|
||||
self._narrator["narrator"] = self._narrators[self._i]
|
||||
print(f"### new narrator is {self._narrator}")
|
||||
yield frame
|
||||
|
||||
|
||||
class VideoImageFrameProcessor(FrameProcessor):
|
||||
def __init__(self, narrator):
|
||||
self._narrator = narrator
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
|
||||
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class ImageRefresher(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, LLMResponseEndFrame):
|
||||
yield RequestVideoImageFrame(participantId=None)
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class TelestratorImageWrapper(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, ImageFrame):
|
||||
yield TelestratorImageFrame(None, frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=576,
|
||||
vad_enabled=False,
|
||||
receive_video=True,
|
||||
receive_video_fps=0
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
narrator=narrator,
|
||||
aggregate_sentences=False
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
vifp = VideoImageFrameProcessor(narrator)
|
||||
ir = ImageRefresher()
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
tiw = TelestratorImageWrapper()
|
||||
lfra = LLMFullResponseAggregator()
|
||||
lfra1 = LLMFullResponseAggregator()
|
||||
lfra2 = LLMFullResponseAggregator()
|
||||
lfra3 = LLMFullResponseAggregator()
|
||||
lfra4 = LLMFullResponseAggregator()
|
||||
fl0 = FrameLogger("@@@ About to describe")
|
||||
fl1 = FrameLogger("!!! About to image gen")
|
||||
f4 = FrameLogger("((( partway through )))")
|
||||
f5 = FrameLogger("!!! f5")
|
||||
ns = NarratorShuffle(narrator, narrators)
|
||||
t1 = TranslationProcessor("English", "Spanish")
|
||||
t2 = TranslationProcessor("Spanish", "German")
|
||||
t3 = TranslationProcessor("German", "Japanese")
|
||||
t4 = TranslationProcessor("Japanese", "English")
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
fl0,
|
||||
vifp,
|
||||
vs,
|
||||
lfra,
|
||||
tts,
|
||||
f4,
|
||||
t1,
|
||||
llm,
|
||||
lfra1,
|
||||
f5,
|
||||
tts,
|
||||
|
||||
t2,
|
||||
llm,
|
||||
lfra2,
|
||||
tts,
|
||||
t3,
|
||||
llm,
|
||||
lfra3,
|
||||
tts,
|
||||
t4,
|
||||
llm,
|
||||
lfra4,
|
||||
tts,
|
||||
fl1,
|
||||
img,
|
||||
tiw,
|
||||
],
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
191
src/examples/starter-apps/telestrator/telestrator-haiku.py
Normal file
191
src/examples/starter-apps/telestrator/telestrator-haiku.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
LLMFullResponseAggregator
|
||||
)
|
||||
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||
from examples.support.runner import configure
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
|
||||
"prompt": "Describe the image in a haiku."},
|
||||
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
|
||||
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
|
||||
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
|
||||
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
|
||||
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
|
||||
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
|
||||
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
|
||||
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
|
||||
{"voice_id": "bnyr1EF3snReVXauGBNn",
|
||||
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
|
||||
|
||||
# random.shuffle(narrators)
|
||||
print(f"$$$ narrators: {narrators}")
|
||||
narrator = {"narrator": narrators[0]}
|
||||
|
||||
|
||||
class TranslationProcessor(FrameProcessor):
|
||||
def __init__(self, in_language, out_language):
|
||||
self._in_language = in_language
|
||||
self._out_language = out_language
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
context = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
|
||||
},
|
||||
{"role": "user", "content": frame.text},
|
||||
]
|
||||
|
||||
yield LLMMessagesQueueFrame(context)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class NarratorShuffle(FrameProcessor):
|
||||
def __init__(self, narrator, narrators):
|
||||
self._narrator = narrator
|
||||
self._narrators = narrators
|
||||
self._i = 0
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
|
||||
self._i += 1
|
||||
if self._i >= len(self._narrators):
|
||||
print(f"### shuffling narrators")
|
||||
random.shuffle(self._narrators)
|
||||
self._i = 0
|
||||
|
||||
self._narrator["narrator"] = self._narrators[self._i]
|
||||
print(f"### new narrator is {self._narrator}")
|
||||
yield frame
|
||||
|
||||
|
||||
class VideoImageFrameProcessor(FrameProcessor):
|
||||
def __init__(self, narrator):
|
||||
self._narrator = narrator
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
|
||||
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class ImageRefresher(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, LLMResponseEndFrame):
|
||||
yield RequestVideoImageFrame(participantId=None)
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class TelestratorImageWrapper(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, ImageFrame):
|
||||
yield TelestratorImageFrame(None, frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
vad_enabled=False,
|
||||
receive_video=True,
|
||||
receive_video_fps=0
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
narrator=narrator,
|
||||
aggregate_sentences=False
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
vifp = VideoImageFrameProcessor(narrator)
|
||||
ir = ImageRefresher()
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
tiw = TelestratorImageWrapper()
|
||||
lfra = LLMFullResponseAggregator()
|
||||
lfra1 = LLMFullResponseAggregator()
|
||||
lfra2 = LLMFullResponseAggregator()
|
||||
lfra3 = LLMFullResponseAggregator()
|
||||
lfra4 = LLMFullResponseAggregator()
|
||||
fl0 = FrameLogger("@@@ About to describe")
|
||||
fl1 = FrameLogger("!!! About to image gen")
|
||||
f4 = FrameLogger("((( partway through )))")
|
||||
f5 = FrameLogger("!!! f5")
|
||||
ns = NarratorShuffle(narrator, narrators)
|
||||
t1 = TranslationProcessor("English", "Spanish")
|
||||
t2 = TranslationProcessor("Spanish", "German")
|
||||
t3 = TranslationProcessor("German", "Japanese")
|
||||
t4 = TranslationProcessor("Japanese", "English")
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
fl0,
|
||||
vifp,
|
||||
vs,
|
||||
lfra,
|
||||
tts,
|
||||
fl1,
|
||||
img,
|
||||
tiw,
|
||||
],
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
191
src/examples/starter-apps/telestrator/telestrator-wordcount.py
Normal file
191
src/examples/starter-apps/telestrator/telestrator-wordcount.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
LLMFullResponseAggregator
|
||||
)
|
||||
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||
from examples.support.runner import configure
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
|
||||
"prompt": "Describe the image in nine words."},
|
||||
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
|
||||
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
|
||||
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
|
||||
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
|
||||
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
|
||||
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
|
||||
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
|
||||
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
|
||||
{"voice_id": "bnyr1EF3snReVXauGBNn",
|
||||
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
|
||||
|
||||
# random.shuffle(narrators)
|
||||
print(f"$$$ narrators: {narrators}")
|
||||
narrator = {"narrator": narrators[0]}
|
||||
|
||||
|
||||
class TranslationProcessor(FrameProcessor):
|
||||
def __init__(self, in_language, out_language):
|
||||
self._in_language = in_language
|
||||
self._out_language = out_language
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
context = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
|
||||
},
|
||||
{"role": "user", "content": frame.text},
|
||||
]
|
||||
|
||||
yield LLMMessagesQueueFrame(context)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class NarratorShuffle(FrameProcessor):
|
||||
def __init__(self, narrator, narrators):
|
||||
self._narrator = narrator
|
||||
self._narrators = narrators
|
||||
self._i = 0
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
|
||||
self._i += 1
|
||||
if self._i >= len(self._narrators):
|
||||
print(f"### shuffling narrators")
|
||||
random.shuffle(self._narrators)
|
||||
self._i = 0
|
||||
|
||||
self._narrator["narrator"] = self._narrators[self._i]
|
||||
print(f"### new narrator is {self._narrator}")
|
||||
yield frame
|
||||
|
||||
|
||||
class VideoImageFrameProcessor(FrameProcessor):
|
||||
def __init__(self, narrator):
|
||||
self._narrator = narrator
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
|
||||
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class ImageRefresher(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, LLMResponseEndFrame):
|
||||
yield RequestVideoImageFrame(participantId=None)
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class TelestratorImageWrapper(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, ImageFrame):
|
||||
yield TelestratorImageFrame(None, frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
vad_enabled=False,
|
||||
receive_video=True,
|
||||
receive_video_fps=0
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
narrator=narrator,
|
||||
aggregate_sentences=False
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
vifp = VideoImageFrameProcessor(narrator)
|
||||
ir = ImageRefresher()
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
tiw = TelestratorImageWrapper()
|
||||
lfra = LLMFullResponseAggregator()
|
||||
lfra1 = LLMFullResponseAggregator()
|
||||
lfra2 = LLMFullResponseAggregator()
|
||||
lfra3 = LLMFullResponseAggregator()
|
||||
lfra4 = LLMFullResponseAggregator()
|
||||
fl0 = FrameLogger("@@@ About to describe")
|
||||
fl1 = FrameLogger("!!! About to image gen")
|
||||
f4 = FrameLogger("((( partway through )))")
|
||||
f5 = FrameLogger("!!! f5")
|
||||
ns = NarratorShuffle(narrator, narrators)
|
||||
t1 = TranslationProcessor("English", "Spanish")
|
||||
t2 = TranslationProcessor("Spanish", "German")
|
||||
t3 = TranslationProcessor("German", "Japanese")
|
||||
t4 = TranslationProcessor("Japanese", "English")
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
fl0,
|
||||
vifp,
|
||||
vs,
|
||||
lfra,
|
||||
tts,
|
||||
fl1,
|
||||
img,
|
||||
tiw,
|
||||
],
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
162
src/examples/starter-apps/telestrator/telestrator.py
Normal file
162
src/examples/starter-apps/telestrator/telestrator.py
Normal file
@@ -0,0 +1,162 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame
|
||||
from dailyai.pipeline.pipeline import Pipeline
|
||||
from dailyai.pipeline.frame_processor import FrameProcessor
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
from dailyai.services.deepgram_ai_services import DeepgramTTSService
|
||||
from dailyai.services.ai_services import FrameLogger
|
||||
from dailyai.pipeline.aggregators import (
|
||||
LLMAssistantContextAggregator,
|
||||
LLMUserContextAggregator,
|
||||
LLMFullResponseAggregator
|
||||
)
|
||||
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
|
||||
from examples.support.runner import configure
|
||||
|
||||
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
|
||||
logger = logging.getLogger("dailyai")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
|
||||
"prompt": "Describe the image in one sentence, in the style of David Attenborough."},
|
||||
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
|
||||
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
|
||||
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
|
||||
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
|
||||
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
|
||||
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
|
||||
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
|
||||
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
|
||||
{"voice_id": "bnyr1EF3snReVXauGBNn",
|
||||
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
|
||||
|
||||
random.shuffle(narrators)
|
||||
print(f"$$$ narrators: {narrators}")
|
||||
narrator = {"narrator": narrators[0]}
|
||||
|
||||
|
||||
class NarratorShuffle(FrameProcessor):
|
||||
def __init__(self, narrator, narrators):
|
||||
self._narrator = narrator
|
||||
self._narrators = narrators
|
||||
self._i = 0
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
|
||||
self._i += 1
|
||||
if self._i >= len(self._narrators):
|
||||
print(f"### shuffling narrators")
|
||||
random.shuffle(self._narrators)
|
||||
self._i = 0
|
||||
|
||||
self._narrator["narrator"] = self._narrators[self._i]
|
||||
print(f"### new narrator is {self._narrator}")
|
||||
yield frame
|
||||
|
||||
|
||||
class VideoImageFrameProcessor(FrameProcessor):
|
||||
def __init__(self, narrator):
|
||||
self._narrator = narrator
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
|
||||
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class ImageRefresher(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, LLMResponseEndFrame):
|
||||
yield RequestVideoImageFrame(participantId=None)
|
||||
yield frame
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
class TelestratorImageWrapper(FrameProcessor):
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, ImageFrame):
|
||||
yield TelestratorImageFrame(None, frame.image)
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
duration_minutes=5,
|
||||
start_transcription=True,
|
||||
mic_enabled=True,
|
||||
mic_sample_rate=16000,
|
||||
camera_enabled=True,
|
||||
camera_width=1024,
|
||||
camera_height=1024,
|
||||
vad_enabled=False,
|
||||
receive_video=True,
|
||||
receive_video_fps=0
|
||||
)
|
||||
|
||||
tts = ElevenLabsTTSService(
|
||||
aiohttp_session=session,
|
||||
api_key=os.getenv("ELEVENLABS_API_KEY"),
|
||||
narrator=narrator,
|
||||
aggregate_sentences=False
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(
|
||||
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
|
||||
model="gpt-4-turbo-preview")
|
||||
|
||||
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
|
||||
vifp = VideoImageFrameProcessor(narrator)
|
||||
ir = ImageRefresher()
|
||||
img = FalImageGenService(
|
||||
image_size="1024x1024",
|
||||
aiohttp_session=session,
|
||||
key_id=os.getenv("FAL_KEY_ID"),
|
||||
key_secret=os.getenv("FAL_KEY_SECRET"),
|
||||
)
|
||||
tiw = TelestratorImageWrapper()
|
||||
lfra = LLMFullResponseAggregator()
|
||||
fl0 = FrameLogger("@@@ About to describe")
|
||||
fl1 = FrameLogger("!!! About to image gen")
|
||||
ns = NarratorShuffle(narrator, narrators)
|
||||
pipeline = Pipeline(
|
||||
processors=[
|
||||
ns,
|
||||
fl0,
|
||||
vifp,
|
||||
vs,
|
||||
lfra,
|
||||
tts,
|
||||
fl1,
|
||||
img,
|
||||
tiw,
|
||||
],
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_other_participant_joined")
|
||||
async def on_first_other_participant_joined(transport):
|
||||
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
await transport.run(pipeline)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
(url, token) = configure()
|
||||
asyncio.run(main(url, token))
|
||||
@@ -26,7 +26,8 @@ logger.setLevel(logging.DEBUG)
|
||||
|
||||
"""
|
||||
This example looks a bit different than the chatbot example, because it isn't waiting on the user to stop talking to start translating.
|
||||
It also isn't saving what the user or bot says into the context object for use in subsequent interactions.
|
||||
It also isn't saving what the user or bot says into the context object for use in subsequent interactions. This example also sends
|
||||
the translated text back to the transport as an App Message, so clients can show subtitles.
|
||||
"""
|
||||
|
||||
|
||||
@@ -50,6 +51,20 @@ class TranslationProcessor(FrameProcessor):
|
||||
yield frame
|
||||
|
||||
|
||||
class TranslationSubtitles(FrameProcessor):
|
||||
def __init__(self, language):
|
||||
self._language = language
|
||||
|
||||
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
|
||||
if isinstance(frame, TextFrame):
|
||||
app_message = {
|
||||
"language": self._language,
|
||||
"text": frame.text
|
||||
}
|
||||
else:
|
||||
yield frame
|
||||
|
||||
|
||||
async def main(room_url: str, token):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
transport = DailyTransportService(
|
||||
@@ -73,7 +88,8 @@ async def main(room_url: str, token):
|
||||
model="gpt-4-turbo-preview")
|
||||
sa = SentenceAggregator()
|
||||
tp = TranslationProcessor("Spanish")
|
||||
pipeline = Pipeline([sa, tp, llm, tts])
|
||||
ts = TranslationSubtitles("Spanish")
|
||||
pipeline = Pipeline([sa, tp, llm, tts, ts])
|
||||
|
||||
transport.transcription_settings["extra"]["endpointing"] = True
|
||||
transport.transcription_settings["extra"]["punctuate"] = True
|
||||
|
||||
Reference in New Issue
Block a user