Compare commits

...

7 Commits

Author SHA1 Message Date
Chad Bailey
c73fb4750f added fuzz example 2024-03-22 14:20:16 +00:00
Chad Bailey
34b10cb4c7 wip 2024-03-19 22:04:47 +00:00
Chad Bailey
e726f15c4e wip: telestrator 2024-03-19 15:31:19 +00:00
Chad Bailey
25ca8b751e cleanup 2024-03-19 03:08:04 +00:00
Chad Bailey
0b4b63d2ee Working vision example 2024-03-19 01:51:36 +00:00
Chad Bailey
6c9425d66a wip: video image frames 2024-03-18 22:14:02 +00:00
Chad Bailey
6d3c52ae81 added app message 2024-03-18 19:52:31 +00:00
17 changed files with 1287 additions and 18 deletions

View File

@@ -252,9 +252,15 @@ class LLMFullResponseAggregator(FrameProcessor):
self.aggregation = ""
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if not isinstance(frame, AudioFrame):
print(f"^^^ LFRA got frame: {frame}")
if isinstance(frame, TextFrame):
self.aggregation += frame.text
print(
f"^^^ LFRA got textframe. aggregation is now {self.aggregation}")
elif isinstance(frame, LLMResponseEndFrame):
print(
f"^^^ LFRA got an llmresponseendframe. About to yield aggregation: {self.aggregation}")
yield TextFrame(self.aggregation)
yield frame
self.aggregation = ""

View File

@@ -179,3 +179,33 @@ class LLMFunctionCallFrame(Frame):
"""Emitted when the LLM has received an entire function call completion."""
function_name: str
arguments: str
@dataclass()
class VideoImageFrame(Frame):
"""Contains a still image from a partcipant's video stream."""
participantId: str
image: bytes
# def __str__(self):
# return f"{self.__class__.__name__}, participantId: {self.participantId}, image size: {len(self.image)} B"
class TelestratorImageFrame(ImageFrame):
pass
@dataclass()
class VisionFrame(Frame):
prompt: str
image: bytes
# def __str__(self):
# return f"{self.__class__.__name__}, prompt: {self.prompt}, image size: {len(self.image)} B"
@dataclass()
class RequestVideoImageFrame(Frame):
"""Send to the transport to request a new video image from a specific participant. Leave participantId
empty to request a frame from all participants."""
participantId: str | None

View File

@@ -18,6 +18,7 @@ from dailyai.pipeline.frames import (
Frame,
TextFrame,
TranscriptionQueueFrame,
VisionFrame
)
from abc import abstractmethod
@@ -61,6 +62,7 @@ class TTSService(AIService):
yield TextFrame(self.current_sentence)
if not isinstance(frame, TextFrame):
print(f"*** tts yielding non-text: {frame}")
yield frame
return
@@ -79,6 +81,7 @@ class TTSService(AIService):
# note we pass along the text frame *after* the audio, so the text
# frame is completed after the audio is processed.
print(f"*** tts yielding text: {text}")
yield TextFrame(text)
@@ -133,14 +136,36 @@ class STTService(AIService):
yield TranscriptionQueueFrame(text, "", str(time.time()))
class VisionService(AIService):
def __init__(self):
super().__init__()
# Renders the image. Returns an Image object.
# TODO-CB: return type
@abstractmethod
async def run_vision(self, prompt: str, image: bytes):
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, VisionFrame):
async for frame in self.run_vision(frame.prompt, frame.image):
print(
f"&&& visionservce processframe got frame to yield: {frame}")
yield frame
yield LLMResponseEndFrame()
else:
yield frame
class FrameLogger(AIService):
def __init__(self, prefix="Frame", **kwargs):
super().__init__(**kwargs)
self.prefix = prefix
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (AudioFrame, ImageFrame)):
self.logger.info(f"{self.prefix}: {type(frame)}")
if isinstance(frame, (AudioFrame)):
# self.logger.info(f"{self.prefix}: {type(frame)}")
pass
else:
print(f"{self.prefix}: {frame}")

View File

@@ -24,6 +24,8 @@ from dailyai.pipeline.frames import (
TextFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
RequestVideoImageFrame,
TelestratorImageFrame
)
from dailyai.pipeline.pipeline import Pipeline
from dailyai.services.ai_services import TTSService
@@ -90,7 +92,9 @@ class BaseTransportService:
self._vad_stop_s = kwargs.get("vad_stop_s") or 0.8
self._context = kwargs.get("context") or []
self._vad_enabled = kwargs.get("vad_enabled") or False
self._receive_video = kwargs.get("receive_video") or False
self._receive_video_fps = kwargs.get("receive_video_fps") or 0.0
self._participant_frame_times = {}
if self._vad_enabled and self._speaker_enabled:
raise Exception(
"Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False."
@@ -441,6 +445,7 @@ class BaseTransportService:
# discard them
if not self._is_interrupted.is_set():
if frame:
if isinstance(frame, AudioFrame):
chunk = frame.data
@@ -452,6 +457,12 @@ class BaseTransportService:
self.write_frame_to_mic(
bytes(b[:truncated_length]))
b = b[truncated_length:]
elif isinstance(frame, TelestratorImageFrame):
self._set_image(frame.image)
asyncio.run_coroutine_threadsafe(
self.receive_queue.put(frame),
self._loop,
)
elif isinstance(frame, ImageFrame):
self._set_image(frame.image)
elif isinstance(frame, SpriteFrame):
@@ -459,6 +470,15 @@ class BaseTransportService:
elif isinstance(frame, SendAppMessageFrame):
self.send_app_message(
frame.message, frame.participantId)
elif isinstance(frame, RequestVideoImageFrame):
# removing one or all participant IDs from _participant_frame_times
# will cause the transport to send the next available frame from
# that participant
if frame.participantId:
self._participant_frame_times.pop(
frame.participantId, None)
else:
self._participant_frame_times.clear()
elif len(b):
self.write_frame_to_mic(bytes(b))
b = bytearray()

View File

@@ -2,6 +2,7 @@ import asyncio
import inspect
import logging
import signal
import time
import threading
import types
@@ -11,6 +12,8 @@ from typing import Any
from dailyai.pipeline.frames import (
ReceivedAppMessageFrame,
TranscriptionQueueFrame,
VideoImageFrame,
TelestratorImageFrame
)
from threading import Event
@@ -204,11 +207,12 @@ class DailyTransportService(BaseTransportService, EventHandler):
)
self._my_participant_id = self.client.participants()["local"]["id"]
self.client.update_subscription_profiles({
"base": {
"camera": "unsubscribed",
}
})
if not self._receive_video:
self.client.update_subscription_profiles({
"base": {
"camera": "unsubscribed",
}
})
if self._token and self._start_transcription:
self.client.start_transcription(self.transcription_settings)
@@ -225,6 +229,31 @@ class DailyTransportService(BaseTransportService, EventHandler):
self.client.leave()
self.client.release()
def _handle_video_frame(self, participant_id, video_frame):
"""If receive_video is true, this function is called once for each frame from each participant. We
don't need to send every frame to the pipeline, so there are two ways to decide how to send frames:
1. Set a greater-than-zero value for receive_video_fps. The transport will track the last send time
for each participant and send a new frame when the requested frame rate has elapsed. This
guarantees an image every second, for example.
2. Set receive_video_fps less than or equal to zero to disable timed frame sending. Then, put a
RequestVideoImageFrame in the pipeline to get a new frame for one or all participants. By
sending a RequestVideoImageFrame immediately after successfully processing an image, you can
ensure you don't end up queueing up frames faster than you can process them.
"""
send_frame = False
if not participant_id in self._participant_frame_times:
# then it's a new participant; send the first frame
send_frame = True
elif self._receive_video_fps > 0 and time.time() > self._participant_frame_times[participant_id] + 1.0/self._receive_video_fps:
# Then it's an existing participant who is due to send a new frame
send_frame = True
if send_frame:
self._participant_frame_times[participant_id] = time.time()
future = asyncio.run_coroutine_threadsafe(
self.receive_queue.put(
VideoImageFrame(participant_id, video_frame)), self._loop)
def on_first_other_participant_joined(self):
pass
@@ -248,6 +277,9 @@ class DailyTransportService(BaseTransportService, EventHandler):
if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
self._other_participant_has_joined = True
self.on_first_other_participant_joined()
if self._receive_video:
self.client.set_video_renderer(
participant["id"], self._handle_video_frame)
def on_participant_left(self, participant, reason):
if len(self.client.participants()) < self._min_others_count + 1:

View File

@@ -15,18 +15,19 @@ class ElevenLabsTTSService(TTSService):
*,
aiohttp_session: aiohttp.ClientSession,
api_key,
voice_id,
narrator,
model="eleven_turbo_v2",
aggregate_sentences=True
):
super().__init__()
super().__init__(aggregate_sentences)
self._api_key = api_key
self._voice_id = voice_id
self._narrator = narrator
self._aiohttp_session = aiohttp_session
self._model = model
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._narrator['narrator']['voice_id']}/stream"
payload = {"text": sentence, "model_id": self._model}
querystring = {
"output_format": "pcm_16000",
@@ -35,6 +36,7 @@ class ElevenLabsTTSService(TTSService):
"xi-api-key": self._api_key,
"Content-Type": "application/json",
}
async with self._aiohttp_session.post(
url, json=payload, headers=headers, params=querystring
) as r:

View File

@@ -53,4 +53,7 @@ class FalImageGenService(ImageGenService):
async with self._aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
image_bytes = image.tobytes()
print(f"!!! fal image tobytes is:")
print(image)
return (image_url, image_bytes)

View File

@@ -2,13 +2,22 @@ import aiohttp
from PIL import Image
import io
import time
from openai import AsyncOpenAI
import base64
from openai import AsyncOpenAI, AsyncStream
import json
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import LLMService, ImageGenService
from openai.types.chat import (
ChatCompletion,
ChatCompletionChunk,
ChatCompletionMessageParam,
)
from daily import VideoFrame
from dailyai.services.ai_services import LLMService, ImageGenService, VisionService
from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
from dailyai.pipeline.frames import TextFrame
class OpenAILLMService(BaseOpenAILLMService):
@@ -50,3 +59,67 @@ class OpenAIImageGenService(ImageGenService):
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
class OpenAIVisionService(VisionService):
def __init__(
self,
*,
model="gpt-4-vision-preview",
api_key,
):
self._model = model
self._client = AsyncOpenAI(api_key=api_key)
async def run_vision(self, prompt: str, image: bytes):
if isinstance(image, VideoFrame):
# Then it's from a daily video frame
print("### processing daily video frame for recognition")
IMAGE_WIDTH = image.width
IMAGE_HEIGHT = image.height
COLOR_FORMAT = image.color_format
a_image = Image.frombytes(
'RGBA', (IMAGE_WIDTH, IMAGE_HEIGHT), image.buffer)
new_image = a_image.convert('RGB')
else:
# handle it as a byte stream from image gen
new_image = Image.frombytes('RGB', (1024, 1024), image)
# Uncomment these lines to write the frame to a jpg in the same directory.
# current_path = os.getcwd()
# image_path = os.path.join(current_path, "image.jpg")
# image.save(image_path, format="JPEG")
jpeg_buffer = io.BytesIO()
new_image.save(jpeg_buffer, format='JPEG')
jpeg_bytes = jpeg_buffer.getvalue()
base64_image = base64.b64encode(jpeg_bytes).decode('utf-8')
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
},
},
],
}
]
chunks: AsyncStream[ChatCompletionChunk] = (
await self._client.chat.completions.create(
model=self._model,
stream=True,
messages=messages,
)
)
async for chunk in chunks:
print(f"%%% chunk: {chunk}")
if len(chunk.choices) == 0:
continue
if chunk.choices[0].delta.content:
yield TextFrame(chunk.choices[0].delta.content)

View File

@@ -0,0 +1,97 @@
import asyncio
import aiohttp
import logging
import os
from typing import AsyncGenerator
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.ai_services import FrameLogger
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
)
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
from examples.support.runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
class VideoImageFrameProcessor(FrameProcessor):
def __init__(self):
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, VideoImageFrame):
yield VisionFrame("Describe the image in one sentence.", frame.image)
else:
yield frame
class ImageRefresher(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, LLMResponseEndFrame):
yield RequestVideoImageFrame(participantId=None)
yield frame
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=False,
vad_enabled=True,
receive_video=True,
receive_video_fps=0
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
model="gpt-4-turbo-preview")
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
vifp = VideoImageFrameProcessor()
ir = ImageRefresher()
pipeline = Pipeline(
processors=[
vifp,
vs,
llm,
tts,
ir,
],
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -124,7 +124,6 @@ async def main(room_url: str, token):
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
print(f"!!! in here, pipeline.source is {pipeline.source}")
await pipeline.queue_frames([LLMMessagesQueueFrame(messages)])
async def run_conversation():

View File

@@ -0,0 +1,100 @@
import asyncio
import aiohttp
import logging
import os
from typing import AsyncGenerator
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.ai_services import FrameLogger
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
)
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
from examples.support.runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
class VideoImageFrameProcessor(FrameProcessor):
def __init__(self):
pass
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, VideoImageFrame):
yield VisionFrame("Describe the image in one sentence.", frame.image)
else:
yield frame
class ImageRefresher(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, LLMResponseEndFrame):
yield RequestVideoImageFrame(participantId=None)
yield frame
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
vad_enabled=False,
receive_video=True,
receive_video_fps=0
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
model="gpt-4-turbo-preview")
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
vifp = VideoImageFrameProcessor()
ir = ImageRefresher()
pipeline = Pipeline(
processors=[
vifp,
vs,
tts,
ir,
],
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,112 @@
import asyncio
import aiohttp
import logging
import os
from typing import AsyncGenerator
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame, TranscriptionQueueFrame, TextFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.ai_services import FrameLogger
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
)
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
from examples.support.runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
class VADAggregator(FrameProcessor):
def __init__(self):
self.aggregating = False
self.aggregation = ""
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, UserStartedSpeakingFrame):
self.aggregating = True
elif isinstance(frame, UserStoppedSpeakingFrame):
self.aggregating = False
# Sometimes VAD triggers quickly on and off. If we don't get any transcription,
# it creates empty LLM message queue frames
if len(self.aggregation) > 0:
yield TextFrame(self.aggregation)
self.aggregation = ""
yield frame
elif isinstance(frame, TranscriptionQueueFrame) and self.aggregating:
self.aggregation += f" {frame.text}"
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
vad_enabled=True,
receive_video=True,
receive_video_fps=0,
vad_timeout_s=1.0
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
model="gpt-4-turbo-preview")
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
vad = VADAggregator()
img = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
fl = FrameLogger("!!! Start")
fl2 = FrameLogger("!!! AFTER VAD")
fl3 = FrameLogger("!!! After img")
pipeline = Pipeline(
processors=[
fl,
vad,
fl2,
img,
fl3
],
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,210 @@
import asyncio
import aiohttp
import logging
import os
import random
from typing import AsyncGenerator
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.ai_services import FrameLogger
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
LLMFullResponseAggregator
)
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
from examples.support.runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
"prompt": "Describe the image in one sentence."},
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
{"voice_id": "bnyr1EF3snReVXauGBNn",
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
# random.shuffle(narrators)
print(f"$$$ narrators: {narrators}")
narrator = {"narrator": narrators[0]}
class TranslationProcessor(FrameProcessor):
def __init__(self, in_language, out_language):
self._in_language = in_language
self._out_language = out_language
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TextFrame):
context = [
{
"role": "system",
"content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
},
{"role": "user", "content": frame.text},
]
yield LLMMessagesQueueFrame(context)
else:
yield frame
class NarratorShuffle(FrameProcessor):
def __init__(self, narrator, narrators):
self._narrator = narrator
self._narrators = narrators
self._i = 0
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
self._i += 1
if self._i >= len(self._narrators):
print(f"### shuffling narrators")
random.shuffle(self._narrators)
self._i = 0
self._narrator["narrator"] = self._narrators[self._i]
print(f"### new narrator is {self._narrator}")
yield frame
class VideoImageFrameProcessor(FrameProcessor):
def __init__(self, narrator):
self._narrator = narrator
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
else:
yield frame
class ImageRefresher(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, LLMResponseEndFrame):
yield RequestVideoImageFrame(participantId=None)
yield frame
else:
yield frame
class TelestratorImageWrapper(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, ImageFrame):
yield TelestratorImageFrame(None, frame.image)
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=1024,
camera_height=576,
vad_enabled=False,
receive_video=True,
receive_video_fps=0
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
narrator=narrator,
aggregate_sentences=False
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
model="gpt-4-turbo-preview")
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
vifp = VideoImageFrameProcessor(narrator)
ir = ImageRefresher()
img = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
tiw = TelestratorImageWrapper()
lfra = LLMFullResponseAggregator()
lfra1 = LLMFullResponseAggregator()
lfra2 = LLMFullResponseAggregator()
lfra3 = LLMFullResponseAggregator()
lfra4 = LLMFullResponseAggregator()
fl0 = FrameLogger("@@@ About to describe")
fl1 = FrameLogger("!!! About to image gen")
f4 = FrameLogger("((( partway through )))")
f5 = FrameLogger("!!! f5")
ns = NarratorShuffle(narrator, narrators)
t1 = TranslationProcessor("English", "Spanish")
t2 = TranslationProcessor("Spanish", "German")
t3 = TranslationProcessor("German", "Japanese")
t4 = TranslationProcessor("Japanese", "English")
pipeline = Pipeline(
processors=[
fl0,
vifp,
vs,
lfra,
tts,
f4,
t1,
llm,
lfra1,
f5,
tts,
t2,
llm,
lfra2,
tts,
t3,
llm,
lfra3,
tts,
t4,
llm,
lfra4,
tts,
fl1,
img,
tiw,
],
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,191 @@
import asyncio
import aiohttp
import logging
import os
import random
from typing import AsyncGenerator
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.ai_services import FrameLogger
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
LLMFullResponseAggregator
)
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
from examples.support.runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
"prompt": "Describe the image in a haiku."},
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
{"voice_id": "bnyr1EF3snReVXauGBNn",
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
# random.shuffle(narrators)
print(f"$$$ narrators: {narrators}")
narrator = {"narrator": narrators[0]}
class TranslationProcessor(FrameProcessor):
def __init__(self, in_language, out_language):
self._in_language = in_language
self._out_language = out_language
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TextFrame):
context = [
{
"role": "system",
"content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
},
{"role": "user", "content": frame.text},
]
yield LLMMessagesQueueFrame(context)
else:
yield frame
class NarratorShuffle(FrameProcessor):
def __init__(self, narrator, narrators):
self._narrator = narrator
self._narrators = narrators
self._i = 0
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
self._i += 1
if self._i >= len(self._narrators):
print(f"### shuffling narrators")
random.shuffle(self._narrators)
self._i = 0
self._narrator["narrator"] = self._narrators[self._i]
print(f"### new narrator is {self._narrator}")
yield frame
class VideoImageFrameProcessor(FrameProcessor):
def __init__(self, narrator):
self._narrator = narrator
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
else:
yield frame
class ImageRefresher(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, LLMResponseEndFrame):
yield RequestVideoImageFrame(participantId=None)
yield frame
else:
yield frame
class TelestratorImageWrapper(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, ImageFrame):
yield TelestratorImageFrame(None, frame.image)
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
vad_enabled=False,
receive_video=True,
receive_video_fps=0
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
narrator=narrator,
aggregate_sentences=False
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
model="gpt-4-turbo-preview")
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
vifp = VideoImageFrameProcessor(narrator)
ir = ImageRefresher()
img = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
tiw = TelestratorImageWrapper()
lfra = LLMFullResponseAggregator()
lfra1 = LLMFullResponseAggregator()
lfra2 = LLMFullResponseAggregator()
lfra3 = LLMFullResponseAggregator()
lfra4 = LLMFullResponseAggregator()
fl0 = FrameLogger("@@@ About to describe")
fl1 = FrameLogger("!!! About to image gen")
f4 = FrameLogger("((( partway through )))")
f5 = FrameLogger("!!! f5")
ns = NarratorShuffle(narrator, narrators)
t1 = TranslationProcessor("English", "Spanish")
t2 = TranslationProcessor("Spanish", "German")
t3 = TranslationProcessor("German", "Japanese")
t4 = TranslationProcessor("Japanese", "English")
pipeline = Pipeline(
processors=[
fl0,
vifp,
vs,
lfra,
tts,
fl1,
img,
tiw,
],
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,191 @@
import asyncio
import aiohttp
import logging
import os
import random
from typing import AsyncGenerator
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.ai_services import FrameLogger
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
LLMFullResponseAggregator
)
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
from examples.support.runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
"prompt": "Describe the image in nine words."},
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
{"voice_id": "bnyr1EF3snReVXauGBNn",
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
# random.shuffle(narrators)
print(f"$$$ narrators: {narrators}")
narrator = {"narrator": narrators[0]}
class TranslationProcessor(FrameProcessor):
def __init__(self, in_language, out_language):
self._in_language = in_language
self._out_language = out_language
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TextFrame):
context = [
{
"role": "system",
"content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
},
{"role": "user", "content": frame.text},
]
yield LLMMessagesQueueFrame(context)
else:
yield frame
class NarratorShuffle(FrameProcessor):
def __init__(self, narrator, narrators):
self._narrator = narrator
self._narrators = narrators
self._i = 0
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
self._i += 1
if self._i >= len(self._narrators):
print(f"### shuffling narrators")
random.shuffle(self._narrators)
self._i = 0
self._narrator["narrator"] = self._narrators[self._i]
print(f"### new narrator is {self._narrator}")
yield frame
class VideoImageFrameProcessor(FrameProcessor):
def __init__(self, narrator):
self._narrator = narrator
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
else:
yield frame
class ImageRefresher(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, LLMResponseEndFrame):
yield RequestVideoImageFrame(participantId=None)
yield frame
else:
yield frame
class TelestratorImageWrapper(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, ImageFrame):
yield TelestratorImageFrame(None, frame.image)
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
vad_enabled=False,
receive_video=True,
receive_video_fps=0
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
narrator=narrator,
aggregate_sentences=False
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
model="gpt-4-turbo-preview")
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
vifp = VideoImageFrameProcessor(narrator)
ir = ImageRefresher()
img = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
tiw = TelestratorImageWrapper()
lfra = LLMFullResponseAggregator()
lfra1 = LLMFullResponseAggregator()
lfra2 = LLMFullResponseAggregator()
lfra3 = LLMFullResponseAggregator()
lfra4 = LLMFullResponseAggregator()
fl0 = FrameLogger("@@@ About to describe")
fl1 = FrameLogger("!!! About to image gen")
f4 = FrameLogger("((( partway through )))")
f5 = FrameLogger("!!! f5")
ns = NarratorShuffle(narrator, narrators)
t1 = TranslationProcessor("English", "Spanish")
t2 = TranslationProcessor("Spanish", "German")
t3 = TranslationProcessor("German", "Japanese")
t4 = TranslationProcessor("Japanese", "English")
pipeline = Pipeline(
processors=[
fl0,
vifp,
vs,
lfra,
tts,
fl1,
img,
tiw,
],
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -0,0 +1,162 @@
import asyncio
import aiohttp
import logging
import os
import random
from typing import AsyncGenerator
from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame
from dailyai.pipeline.pipeline import Pipeline
from dailyai.pipeline.frame_processor import FrameProcessor
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
from dailyai.services.fal_ai_services import FalImageGenService
from dailyai.services.deepgram_ai_services import DeepgramTTSService
from dailyai.services.ai_services import FrameLogger
from dailyai.pipeline.aggregators import (
LLMAssistantContextAggregator,
LLMUserContextAggregator,
LLMFullResponseAggregator
)
from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
from examples.support.runner import configure
logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("dailyai")
logger.setLevel(logging.DEBUG)
narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
"prompt": "Describe the image in one sentence, in the style of David Attenborough."},
{"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
"prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
{"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
"prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
{"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
"prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
{"voice_id": "gvpBhHjzfd7M2WedYVUI",
"prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
{"voice_id": "bnyr1EF3snReVXauGBNn",
"prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
random.shuffle(narrators)
print(f"$$$ narrators: {narrators}")
narrator = {"narrator": narrators[0]}
class NarratorShuffle(FrameProcessor):
def __init__(self, narrator, narrators):
self._narrator = narrator
self._narrators = narrators
self._i = 0
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
self._i += 1
if self._i >= len(self._narrators):
print(f"### shuffling narrators")
random.shuffle(self._narrators)
self._i = 0
self._narrator["narrator"] = self._narrators[self._i]
print(f"### new narrator is {self._narrator}")
yield frame
class VideoImageFrameProcessor(FrameProcessor):
def __init__(self, narrator):
self._narrator = narrator
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
else:
yield frame
class ImageRefresher(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, LLMResponseEndFrame):
yield RequestVideoImageFrame(participantId=None)
yield frame
else:
yield frame
class TelestratorImageWrapper(FrameProcessor):
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, ImageFrame):
yield TelestratorImageFrame(None, frame.image)
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
room_url,
token,
"Respond bot",
duration_minutes=5,
start_transcription=True,
mic_enabled=True,
mic_sample_rate=16000,
camera_enabled=True,
camera_width=1024,
camera_height=1024,
vad_enabled=False,
receive_video=True,
receive_video_fps=0
)
tts = ElevenLabsTTSService(
aiohttp_session=session,
api_key=os.getenv("ELEVENLABS_API_KEY"),
narrator=narrator,
aggregate_sentences=False
)
llm = OpenAILLMService(
api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
model="gpt-4-turbo-preview")
vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
vifp = VideoImageFrameProcessor(narrator)
ir = ImageRefresher()
img = FalImageGenService(
image_size="1024x1024",
aiohttp_session=session,
key_id=os.getenv("FAL_KEY_ID"),
key_secret=os.getenv("FAL_KEY_SECRET"),
)
tiw = TelestratorImageWrapper()
lfra = LLMFullResponseAggregator()
fl0 = FrameLogger("@@@ About to describe")
fl1 = FrameLogger("!!! About to image gen")
ns = NarratorShuffle(narrator, narrators)
pipeline = Pipeline(
processors=[
ns,
fl0,
vifp,
vs,
lfra,
tts,
fl1,
img,
tiw,
],
)
@transport.event_handler("on_first_other_participant_joined")
async def on_first_other_participant_joined(transport):
await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True
await transport.run(pipeline)
if __name__ == "__main__":
(url, token) = configure()
asyncio.run(main(url, token))

View File

@@ -26,7 +26,8 @@ logger.setLevel(logging.DEBUG)
"""
This example looks a bit different than the chatbot example, because it isn't waiting on the user to stop talking to start translating.
It also isn't saving what the user or bot says into the context object for use in subsequent interactions.
It also isn't saving what the user or bot says into the context object for use in subsequent interactions. This example also sends
the translated text back to the transport as an App Message, so clients can show subtitles.
"""
@@ -50,6 +51,20 @@ class TranslationProcessor(FrameProcessor):
yield frame
class TranslationSubtitles(FrameProcessor):
def __init__(self, language):
self._language = language
async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
if isinstance(frame, TextFrame):
app_message = {
"language": self._language,
"text": frame.text
}
else:
yield frame
async def main(room_url: str, token):
async with aiohttp.ClientSession() as session:
transport = DailyTransportService(
@@ -73,7 +88,8 @@ async def main(room_url: str, token):
model="gpt-4-turbo-preview")
sa = SentenceAggregator()
tp = TranslationProcessor("Spanish")
pipeline = Pipeline([sa, tp, llm, tts])
ts = TranslationSubtitles("Spanish")
pipeline = Pipeline([sa, tp, llm, tts, ts])
transport.transcription_settings["extra"]["endpointing"] = True
transport.transcription_settings["extra"]["punctuate"] = True