added fuzz example

wip
wip: telestrator
2024-03-22 14:20:16 +00:00 · 2024-03-19 22:04:47 +00:00 · 2024-03-19 15:31:19 +00:00 · 2024-03-19 03:08:04 +00:00 · 2024-03-19 01:51:36 +00:00 · 2024-03-18 22:14:02 +00:00
17 changed files with 1287 additions and 18 deletions
--- a/src/dailyai/pipeline/aggregators.py
+++ b/src/dailyai/pipeline/aggregators.py
@@ -252,9 +252,15 @@ class LLMFullResponseAggregator(FrameProcessor):
        self.aggregation = ""

    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if not isinstance(frame, AudioFrame):
+            print(f"^^^ LFRA got frame: {frame}")
        if isinstance(frame, TextFrame):
            self.aggregation += frame.text
+            print(
+                f"^^^ LFRA got textframe. aggregation is now {self.aggregation}")
        elif isinstance(frame, LLMResponseEndFrame):
+            print(
+                f"^^^ LFRA got an llmresponseendframe. About to yield aggregation: {self.aggregation}")
            yield TextFrame(self.aggregation)
            yield frame
            self.aggregation = ""
--- a/src/dailyai/pipeline/frames.py
+++ b/src/dailyai/pipeline/frames.py
@@ -179,3 +179,33 @@ class LLMFunctionCallFrame(Frame):
    """Emitted when the LLM has received an entire function call completion."""
    function_name: str
    arguments: str
+
+
+@dataclass()
+class VideoImageFrame(Frame):
+    """Contains a still image from a partcipant's video stream."""
+    participantId: str
+    image: bytes
+
+    # def __str__(self):
+    #     return f"{self.__class__.__name__}, participantId: {self.participantId}, image size: {len(self.image)} B"
+
+
+class TelestratorImageFrame(ImageFrame):
+    pass
+
+
+@dataclass()
+class VisionFrame(Frame):
+    prompt: str
+    image: bytes
+
+    # def __str__(self):
+    #     return f"{self.__class__.__name__}, prompt: {self.prompt}, image size: {len(self.image)} B"
+
+
+@dataclass()
+class RequestVideoImageFrame(Frame):
+    """Send to the transport to request a new video image from a specific participant. Leave participantId
+    empty to request a frame from all participants."""
+    participantId: str | None
--- a/src/dailyai/services/ai_services.py
+++ b/src/dailyai/services/ai_services.py
@@ -18,6 +18,7 @@ from dailyai.pipeline.frames import (
    Frame,
    TextFrame,
    TranscriptionQueueFrame,
+    VisionFrame
 )

 from abc import abstractmethod
@@ -61,6 +62,7 @@ class TTSService(AIService):
                yield TextFrame(self.current_sentence)

        if not isinstance(frame, TextFrame):
+            print(f"*** tts yielding non-text: {frame}")
            yield frame
            return

@@ -79,6 +81,7 @@ class TTSService(AIService):

            # note we pass along the text frame *after* the audio, so the text
            # frame is completed after the audio is processed.
+            print(f"*** tts yielding text: {text}")
            yield TextFrame(text)


@@ -133,14 +136,36 @@ class STTService(AIService):
        yield TranscriptionQueueFrame(text, "", str(time.time()))


+class VisionService(AIService):
+    def __init__(self):
+        super().__init__()
+
+    # Renders the image. Returns an Image object.
+    # TODO-CB: return type
+    @abstractmethod
+    async def run_vision(self, prompt: str, image: bytes):
+        pass
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, VisionFrame):
+            async for frame in self.run_vision(frame.prompt, frame.image):
+                print(
+                    f"&&& visionservce processframe got frame to yield: {frame}")
+                yield frame
+            yield LLMResponseEndFrame()
+        else:
+            yield frame
+
+
 class FrameLogger(AIService):
    def __init__(self, prefix="Frame", **kwargs):
        super().__init__(**kwargs)
        self.prefix = prefix

    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
-        if isinstance(frame, (AudioFrame, ImageFrame)):
-            self.logger.info(f"{self.prefix}: {type(frame)}")
+        if isinstance(frame, (AudioFrame)):
+            # self.logger.info(f"{self.prefix}: {type(frame)}")
+            pass
        else:
            print(f"{self.prefix}: {frame}")

--- a/src/dailyai/services/base_transport_service.py
+++ b/src/dailyai/services/base_transport_service.py
@@ -24,6 +24,8 @@ from dailyai.pipeline.frames import (
    TextFrame,
    UserStartedSpeakingFrame,
    UserStoppedSpeakingFrame,
+    RequestVideoImageFrame,
+    TelestratorImageFrame
 )
 from dailyai.pipeline.pipeline import Pipeline
 from dailyai.services.ai_services import TTSService
@@ -90,7 +92,9 @@ class BaseTransportService:
        self._vad_stop_s = kwargs.get("vad_stop_s") or 0.8
        self._context = kwargs.get("context") or []
        self._vad_enabled = kwargs.get("vad_enabled") or False
-
+        self._receive_video = kwargs.get("receive_video") or False
+        self._receive_video_fps = kwargs.get("receive_video_fps") or 0.0
+        self._participant_frame_times = {}
        if self._vad_enabled and self._speaker_enabled:
            raise Exception(
                "Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False."
@@ -441,6 +445,7 @@ class BaseTransportService:
                    # discard them
                    if not self._is_interrupted.is_set():
                        if frame:
+
                            if isinstance(frame, AudioFrame):
                                chunk = frame.data

@@ -452,6 +457,12 @@ class BaseTransportService:
                                    self.write_frame_to_mic(
                                        bytes(b[:truncated_length]))
                                    b = b[truncated_length:]
+                            elif isinstance(frame, TelestratorImageFrame):
+                                self._set_image(frame.image)
+                                asyncio.run_coroutine_threadsafe(
+                                    self.receive_queue.put(frame),
+                                    self._loop,
+                                )
                            elif isinstance(frame, ImageFrame):
                                self._set_image(frame.image)
                            elif isinstance(frame, SpriteFrame):
@@ -459,6 +470,15 @@ class BaseTransportService:
                            elif isinstance(frame, SendAppMessageFrame):
                                self.send_app_message(
                                    frame.message, frame.participantId)
+                            elif isinstance(frame, RequestVideoImageFrame):
+                                # removing one or all participant IDs from _participant_frame_times
+                                # will cause the transport to send the next available frame from
+                                # that participant
+                                if frame.participantId:
+                                    self._participant_frame_times.pop(
+                                        frame.participantId, None)
+                                else:
+                                    self._participant_frame_times.clear()
                        elif len(b):
                            self.write_frame_to_mic(bytes(b))
                            b = bytearray()
--- a/src/dailyai/services/daily_transport_service.py
+++ b/src/dailyai/services/daily_transport_service.py
@@ -2,6 +2,7 @@ import asyncio
 import inspect
 import logging
 import signal
+import time
 import threading
 import types

@@ -11,6 +12,8 @@ from typing import Any
 from dailyai.pipeline.frames import (
    ReceivedAppMessageFrame,
    TranscriptionQueueFrame,
+    VideoImageFrame,
+    TelestratorImageFrame
 )

 from threading import Event
@@ -204,11 +207,12 @@ class DailyTransportService(BaseTransportService, EventHandler):
        )
        self._my_participant_id = self.client.participants()["local"]["id"]

-        self.client.update_subscription_profiles({
-            "base": {
-                "camera": "unsubscribed",
-            }
-        })
+        if not self._receive_video:
+            self.client.update_subscription_profiles({
+                "base": {
+                    "camera": "unsubscribed",
+                }
+            })

        if self._token and self._start_transcription:
            self.client.start_transcription(self.transcription_settings)
@@ -225,6 +229,31 @@ class DailyTransportService(BaseTransportService, EventHandler):
        self.client.leave()
        self.client.release()

+    def _handle_video_frame(self, participant_id, video_frame):
+        """If receive_video is true, this function is called once for each frame from each participant. We
+         don't need to send every frame to the pipeline, so there are two ways to decide how to send frames:
+         1. Set a greater-than-zero value for receive_video_fps. The transport will track the last send time
+            for each participant and send a new frame when the requested frame rate has elapsed. This
+            guarantees an image every second, for example.
+         2. Set receive_video_fps less than or equal to zero to disable timed frame sending. Then, put a
+            RequestVideoImageFrame in the pipeline to get a new frame for one or all participants. By
+            sending a RequestVideoImageFrame immediately after successfully processing an image, you can
+            ensure you don't end up queueing up frames faster than you can process them.
+            """
+        send_frame = False
+        if not participant_id in self._participant_frame_times:
+            # then it's a new participant; send the first frame
+            send_frame = True
+        elif self._receive_video_fps > 0 and time.time() > self._participant_frame_times[participant_id] + 1.0/self._receive_video_fps:
+            # Then it's an existing participant who is due to send a new frame
+            send_frame = True
+
+        if send_frame:
+            self._participant_frame_times[participant_id] = time.time()
+            future = asyncio.run_coroutine_threadsafe(
+                self.receive_queue.put(
+                    VideoImageFrame(participant_id, video_frame)), self._loop)
+
    def on_first_other_participant_joined(self):
        pass

@@ -248,6 +277,9 @@ class DailyTransportService(BaseTransportService, EventHandler):
        if not self._other_participant_has_joined and participant["id"] != self._my_participant_id:
            self._other_participant_has_joined = True
            self.on_first_other_participant_joined()
+        if self._receive_video:
+            self.client.set_video_renderer(
+                participant["id"], self._handle_video_frame)

    def on_participant_left(self, participant, reason):
        if len(self.client.participants()) < self._min_others_count + 1:
--- a/src/dailyai/services/elevenlabs_ai_service.py
+++ b/src/dailyai/services/elevenlabs_ai_service.py
@@ -15,18 +15,19 @@ class ElevenLabsTTSService(TTSService):
        *,
        aiohttp_session: aiohttp.ClientSession,
        api_key,
-        voice_id,
+        narrator,
        model="eleven_turbo_v2",
+        aggregate_sentences=True
    ):
-        super().__init__()
+        super().__init__(aggregate_sentences)

        self._api_key = api_key
-        self._voice_id = voice_id
+        self._narrator = narrator
        self._aiohttp_session = aiohttp_session
        self._model = model

    async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
-        url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._voice_id}/stream"
+        url = f"https://api.elevenlabs.io/v1/text-to-speech/{self._narrator['narrator']['voice_id']}/stream"
        payload = {"text": sentence, "model_id": self._model}
        querystring = {
            "output_format": "pcm_16000",
@@ -35,6 +36,7 @@ class ElevenLabsTTSService(TTSService):
            "xi-api-key": self._api_key,
            "Content-Type": "application/json",
        }
+
        async with self._aiohttp_session.post(
            url, json=payload, headers=headers, params=querystring
        ) as r:
--- a/src/dailyai/services/fal_ai_services.py
+++ b/src/dailyai/services/fal_ai_services.py
@@ -53,4 +53,7 @@ class FalImageGenService(ImageGenService):
        async with self._aiohttp_session.get(image_url) as response:
            image_stream = io.BytesIO(await response.content.read())
            image = Image.open(image_stream)
-            return (image_url, image.tobytes())
+            image_bytes = image.tobytes()
+            print(f"!!! fal image tobytes is:")
+            print(image)
+            return (image_url, image_bytes)
--- a/src/dailyai/services/open_ai_services.py
+++ b/src/dailyai/services/open_ai_services.py
@@ -2,13 +2,22 @@ import aiohttp
 from PIL import Image
 import io
 import time
-from openai import AsyncOpenAI
+import base64
+from openai import AsyncOpenAI, AsyncStream

 import json
 from collections.abc import AsyncGenerator

-from dailyai.services.ai_services import LLMService, ImageGenService
+from openai.types.chat import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessageParam,
+)
+
+from daily import VideoFrame
+from dailyai.services.ai_services import LLMService, ImageGenService, VisionService
 from dailyai.services.openai_api_llm_service import BaseOpenAILLMService
+from dailyai.pipeline.frames import TextFrame


 class OpenAILLMService(BaseOpenAILLMService):
@@ -50,3 +59,67 @@ class OpenAIImageGenService(ImageGenService):
            image_stream = io.BytesIO(await response.content.read())
            image = Image.open(image_stream)
            return (image_url, image.tobytes())
+
+
+class OpenAIVisionService(VisionService):
+    def __init__(
+        self,
+        *,
+        model="gpt-4-vision-preview",
+        api_key,
+    ):
+        self._model = model
+        self._client = AsyncOpenAI(api_key=api_key)
+
+    async def run_vision(self, prompt: str, image: bytes):
+        if isinstance(image, VideoFrame):
+            # Then it's from a daily video frame
+            print("### processing daily video frame for recognition")
+            IMAGE_WIDTH = image.width
+            IMAGE_HEIGHT = image.height
+            COLOR_FORMAT = image.color_format
+            a_image = Image.frombytes(
+                'RGBA', (IMAGE_WIDTH, IMAGE_HEIGHT), image.buffer)
+            new_image = a_image.convert('RGB')
+        else:
+            # handle it as a byte stream from image gen
+            new_image = Image.frombytes('RGB', (1024, 1024), image)
+            # Uncomment these lines to write the frame to a jpg in the same directory.
+            # current_path = os.getcwd()
+            # image_path = os.path.join(current_path, "image.jpg")
+            # image.save(image_path, format="JPEG")
+
+        jpeg_buffer = io.BytesIO()
+
+        new_image.save(jpeg_buffer, format='JPEG')
+
+        jpeg_bytes = jpeg_buffer.getvalue()
+        base64_image = base64.b64encode(jpeg_bytes).decode('utf-8')
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{base64_image}"
+                        },
+                    },
+                ],
+            }
+        ]
+        chunks: AsyncStream[ChatCompletionChunk] = (
+            await self._client.chat.completions.create(
+                model=self._model,
+                stream=True,
+                messages=messages,
+            )
+        )
+        async for chunk in chunks:
+            print(f"%%% chunk: {chunk}")
+            if len(chunk.choices) == 0:
+                continue
+            if chunk.choices[0].delta.content:
+                yield TextFrame(chunk.choices[0].delta.content)
--- a/src/examples/foundational/12-describe-video.py
+++ b/src/examples/foundational/12-describe-video.py
@@ -0,0 +1,97 @@
+import asyncio
+import aiohttp
+import logging
+import os
+from typing import AsyncGenerator
+
+from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.pipeline.frame_processor import FrameProcessor
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
+from dailyai.services.ai_services import FrameLogger
+from dailyai.pipeline.aggregators import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+)
+from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
+from examples.support.runner import configure
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+
+class VideoImageFrameProcessor(FrameProcessor):
+    def __init__(self):
+        pass
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, VideoImageFrame):
+            yield VisionFrame("Describe the image in one sentence.", frame.image)
+        else:
+            yield frame
+
+
+class ImageRefresher(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, LLMResponseEndFrame):
+            yield RequestVideoImageFrame(participantId=None)
+            yield frame
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Respond bot",
+            duration_minutes=5,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=False,
+            vad_enabled=True,
+            receive_video=True,
+            receive_video_fps=0
+        )
+
+        tts = ElevenLabsTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("ELEVENLABS_API_KEY"),
+            voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
+        )
+
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
+            model="gpt-4-turbo-preview")
+
+        vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
+        vifp = VideoImageFrameProcessor()
+        ir = ImageRefresher()
+        pipeline = Pipeline(
+            processors=[
+                vifp,
+                vs,
+                llm,
+                tts,
+                ir,
+            ],
+        )
+
+        @transport.event_handler("on_first_other_participant_joined")
+        async def on_first_other_participant_joined(transport):
+            await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+        await transport.run(pipeline)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
--- a/src/examples/starter-apps/chatbot.py
+++ b/src/examples/starter-apps/chatbot.py
@@ -124,7 +124,6 @@ async def main(room_url: str, token):

        @transport.event_handler("on_first_other_participant_joined")
        async def on_first_other_participant_joined(transport):
-            print(f"!!! in here, pipeline.source is {pipeline.source}")
            await pipeline.queue_frames([LLMMessagesQueueFrame(messages)])

        async def run_conversation():
--- a/src/examples/starter-apps/telestrator/describer.py
+++ b/src/examples/starter-apps/telestrator/describer.py
@@ -0,0 +1,100 @@
+import asyncio
+import aiohttp
+import logging
+import os
+from typing import AsyncGenerator
+
+from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.pipeline.frame_processor import FrameProcessor
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
+from dailyai.services.fal_ai_services import FalImageGenService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
+from dailyai.services.ai_services import FrameLogger
+from dailyai.pipeline.aggregators import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+)
+from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
+from examples.support.runner import configure
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+
+class VideoImageFrameProcessor(FrameProcessor):
+    def __init__(self):
+        pass
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, VideoImageFrame):
+            yield VisionFrame("Describe the image in one sentence.", frame.image)
+        else:
+            yield frame
+
+
+class ImageRefresher(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, LLMResponseEndFrame):
+            yield RequestVideoImageFrame(participantId=None)
+            yield frame
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Respond bot",
+            duration_minutes=5,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=True,
+            camera_width=1024,
+            camera_height=1024,
+            vad_enabled=False,
+            receive_video=True,
+            receive_video_fps=0
+        )
+
+        tts = ElevenLabsTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("ELEVENLABS_API_KEY"),
+            voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
+        )
+
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
+            model="gpt-4-turbo-preview")
+
+        vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
+        vifp = VideoImageFrameProcessor()
+        ir = ImageRefresher()
+
+        pipeline = Pipeline(
+            processors=[
+                vifp,
+                vs,
+                tts,
+                ir,
+            ],
+        )
+
+        @transport.event_handler("on_first_other_participant_joined")
+        async def on_first_other_participant_joined(transport):
+            await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+        await transport.run(pipeline)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
--- a/src/examples/starter-apps/telestrator/illustrator.py
+++ b/src/examples/starter-apps/telestrator/illustrator.py
@@ -0,0 +1,112 @@
+import asyncio
+import aiohttp
+import logging
+import os
+from typing import AsyncGenerator
+
+from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame, TranscriptionQueueFrame, TextFrame
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.pipeline.frame_processor import FrameProcessor
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
+from dailyai.services.fal_ai_services import FalImageGenService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
+from dailyai.services.ai_services import FrameLogger
+from dailyai.pipeline.aggregators import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+)
+from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
+from examples.support.runner import configure
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+
+class VADAggregator(FrameProcessor):
+    def __init__(self):
+        self.aggregating = False
+        self.aggregation = ""
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, UserStartedSpeakingFrame):
+            self.aggregating = True
+        elif isinstance(frame, UserStoppedSpeakingFrame):
+            self.aggregating = False
+            # Sometimes VAD triggers quickly on and off. If we don't get any transcription,
+            # it creates empty LLM message queue frames
+            if len(self.aggregation) > 0:
+                yield TextFrame(self.aggregation)
+
+                self.aggregation = ""
+                yield frame
+        elif isinstance(frame, TranscriptionQueueFrame) and self.aggregating:
+            self.aggregation += f" {frame.text}"
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Respond bot",
+            duration_minutes=5,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=True,
+            camera_width=1024,
+            camera_height=1024,
+            vad_enabled=True,
+            receive_video=True,
+            receive_video_fps=0,
+            vad_timeout_s=1.0
+        )
+
+        tts = ElevenLabsTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("ELEVENLABS_API_KEY"),
+            voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
+        )
+
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
+            model="gpt-4-turbo-preview")
+
+        vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
+        vad = VADAggregator()
+        img = FalImageGenService(
+            image_size="1024x1024",
+            aiohttp_session=session,
+            key_id=os.getenv("FAL_KEY_ID"),
+            key_secret=os.getenv("FAL_KEY_SECRET"),
+        )
+        fl = FrameLogger("!!! Start")
+        fl2 = FrameLogger("!!! AFTER VAD")
+        fl3 = FrameLogger("!!! After img")
+        pipeline = Pipeline(
+            processors=[
+                fl,
+                vad,
+                fl2,
+                img,
+                fl3
+            ],
+        )
+
+        @transport.event_handler("on_first_other_participant_joined")
+        async def on_first_other_participant_joined(transport):
+            await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+        await transport.run(pipeline)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
--- a/src/examples/starter-apps/telestrator/telestrator-fuzz.py
+++ b/src/examples/starter-apps/telestrator/telestrator-fuzz.py
@@ -0,0 +1,210 @@
+import asyncio
+import aiohttp
+import logging
+import os
+import random
+from typing import AsyncGenerator
+
+from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.pipeline.frame_processor import FrameProcessor
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
+from dailyai.services.fal_ai_services import FalImageGenService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
+from dailyai.services.ai_services import FrameLogger
+from dailyai.pipeline.aggregators import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+    LLMFullResponseAggregator
+)
+from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
+from examples.support.runner import configure
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
+              "prompt": "Describe the image in one sentence."},
+             {"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
+              "prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
+             {"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
+              "prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
+             {"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
+              "prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
+             {"voice_id": "gvpBhHjzfd7M2WedYVUI",
+              "prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
+             {"voice_id": "bnyr1EF3snReVXauGBNn",
+              "prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
+
+# random.shuffle(narrators)
+print(f"$$$ narrators: {narrators}")
+narrator = {"narrator": narrators[0]}
+
+
+class TranslationProcessor(FrameProcessor):
+    def __init__(self, in_language, out_language):
+        self._in_language = in_language
+        self._out_language = out_language
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, TextFrame):
+            context = [
+                {
+                    "role": "system",
+                    "content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
+                },
+                {"role": "user", "content": frame.text},
+            ]
+
+            yield LLMMessagesQueueFrame(context)
+        else:
+            yield frame
+
+
+class NarratorShuffle(FrameProcessor):
+    def __init__(self, narrator, narrators):
+        self._narrator = narrator
+        self._narrators = narrators
+        self._i = 0
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
+            self._i += 1
+            if self._i >= len(self._narrators):
+                print(f"### shuffling narrators")
+                random.shuffle(self._narrators)
+                self._i = 0
+
+            self._narrator["narrator"] = self._narrators[self._i]
+            print(f"### new narrator is {self._narrator}")
+        yield frame
+
+
+class VideoImageFrameProcessor(FrameProcessor):
+    def __init__(self, narrator):
+        self._narrator = narrator
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
+            yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
+        else:
+            yield frame
+
+
+class ImageRefresher(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, LLMResponseEndFrame):
+            yield RequestVideoImageFrame(participantId=None)
+            yield frame
+        else:
+            yield frame
+
+
+class TelestratorImageWrapper(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, ImageFrame):
+            yield TelestratorImageFrame(None, frame.image)
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Respond bot",
+            duration_minutes=5,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=True,
+            camera_width=1024,
+            camera_height=576,
+            vad_enabled=False,
+            receive_video=True,
+            receive_video_fps=0
+        )
+
+        tts = ElevenLabsTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("ELEVENLABS_API_KEY"),
+            narrator=narrator,
+            aggregate_sentences=False
+        )
+
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
+            model="gpt-4-turbo-preview")
+
+        vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
+        vifp = VideoImageFrameProcessor(narrator)
+        ir = ImageRefresher()
+        img = FalImageGenService(
+            image_size="1024x1024",
+            aiohttp_session=session,
+            key_id=os.getenv("FAL_KEY_ID"),
+            key_secret=os.getenv("FAL_KEY_SECRET"),
+        )
+        tiw = TelestratorImageWrapper()
+        lfra = LLMFullResponseAggregator()
+        lfra1 = LLMFullResponseAggregator()
+        lfra2 = LLMFullResponseAggregator()
+        lfra3 = LLMFullResponseAggregator()
+        lfra4 = LLMFullResponseAggregator()
+        fl0 = FrameLogger("@@@ About to describe")
+        fl1 = FrameLogger("!!! About to image gen")
+        f4 = FrameLogger("((( partway through )))")
+        f5 = FrameLogger("!!! f5")
+        ns = NarratorShuffle(narrator, narrators)
+        t1 = TranslationProcessor("English", "Spanish")
+        t2 = TranslationProcessor("Spanish", "German")
+        t3 = TranslationProcessor("German", "Japanese")
+        t4 = TranslationProcessor("Japanese", "English")
+        pipeline = Pipeline(
+            processors=[
+                fl0,
+                vifp,
+                vs,
+                lfra,
+                tts,
+                f4,
+                t1,
+                llm,
+                lfra1,
+                f5,
+                tts,
+
+                t2,
+                llm,
+                lfra2,
+                tts,
+                t3,
+                llm,
+                lfra3,
+                tts,
+                t4,
+                llm,
+                lfra4,
+                tts,
+                fl1,
+                img,
+                tiw,
+            ],
+        )
+
+        @transport.event_handler("on_first_other_participant_joined")
+        async def on_first_other_participant_joined(transport):
+            await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+        await transport.run(pipeline)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
--- a/src/examples/starter-apps/telestrator/telestrator-haiku.py
+++ b/src/examples/starter-apps/telestrator/telestrator-haiku.py
@@ -0,0 +1,191 @@
+import asyncio
+import aiohttp
+import logging
+import os
+import random
+from typing import AsyncGenerator
+
+from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.pipeline.frame_processor import FrameProcessor
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
+from dailyai.services.fal_ai_services import FalImageGenService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
+from dailyai.services.ai_services import FrameLogger
+from dailyai.pipeline.aggregators import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+    LLMFullResponseAggregator
+)
+from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
+from examples.support.runner import configure
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
+              "prompt": "Describe the image in a haiku."},
+             {"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
+              "prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
+             {"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
+              "prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
+             {"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
+              "prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
+             {"voice_id": "gvpBhHjzfd7M2WedYVUI",
+              "prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
+             {"voice_id": "bnyr1EF3snReVXauGBNn",
+              "prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
+
+# random.shuffle(narrators)
+print(f"$$$ narrators: {narrators}")
+narrator = {"narrator": narrators[0]}
+
+
+class TranslationProcessor(FrameProcessor):
+    def __init__(self, in_language, out_language):
+        self._in_language = in_language
+        self._out_language = out_language
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, TextFrame):
+            context = [
+                {
+                    "role": "system",
+                    "content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
+                },
+                {"role": "user", "content": frame.text},
+            ]
+
+            yield LLMMessagesQueueFrame(context)
+        else:
+            yield frame
+
+
+class NarratorShuffle(FrameProcessor):
+    def __init__(self, narrator, narrators):
+        self._narrator = narrator
+        self._narrators = narrators
+        self._i = 0
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
+            self._i += 1
+            if self._i >= len(self._narrators):
+                print(f"### shuffling narrators")
+                random.shuffle(self._narrators)
+                self._i = 0
+
+            self._narrator["narrator"] = self._narrators[self._i]
+            print(f"### new narrator is {self._narrator}")
+        yield frame
+
+
+class VideoImageFrameProcessor(FrameProcessor):
+    def __init__(self, narrator):
+        self._narrator = narrator
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
+            yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
+        else:
+            yield frame
+
+
+class ImageRefresher(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, LLMResponseEndFrame):
+            yield RequestVideoImageFrame(participantId=None)
+            yield frame
+        else:
+            yield frame
+
+
+class TelestratorImageWrapper(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, ImageFrame):
+            yield TelestratorImageFrame(None, frame.image)
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Respond bot",
+            duration_minutes=5,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=True,
+            camera_width=1024,
+            camera_height=1024,
+            vad_enabled=False,
+            receive_video=True,
+            receive_video_fps=0
+        )
+
+        tts = ElevenLabsTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("ELEVENLABS_API_KEY"),
+            narrator=narrator,
+            aggregate_sentences=False
+        )
+
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
+            model="gpt-4-turbo-preview")
+
+        vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
+        vifp = VideoImageFrameProcessor(narrator)
+        ir = ImageRefresher()
+        img = FalImageGenService(
+            image_size="1024x1024",
+            aiohttp_session=session,
+            key_id=os.getenv("FAL_KEY_ID"),
+            key_secret=os.getenv("FAL_KEY_SECRET"),
+        )
+        tiw = TelestratorImageWrapper()
+        lfra = LLMFullResponseAggregator()
+        lfra1 = LLMFullResponseAggregator()
+        lfra2 = LLMFullResponseAggregator()
+        lfra3 = LLMFullResponseAggregator()
+        lfra4 = LLMFullResponseAggregator()
+        fl0 = FrameLogger("@@@ About to describe")
+        fl1 = FrameLogger("!!! About to image gen")
+        f4 = FrameLogger("((( partway through )))")
+        f5 = FrameLogger("!!! f5")
+        ns = NarratorShuffle(narrator, narrators)
+        t1 = TranslationProcessor("English", "Spanish")
+        t2 = TranslationProcessor("Spanish", "German")
+        t3 = TranslationProcessor("German", "Japanese")
+        t4 = TranslationProcessor("Japanese", "English")
+        pipeline = Pipeline(
+            processors=[
+                fl0,
+                vifp,
+                vs,
+                lfra,
+                tts,
+                fl1,
+                img,
+                tiw,
+            ],
+        )
+
+        @transport.event_handler("on_first_other_participant_joined")
+        async def on_first_other_participant_joined(transport):
+            await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+        await transport.run(pipeline)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
--- a/src/examples/starter-apps/telestrator/telestrator-wordcount.py
+++ b/src/examples/starter-apps/telestrator/telestrator-wordcount.py
@@ -0,0 +1,191 @@
+import asyncio
+import aiohttp
+import logging
+import os
+import random
+from typing import AsyncGenerator
+
+from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame, TextFrame
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.pipeline.frame_processor import FrameProcessor
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
+from dailyai.services.fal_ai_services import FalImageGenService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
+from dailyai.services.ai_services import FrameLogger
+from dailyai.pipeline.aggregators import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+    LLMFullResponseAggregator
+)
+from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
+from examples.support.runner import configure
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
+              "prompt": "Describe the image in nine words."},
+             {"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
+              "prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
+             {"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
+              "prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
+             {"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
+              "prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
+             {"voice_id": "gvpBhHjzfd7M2WedYVUI",
+              "prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
+             {"voice_id": "bnyr1EF3snReVXauGBNn",
+              "prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
+
+# random.shuffle(narrators)
+print(f"$$$ narrators: {narrators}")
+narrator = {"narrator": narrators[0]}
+
+
+class TranslationProcessor(FrameProcessor):
+    def __init__(self, in_language, out_language):
+        self._in_language = in_language
+        self._out_language = out_language
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, TextFrame):
+            context = [
+                {
+                    "role": "system",
+                    "content": f"You will be provided with a sentence in {self._in_language}, and your task is to translate it into {self._out_language}.",
+                },
+                {"role": "user", "content": frame.text},
+            ]
+
+            yield LLMMessagesQueueFrame(context)
+        else:
+            yield frame
+
+
+class NarratorShuffle(FrameProcessor):
+    def __init__(self, narrator, narrators):
+        self._narrator = narrator
+        self._narrators = narrators
+        self._i = 0
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
+            self._i += 1
+            if self._i >= len(self._narrators):
+                print(f"### shuffling narrators")
+                random.shuffle(self._narrators)
+                self._i = 0
+
+            self._narrator["narrator"] = self._narrators[self._i]
+            print(f"### new narrator is {self._narrator}")
+        yield frame
+
+
+class VideoImageFrameProcessor(FrameProcessor):
+    def __init__(self, narrator):
+        self._narrator = narrator
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
+            yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
+        else:
+            yield frame
+
+
+class ImageRefresher(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, LLMResponseEndFrame):
+            yield RequestVideoImageFrame(participantId=None)
+            yield frame
+        else:
+            yield frame
+
+
+class TelestratorImageWrapper(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, ImageFrame):
+            yield TelestratorImageFrame(None, frame.image)
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Respond bot",
+            duration_minutes=5,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=True,
+            camera_width=1024,
+            camera_height=1024,
+            vad_enabled=False,
+            receive_video=True,
+            receive_video_fps=0
+        )
+
+        tts = ElevenLabsTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("ELEVENLABS_API_KEY"),
+            narrator=narrator,
+            aggregate_sentences=False
+        )
+
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
+            model="gpt-4-turbo-preview")
+
+        vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
+        vifp = VideoImageFrameProcessor(narrator)
+        ir = ImageRefresher()
+        img = FalImageGenService(
+            image_size="1024x1024",
+            aiohttp_session=session,
+            key_id=os.getenv("FAL_KEY_ID"),
+            key_secret=os.getenv("FAL_KEY_SECRET"),
+        )
+        tiw = TelestratorImageWrapper()
+        lfra = LLMFullResponseAggregator()
+        lfra1 = LLMFullResponseAggregator()
+        lfra2 = LLMFullResponseAggregator()
+        lfra3 = LLMFullResponseAggregator()
+        lfra4 = LLMFullResponseAggregator()
+        fl0 = FrameLogger("@@@ About to describe")
+        fl1 = FrameLogger("!!! About to image gen")
+        f4 = FrameLogger("((( partway through )))")
+        f5 = FrameLogger("!!! f5")
+        ns = NarratorShuffle(narrator, narrators)
+        t1 = TranslationProcessor("English", "Spanish")
+        t2 = TranslationProcessor("Spanish", "German")
+        t3 = TranslationProcessor("German", "Japanese")
+        t4 = TranslationProcessor("Japanese", "English")
+        pipeline = Pipeline(
+            processors=[
+                fl0,
+                vifp,
+                vs,
+                lfra,
+                tts,
+                fl1,
+                img,
+                tiw,
+            ],
+        )
+
+        @transport.event_handler("on_first_other_participant_joined")
+        async def on_first_other_participant_joined(transport):
+            await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+        await transport.run(pipeline)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
--- a/src/examples/starter-apps/telestrator/telestrator.py
+++ b/src/examples/starter-apps/telestrator/telestrator.py
@@ -0,0 +1,162 @@
+import asyncio
+import aiohttp
+import logging
+import os
+import random
+from typing import AsyncGenerator
+
+from dailyai.pipeline.frames import Frame, LLMMessagesQueueFrame, RequestVideoImageFrame, LLMResponseEndFrame, TelestratorImageFrame, ImageFrame
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.pipeline.frame_processor import FrameProcessor
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from dailyai.services.open_ai_services import OpenAILLMService, OpenAIVisionService
+from dailyai.services.fal_ai_services import FalImageGenService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
+from dailyai.services.ai_services import FrameLogger
+from dailyai.pipeline.aggregators import (
+    LLMAssistantContextAggregator,
+    LLMUserContextAggregator,
+    LLMFullResponseAggregator
+)
+from dailyai.pipeline.frames import VideoImageFrame, VisionFrame
+from examples.support.runner import configure
+
+logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+narrators = [{"voice_id": "wDRBdcyPzQOCeq51IxW5",
+              "prompt": "Describe the image in one sentence, in the style of David Attenborough."},
+             {"voice_id": "M3bAX0o3Ptb2l6XqwQJV",
+              "prompt": "Describe the image in one sentence, in the style of John Oliver's Last Week Tonight show."},
+             {"voice_id": "lJm5d2ZZ3UE4qYOxl2t7",
+              "prompt": "Describe the image in one sentence, in the style of Oprah Winfrey."},
+             {"voice_id": "7SNUlQ8GAbnZxRO9CKOt",
+              "prompt": "Describe the image in one sentence, in the style of a royal pronouncement by the Queen of England."},
+             {"voice_id": "gvpBhHjzfd7M2WedYVUI",
+              "prompt": "Describe the image in one sentence, in the style of Captain Picard from Star Trek."},
+             {"voice_id": "bnyr1EF3snReVXauGBNn",
+              "prompt": "Describe the image in one sentence, in the style of Maya Angelou."}]
+
+random.shuffle(narrators)
+print(f"$$$ narrators: {narrators}")
+narrator = {"narrator": narrators[0]}
+
+
+class NarratorShuffle(FrameProcessor):
+    def __init__(self, narrator, narrators):
+        self._narrator = narrator
+        self._narrators = narrators
+        self._i = 0
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (ImageFrame, TelestratorImageFrame)):
+            self._i += 1
+            if self._i >= len(self._narrators):
+                print(f"### shuffling narrators")
+                random.shuffle(self._narrators)
+                self._i = 0
+
+            self._narrator["narrator"] = self._narrators[self._i]
+            print(f"### new narrator is {self._narrator}")
+        yield frame
+
+
+class VideoImageFrameProcessor(FrameProcessor):
+    def __init__(self, narrator):
+        self._narrator = narrator
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, (VideoImageFrame, TelestratorImageFrame)):
+            yield VisionFrame(self._narrator["narrator"]["prompt"], frame.image)
+        else:
+            yield frame
+
+
+class ImageRefresher(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, LLMResponseEndFrame):
+            yield RequestVideoImageFrame(participantId=None)
+            yield frame
+        else:
+            yield frame
+
+
+class TelestratorImageWrapper(FrameProcessor):
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, ImageFrame):
+            yield TelestratorImageFrame(None, frame.image)
+        else:
+            yield frame
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Respond bot",
+            duration_minutes=5,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=True,
+            camera_width=1024,
+            camera_height=1024,
+            vad_enabled=False,
+            receive_video=True,
+            receive_video_fps=0
+        )
+
+        tts = ElevenLabsTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("ELEVENLABS_API_KEY"),
+            narrator=narrator,
+            aggregate_sentences=False
+        )
+
+        llm = OpenAILLMService(
+            api_key=os.getenv("OPENAI_CHATGPT_API_KEY"),
+            model="gpt-4-turbo-preview")
+
+        vs = OpenAIVisionService(api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
+        vifp = VideoImageFrameProcessor(narrator)
+        ir = ImageRefresher()
+        img = FalImageGenService(
+            image_size="1024x1024",
+            aiohttp_session=session,
+            key_id=os.getenv("FAL_KEY_ID"),
+            key_secret=os.getenv("FAL_KEY_SECRET"),
+        )
+        tiw = TelestratorImageWrapper()
+        lfra = LLMFullResponseAggregator()
+        fl0 = FrameLogger("@@@ About to describe")
+        fl1 = FrameLogger("!!! About to image gen")
+        ns = NarratorShuffle(narrator, narrators)
+        pipeline = Pipeline(
+            processors=[
+                ns,
+                fl0,
+                vifp,
+                vs,
+                lfra,
+                tts,
+                fl1,
+                img,
+                tiw,
+            ],
+        )
+
+        @transport.event_handler("on_first_other_participant_joined")
+        async def on_first_other_participant_joined(transport):
+            await pipeline.queue_frames([RequestVideoImageFrame(participantId=None)])
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+        await transport.run(pipeline)
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
--- a/src/examples/starter-apps/translator.py
+++ b/src/examples/starter-apps/translator.py
@@ -26,7 +26,8 @@ logger.setLevel(logging.DEBUG)

 """
 This example looks a bit different than the chatbot example, because it isn't waiting on the user to stop talking to start translating.
-It also isn't saving what the user or bot says into the context object for use in subsequent interactions.
+It also isn't saving what the user or bot says into the context object for use in subsequent interactions. This example also sends
+the translated text back to the transport as an App Message, so clients can show subtitles.
 """


@@ -50,6 +51,20 @@ class TranslationProcessor(FrameProcessor):
            yield frame


+class TranslationSubtitles(FrameProcessor):
+    def __init__(self, language):
+        self._language = language
+
+    async def process_frame(self, frame: Frame) -> AsyncGenerator[Frame, None]:
+        if isinstance(frame, TextFrame):
+            app_message = {
+                "language": self._language,
+                "text": frame.text
+            }
+        else:
+            yield frame
+
+
 async def main(room_url: str, token):
    async with aiohttp.ClientSession() as session:
        transport = DailyTransportService(
@@ -73,7 +88,8 @@ async def main(room_url: str, token):
            model="gpt-4-turbo-preview")
        sa = SentenceAggregator()
        tp = TranslationProcessor("Spanish")
-        pipeline = Pipeline([sa, tp, llm, tts])
+        ts = TranslationSubtitles("Spanish")
+        pipeline = Pipeline([sa, tp, llm, tts, ts])

        transport.transcription_settings["extra"]["endpointing"] = True
        transport.transcription_settings["extra"]["punctuate"] = True
Author	SHA1	Message	Date
Chad Bailey	c73fb4750f	added fuzz example	2024-03-22 14:20:16 +00:00
Chad Bailey	34b10cb4c7	wip	2024-03-19 22:04:47 +00:00
Chad Bailey	e726f15c4e	wip: telestrator	2024-03-19 15:31:19 +00:00
Chad Bailey	25ca8b751e	cleanup	2024-03-19 03:08:04 +00:00
Chad Bailey	0b4b63d2ee	Working vision example	2024-03-19 01:51:36 +00:00
Chad Bailey	6c9425d66a	wip: video image frames	2024-03-18 22:14:02 +00:00
Chad Bailey	6d3c52ae81	added app message	2024-03-18 19:52:31 +00:00