alllow interrupt

working with summary
more variables
2024-11-02 16:12:29 -07:00 · 2024-11-02 15:33:03 -07:00 · 2024-11-02 14:05:19 -07:00 · 2024-11-02 13:46:28 -07:00 · 2024-11-02 13:37:35 -07:00 · 2024-11-02 13:27:08 -07:00
2 changed files with 303 additions and 2 deletions
--- a/examples/foundational/99-anthropic-hackathon.py
+++ b/examples/foundational/99-anthropic-hackathon.py
@@ -0,0 +1,298 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import base64
+import io
+import os
+import sys
+from collections import deque
+
+import aiohttp
+from dotenv import load_dotenv
+from loguru import logger
+from PIL import Image
+from runner import configure
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.frames.frames import (
+    BotInterruptionFrame,
+    Frame,
+    ImageRawFrame,
+    LLMFullResponseEndFrame,
+    LLMMessagesFrame,
+    TextFrame,
+    TranscriptionFrame,
+)
+from pipecat.pipeline.parallel_pipeline import ParallelPipeline
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import (
+    OpenAILLMContext,
+    OpenAILLMContextFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.processors.frameworks.rtvi import (
+    RTVIBotTranscriptionProcessor,
+    RTVIUserTranscriptionProcessor,
+)
+from pipecat.services.anthropic import AnthropicLLMContext, AnthropicLLMService
+from pipecat.services.cartesia import CartesiaTTSService
+from pipecat.transports.services.daily import DailyParams, DailyTransport
+
+load_dotenv(override=True)
+
+logger.remove(0)
+logger.add(sys.stderr, level="DEBUG")
+
+MAX_FRAMES = 5
+FRAMES_PER_SECOND = 0.2
+
+
+video_participant_id = None
+anthropic_context = None
+recent_image_frames = deque(maxlen=MAX_FRAMES)
+most_recent_image_summary = ""
+
+
+class ImageFrameCatcher(FrameProcessor):
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        global recent_image_frames
+
+        await super().process_frame(frame, direction)
+        if isinstance(frame, ImageRawFrame):
+            recent_image_frames.append(frame)
+        else:
+            await self.push_frame(frame, direction)
+
+
+class TranscriptFrameCatcher(FrameProcessor):
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, TranscriptionFrame):
+            logger.debug(
+                f"TranscriptLogger: {frame}, num frames: {len(recent_image_frames)}, anthropic context: {anthropic_context}"
+            )
+            if anthropic_context:
+                add_message_with_images(
+                    anthropic_context, frame.text, frames=list(recent_image_frames)
+                )
+        await self.push_frame(frame, direction)
+
+
+class MessageFrameCatcher(FrameProcessor):
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+        if isinstance(frame, OpenAILLMContextFrame):
+            last_message = frame.context.messages[-1]
+
+            system_message = """
+Give me a concise summary of the images supplied.
+            """
+            frame = LLMMessagesFrame(
+                messages=[
+                    {
+                        "role": "system",
+                        "content": system_message,
+                    },
+                    last_message,
+                ],
+            )
+            await self.push_frame(frame, direction)
+            return
+
+
+class MessageFrameCatcher2(FrameProcessor):
+    def __init__(self):
+        super().__init__()
+        self.text_blob = ""
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        global most_recent_image_summary
+        await super().process_frame(frame, direction)
+        if isinstance(frame, TextFrame):
+            self.text_blob += f" {frame.text}"
+
+        if isinstance(frame, LLMFullResponseEndFrame):
+            logger.debug(f"MessageFrameCatcher2: {self.text_blob}")
+            most_recent_image_summary = self.text_blob
+            self.text_blob = ""
+
+        await self.push_frame(frame, direction)
+
+
+async def main():
+    global llm
+    global anthropic_context
+
+    async with aiohttp.ClientSession() as session:
+        (room_url, token) = await configure(session)
+
+        transport = DailyTransport(
+            room_url,
+            token,
+            "Respond bot",
+            DailyParams(
+                audio_out_enabled=True,
+                transcription_enabled=True,
+                vad_enabled=True,
+                vad_analyzer=SileroVADAnalyzer(),
+            ),
+        )
+
+        tts = CartesiaTTSService(
+            api_key=os.getenv("CARTESIA_API_KEY"),
+            voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22",  # British Lady
+        )
+
+        llm = AnthropicLLMService(
+            api_key=os.getenv("ANTHROPIC_API_KEY"),
+            model="claude-3-5-sonnet-20240620",
+            enable_prompt_caching_beta=True,
+        )
+
+        vision_llm = AnthropicLLMService(
+            api_key=os.getenv("ANTHROPIC_API_KEY"),
+            model="claude-3-5-sonnet-20240620",
+            enable_prompt_caching_beta=True,
+        )
+
+        # todo: test with very short initial user message
+
+        system_prompt = """\
+You are a helpful assistant who converses with a user and answers questions. Respond concisely to general questions. Keep
+your answers brief unless explicitly asked for more information.
+
+Your response will be turned into speech so use only simple words and punctuation.
+        """
+
+        messages = [
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": system_prompt,
+                    }
+                ],
+            },
+            {"role": "user", "content": "Start the conversation by saying 'hello'."},
+        ]
+
+        context = OpenAILLMContext(messages)
+        anthropic_context = AnthropicLLMContext.upgrade_to_anthropic(context)
+        context_aggregator = llm.create_context_aggregator(context)
+
+        rtvi_user_transcription = RTVIUserTranscriptionProcessor()
+        rtvi_bot_transcription = RTVIBotTranscriptionProcessor()
+
+        pipeline = Pipeline(
+            [
+                transport.input(),  # Transport user input
+                ImageFrameCatcher(),
+                TranscriptFrameCatcher(),
+                rtvi_user_transcription,
+                context_aggregator.user(),  # User speech to text
+                ParallelPipeline(
+                    [
+                        llm,  # LLM
+                        rtvi_bot_transcription,
+                        tts,  # TTS
+                        transport.output(),  # Transport bot output
+                        context_aggregator.assistant(),  # Assistant spoken responses and tool context
+                    ],
+                    [MessageFrameCatcher(), vision_llm, MessageFrameCatcher2()],
+                ),
+            ],
+        )
+
+        task = PipelineTask(pipeline, PipelineParams(allow_interruptions=True, enable_metrics=True))
+
+        @transport.event_handler("on_first_participant_joined")
+        async def on_first_participant_joined(transport, participant):
+            global video_participant_id
+            video_participant_id = participant["id"]
+            await transport.capture_participant_transcription(video_participant_id)
+            await transport.capture_participant_video(
+                video_participant_id, framerate=FRAMES_PER_SECOND, video_source="screenVideo"
+            )
+            # Kick off the conversation.
+            await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+        @transport.event_handler("on_app_message")
+        async def on_app_message(transport, message, sender):
+            logger.debug(f"Received app message: {message} - {context}")
+
+            if not recent_image_frames:
+                logger.debug("No image frames to send")
+                return
+
+            add_message_with_images(
+                anthropic_context, message["message"], frames=list(recent_image_frames)
+            )
+
+            interrupt_message = "STOP"
+
+            if interrupt_message == message["message"]:
+                logger.debug("Interrupting")
+                await task.queue_frames([BotInterruptionFrame()])
+            else:
+                await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+        runner = PipelineRunner()
+        await runner.run(task)
+
+
+def add_message_with_images(c, message, frames=None):
+    if frames is None:
+        frames = list(recent_image_frames)
+
+    if not frames:
+        logger.debug("No image frames to send")
+        return
+
+    # Create content list starting with all images
+    content = []
+    for frame in frames:
+        buffer = io.BytesIO()
+        Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG")
+        encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+        content.append(
+            {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": "image/jpeg",
+                    "data": encoded_image,
+                },
+            }
+        )
+
+    # Add text message at the end if provided
+    if message:
+        content.append({"type": "text", "text": message})
+
+    # Go through all messages and replace user messages containing images
+    if c.messages:
+        for i, msg in enumerate(c.messages):
+            if (
+                msg["role"] == "user"
+                and isinstance(msg["content"], list)
+                and len(msg["content"]) > 0
+            ):
+                if msg["content"][0].get("type") == "image":
+                    logger.debug(
+                        f"Replacing user message {i} containing images with summary: {most_recent_image_summary}"
+                    )
+                    c.messages[i] = {"role": "user", "content": most_recent_image_summary}
+
+    c.add_message({"role": "user", "content": content})
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/src/pipecat/transports/services/daily.py
+++ b/src/pipecat/transports/services/daily.py
@@ -495,9 +495,12 @@ class DailyTransportClient(EventHandler):
        video_source: str = "camera",
        color_format: str = "RGB",
    ):
-        # Only enable camera subscription on this participant
+        # Try to enable camera and screen subscription on this participant
        await self.update_subscriptions(
-            participant_settings={participant_id: {"media": "subscribed"}}
+            # participant_settings={participant_id: {"media": "subscribed"}}
+            participant_settings={
+                participant_id: {"media": {"camera": "subscribed", "screenVideo": "subscribed"}}
+            }
        )

        self._video_renderers[participant_id] = callback
Author	SHA1	Message	Date
Nikita Gamolsky	0265c1d3ef	alllow interrupt	2024-11-02 16:12:29 -07:00
Nikita Gamolsky	ffa0e5a122	working with summary	2024-11-02 15:33:03 -07:00
Nikita Gamolsky	cdeab597b3	more variables	2024-11-02 14:05:19 -07:00
Nikita Gamolsky	abd486025b	more updates	2024-11-02 13:46:28 -07:00
Nikita Gamolsky	c4cdb2d809	update to use context global	2024-11-02 13:37:35 -07:00
Nikita Gamolsky	05ba10c969	update	2024-11-02 13:27:08 -07:00
Kwindla Hultman Kramer	2f80683dc4	initial commit of screen capture in 99-anthropic-hackathon.py	2024-11-02 10:42:31 -07:00