From 1c27f77f1ab326bdbbcdeb7e09610ea20d427df9 Mon Sep 17 00:00:00 2001 From: Moishe Lettvin Date: Mon, 18 Mar 2024 08:39:50 -0400 Subject: [PATCH 1/3] drafty architecture doc --- docs/architecture.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/architecture.md b/docs/architecture.md index 5566a29ba..1c5946c9c 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,2 +1,17 @@ # Daily AI SDK Architecture Guide +## Frames + +Frames can represent discrete chunks of data, for instance a chunk of text, a chunk of audio, or an image. They can also be used to as control flow, for instance a frame that indicates that there is no more data available, or that a user started or stopped talking. They can also represent more complex data structures, such as a message array used for an LLM completion. + +## FrameProcessors + +Frame processors operate on frames. Every frame processor implements a `process_frame` method that consumes one frame and produces zero or more frames. Frame processors can do simple transforms, such as concatenating text fragments into sentences, or they can treat frames as input for an AI Service, and emit chat completions based on message arrays or transform text into audio or images. + +## Pipelines + +Pipelines are lists of frame processors that read from a source queue and send the processed frames to a sink queue. A very simple pipeline might chain an LLM frame processor to a text-to-speech frame processor, with a transport's send queue as its sync. Placing LLM message frames on the pipeline's source queue will cause the LLM's response to be spoken. See example #2 for an implementation of this. + +## Transports + +Transports provide a receive queue, which is input from "the outside world", and a sink queue, which is data that will be sent "to the outside world". The `LocalTransportService` does this with the local camera, mic, display and speaker. The `DailyTransportService` does this with a WebRTC session joined to a Daily.co room. From 99b836c2277b5481df1465385d7817f7f14945fb Mon Sep 17 00:00:00 2001 From: Moishe Lettvin Date: Mon, 18 Mar 2024 09:08:12 -0400 Subject: [PATCH 2/3] added docstrings to frames. --- src/dailyai/pipeline/frames.py | 43 +++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/src/dailyai/pipeline/frames.py b/src/dailyai/pipeline/frames.py index d0d279bd9..bb2bf5a10 100644 --- a/src/dailyai/pipeline/frames.py +++ b/src/dailyai/pipeline/frames.py @@ -17,14 +17,23 @@ class ControlFrame(Frame): class StartFrame(ControlFrame): + """Used (but not required) to start a pipeline, and is also used to + indicate that an interruption has ended and the transport should start + processing frames again.""" pass class EndFrame(ControlFrame): + """Indicates that a pipeline has ended and frame processors and pipelines + should be shut down. If the transport receives this frame, it will stop + sending frames to its output channel(s) and close all its threads.""" pass class EndPipeFrame(ControlFrame): + """Indicates that a pipeline has ended but that the transport should + continue processing. This frame is used in parallel pipelines and other + sub-pipelines.""" pass @@ -39,15 +48,20 @@ class PipelineStartedFrame(ControlFrame): class LLMResponseStartFrame(ControlFrame): + """Used to indicate the beginning of an LLM response. Following TextFrames + are part of the LLM response until an LLMResponseEndFrame""" pass class LLMResponseEndFrame(ControlFrame): + """Indicates the end of an LLM response.""" pass @dataclass() class AudioFrame(Frame): + """A chunk of audio. Will be played by the transport if the transport's mic + has been enabled.""" data: bytes def __str__(self): @@ -56,6 +70,8 @@ class AudioFrame(Frame): @dataclass() class ImageFrame(Frame): + """An image. Will be shown by the transport if the transport's camera is + enabled.""" url: str | None image: bytes @@ -65,14 +81,19 @@ class ImageFrame(Frame): @dataclass() class SpriteFrame(Frame): + """An animated sprite. Will be shown by the transport if the transport's + camera is enabled. Will play at the framerate specified in the transport's + `fps` constructor parameter.""" images: list[bytes] def __str__(self): - return f"{self.__class__.name__}, list size: {len(self.images)}" + return f"{self.__class__.__name__}, list size: {len(self.images)}" @dataclass() class TextFrame(Frame): + """A chunk of text. Emitted by LLM services, consumed by TTS services, can + be used to send text through pipelines.""" text: str def __str__(self): @@ -81,17 +102,27 @@ class TextFrame(Frame): @dataclass() class TranscriptionQueueFrame(TextFrame): + """A text frame with transcription-specific data. Will be placed in the + transport's receive queue when a participant speaks.""" participantId: str timestamp: str @dataclass() class LLMMessagesQueueFrame(Frame): + """A frame containing a list of LLM messages. Used to signal that an LLM + service should run a chat completion and emit an LLMStartFrames, TextFrames + and an LLMEndFrame. + Note that the messages property on this class is mutable, and will be + be updated by various ResponseAggregator frame processors.""" messages: List[dict] @dataclass() class OpenAILLMContextFrame(Frame): + """Like an LLMMessagesQueueFrame, but with extra context specific to the + OpenAI API. The context in this message is also mutable, and will be + changed by the OpenAIContextAggregator frame processor.""" context: OpenAILLMContext @@ -101,10 +132,15 @@ class AppMessageQueueFrame(Frame): class UserStartedSpeakingFrame(Frame): + """Emitted by VAD to indicate that a participant has started speaking. + This can be used for interruptions or other times when detecting that + someone is speaking is more important than knowing what they're saying + (as you will with a TranscriptionFrame)""" pass class UserStoppedSpeakingFrame(Frame): + """Emitted by the VAD to indicate that a user stopped speaking.""" pass @@ -118,10 +154,15 @@ class BotStoppedSpeakingFrame(Frame): @dataclass() class LLMFunctionStartFrame(Frame): + """Emitted when the LLM receives the beginngin of a function call + completion. A frame processor can use this frame to indicate that it should + start preparing to make a function call, if it can do so in the absence of + any arguments.""" function_name: str @dataclass() class LLMFunctionCallFrame(Frame): + """Emitted when the LLM has received an entire function call completions.""" function_name: str arguments: str From 3b61d0b41a217ae5499d7b0aa9b5801b64e760c5 Mon Sep 17 00:00:00 2001 From: Moishe Lettvin Date: Mon, 18 Mar 2024 13:38:00 -0400 Subject: [PATCH 3/3] fix typos --- src/dailyai/pipeline/frames.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dailyai/pipeline/frames.py b/src/dailyai/pipeline/frames.py index bb2bf5a10..194de910c 100644 --- a/src/dailyai/pipeline/frames.py +++ b/src/dailyai/pipeline/frames.py @@ -154,7 +154,7 @@ class BotStoppedSpeakingFrame(Frame): @dataclass() class LLMFunctionStartFrame(Frame): - """Emitted when the LLM receives the beginngin of a function call + """Emitted when the LLM receives the beginning of a function call completion. A frame processor can use this frame to indicate that it should start preparing to make a function call, if it can do so in the absence of any arguments.""" @@ -163,6 +163,6 @@ class LLMFunctionStartFrame(Frame): @dataclass() class LLMFunctionCallFrame(Frame): - """Emitted when the LLM has received an entire function call completions.""" + """Emitted when the LLM has received an entire function call completion.""" function_name: str arguments: str