Merge pull request #64 from daily-co/docs

Some docs
This commit is contained in:
Moishe Lettvin
2024-03-18 13:38:32 -04:00
committed by GitHub
2 changed files with 57 additions and 1 deletions

View File

@@ -1,2 +1,17 @@
# Daily AI SDK Architecture Guide
## Frames
Frames can represent discrete chunks of data, for instance a chunk of text, a chunk of audio, or an image. They can also be used to as control flow, for instance a frame that indicates that there is no more data available, or that a user started or stopped talking. They can also represent more complex data structures, such as a message array used for an LLM completion.
## FrameProcessors
Frame processors operate on frames. Every frame processor implements a `process_frame` method that consumes one frame and produces zero or more frames. Frame processors can do simple transforms, such as concatenating text fragments into sentences, or they can treat frames as input for an AI Service, and emit chat completions based on message arrays or transform text into audio or images.
## Pipelines
Pipelines are lists of frame processors that read from a source queue and send the processed frames to a sink queue. A very simple pipeline might chain an LLM frame processor to a text-to-speech frame processor, with a transport's send queue as its sync. Placing LLM message frames on the pipeline's source queue will cause the LLM's response to be spoken. See example #2 for an implementation of this.
## Transports
Transports provide a receive queue, which is input from "the outside world", and a sink queue, which is data that will be sent "to the outside world". The `LocalTransportService` does this with the local camera, mic, display and speaker. The `DailyTransportService` does this with a WebRTC session joined to a Daily.co room.

View File

@@ -17,14 +17,23 @@ class ControlFrame(Frame):
class StartFrame(ControlFrame):
"""Used (but not required) to start a pipeline, and is also used to
indicate that an interruption has ended and the transport should start
processing frames again."""
pass
class EndFrame(ControlFrame):
"""Indicates that a pipeline has ended and frame processors and pipelines
should be shut down. If the transport receives this frame, it will stop
sending frames to its output channel(s) and close all its threads."""
pass
class EndPipeFrame(ControlFrame):
"""Indicates that a pipeline has ended but that the transport should
continue processing. This frame is used in parallel pipelines and other
sub-pipelines."""
pass
@@ -39,15 +48,20 @@ class PipelineStartedFrame(ControlFrame):
class LLMResponseStartFrame(ControlFrame):
"""Used to indicate the beginning of an LLM response. Following TextFrames
are part of the LLM response until an LLMResponseEndFrame"""
pass
class LLMResponseEndFrame(ControlFrame):
"""Indicates the end of an LLM response."""
pass
@dataclass()
class AudioFrame(Frame):
"""A chunk of audio. Will be played by the transport if the transport's mic
has been enabled."""
data: bytes
def __str__(self):
@@ -56,6 +70,8 @@ class AudioFrame(Frame):
@dataclass()
class ImageFrame(Frame):
"""An image. Will be shown by the transport if the transport's camera is
enabled."""
url: str | None
image: bytes
@@ -65,14 +81,19 @@ class ImageFrame(Frame):
@dataclass()
class SpriteFrame(Frame):
"""An animated sprite. Will be shown by the transport if the transport's
camera is enabled. Will play at the framerate specified in the transport's
`fps` constructor parameter."""
images: list[bytes]
def __str__(self):
return f"{self.__class__.name__}, list size: {len(self.images)}"
return f"{self.__class__.__name__}, list size: {len(self.images)}"
@dataclass()
class TextFrame(Frame):
"""A chunk of text. Emitted by LLM services, consumed by TTS services, can
be used to send text through pipelines."""
text: str
def __str__(self):
@@ -81,17 +102,27 @@ class TextFrame(Frame):
@dataclass()
class TranscriptionQueueFrame(TextFrame):
"""A text frame with transcription-specific data. Will be placed in the
transport's receive queue when a participant speaks."""
participantId: str
timestamp: str
@dataclass()
class LLMMessagesQueueFrame(Frame):
"""A frame containing a list of LLM messages. Used to signal that an LLM
service should run a chat completion and emit an LLMStartFrames, TextFrames
and an LLMEndFrame.
Note that the messages property on this class is mutable, and will be
be updated by various ResponseAggregator frame processors."""
messages: List[dict]
@dataclass()
class OpenAILLMContextFrame(Frame):
"""Like an LLMMessagesQueueFrame, but with extra context specific to the
OpenAI API. The context in this message is also mutable, and will be
changed by the OpenAIContextAggregator frame processor."""
context: OpenAILLMContext
@@ -114,10 +145,15 @@ class SendAppMessageFrame(Frame):
class UserStartedSpeakingFrame(Frame):
"""Emitted by VAD to indicate that a participant has started speaking.
This can be used for interruptions or other times when detecting that
someone is speaking is more important than knowing what they're saying
(as you will with a TranscriptionFrame)"""
pass
class UserStoppedSpeakingFrame(Frame):
"""Emitted by the VAD to indicate that a user stopped speaking."""
pass
@@ -131,10 +167,15 @@ class BotStoppedSpeakingFrame(Frame):
@dataclass()
class LLMFunctionStartFrame(Frame):
"""Emitted when the LLM receives the beginning of a function call
completion. A frame processor can use this frame to indicate that it should
start preparing to make a function call, if it can do so in the absence of
any arguments."""
function_name: str
@dataclass()
class LLMFunctionCallFrame(Frame):
"""Emitted when the LLM has received an entire function call completion."""
function_name: str
arguments: str