some more changes

hackathon code
more tweaks
2024-02-25 21:51:08 -08:00 · 2024-02-25 21:41:55 -08:00 · 2024-02-22 22:18:06 +00:00 · 2024-02-22 16:14:36 -06:00 · 2024-02-22 15:39:21 -06:00 · 2024-02-22 14:45:38 -06:00
14 changed files with 804 additions and 58 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,11 +12,16 @@ dependencies = [
    "daily-python",
    "fal",
    "faster_whisper",
+    "groq",
    "google-cloud-texttospeech",
+    "numpy",
    "openai",
    "Pillow",
    "pyht",
    "python-dotenv",
+    "torch",
+    "torchaudio",
+    "pyaudio",
    "typing-extensions"
 ]

--- a/src/dailyai/queue_frame.py
+++ b/src/dailyai/queue_frame.py
@@ -23,6 +23,14 @@ class LLMResponseEndQueueFrame(QueueFrame):
    pass


+class UserStartedSpeakingFrame(QueueFrame):
+    pass
+
+
+class UserStoppedSpeakingFrame(QueueFrame):
+    pass
+
+
@dataclass()
 class AudioQueueFrame(QueueFrame):
    data: bytes
@@ -44,6 +52,17 @@ class TextQueueFrame(QueueFrame):
    text: str


+@dataclass()
+class TextQueueOutOfBandFrame(TextQueueFrame):
+    outOfBand: bool = True
+
+
+@dataclass()
+class TTSCompletedFrame(QueueFrame):
+    text: str
+    outOfBand: bool = False
+
+
@dataclass()
 class TranscriptionQueueFrame(TextQueueFrame):
    participantId: str
--- a/src/dailyai/services/ai_services.py
+++ b/src/dailyai/services/ai_services.py
@@ -2,9 +2,11 @@ import asyncio
 import io
 import logging
 import time
+import datetime
 import wave

 from dailyai.queue_frame import (
+    QueueFrame,
    AudioQueueFrame,
    ControlQueueFrame,
    EndStreamQueueFrame,
@@ -13,7 +15,9 @@ from dailyai.queue_frame import (
    LLMResponseEndQueueFrame,
    QueueFrame,
    TextQueueFrame,
+    TTSCompletedFrame,
    TranscriptionQueueFrame,
+    UserStoppedSpeakingFrame
 )

 from abc import abstractmethod
@@ -80,6 +84,11 @@ class AIService:


 class LLMService(AIService):
+
+    def __init__(self, context):
+        super().__init__()
+        self._context = context
+
    @abstractmethod
    async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
        yield ""
@@ -89,9 +98,20 @@ class LLMService(AIService):
        pass

    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
-        if isinstance(frame, LLMMessagesQueueFrame):
-            async for text_chunk in self.run_llm_async(frame.messages):
-                yield TextQueueFrame(text_chunk)
+        print(f"##### process frame got a frame, {type(frame)}")
+        if isinstance(frame, UserStoppedSpeakingFrame):
+            print(
+                f"### Got a user stopped speaking frame, context is {self._context}")
+            async for chunk in self.run_llm_async(self._context):
+                # if we get a string, wrap it in a frame
+                if isinstance(chunk, str):
+                    yield TextQueueFrame(chunk)
+                # if we get a frame, pass it through
+                elif isinstance(chunk, QueueFrame):
+                    print(f"### Got a frame chunk: {chunk}")
+                    yield chunk
+                else:
+                    print(f"### Got an unknown chunk: {chunk}")
            yield LLMResponseEndQueueFrame()
        else:
            yield frame
@@ -116,6 +136,12 @@ class TTSService(AIService):

    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
        if not isinstance(frame, TextQueueFrame):
+            # We don't want transcription frames, which are a subclass
+            yield frame
+            return
+
+        # TODO-CB: Clean this up
+        if isinstance(frame, TranscriptionQueueFrame):
            yield frame
            return

@@ -130,7 +156,11 @@ class TTSService(AIService):

        if text:
            async for audio_chunk in self.run_tts(text):
-                yield AudioQueueFrame(audio_chunk)
+                size = 8000
+                for i in range(0, len(audio_chunk), size):
+                    yield AudioQueueFrame(audio_chunk[i: i+size])
+            print("### ABOUT TO YIELD TTS COMPLETED FRAME", frame)
+            yield TTSCompletedFrame(text, hasattr(frame, 'outOfBand') and frame.outOfBand)

    async def finalize(self):
        if self.current_sentence:
@@ -200,8 +230,9 @@ class FrameLogger(AIService):

    async def process_frame(self, frame: QueueFrame) -> AsyncGenerator[QueueFrame, None]:
        if isinstance(frame, (AudioQueueFrame, ImageQueueFrame)):
-            self.logger.info(f"{self.prefix}: {type(frame)}")
+            self.logger.info(
+                f"{datetime.datetime.utcnow().isoformat()} {self.prefix}: {type(frame)}")
        else:
-            print(f"{self.prefix}: {frame}")
+            print(f"{datetime.datetime.utcnow().isoformat()} {self.prefix}: {frame}")

        yield frame
--- a/src/dailyai/services/azure_ai_services.py
+++ b/src/dailyai/services/azure_ai_services.py
@@ -42,14 +42,16 @@ class AzureTTSService(TTSService):
            yield result.audio_data[44:]
        elif result.reason == ResultReason.Canceled:
            cancellation_details = result.cancellation_details
-            self.logger.info("Speech synthesis canceled: {}".format(cancellation_details.reason))
+            self.logger.info("Speech synthesis canceled: {}".format(
+                cancellation_details.reason))
            if cancellation_details.reason == CancellationReason.Error:
-                self.logger.info("Error details: {}".format(cancellation_details.error_details))
+                self.logger.info("Error details: {}".format(
+                    cancellation_details.error_details))


 class AzureLLMService(LLMService):
-    def __init__(self, *, api_key, endpoint, api_version="2023-12-01-preview", model):
-        super().__init__()
+    def __init__(self, *, api_key, endpoint, api_version="2023-12-01-preview", model, context):
+        super().__init__(context)
        self._model: str = model

        self._client = AsyncAzureOpenAI(
@@ -102,7 +104,8 @@ class AzureImageGenServiceREST(ImageGenService):

    async def run_image_gen(self, sentence) -> tuple[str, bytes]:
        url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
-        headers = {"api-key": self._api_key, "Content-Type": "application/json"}
+        headers = {"api-key": self._api_key,
+                   "Content-Type": "application/json"}
        body = {
            # Enter your prompt text here
            "prompt": sentence,
--- a/src/dailyai/services/base_transport_service.py
+++ b/src/dailyai/services/base_transport_service.py
@@ -1,11 +1,23 @@
 from abc import abstractmethod
 import asyncio
+import copy
+import functools
 import itertools
 import logging
 import queue
 import threading
 import time
 from typing import AsyncGenerator
+import numpy as np
+import pyaudio
+import torch
+import torchaudio
+from enum import Enum
+import datetime
+import traceback
+
+from typing import AsyncGenerator, AsyncIterable, BinaryIO, Iterable
+from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMUserContextAggregator

 from dailyai.queue_frame import (
    AudioQueueFrame,
@@ -14,8 +26,59 @@ from dailyai.queue_frame import (
    QueueFrame,
    SpriteQueueFrame,
    StartStreamQueueFrame,
+    TranscriptionQueueFrame,
+    TTSCompletedFrame,
+    UserStartedSpeakingFrame,
+    UserStoppedSpeakingFrame
 )

+torch.set_num_threads(1)
+
+model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                              model='silero_vad',
+                              force_reload=False)
+
+(get_speech_timestamps,
+ save_audio,
+ read_audio,
+ VADIterator,
+ collect_chunks) = utils
+
+# Taken from utils_vad.py
+
+
+def validate(model,
+             inputs: torch.Tensor):
+    with torch.no_grad():
+        outs = model(inputs)
+    return outs
+
+# Provided by Alexander Veysov
+
+
+def int2float(sound):
+    abs_max = np.abs(sound).max()
+    sound = sound.astype('float32')
+    if abs_max > 0:
+        sound *= 1/32768
+    sound = sound.squeeze()  # depends on the use case
+    return sound
+
+
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+SAMPLE_RATE = 16000
+CHUNK = int(SAMPLE_RATE / 10)
+
+audio = pyaudio.PyAudio()
+
+
+class VADState(Enum):
+    QUIET = 1
+    STARTING = 2
+    SPEAKING = 3
+    STOPPING = 4
+

 class BaseTransportService():

@@ -31,6 +94,17 @@ class BaseTransportService():
        self._speaker_enabled = kwargs.get("speaker_enabled") or False
        self._speaker_sample_rate = kwargs.get("speaker_sample_rate") or 16000
        self._fps = kwargs.get("fps") or 8
+        self._vad_start_s = kwargs.get("vad_start_s") or 0.2
+        self._vad_stop_s = kwargs.get("vad_stop_s") or 0.5
+        self._context = kwargs.get("context") or []
+
+        self._vad_samples = 1536
+        vad_frame_s = self._vad_samples / SAMPLE_RATE
+        self._vad_start_frames = round(self._vad_start_s / vad_frame_s)
+        self._vad_stop_frames = round(self._vad_stop_s / vad_frame_s)
+        self._vad_starting_count = 0
+        self._vad_stopping_count = 0
+        self._vad_state = VADState.QUIET

        duration_minutes = kwargs.get("duration_minutes") or 10
        self._expiration = time.time() + duration_minutes * 60
@@ -41,6 +115,8 @@ class BaseTransportService():
        self._threadsafe_send_queue = queue.Queue()

        self._images = None
+        self._user_is_speaking = False
+        self._current_phrase = ""

        try:
            self._loop: asyncio.AbstractEventLoop | None = asyncio.get_running_loop()
@@ -52,20 +128,94 @@ class BaseTransportService():

        self._logger: logging.Logger = logging.getLogger()

+    def update_messages(self, new_context: list[dict[str, str]], task: asyncio.Task | None):
+        if task:
+            if not task.cancelled():
+                self._current_phrase = ""
+                self._context = new_context
+
+    def append_to_context(self, role, chunk_or_text):
+        print("IN APPEND", chunk_or_text)
+        # if we get a non-string, append it to the context without further error checking
+        # unless the outOfBand property is True
+        if not isinstance(chunk_or_text, str):
+
+            if not chunk_or_text.get("outOfBand") == True:
+                self._context.append(chunk_or_text)
+            return
+
+        text = chunk_or_text
+        last_context_item = self._context[-1]
+
+        print("TEXT", text)
+        print("LAST CONTEXT ITEM", last_context_item)
+        traceback.print_stack()
+
+        if last_context_item and last_context_item['role'] == role:
+            last_context_item['content'] += f" {text}"
+        else:
+            self._context.append({"role": role, "content": text})
+
+    async def run_pipeline(self, frame):
+        print(f"starting to speak_after_delay, {frame}")
+        # TODO-CB: This exception for missing class gets eaten!
+        await self._runner(frame)
+
+    async def run_conversation(self, runner: Iterable[QueueFrame]
+                               | AsyncIterable[QueueFrame]
+                               | asyncio.Queue[QueueFrame],
+                               ) -> AsyncGenerator[QueueFrame, None]:
+        current_response_task = None
+        self._runner = runner
+
+        async for frame in self.get_receive_frames():
+            print(f"got frame of type: {type(frame)}, {frame}")
+            if isinstance(frame, EndStreamQueueFrame):
+                break
+            # elif not isinstance(frame, TranscriptionQueueFrame):
+                # continue
+            # TODO-CB: Verify this is an accurate replacement
+            # if hasattr(frame, 'participantId') and frame.participantId == self._my_participant_id:
+            if not isinstance(frame, UserStoppedSpeakingFrame):
+                continue
+
+            if current_response_task:
+                # TODO-CB: Maybe not always interrupt? Are there frame types we can pass through?
+                current_response_task.cancel()
+                self.interrupt()
+
+            # self._current_phrase += " " + frame.text
+           # current_llm_context = copy.deepcopy(self._context)
+            current_response_task = asyncio.create_task(
+                self.run_pipeline(
+                    frame)
+            )
+            current_response_task.add_done_callback(
+                functools.partial(self.update_messages, self._context)
+            )
+
    async def run(self):
        self._prerun()

-        async_output_queue_marshal_task = asyncio.create_task(self._marshal_frames())
+        async_output_queue_marshal_task = asyncio.create_task(
+            self._marshal_frames())

-        self._camera_thread = threading.Thread(target=self._run_camera, daemon=True)
+        self._camera_thread = threading.Thread(
+            target=self._run_camera, daemon=True)
        self._camera_thread.start()

-        self._frame_consumer_thread = threading.Thread(target=self._frame_consumer, daemon=True)
+        self._frame_consumer_thread = threading.Thread(
+            target=self._frame_consumer, daemon=True)
        self._frame_consumer_thread.start()

        if self._speaker_enabled:
-            self._receive_audio_thread = threading.Thread(target=self._receive_audio, daemon=True)
-            self._receive_audio_thread.start()
+            # TODO-CB: This is interesting
+            # self._receive_audio_thread = threading.Thread(
+            #     target=self._receive_audio, daemon=True)
+            # self._receive_audio_thread.start()
+
+            self._vad_thread = threading.Thread(target=self._vad, daemon=True)
+            self._vad_thread.start()

        try:
            while (
@@ -122,6 +272,61 @@ class BaseTransportService():
    def _prerun(self):
        pass

+    def _vad(self):
+        # CB: Starting silero VAD stuff
+        # TODO-CB: Probably need to force virtual speaker creation if we're
+        # going to build this in?
+        # TODO-CB: pyaudio installation
+        while not self._stop_threads.is_set():
+            audio_chunk = self.read_audio_frames(self._vad_samples)
+            audio_int16 = np.frombuffer(audio_chunk, np.int16)
+            audio_float32 = int2float(audio_int16)
+            new_confidence = model(
+                torch.from_numpy(audio_float32), 16000).item()
+            speaking = new_confidence > 0.5
+
+            if speaking:
+                match self._vad_state:
+                    case VADState.QUIET:
+                        self._vad_state = VADState.STARTING
+                        self._vad_starting_count = 1
+                    case VADState.STARTING:
+                        self._vad_starting_count += 1
+                    case VADState.STOPPING:
+                        self._vad_state = VADState.SPEAKING
+                        self._vad_stopping_count = 0
+            else:
+                match self._vad_state:
+                    case VADState.STARTING:
+                        self._vad_state = VADState.QUIET
+                        self._vad_starting_count = 0
+                    case VADState.SPEAKING:
+                        self._vad_state = VADState.STOPPING
+                        self._vad_stopping_count = 1
+                    case VADState.STOPPING:
+                        self._vad_stopping_count += 1
+
+            if self._vad_state == VADState.STARTING and self._vad_starting_count >= self._vad_start_frames:
+                print(
+                    f'!!! {datetime.datetime.utcnow().isoformat()} queueing start frame')
+                asyncio.run_coroutine_threadsafe(
+                    self.receive_queue.put(
+                        UserStartedSpeakingFrame()), self._loop
+                )
+                print(f"!!! VAD started, calling interrupt")
+                self.interrupt()
+                self._vad_state = VADState.SPEAKING
+                self._vad_starting_count = 0
+            if self._vad_state == VADState.STOPPING and self._vad_stopping_count >= self._vad_stop_frames:
+                print(
+                    f'!!! {datetime.datetime.utcnow().isoformat()} queueing stop frame')
+                asyncio.run_coroutine_threadsafe(
+                    self.receive_queue.put(
+                        UserStoppedSpeakingFrame()), self._loop
+                )
+                self._vad_state = VADState.QUIET
+                self._vad_stopping_count = 0
+
    async def _marshal_frames(self):
        while True:
            frame: QueueFrame | list = await self.send_queue.get()
@@ -131,6 +336,7 @@ class BaseTransportService():
                break

    def interrupt(self):
+        print(f"!!! setting interrupt")
        self._is_interrupted.set()

    async def get_receive_frames(self) -> AsyncGenerator[QueueFrame, None]:
@@ -205,7 +411,6 @@ class BaseTransportService():
                        if frame:
                            if isinstance(frame, AudioQueueFrame):
                                chunk = frame.data
-
                                all_audio_frames.extend(chunk)

                                b.extend(chunk)
@@ -213,21 +418,27 @@ class BaseTransportService():
                                    len(b) % smallest_write_size
                                )
                                if truncated_length:
-                                    self.write_frame_to_mic(bytes(b[:truncated_length]))
+                                    self.write_frame_to_mic(
+                                        bytes(b[:truncated_length]))
                                    b = b[truncated_length:]
                            elif isinstance(frame, ImageQueueFrame):
                                self._set_image(frame.image)
                            elif isinstance(frame, SpriteQueueFrame):
                                self._set_images(frame.images)
+                            elif isinstance(frame, TTSCompletedFrame) and not frame.outOfBand:
+                                self.append_to_context(
+                                    "assistant", frame.text)
                        elif len(b):
                            self.write_frame_to_mic(bytes(b))
                            b = bytearray()
                    else:
                        # if there are leftover audio bytes, write them now; failing to do so
                        # can cause static in the audio stream.
+                        print(f"!!! interrupted, flushing audio")
                        if len(b):
                            truncated_length = len(b) - (len(b) % 160)
-                            self.write_frame_to_mic(bytes(b[:truncated_length]))
+                            self.write_frame_to_mic(
+                                bytes(b[:truncated_length]))
                            b = bytearray()

                        if isinstance(frame, StartStreamQueueFrame):
@@ -240,5 +451,6 @@ class BaseTransportService():

                b = bytearray()
            except Exception as e:
-                self._logger.error(f"Exception in frame_consumer: {e}, {len(b)}")
+                self._logger.error(
+                    f"Exception in frame_consumer: {e}, {len(b)}")
                raise e
--- a/src/dailyai/services/daily_transport_service.py
+++ b/src/dailyai/services/daily_transport_service.py
@@ -1,18 +1,4 @@
-import asyncio
-import inspect
-import logging
-import signal
-import threading
-import types
-
-from functools import partial
-
-from dailyai.queue_frame import (
-    TranscriptionQueueFrame,
-)
-
-from threading import Event
-
+from dailyai.services.base_transport_service import BaseTransportService
 from daily import (
    EventHandler,
    CallClient,
@@ -21,8 +7,61 @@ from daily import (
    VirtualMicrophoneDevice,
    VirtualSpeakerDevice,
 )
+from threading import Event
+from dailyai.queue_frame import (
+    TranscriptionQueueFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
+)
+from functools import partial
+import types
+import pyaudio
+import torchaudio
+import asyncio
+import inspect
+import io
+import logging
+import numpy as np
+import signal
+import threading
+import torch
+torch.set_num_threads(1)

-from dailyai.services.base_transport_service import BaseTransportService
+model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                              model='silero_vad',
+                              force_reload=False)
+
+(get_speech_timestamps,
+ save_audio,
+ read_audio,
+ VADIterator,
+ collect_chunks) = utils
+
+# Taken from utils_vad.py
+
+
+def validate(model,
+             inputs: torch.Tensor):
+    with torch.no_grad():
+        outs = model(inputs)
+    return outs
+
+# Provided by Alexander Veysov
+
+
+def int2float(sound):
+    abs_max = np.abs(sound).max()
+    sound = sound.astype('float32')
+    if abs_max > 0:
+        sound *= 1/32768
+    sound = sound.squeeze()  # depends on the use case
+    return sound
+
+
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+SAMPLE_RATE = 16000
+CHUNK = int(SAMPLE_RATE / 10)
+
+audio = pyaudio.PyAudio()


 class DailyTransportService(BaseTransportService, EventHandler):
@@ -45,7 +84,8 @@ class DailyTransportService(BaseTransportService, EventHandler):
        start_transcription: bool = False,
        **kwargs,
    ):
-        super().__init__(**kwargs)  # This will call BaseTransportService.__init__ method, not EventHandler
+        # This will call BaseTransportService.__init__ method, not EventHandler
+        super().__init__(**kwargs)

        self._room_url: str = room_url
        self._bot_name: str = bot_name
@@ -80,7 +120,8 @@ class DailyTransportService(BaseTransportService, EventHandler):
            for handler in self._event_handlers[event_name]:
                if inspect.iscoroutinefunction(handler):
                    if self._loop:
-                        asyncio.run_coroutine_threadsafe(handler(*args, **kwargs), self._loop)
+                        asyncio.run_coroutine_threadsafe(
+                            handler(*args, **kwargs), self._loop)
                    else:
                        raise Exception(
                            "No event loop to run coroutine. In order to use async event handlers, you must run the DailyTransportService in an asyncio event loop.")
@@ -92,7 +133,8 @@ class DailyTransportService(BaseTransportService, EventHandler):

    def add_event_handler(self, event_name: str, handler):
        if not event_name.startswith("on_"):
-            raise Exception(f"Event handler {event_name} must start with 'on_'")
+            raise Exception(
+                f"Event handler {event_name} must start with 'on_'")

        methods = inspect.getmembers(self, predicate=inspect.ismethod)
        if event_name not in [method[0] for method in methods]:
@@ -105,7 +147,8 @@ class DailyTransportService(BaseTransportService, EventHandler):
                    handler, self)]
            setattr(self, event_name, partial(self._patch_method, event_name))
        else:
-            self._event_handlers[event_name].append(types.MethodType(handler, self))
+            self._event_handlers[event_name].append(
+                types.MethodType(handler, self))

    def event_handler(self, event_name: str):
        def decorator(handler):
@@ -149,7 +192,8 @@ class DailyTransportService(BaseTransportService, EventHandler):
            Daily.select_speaker_device("speaker")

        self.client.set_user_name(self._bot_name)
-        self.client.join(self._room_url, self._token, completion=self.call_joined)
+        self.client.join(self._room_url, self._token,
+                         completion=self.call_joined)
        self._my_participant_id = self.client.participants()["local"]["id"]

        self.client.update_inputs(
@@ -232,18 +276,41 @@ class DailyTransportService(BaseTransportService, EventHandler):
        if len(self.client.participants()) < self._min_others_count + 1:
            self._stop_threads.set()

+    async def insert_speech(self, text, sender, date):
+        await self.receive_queue.put(UserStartedSpeakingFrame())
+        await asyncio.sleep(0.3)
+
+        # frame = TranscriptionQueueFrame(text, sender, date)
+        # await self.receive_queue.put(frame)
+        self.on_transcription_message({
+            "text": text,
+            "participantId":  "cb65b845-aac0-4fc8-987d-2e7ce3c7d8f0",
+            "timestamp": date
+        })
+
+        await asyncio.sleep(0.3)
+        await self.receive_queue.put(UserStoppedSpeakingFrame())
+
    def on_app_message(self, message, sender):
-        pass
+        if self._loop:
+            print("APP MESSAGE", message)
+            asyncio.run_coroutine_threadsafe(
+                self.insert_speech(message["message"], sender, message["date"]), self._loop)

    def on_transcription_message(self, message: dict):
        if self._loop:
+            print(f"transcription: {message}")
            participantId = ""
            if "participantId" in message:
                participantId = message["participantId"]
            elif "session_id" in message:
                participantId = message["session_id"]
-            frame = TranscriptionQueueFrame(message["text"], participantId, message["timestamp"])
-            asyncio.run_coroutine_threadsafe(self.receive_queue.put(frame), self._loop)
+            frame = TranscriptionQueueFrame(
+                message["text"], participantId, message["timestamp"])
+            if self._my_participant_id and participantId != self._my_participant_id:
+                self.append_to_context("user", message["text"])
+            asyncio.run_coroutine_threadsafe(
+                self.receive_queue.put(frame), self._loop)

    def on_transcription_stopped(self, stopped_by, stopped_by_error):
        pass
--- a/src/dailyai/services/fal_ai_services.py
+++ b/src/dailyai/services/fal_ai_services.py
@@ -32,7 +32,8 @@ class FalImageGenService(ImageGenService):
            handler = fal.apps.submit(
                "110602490-fast-sdxl",
                arguments={
-                    "prompt": sentence
+                    "prompt": sentence,
+                    "seed": 23
                },
            )
            for event in handler.iter_events():
--- a/src/dailyai/services/fireworks_ai_services.py
+++ b/src/dailyai/services/fireworks_ai_services.py
@@ -0,0 +1,122 @@
+import aiohttp
+from PIL import Image
+import io
+from openai import AsyncOpenAI
+
+import asyncio
+import json
+from collections.abc import AsyncGenerator
+
+from dailyai.services.ai_services import LLMService, ImageGenService
+
+from dailyai.queue_frame import (TextQueueFrame, TextQueueOutOfBandFrame)
+
+
+class FireworksLLMService(LLMService):
+    def __init__(self, *, api_key, model="", tools=[], context, change_appearance, transport=""):
+        super().__init__(context)
+        self._model = model
+        self._tools = tools
+        self._change_appearance = change_appearance
+        self._transport = transport
+        self._client = AsyncOpenAI(
+            api_key=api_key,
+            base_url="https://api.fireworks.ai/inference/v1"
+        )
+
+    async def get_response(self, messages, stream):
+        print("GET RESPONSE ... WHEN DO WE EXPECT THIS TO BE CALLED?")
+        return await self._client.chat.completions.create(
+            stream=stream,
+            messages=messages,
+            model=self._model,
+            temperature=0.1,
+            tools=self._tools
+        )
+
+    async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
+        print("IN ASYNC")
+        messages_for_log = json.dumps(messages)
+        self.logger.debug(f"Generating chat via openai: {messages_for_log}")
+
+        chunks = await self._client.chat.completions.create(
+            model=self._model,
+            stream=True,  # BLARGH
+            messages=messages,
+            temperature=0.1,
+            tools=self._tools
+        )
+
+        tool_call = {}
+
+        async for chunk in chunks:
+            print(f"CHUNK: {chunk}")
+            if len(chunk.choices) == 0:
+                continue
+
+            if chunk.choices[0].delta.content:
+                yield chunk.choices[0].delta.content
+
+            if chunk.choices[0].delta.tool_calls:
+                print(f"TOOL CALLS: {chunk.choices[0].delta.tool_calls[0]}")
+                if chunk.choices[0].delta.tool_calls[0].function.name:
+                    tool_call["id"] = chunk.choices[0].delta.tool_calls[0].id
+                    tool_call["name"] = chunk.choices[0].delta.tool_calls[0].function.name
+                    tool_call["arguments"] = ''
+                if chunk.choices[0].delta.tool_calls[0].function.arguments:
+                    tool_call["arguments"] += chunk.choices[0].delta.tool_calls[0].function.arguments
+
+            if chunk.choices[0].finish_reason:
+                print(f"TOOL CALLS ACCUM -- {tool_call}")
+                if tool_call.get("name"):
+                    # hard coding tool call action for now. we should assemble the tool call
+                    # from the streaming response, then yield it to the pipeline.
+                    # this approach works for the first few change appearance requests but
+                    # then the model starts refusing. need to read more about function
+                    # calling, try this with the OpenAI APIs, and talk to the Fireworks people.
+                    self._transport.append_to_context("assistant", {
+                        # pipeline will append the content to this context after it goes
+                        # through tts. we need to manually append the tool call, though
+                        "content": "",
+                        "role": "assistant",
+                        "tool_calls": [
+                            {
+                                "id": tool_call["id"],
+                                "type": "function",
+                                "index": 0,
+                                "function": {
+                                    "name": tool_call["name"],
+                                    "arguments": tool_call["arguments"]
+                                },
+                            }
+                        ],
+                    })
+                    self._transport.append_to_context("tool", {
+                        "content": "image generated by prompt arguments: " + tool_call["arguments"],
+                        "role": "tool",
+                        "tool_call_id": tool_call["id"]
+                    })
+                    self._transport.append_to_context("assistant", {
+                        "content": f"call to {tool_call['name']} function succeeded",
+                        "role": "assistant",
+                    })
+                    print("APPENDED TO CONTEXT")
+                    image_prompt = json.loads(
+                        tool_call["arguments"]).get("appearance")
+                    print("IMAGE PROMPT", image_prompt)
+                    asyncio.create_task(
+                        self._change_appearance(image_prompt))
+                    yield TextQueueOutOfBandFrame("Sure, let me work on that for you!")
+                    # yield {"content": "Sure, let me work on that for you!"}
+                    # yield "Sure, let me work on that for you!"
+
+    async def run_llm(self, messages) -> str | None:
+        print("--> IN SYNC ... WHEN DO WE EXPECT THIS TO BE CALLED?")
+        messages_for_log = json.dumps(messages)
+        self.logger.debug(f"Generating chat via openai: {messages_for_log}")
+
+        response = await self._client.chat.completions.create(model=self._model, stream=False, messages=messages)
+        if response and len(response.choices) > 0:
+            return response.choices[0].message.content
+        else:
+            return None
--- a/src/dailyai/services/groq_ai_services.py
+++ b/src/dailyai/services/groq_ai_services.py
@@ -0,0 +1,33 @@
+import os
+import groq
+from groq import AsyncGroq
+from dailyai.services.ai_services import LLMService
+from collections.abc import AsyncGenerator
+
+
+class GroqLLMService(LLMService):
+    def __init__(self, *, api_key, model="mixtral-8x7b-32768", context):
+        super().__init__(context)
+        self._model = model
+        # os.environ["GROQ_SECRET_ACCESS_KEY"] = api_key
+        
+        self._client = AsyncGroq()
+
+    async def run_llm_async(self, messages) -> AsyncGenerator[str, None]:
+        print(f"messages are {messages}")
+        try:
+            resp = await self._client.chat.completions.create(messages=messages, model=self._model)
+            print(f"got chunks from groq: {resp}")
+
+            if resp.choices[0].message.content:
+                yield resp.choices[0].message.content
+        except groq.APIConnectionError as e:
+            print("The server could not be reached")
+            print(e.__cause__)  # an underlying Exception, likely raised within httpx.
+        except groq.RateLimitError as e:
+            print("A 429 status code was received; we should back off a bit.")
+        except groq.APIStatusError as e:
+            print("Another non-200-range status code was received")
+            print(e.status_code)
+            print(e.response)
+    
--- a/src/dailyai/services/open_ai_services.py
+++ b/src/dailyai/services/open_ai_services.py
@@ -10,8 +10,8 @@ from dailyai.services.ai_services import LLMService, ImageGenService


 class OpenAILLMService(LLMService):
-    def __init__(self, *, api_key, model="gpt-4"):
-        super().__init__()
+    def __init__(self, *, api_key, model="gpt-4-turbo-preview", context):
+        super().__init__(context)
        self._model = model
        self._client = AsyncOpenAI(api_key=api_key)

--- a/src/examples/foundational/02-llm-say-one-thing.py
+++ b/src/examples/foundational/02-llm-say-one-thing.py
@@ -20,7 +20,8 @@ async def main(room_url):
            None,
            "Say One Thing From an LLM",
            duration_minutes=meeting_duration_minutes,
-            mic_enabled=True
+            mic_enabled=True,
+            speaker_enabled=True
        )

        tts = ElevenLabsTTSService(
--- a/src/examples/foundational/06-listen-and-respond.py
+++ b/src/examples/foundational/06-listen-and-respond.py
@@ -5,9 +5,16 @@ from dailyai.services.daily_transport_service import DailyTransportService
 from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
 from dailyai.queue_aggregators import LLMAssistantContextAggregator, LLMContextAggregator, LLMUserContextAggregator
 from examples.foundational.support.runner import configure
+from dailyai.services.ai_services import FrameLogger


 async def main(room_url: str, token):
+    context = [
+        {
+            "role": "system",
+            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
+        },
+    ]
    transport = DailyTransportService(
        room_url,
        token,
@@ -16,7 +23,9 @@ async def main(room_url: str, token):
        start_transcription=True,
        mic_enabled=True,
        mic_sample_rate=16000,
-        camera_enabled=False
+        camera_enabled=False,
+        speaker_enabled=True,
+        context=context
    )

    llm = AzureLLMService(
@@ -26,33 +35,33 @@ async def main(room_url: str, token):
    tts = AzureTTSService(
        api_key=os.getenv("AZURE_SPEECH_API_KEY"),
        region=os.getenv("AZURE_SPEECH_REGION"))
+    fl = FrameLogger("transport")

    @transport.event_handler("on_first_other_participant_joined")
    async def on_first_other_participant_joined(transport):
        await tts.say("Hi, I'm listening!", transport.send_queue)

    async def handle_transcriptions():
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
-            },
-        ]

-        tma_in = LLMUserContextAggregator(messages, transport._my_participant_id)
-        tma_out = LLMAssistantContextAggregator(messages, transport._my_participant_id)
+        tma_in = LLMUserContextAggregator(
+            context, transport._my_participant_id)
+        tma_out = LLMAssistantContextAggregator(
+            context, transport._my_participant_id)
        await tts.run_to_queue(
            transport.send_queue,
            tma_out.run(
                llm.run(
                    tma_in.run(
-                        transport.get_receive_frames()
+                        fl.run(
+                            transport.get_receive_frames()
+                        )
                    )
                )
            )
        )

    transport.transcription_settings["extra"]["punctuate"] = True
+    transport.transcription_settings["extra"]["endpointing"] = True
    await asyncio.gather(transport.run(), handle_transcriptions())


--- a/src/examples/foundational/06c-listen-respond-interruptible-refactor.py
+++ b/src/examples/foundational/06c-listen-respond-interruptible-refactor.py
@@ -0,0 +1,83 @@
+import asyncio
+import aiohttp
+import os
+from dailyai.conversation_wrappers import InterruptibleConversationWrapper
+
+from dailyai.queue_frame import StartStreamQueueFrame, TextQueueFrame
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.azure_ai_services import AzureLLMService, AzureTTSService
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from dailyai.services.open_ai_services import OpenAILLMService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
+from dailyai.services.ai_services import FrameLogger
+from dailyai.services.groq_ai_services import GroqLLMService
+
+from examples.foundational.support.runner import configure
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        context = [
+            {
+                "role": "system",
+                "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio. Respond to what the user said in a creative and helpful way.",
+            },
+        ]
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Respond bot",
+            duration_minutes=5,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=False,
+            # TODO-CB: Should this be VAD enabled or something?
+            speaker_enabled=True,
+            context=context
+        )
+
+        # llm = AzureLLMService(
+        #     api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
+        #     endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
+        #     model=os.getenv("AZURE_CHATGPT_MODEL"),
+        #     context=context)
+        llm = OpenAILLMService(
+            context=context, api_key=os.getenv("OPENAI_CHATGPT_API_KEY"))
+        # llm = GroqLLMService(api_key=os.getenv("GROQ_API_KEY"), context=context)
+        # tts = AzureTTSService(
+        #     api_key=os.getenv("AZURE_SPEECH_API_KEY"),
+        #     region=os.getenv("AZURE_SPEECH_REGION"))
+        tts = ElevenLabsTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("ELEVENLABS_API_KEY"),
+            voice_id=os.getenv("ELEVENLABS_VOICE_ID"))
+        # tts = DeepgramTTSService(aiohttp_session=session, api_key=os.getenv("DEEPGRAM_API_KEY"), voice=os.getenv("DEEPGRAM_VOICE"))
+        fl = FrameLogger("just outside the innermost layer")
+
+        async def run_response(in_frame):
+            await tts.run_to_queue(
+                transport.send_queue,
+                # tma_out.run(
+                llm.run(
+                    # tma_in.run(
+                    fl.run(
+                        [StartStreamQueueFrame(), in_frame]
+                    )
+                    # )
+                )
+                # ),
+            )
+
+        @transport.event_handler("on_first_other_participant_joined")
+        async def on_first_other_participant_joined(transport):
+            await tts.say("Hi, I'm listening!", transport.send_queue)
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+        await asyncio.gather(transport.run(), transport.run_conversation(run_response))
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
--- a/src/khk-hackathon/06d-listen.py
+++ b/src/khk-hackathon/06d-listen.py
@@ -0,0 +1,160 @@
+from datetime import datetime
+import asyncio
+import aiohttp
+import os
+import sys
+from dailyai.conversation_wrappers import InterruptibleConversationWrapper
+
+from dailyai.queue_frame import StartStreamQueueFrame, TranscriptionQueueFrame, TextQueueFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
+from dailyai.services.daily_transport_service import DailyTransportService
+from dailyai.services.fireworks_ai_services import FireworksLLMService
+from dailyai.services.deepgram_ai_services import DeepgramTTSService
+from dailyai.services.ai_services import FrameLogger
+
+from dailyai.services.fal_ai_services import FalImageGenService
+
+from examples.foundational.support.runner import configure
+
+
+command_line_prompt = ' '.join(sys.argv[1:])
+
+system_prompt = """
+You are a friendly robot character with a cartoon body with head, torso, arms, feet,
+and legs.
+
+You can change your appearance using the `change_appearance` function call.
+You can add or remove items from your body, change
+your color, and more. You can use function calling to change your appearance.
+
+When changing your appearance, please create a prompt as an argument to the function.
+The prompt will help the image generation model
+create a new appearance for you. Include as much detail as possible. Include the
+keywords "robot", "friendly", "cartoon", "smiling", "happy", "animated". 
+The initial image prompt you are adding to or changing is
+"A friendly cartoon robot, smiling and happy, animated."
+
+Do not include the image model prompt in your response. The prompt must be passed to the function
+as a parameter. 
+"""
+
+change_appearance_function = {
+    "name": "change_appearance",
+    "description": "Call this function when the users want you to change your appearance.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "appearance": {
+                "type": "string",
+                "description": "The new appearance for the robot, in the form of a prompt for an generative AI diffusion model."
+            }
+        }
+    }
+}
+
+tools = [
+    {
+        "type": "function",
+        "function": change_appearance_function
+    }
+]
+
+
+async def main(room_url: str, token):
+    async with aiohttp.ClientSession() as session:
+        context = [
+            {
+                "role": "system",
+                "content": system_prompt,
+            },
+        ]
+        transport = DailyTransportService(
+            room_url,
+            token,
+            "Respond bot",
+            duration_minutes=30,
+            start_transcription=True,
+            mic_enabled=True,
+            mic_sample_rate=16000,
+            camera_enabled=True,
+            camera_width=1024,
+            camera_height=1024,
+            # TODO-CB: Should this be VAD enabled or something?
+            speaker_enabled=True,
+            context=context
+        )
+
+        imagegen = FalImageGenService(
+            image_size="512x512",
+            aiohttp_session=session,
+            key_id=os.getenv("FAL_KEY_ID"),
+            key_secret=os.getenv("FAL_KEY_SECRET"))
+
+        async def change_appearance(appearance):
+            await asyncio.create_task(
+                imagegen.run_to_queue(
+                    transport.send_queue, [
+                        TextQueueFrame(appearance)]))
+
+        llm = FireworksLLMService(
+            context=context,
+            api_key=os.getenv("FIREWORKS_API_KEY"),
+            model="accounts/fireworks/models/firefunction-v1",
+            # TODO - how can we modify tools list on the fly?
+            tools=tools,
+            change_appearance=change_appearance,
+            transport=transport
+        )
+        tts = DeepgramTTSService(aiohttp_session=session, api_key=os.getenv(
+            "DEEPGRAM_API_KEY"), voice=os.getenv("DEEPGRAM_VOICE"))
+        fl = FrameLogger("just outside the innermost layer")
+
+        async def run_response(in_frame):
+            await tts.run_to_queue(
+                transport.send_queue,
+                # tma_out.run(
+                llm.run(
+                    # tma_in.run(
+                    fl.run(
+                        [StartStreamQueueFrame(), in_frame]
+                    )
+                    # )
+                )
+                # ),
+            )
+
+        @transport.event_handler("on_first_other_participant_joined")
+        async def on_first_other_participant_joined(transport):
+            await change_appearance("A friendly cartoon robot, smiling and happy, animated.")
+            return
+
+            await tts.say("Hi, I'm listening!", transport.send_queue)
+            await asyncio.sleep(1)
+
+            await transport.receive_queue.put(UserStartedSpeakingFrame())
+            await asyncio.sleep(0.1)
+
+            transport.on_transcription_message({
+                "text": command_line_prompt,
+                "participantId":  "cb65b845-aac0-4fc8-987d-2e7ce3c7d8f0",
+                "timestamp": datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
+            })
+# putting the frame into the queue directly doesn't seem to work
+#            await transport.receive_queue.put(
+#                TranscriptionQueueFrame(
+#                    "tell me a joke.",
+#                    "cb65b845-aac0-4fc8-987d-2e7ce3c7d8f0",
+#                    datetime.utcnow().strftime(
+#                        '%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
+#                ))
+            await asyncio.sleep(0.1)
+            await transport.receive_queue.put(UserStoppedSpeakingFrame())
+
+        transport.transcription_settings["extra"]["endpointing"] = True
+        transport.transcription_settings["extra"]["punctuate"] = True
+
+        await asyncio.gather(transport.run(), transport.run_conversation(run_response))
+
+
+if __name__ == "__main__":
+    (url, token) = configure()
+    asyncio.run(main(url, token))
Author	SHA1	Message	Date
Kwindla Hultman Kramer	5d6d674ff6	some more changes	2024-02-25 21:51:08 -08:00
Kwindla Hultman Kramer	1e552958aa	hackathon code	2024-02-25 21:41:55 -08:00
Chad Bailey	17edfe98bd	more tweaks	2024-02-22 22:18:06 +00:00
Chad Bailey	5100a7599b	0.5s VAD is interesting	2024-02-22 16:14:36 -06:00
Chad Bailey	18c2b37358	groq worqs	2024-02-22 15:39:21 -06:00
Chad Bailey	0244f358d2	Added better interruptability	2024-02-22 14:45:38 -06:00
Chad Bailey	85fe6c0580	more wip	2024-02-22 16:22:41 +00:00
Chad Bailey	ae7482ed18	wip: interruptions in the base transport	2024-02-22 16:08:01 +00:00
Chad Bailey	90d928be99	first commit of transport conversation runner	2024-02-21 18:57:06 +00:00
Chad Bailey	0703b926a3	adding silero VAD	2024-02-16 20:09:02 +00:00