Compare commits

...

11 Commits

Author SHA1 Message Date
Aleix Conchillo Flaqué
a46eaa838b Merge pull request #641 from pipecat-ai/aleix/prepare-0.0.47
prepare 0.0.47
2024-10-22 10:30:42 -07:00
Aleix Conchillo Flaqué
7c432499db update CHANGELOG for 0.0.47 2024-10-22 10:02:50 -07:00
Aleix Conchillo Flaqué
8d75fcc9f0 use warnings package to report deprecated code 2024-10-22 10:02:21 -07:00
Aleix Conchillo Flaqué
61d73f81ae Merge pull request #639 from pipecat-ai/aleix/daily-transcription-model
transport(daily): use "nova-2-general" for transcription
2024-10-22 09:40:43 -07:00
Aleix Conchillo Flaqué
951255def9 transport(daily): use "nova-2-general" for transcription 2024-10-22 09:40:03 -07:00
Mark Backman
e556f34094 Merge pull request #638 from pipecat-ai/mb/fix-silero-vad-import
Fix Silero VAD import issue
2024-10-21 20:48:06 -04:00
Mark Backman
ccc3691620 Fix Silero VAD import issue 2024-10-21 20:39:20 -04:00
Vanessa Pyne
5321affda7 Merge pull request #588 from Allenmylath/patch-11
Update README.md
2024-10-21 11:20:05 -05:00
Mark Backman
e5ad8dc67b Merge pull request #627 from pipecat-ai/mb/upgrade-gladia-to-v2-api
Update GladiaSTTService to use the Gladia V2 API
2024-10-21 12:01:20 -04:00
Mark Backman
46927805bc Update GladiaSTTService to use the Gladia V2 API 2024-10-21 07:10:38 -04:00
allenmylath
b999b76f70 Update README.md
readme description still shows simple-chatbot definition hence made more accurate description
2024-10-15 08:14:43 +05:30
8 changed files with 181 additions and 136 deletions

View File

@@ -5,7 +5,7 @@ All notable changes to **Pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [0.0.47] - 2024-10-22
### Added
@@ -15,8 +15,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added a foundational example for Gladia transcription:
`13c-gladia-transcription.py`
### Changed
- Updated `GladiaSTTService` to use the V2 API.
- Changed `DailyTransport` transcription model to `nova-2-general`.
### Fixed
- Fixed an issue that would cause an import error when importing
`SileroVADAnalyzer` from the old package `pipecat.vad.silero`.
- Fixed `enable_usage_metrics` to control LLM/TTS usage metrics separately
from `enable_metrics`.
@@ -32,6 +41,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Changed `DeepgramSTTService` model to `nova-2-general`.
- Moved `SileroVAD` audio processor to `processors.audio.vad`.
- Module `utils.audio` is now `audio.utils`. A new `resample_audio` function has

View File

@@ -1,12 +1,43 @@
# Simple Chatbot
# Chatbot with canonical-metrics
<img src="image.png" width="420px">
This project implements a chatbot using a pipeline architecture that integrates audio processing, transcription, and a language model for conversational interactions. The chatbot operates within a daily communication environment, utilizing various services for text-to-speech and language model responses.
This app connects you to a chatbot powered by GPT-4, complete with animations generated by Stable Video Diffusion.
## Features
See a video of it in action: https://x.com/kwindla/status/1778628911817183509
- **Audio Input and Output**: Captures microphone input and plays back audio responses.
- **Voice Activity Detection**: Utilizes Silero VAD to manage audio input intelligently.
- **Text-to-Speech**: Integrates ElevenLabs TTS service to convert text responses into audio.
- **Language Model Interaction**: Uses OpenAI's GPT-4 model to generate responses based on user input.
- **Transcription Services**: Captures and transcribes participant speech for analytics.
- **Metrics Collection**: Sends audio data for analysis via Canonical Metrics Service.
## Requirements
- Python 3.7+
- `aiohttp`
- `loguru`
- `python-dotenv`
- Additional libraries from the `pipecat` package.
## Setup
1. Clone the repository.
2. Install the required packages.
3. Set up environment variables for API keys:
- `OPENAI_API_KEY`
- `ELEVENLABS_API_KEY`
- `CANONICAL_API_KEY`
- `CANONICAL_API_URL`
4. Run the script.
## Usage
The chatbot introduces itself and engages in conversations, providing brief and creative responses. Designed for flexibility, it can support multiple languages with appropriate configuration.
## Events
- Participants joining or leaving the call are handled dynamically, adjusting the chatbot's behavior accordingly.
And a quick video walkthrough of the code: https://www.loom.com/share/13df1967161f4d24ade054e7f8753416
The first time, things might take extra time to get started since VAD (Voice Activity Detection) model needs to be downloaded.

View File

@@ -5,12 +5,16 @@
#
import asyncio
import aiohttp
import os
import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.frames.frames import EndFrame, LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -20,12 +24,6 @@ from pipecat.services.gladia import GladiaSTTService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
@@ -90,6 +88,11 @@ async def main():
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
# Register an event handler to exit the application when the user leaves.
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.queue_frame(EndFrame())
runner = PipelineRunner()
await runner.run(task)

View File

@@ -21,9 +21,9 @@ classifiers = [
]
dependencies = [
"aiohttp~=3.10.3",
"loguru~=0.7.2",
"Markdown~=3.7",
"numpy~=1.26.4",
"loguru~=0.7.2",
"Pillow~=10.4.0",
"protobuf~=4.25.4",
"pydantic~=2.8.2",

View File

@@ -8,6 +8,7 @@ import base64
import json
from typing import AsyncGenerator, Optional
import aiohttp
from loguru import logger
from pydantic.main import BaseModel
@@ -23,7 +24,6 @@ from pipecat.services.ai_services import STTService
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
# See .env.example for Gladia configuration needed
try:
import websockets
except ModuleNotFoundError as e:
@@ -38,15 +38,16 @@ class GladiaSTTService(STTService):
class InputParams(BaseModel):
sample_rate: Optional[int] = 16000
language: Optional[Language] = Language.EN
transcription_hint: Optional[str] = None
endpointing: Optional[int] = 200
prosody: Optional[bool] = None
endpointing: Optional[float] = 0.2
maximum_duration_without_endpointing: Optional[int] = 10
audio_enhancer: Optional[bool] = None
words_accurate_timestamps: Optional[bool] = None
def __init__(
self,
*,
api_key: str,
url: str = "wss://api.gladia.io/audio/text/audio-transcription",
url: str = "https://api.gladia.io/v2/live",
confidence: float = 0.5,
params: InputParams = InputParams(),
**kwargs,
@@ -56,101 +57,82 @@ class GladiaSTTService(STTService):
self._api_key = api_key
self._url = url
self._settings = {
"encoding": "wav/pcm",
"bit_depth": 16,
"sample_rate": params.sample_rate,
"language": self.language_to_service_language(params.language)
if params.language
else Language.EN,
"transcription_hint": params.transcription_hint,
"channels": 1,
"language_config": {
"languages": [self.language_to_service_language(params.language)]
if params.language
else [],
"code_switching": False,
},
"endpointing": params.endpointing,
"prosody": params.prosody,
"maximum_duration_without_endpointing": params.maximum_duration_without_endpointing,
"pre_processing": {
"audio_enhancer": params.audio_enhancer,
},
"realtime_processing": {
"words_accurate_timestamps": params.words_accurate_timestamps,
},
}
self._confidence = confidence
def language_to_service_language(self, language: Language) -> str | None:
match language:
case Language.BG:
return "bulgarian"
case Language.CA:
return "catalan"
case Language.ZH:
return "chinese"
case Language.CS:
return "czech"
case Language.DA:
return "danish"
case Language.NL:
return "dutch"
case (
Language.EN
| Language.EN_US
| Language.EN_AU
| Language.EN_GB
| Language.EN_NZ
| Language.EN_IN
):
return "english"
case Language.ET:
return "estonian"
case Language.FI:
return "finnish"
case Language.FR | Language.FR_CA:
return "french"
case Language.DE | Language.DE_CH:
return "german"
case Language.EL:
return "greek"
case Language.HI:
return "hindi"
case Language.HU:
return "hungarian"
case Language.ID:
return "indonesian"
case Language.IT:
return "italian"
case Language.JA:
return "japanese"
case Language.KO:
return "korean"
case Language.LV:
return "latvian"
case Language.LT:
return "lithuanian"
case Language.MS:
return "malay"
case Language.NO:
return "norwegian"
case Language.PL:
return "polish"
case Language.PT | Language.PT_BR:
return "portuguese"
case Language.RO:
return "romanian"
case Language.RU:
return "russian"
case Language.SK:
return "slovak"
case Language.ES:
return "spanish"
case Language.SV:
return "slovenian"
case Language.TH:
return "thai"
case Language.TR:
return "turkish"
case Language.UK:
return "ukrainian"
case Language.VI:
return "vietnamese"
return None
language_map = {
Language.BG: "bg",
Language.CA: "ca",
Language.ZH: "zh",
Language.CS: "cs",
Language.DA: "da",
Language.NL: "nl",
Language.EN: "en",
Language.EN_US: "en",
Language.EN_AU: "en",
Language.EN_GB: "en",
Language.EN_NZ: "en",
Language.EN_IN: "en",
Language.ET: "et",
Language.FI: "fi",
Language.FR: "fr",
Language.FR_CA: "fr",
Language.DE: "de",
Language.DE_CH: "de",
Language.EL: "el",
Language.HI: "hi",
Language.HU: "hu",
Language.ID: "id",
Language.IT: "it",
Language.JA: "ja",
Language.KO: "ko",
Language.LV: "lv",
Language.LT: "lt",
Language.MS: "ms",
Language.NO: "no",
Language.PL: "pl",
Language.PT: "pt",
Language.PT_BR: "pt",
Language.RO: "ro",
Language.RU: "ru",
Language.SK: "sk",
Language.ES: "es",
Language.SV: "sv",
Language.TH: "th",
Language.TR: "tr",
Language.UK: "uk",
Language.VI: "vi",
}
return language_map.get(language)
async def start(self, frame: StartFrame):
await super().start(frame)
self._websocket = await websockets.connect(self._url)
response = await self._setup_gladia()
self._websocket = await websockets.connect(response["url"])
self._receive_task = self.get_event_loop().create_task(self._receive_task_handler())
await self._setup_gladia()
async def stop(self, frame: EndFrame):
await super().stop(frame)
await self._send_stop_recording()
await self._websocket.close()
async def cancel(self, frame: CancelFrame):
@@ -164,39 +146,37 @@ class GladiaSTTService(STTService):
yield None
async def _setup_gladia(self):
configuration = {
"x_gladia_key": self._api_key,
"encoding": "WAV/PCM",
"model_type": "fast",
"language_behaviour": "manual",
"sample_rate": self._settings["sample_rate"],
"language": self._settings["language"],
"transcription_hint": self._settings["transcription_hint"],
"endpointing": self._settings["endpointing"],
"prosody": self._settings["prosody"],
}
await self._websocket.send(json.dumps(configuration))
async with aiohttp.ClientSession() as session:
async with session.post(
self._url,
headers={"X-Gladia-Key": self._api_key, "Content-Type": "application/json"},
json=self._settings,
) as response:
if response.ok:
return await response.json()
else:
logger.error(
f"Gladia error: {response.status}: {response.text or response.reason}"
)
raise Exception(f"Failed to initialize Gladia session: {response.status}")
async def _send_audio(self, audio: bytes):
message = {"frames": base64.b64encode(audio).decode("utf-8")}
data = base64.b64encode(audio).decode("utf-8")
message = {"type": "audio_chunk", "data": {"chunk": data}}
await self._websocket.send(json.dumps(message))
async def _send_stop_recording(self):
await self._websocket.send(json.dumps({"type": "stop_recording"}))
async def _receive_task_handler(self):
async for message in self._websocket:
utterance = json.loads(message)
if not utterance:
continue
if "error" in utterance:
message = utterance["message"]
logger.error(f"Gladia error: {message}")
elif "confidence" in utterance:
type = utterance["type"]
confidence = utterance["confidence"]
transcript = utterance["transcription"]
content = json.loads(message)
if content["type"] == "transcript":
utterance = content["data"]["utterance"]
confidence = utterance.get("confidence", 0)
transcript = utterance["text"]
if confidence >= self._confidence:
if type == "final":
if content["data"]["is_final"]:
await self.push_frame(
TranscriptionFrame(transcript, "", time_now_iso8601())
)

View File

@@ -6,6 +6,7 @@
import asyncio
import time
import warnings
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from typing import Any, Awaitable, Callable, Mapping, Optional
@@ -20,7 +21,7 @@ from daily import (
VirtualSpeakerDevice,
)
from loguru import logger
from pydantic.main import BaseModel
from pydantic import BaseModel, model_validator
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams
from pipecat.frames.frames import (
@@ -93,8 +94,8 @@ class DailyDialinSettings(BaseModel):
class DailyTranscriptionSettings(BaseModel):
language: str = "en"
tier: str = "nova"
model: str = "2-conversationalai"
tier: Optional[str] = None
model: str = "nova-2-general"
profanity_filter: bool = True
redact: bool = False
endpointing: bool = True
@@ -102,6 +103,16 @@ class DailyTranscriptionSettings(BaseModel):
includeRawResponse: bool = True
extra: Mapping[str, Any] = {"interim_results": True}
@model_validator(mode="before")
def check_deprecated_fields(cls, values):
with warnings.catch_warnings():
warnings.simplefilter("always")
if "tier" in values:
warnings.warn(
"Field 'tier' is deprecated, use 'model' instead.", DeprecationWarning
)
return values
class DailyParams(TransportParams):
api_url: str = "https://api.daily.co/v1"

View File

@@ -4,8 +4,13 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
from loguru import logger
import warnings
logger.warning("DEPRECATED: Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead.")
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead", DeprecationWarning
)
from ..audio.vad.silero import SileroVAD, SileroVADAnalyzer
from ..audio.vad.silero import SileroVADAnalyzer
from ..processors.audio.vad.silero import SileroVAD

View File

@@ -4,8 +4,12 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
from loguru import logger
import warnings
logger.warning("DEPRECATED: Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead.")
with warnings.catch_warnings():
warnings.simplefilter("always")
warnings.warn(
"Package `pipecat.vad` is deprecated, use `pipecat.audio.vad` instead", DeprecationWarning
)
from ..audio.vad.vad_analyzer import VADAnalyzer, VADParams, VADState