diff --git a/TESTING_CHECKLIST.md b/TESTING_CHECKLIST.md deleted file mode 100644 index 8d2d147d4..000000000 --- a/TESTING_CHECKLIST.md +++ /dev/null @@ -1,273 +0,0 @@ -# AssemblyAI u3-rt-pro Testing Checklist - -## Test Environment Setup -- [ ] Install dependencies: `uv sync --group dev --all-extras` -- [ ] Set up `.env` file with API keys -- [ ] Verify LiveKit connection -- [ ] Run basic voice agent test - ---- - -## Feature Testing Checklist - -### ✅ Basic Configuration Tests - -#### Test 1: Default u3-rt-pro Configuration -- [ ] **Setup:** Create service with default params -- [ ] **Expected:** No errors, uses u3-rt-pro model with 100ms min/max -- [ ] **Verify:** Check logs for connection confirmation - -#### Test 2: Custom min_turn_silence -- [ ] **Setup:** Set `min_turn_silence=200` -- [ ] **Expected:** Both min and max set to 200ms -- [ ] **Verify:** Speak short phrases, observe turn detection timing - -#### Test 3: User sets max_turn_silence (Warning Test) -- [ ] **Setup:** Set `max_turn_silence=500` in connection params -- [ ] **Expected:** Warning logged, value overridden to match min -- [ ] **Verify:** Check logs for warning message - ---- - -### ✅ Prompting Tests - -#### Test 4: No Prompt (Default - Recommended) -- [ ] **Setup:** Don't set prompt parameter -- [ ] **Expected:** Uses default prompt, 88% accuracy, no warnings -- [ ] **Verify:** Transcription quality is good - -#### Test 5: Custom Prompt (Warning Test) -- [ ] **Setup:** Set custom prompt in connection params -- [ ] **Expected:** Warning logged about testing without prompt first -- [ ] **Verify:** Check logs for prompt warning - -#### Test 6: Prompt + Keyterms Conflict (Error Test) -- [ ] **Setup:** Set both `prompt` and `keyterms_prompt` at init -- [ ] **Expected:** ValueError raised with helpful error message -- [ ] **Verify:** Service fails to initialize with clear error - ---- - -### ✅ Keyterms Prompting Tests - -#### Test 7: Basic Keyterms at Init -- [ ] **Setup:** Set `keyterms_prompt=["Pipecat", "AssemblyAI", "Universal-3"]` -- [ ] **Expected:** Terms are boosted in recognition -- [ ] **Verify:** Say the boosted terms, check accuracy - -#### Test 8: Empty Keyterms (No Boosting) -- [ ] **Setup:** Set `keyterms_prompt=[]` -- [ ] **Expected:** No boosting, default behavior -- [ ] **Verify:** Normal transcription - ---- - -### ✅ Diarization Tests - -#### Test 9: Diarization Disabled (Default) -- [ ] **Setup:** Don't set `speaker_labels` parameter -- [ ] **Expected:** No speaker info in transcripts -- [ ] **Verify:** TranscriptionFrame.user_id is default user_id - -#### Test 10: Diarization Enabled (No Formatting) -- [ ] **Setup:** Set `speaker_labels=True` -- [ ] **Expected:** Speaker ID in user_id field, plain text -- [ ] **Verify:** Multiple speakers show different IDs (Speaker A, Speaker B) - -#### Test 11: Diarization with XML Formatting -- [ ] **Setup:** Set `speaker_labels=True`, `speaker_format="<{speaker}>{text}"` -- [ ] **Expected:** Text includes speaker tags: `Hello` -- [ ] **Verify:** Formatted text in transcript, speaker ID in user_id - -#### Test 12: Diarization with Colon Prefix -- [ ] **Setup:** Set `speaker_labels=True`, `speaker_format="{speaker}: {text}"` -- [ ] **Expected:** Text includes prefix: `Speaker A: Hello` -- [ ] **Verify:** Formatted text, multiple speakers distinguishable - ---- - -### ✅ Dynamic Updates Tests - -#### Test 13: Dynamic Keyterms Update (Stage 1 → Stage 2) -- [ ] **Setup:** Start with empty keyterms, update mid-conversation -- [ ] **Expected:** New keyterms take effect immediately -- [ ] **Test Steps:** - 1. Start conversation with no keyterms - 2. Send update frame with `keyterms_prompt=["cardiology", "Dr. Smith"]` - 3. Say the new terms -- [ ] **Verify:** Improved recognition after update - -#### Test 14: Clear Keyterms (Reset Context) -- [ ] **Setup:** Start with keyterms, clear them mid-stream -- [ ] **Expected:** Context biasing removed -- [ ] **Test Steps:** - 1. Start with `keyterms_prompt=["test", "words"]` - 2. Send update frame with `keyterms_prompt=[]` -- [ ] **Verify:** No more boosting after clear - -#### Test 15: Dynamic Silence Parameters -- [ ] **Setup:** Update `max_turn_silence` mid-stream -- [ ] **Expected:** Turn detection timing changes -- [ ] **Test Steps:** - 1. Start with default (1200ms) - 2. Update to `max_turn_silence=5000` (for reading numbers) - 3. Pause longer between words - 4. Update back to `max_turn_silence=1200` -- [ ] **Verify:** Longer pauses tolerated when increased - -#### Test 16: Dynamic Prompt Update -- [ ] **Setup:** Update prompt mid-stream -- [ ] **Expected:** New instructions take effect -- [ ] **Test Steps:** - 1. Start with default prompt - 2. Send update with custom prompt -- [ ] **Verify:** Behavior changes according to new prompt - -#### Test 17: Multiple Parameters at Once -- [ ] **Setup:** Update keyterms, max_turn_silence, and min_end_of_turn together -- [ ] **Expected:** All parameters updated in single WebSocket message -- [ ] **Verify:** Check logs for single UpdateConfiguration message - -#### Test 18: Dynamic Update - Prompt + Keyterms Conflict (Error) -- [ ] **Setup:** Try to update both prompt and keyterms_prompt in same update -- [ ] **Expected:** ValueError raised -- [ ] **Verify:** Update fails with clear error message - ---- - -### ✅ Turn Detection Mode Tests - -#### Test 19: Pipecat Mode (vad_force_turn_endpoint=True) - Default -- [ ] **Setup:** Use default settings (Pipecat mode) -- [ ] **Expected:** - - ForceEndpoint sent on VAD stop - - Smart Turn Analyzer makes decisions - - min=max=100ms for u3-rt-pro -- [ ] **Verify:** Fast finals, Smart Turn handles completeness - -#### Test 20: STT Mode (vad_force_turn_endpoint=False) - u3-rt-pro only -- [ ] **Setup:** Set `vad_force_turn_endpoint=False` with u3-rt-pro -- [ ] **Expected:** - - AssemblyAI controls turn endings - - SpeechStarted message triggers interruptions - - UserStarted/StoppedSpeakingFrame emitted -- [ ] **Verify:** Turn detection from AssemblyAI model - -#### Test 21: STT Mode with universal-streaming (Error Test) -- [ ] **Setup:** Set `vad_force_turn_endpoint=False` with universal-streaming -- [ ] **Expected:** ValueError raised (requires u3-rt-pro) -- [ ] **Verify:** Service fails with clear error - ---- - -### ✅ Language Detection Tests (If Multilingual Model) - -#### Test 22: Language Detection Enabled -- [ ] **Setup:** Use `universal-streaming-multilingual` with `language_detection=True` -- [ ] **Expected:** Language codes in transcripts -- [ ] **Verify:** Speak different languages, check language_code field - -#### Test 23: Language Confidence Threshold -- [ ] **Setup:** Enable language detection -- [ ] **Expected:** High confidence (≥0.7) → detected language, Low → fallback to English -- [ ] **Verify:** Check logs for confidence warnings - ---- - -### ✅ Edge Cases & Error Handling - -#### Test 24: WebSocket Disconnect During Update -- [ ] **Setup:** Simulate disconnect, try update -- [ ] **Expected:** Error logged, update queued for reconnection -- [ ] **Verify:** Graceful handling, no crash - -#### Test 25: Invalid Parameter Types -- [ ] **Setup:** Send update with wrong type (e.g., keyterms_prompt as string) -- [ ] **Expected:** Warning logged, parameter skipped -- [ ] **Verify:** Service continues, invalid param ignored - -#### Test 26: Unknown Parameter in Update -- [ ] **Setup:** Send update with unsupported parameter (e.g., `language`) -- [ ] **Expected:** Warning logged about parameter -- [ ] **Verify:** Other valid params still updated - ---- - -### ✅ Integration Tests - -#### Test 27: Full Voice Agent Flow (Multi-Stage) -- [ ] **Setup:** Complete voice agent with stage transitions -- [ ] **Test Steps:** - 1. Greeting stage (general keyterms) - 2. Name collection stage (name keyterms) - 3. Account number stage (number keyterms, longer silence) - 4. Medical info stage (medical keyterms) - 5. Closing stage (goodbye keyterms) -- [ ] **Verify:** Each stage has appropriate keyterms and timing - -#### Test 28: Diarization + Dynamic Updates -- [ ] **Setup:** Enable diarization, update keyterms mid-stream -- [ ] **Expected:** Both features work together -- [ ] **Verify:** Speaker IDs persist, keyterms update correctly - -#### Test 29: Interruption Handling -- [ ] **Setup:** Bot speaking, user interrupts -- [ ] **Expected:** - - Pipecat mode: VAD + Smart Turn handles - - STT mode: SpeechStarted triggers interrupt -- [ ] **Verify:** Bot stops, user speech processed - ---- - -## Testing Results Template - -``` -| Test # | Feature | Status | Notes | -|--------|---------|--------|-------| -| 1 | Default Config | ✅ PASS | | -| 2 | Custom min_silence | ✅ PASS | | -| 3 | max_silence Warning | ✅ PASS | | -| ... | ... | ... | ... | -``` - ---- - -## Expected Outcomes Summary - -### ✅ Should Work (No Errors) -- Default configuration -- Custom min_turn_silence -- Keyterms prompting -- Diarization with/without formatting -- Dynamic updates (one parameter or multiple) -- Pipecat mode turn detection - -### ⚠️ Should Warn (Logs Warning, Continues) -- Custom prompt set at init -- max_turn_silence set (overridden) -- Invalid parameter types in updates -- Language update attempted -- Prompt used with universal-streaming - -### ❌ Should Error (Raises Exception, Stops) -- prompt + keyterms_prompt at init -- prompt + keyterms_prompt in same update -- vad_force_turn_endpoint=False with universal-streaming - ---- - -## Quick Test Commands - -```bash -# Run basic test -python test_assemblyai_u3pro.py --test basic - -# Run specific test -python test_assemblyai_u3pro.py --test diarization - -# Run all tests -python test_assemblyai_u3pro.py --test all - -# Interactive mode -python test_assemblyai_u3pro.py --interactive -``` diff --git a/TESTING_SETUP.md b/TESTING_SETUP.md deleted file mode 100644 index fa1dca462..000000000 --- a/TESTING_SETUP.md +++ /dev/null @@ -1,310 +0,0 @@ -# AssemblyAI u3-rt-pro Testing Setup Guide - -## Quick Start - -### 1. Setup Environment - -```bash -# Copy API keys -cp .env.testing .env - -# Install dependencies -uv sync --group dev --all-extras --no-extra gstreamer --no-extra krisp - -# Make test script executable -chmod +x test_assemblyai_u3pro.py -``` - -### 2. Ensure Audio Devices - -Make sure you have: -- **Microphone** enabled and working -- **Speakers/headphones** connected -- Audio permissions granted (macOS will prompt on first run) - -### 3. Run Tests - -```bash -# Run a specific test -python test_assemblyai_u3pro.py --test basic - -# Interactive mode (choose from menu) -python test_assemblyai_u3pro.py --interactive - -# Run all tests sequentially -python test_assemblyai_u3pro.py --test all -``` - ---- - -## Available Tests - -### Basic Configuration Tests -```bash -# Test 1: Default configuration (min=max=100ms) -python test_assemblyai_u3pro.py --test basic - -# Test 2: Custom min_turn_silence -python test_assemblyai_u3pro.py --test custom_min - -# Test 3: max_turn_silence warning (should be overridden) -python test_assemblyai_u3pro.py --test max_warning -``` - -### Prompting Tests -```bash -# Test 5: Custom prompt warning -python test_assemblyai_u3pro.py --test prompt_warning - -# Test 6: Prompt + keyterms conflict (should error) -python test_assemblyai_u3pro.py --test prompt_keyterms_conflict - -# Test 7: Basic keyterms prompting -python test_assemblyai_u3pro.py --test keyterms -``` - -### Diarization Tests -```bash -# Test 10: Diarization without formatting -python test_assemblyai_u3pro.py --test diarization - -# Test 11: Diarization with XML formatting -python test_assemblyai_u3pro.py --test diarization_xml -``` - -### Dynamic Updates Tests -```bash -# Test 13: Dynamic keyterms (multi-stage) -python test_assemblyai_u3pro.py --test dynamic_keyterms - -# Test 15: Dynamic silence parameters -python test_assemblyai_u3pro.py --test dynamic_silence - -# Test 17: Multiple parameters at once -python test_assemblyai_u3pro.py --test multi_param -``` - ---- - -## Test Execution Flow - -### For Each Test: - -1. **Start the test script** - ```bash - python test_assemblyai_u3pro.py --test - ``` - -2. **Wait for "started" message** indicating the bot is ready - -3. **Speak into your microphone** to test - the bot will: - - Transcribe your speech (you'll see `📝 TRANSCRIPTION:` logs) - - Process through the LLM - - Respond with voice through your speakers - -4. **Observe logs** for: - - ✅ Success indicators - - ⚠️ Warning messages - - ❌ Error messages - - 📝 Transcription output - -5. **Verify expected behavior** against checklist - -6. **Stop test** with Ctrl+C - ---- - -## Expected Test Outcomes - -### Should Pass (✅) -- Basic configuration creates service -- Custom parameters are applied -- Keyterms boost recognition -- Diarization shows speaker IDs -- Dynamic updates work without errors - -### Should Warn (⚠️) -Check logs for warnings: -- "We recommend testing at first with no prompt" -- "max_turn_silence is not used in Pipecat mode" -- "Unknown setting for AssemblyAI STT service" - -### Should Error (❌) -Should raise ValueError and fail to start: -- Both prompt and keyterms_prompt set at init -- Both prompt and keyterms_prompt in same update -- vad_force_turn_endpoint=False with universal-streaming - ---- - -## Debugging Tips - -### Check Logs -```bash -# Run with verbose logging -LOGURU_LEVEL=DEBUG python test_assemblyai_u3pro.py --test -``` - -### Common Issues - -**Issue: "WebSocket connection failed"** -- Check ASSEMBLYAI_API_KEY is correct -- Verify network connection -- Check firewall settings - -**Issue: "No audio input/output"** -- Verify microphone permissions (System Preferences → Security & Privacy → Microphone) -- Check default audio devices in System Preferences → Sound -- Test microphone with another app first -- Make sure no other app is using the microphone - -**Issue: "No transcriptions appearing"** -- Verify microphone permissions -- Check audio levels (speak louder or move closer to mic) -- Speak clearly and wait for VAD to detect -- Check if microphone is muted - -**Issue: "Can't hear bot responses"** -- Check speaker/headphone volume -- Verify correct output device is selected -- Check terminal for TTS errors - -**Issue: "Service fails to start"** -- Check all API keys in .env -- Run `uv sync` to ensure dependencies installed -- Check Python version (3.10+) - ---- - -## Manual Testing Checklist - -After running automated tests, manually verify: - -### ✅ Audio Quality -- [ ] Transcriptions are accurate -- [ ] No distortion or dropouts -- [ ] Latency is acceptable - -### ✅ Turn Detection -- [ ] Bot waits for user to finish speaking -- [ ] No premature cutoffs -- [ ] Handles natural pauses correctly - -### ✅ Interruptions -- [ ] Can interrupt bot mid-sentence -- [ ] Interruption is smooth -- [ ] Bot stops speaking immediately - -### ✅ Diarization (if enabled) -- [ ] Multiple speakers detected correctly -- [ ] Speaker IDs consistent -- [ ] Speaker formatting works - -### ✅ Dynamic Updates -- [ ] Keyterms update without disconnection -- [ ] Turn detection timing changes work -- [ ] Updates logged correctly - ---- - -## Test Results Recording - -### Use this template: - -```markdown -## Test Run: YYYY-MM-DD - -| Test # | Test Name | Status | Notes | -|--------|-----------|--------|-------| -| 1 | basic | ✅ PASS | Transcriptions working | -| 2 | custom_min | ✅ PASS | Turn timing changed | -| 3 | max_warning | ✅ PASS | Warning logged | -| 5 | prompt_warning | ✅ PASS | Warning shown | -| 6 | prompt_keyterms_conflict | ✅ PASS | ValueError raised | -| 7 | keyterms | ✅ PASS | Terms boosted | -| 10 | diarization | ✅ PASS | Speaker IDs correct | -| 11 | diarization_xml | ✅ PASS | XML tags shown | -| 13 | dynamic_keyterms | ✅ PASS | Updates worked | -| 15 | dynamic_silence | ✅ PASS | Timing adjusted | -| 17 | multi_param | ✅ PASS | All params updated | - -### Issues Found: -- None - -### Notes: -- All tests passed successfully -- Latency is excellent (sub-300ms) -- Diarization accuracy is good -``` - ---- - -## Advanced Testing - -### Custom Test Scenarios - -Create custom tests by modifying `test_assemblyai_u3pro.py`: - -```python -async def test_my_custom_scenario(): - """My custom test scenario.""" - logger.info("Testing my specific use case") - - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - # Your custom params here - ) - - task, transport = await create_basic_voice_agent(connection_params) - - # Your test logic here - - runner = PipelineRunner() - await runner.run(task) -``` - -### Stress Testing - -Test with: -- Multiple simultaneous speakers -- Long conversations (30+ minutes) -- Rapid speech -- Heavy accents -- Background noise -- Poor network conditions - ---- - -## Reporting Issues - -When reporting issues, include: - -1. **Test name and number** -2. **Full error message and stack trace** -3. **Relevant log output** (use LOGURU_LEVEL=DEBUG) -4. **Configuration used** (connection_params) -5. **Expected vs actual behavior** -6. **Steps to reproduce** - ---- - -## Next Steps - -After testing: - -1. ✅ Mark completed tests in `TESTING_CHECKLIST.md` -2. 📝 Document any issues found -3. 🐛 Create GitHub issues for bugs -4. ✨ Suggest improvements -5. 📊 Share results with team - ---- - -## Contact - -Questions? Issues? -- Check `TESTING_CHECKLIST.md` for detailed test descriptions -- Review logs with `LOGURU_LEVEL=DEBUG` -- Reach out to the team with your findings - -Happy testing! 🎯 diff --git a/test_assemblyai_custom.py b/test_assemblyai_custom.py deleted file mode 100755 index e8e0a28d2..000000000 --- a/test_assemblyai_custom.py +++ /dev/null @@ -1,240 +0,0 @@ -#!/usr/bin/env python3 -"""Custom AssemblyAI u3-rt-pro Test Script -Easy parameter tweaking for experimentation - -Edit the CONFIGURATION section below to test different settings! -""" - -import asyncio -import os -import sys - -from dotenv import load_dotenv -from loguru import logger - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.frames.frames import LLMRunFrame -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_context import LLMContext -from pipecat.processors.aggregators.llm_response_universal import ( - LLMContextAggregatorPair, - LLMUserAggregatorParams, -) -from pipecat.services.assemblyai.models import AssemblyAIConnectionParams -from pipecat.services.assemblyai.stt import AssemblyAISTTService -from pipecat.services.cartesia.tts import CartesiaTTSService -from pipecat.services.openai.llm import OpenAILLMService -from pipecat.transports.local.audio import LocalAudioTransport, LocalAudioTransportParams - -load_dotenv(override=True) - -# ============================================================================ -# CONFIGURATION -# ============================================================================ - -# Log Level: "DEBUG" for detailed logs, "INFO" for normal operation -LOG_LEVEL = "INFO" - -# ============================================================================ -# BOT IMPLEMENTATION -# ============================================================================ - - -async def main(): - """Run the custom test bot with your configured parameters.""" - # Setup logging - logger.remove(0) - logger.add(sys.stderr, level=LOG_LEVEL) - - logger.info("=" * 80) - logger.info("AssemblyAI u3-rt-pro Custom Test") - logger.info("=" * 80) - logger.info("Starting bot... Speak after you hear the greeting!") - logger.info("=" * 80) - - # Create local audio transport - transport = LocalAudioTransport( - LocalAudioTransportParams( - audio_in_enabled=True, - audio_out_enabled=True, - ) - ) - - # ======================================================================== - # EDIT PARAMETERS HERE - # ======================================================================== - - # Build connection params - connection_params = AssemblyAIConnectionParams( - # ==================================================================== - # Model Selection - # ==================================================================== - speech_model="u3-rt-pro", - # speech_model="universal-streaming-english", - # speech_model="universal-streaming-multilingual", - # ==================================================================== - # Turn Detection Timing - # ==================================================================== - # Minimum silence when confident about end of turn (milliseconds) - # Default: 100ms | Higher = more patient | Lower = faster responses - # Only used in Pipecat mode (vad_force_turn_endpoint=True) - min_turn_silence=100000, - # min_turn_silence=200, - # min_turn_silence=300, - # Maximum turn silence (milliseconds) - # WARNING: In Pipecat mode (vad_force_turn_endpoint=True), this is - # automatically set equal to min_turn_silence - # to avoid double turn detection. Only used as-is in STT mode. - max_turn_silence=500, - # End of turn confidence threshold (0.0 to 1.0) - # Higher = requires more confidence before ending turn - # end_of_turn_confidence_threshold=0.8, - # ==================================================================== - # Prompting & Boosting - # ==================================================================== - # Custom Prompt (WARNING: test carefully, default is optimized!) - # None = Use AssemblyAI's optimized default (recommended for 88% accuracy) - prompt=None, - # prompt="Transcribe speech with focus on technical terms.", - # prompt="Context: Medical conversation. Transcribe accurately.", - # Keyterms Prompting (boosts recognition for specific words) - # NOTE: Cannot use both prompt and keyterms_prompt! - keyterms_prompt=None, - # keyterms_prompt=["Pipecat", "AssemblyAI", "OpenAI", "Cartesia"], - # keyterms_prompt=["Python", "JavaScript", "TypeScript", "API"], - # ==================================================================== - # Diarization (Speaker Identification) - # ==================================================================== - # Enable speaker labels (identifies different speakers) - speaker_labels=None, # None or True - # speaker_labels=True, - # ==================================================================== - # Audio Configuration - # ==================================================================== - # Audio sample rate (Hz) - # sample_rate=16000, - # sample_rate=8000, - # Audio encoding format - # encoding="pcm_s16le", # Default: 16-bit PCM - # encoding="pcm_mulaw", # μ-law encoding (telephony) - # ==================================================================== - # Other Options - # ==================================================================== - # Format transcript turns (applies formatting rules) - # format_turns=True, # Default - # format_turns=False, - # Language detection (only for universal-streaming-multilingual) - # language_detection=True, - ) - - # Log connection parameters for debugging - logger.info("=" * 80) - logger.info("CONNECTION PARAMETERS:") - logger.info(f" speech_model: {connection_params.speech_model}") - logger.info(f" min_turn_silence: {connection_params.min_turn_silence}") - logger.info(f" max_turn_silence: {connection_params.max_turn_silence}") - logger.info(f" sample_rate: {connection_params.sample_rate}") - logger.info(f" encoding: {connection_params.encoding}") - logger.info(f" prompt: {connection_params.prompt}") - logger.info(f" keyterms_prompt: {connection_params.keyterms_prompt}") - logger.info(f" speaker_labels: {connection_params.speaker_labels}") - logger.info(f" format_turns: {connection_params.format_turns}") - logger.info( - f" end_of_turn_confidence_threshold: {connection_params.end_of_turn_confidence_threshold}" - ) - logger.info(f" language_detection: {connection_params.language_detection}") - logger.info("=" * 80) - - # AssemblyAI Speech-to-Text Service - stt = AssemblyAISTTService( - api_key=os.getenv("ASSEMBLYAI_API_KEY"), - connection_params=connection_params, - # Turn Detection Mode - # True = Pipecat mode (VAD + Smart Turn controls turns) - # False = STT mode (u3-rt-pro model controls turns) - vad_force_turn_endpoint=True, - # Speaker Formatting (only used if speaker_labels=True) - # None = Just log speaker IDs, don't modify transcript - speaker_format=None, - # speaker_format="{text}", - # speaker_format="{speaker}: {text}", - # speaker_format="[{speaker}] {text}", - # Additional available parameters (uncomment to use): - # should_interrupt=True, # Only for STT mode - ) - - # ======================================================================== - - # Text-to-Speech - tts = CartesiaTTSService( - api_key=os.getenv("CARTESIA_API_KEY"), - voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Conversational English - ) - - # LLM - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4", - ) - - # Conversation context - messages = [ - { - "role": "system", - "content": ( - "You are a helpful voice assistant testing the AssemblyAI u3-rt-pro model. " - "Keep responses very brief (1-2 sentences). " - "Start by introducing yourself briefly and asking the user to speak." - ), - }, - ] - - context = LLMContext(messages) - - # Configure aggregator based on mode - # In STT mode, don't use VAD (model handles turn detection) - # In Pipecat mode, use VAD + Smart Turn - vad_force_turn_endpoint = True # Must match the value in stt configuration above - user_params = None - if vad_force_turn_endpoint: - user_params = LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()) - - user_aggregator, assistant_aggregator = LLMContextAggregatorPair( - context, - user_params=user_params, - ) - - # Pipeline - pipeline = Pipeline( - [ - transport.input(), - stt, - user_aggregator, - llm, - tts, - transport.output(), - assistant_aggregator, - ] - ) - - # Task - task = PipelineTask( - pipeline, - params=PipelineParams( - enable_metrics=True, - enable_usage_metrics=True, - ), - ) - - # Start the conversation - await task.queue_frames([LLMRunFrame()]) - - # Run - runner = PipelineRunner() - await runner.run(task) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/test_assemblyai_interactive.py b/test_assemblyai_interactive.py deleted file mode 100755 index c5ec0b429..000000000 --- a/test_assemblyai_interactive.py +++ /dev/null @@ -1,749 +0,0 @@ -#!/usr/bin/env python3 -"""Interactive AssemblyAI u3-rt-pro Comprehensive Test Suite - -Tests all features with detailed scenarios: -- Basic configuration variations -- Prompting and keyterms with difficult names -- Diarization -- Dynamic parameter updates (single and multiple) -- Mode comparisons -- STT mode timing experiments (testing silence parameters) -- Edge cases - -Usage: - python test_assemblyai_interactive.py -""" - -import asyncio -import os -import sys -from typing import Optional - -from dotenv import load_dotenv -from loguru import logger - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.frames.frames import LLMRunFrame, STTUpdateSettingsFrame -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_context import LLMContext -from pipecat.processors.aggregators.llm_response_universal import ( - LLMContextAggregatorPair, - LLMUserAggregatorParams, -) -from pipecat.services.assemblyai.models import AssemblyAIConnectionParams -from pipecat.services.assemblyai.stt import AssemblyAISTTService, AssemblyAISTTSettings -from pipecat.services.cartesia.tts import CartesiaTTSService -from pipecat.services.openai.llm import OpenAILLMService -from pipecat.transports.local.audio import LocalAudioTransport, LocalAudioTransportParams - -load_dotenv(override=True) - -logger.remove(0) -logger.add(sys.stderr, level="INFO") - - -async def run_bot( - connection_params: AssemblyAIConnectionParams, - test_name: str, - vad_force_turn_endpoint: bool = True, - speaker_format: Optional[str] = None, - test_dynamic_updates: Optional[callable] = None, -): - """Run the voice bot with specified configuration.""" - logger.info("=" * 80) - logger.info(f"TEST: {test_name}") - logger.info("=" * 80) - logger.info("Starting bot... Speak into your microphone after you hear the greeting!") - logger.info("=" * 80) - - # Create local audio transport - transport = LocalAudioTransport( - LocalAudioTransportParams( - audio_in_enabled=True, - audio_out_enabled=True, - ) - ) - - # AssemblyAI Speech-to-Text - stt = AssemblyAISTTService( - api_key=os.getenv("ASSEMBLYAI_API_KEY"), - connection_params=connection_params, - vad_force_turn_endpoint=vad_force_turn_endpoint, - speaker_format=speaker_format, - ) - - # Text-to-Speech - tts = CartesiaTTSService( - api_key=os.getenv("CARTESIA_API_KEY"), - voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", - ) - - # LLM - llm = OpenAILLMService( - api_key=os.getenv("OPENAI_API_KEY"), - model="gpt-4", - ) - - # Conversation context - messages = [ - { - "role": "system", - "content": ( - "You are a helpful voice assistant testing the AssemblyAI u3-rt-pro model. " - "Keep responses very brief (1-2 sentences). " - "Start by introducing yourself briefly and asking the user to speak." - ), - }, - ] - - context = LLMContext(messages) - - # Configure aggregator based on mode - user_params = None - if vad_force_turn_endpoint: - user_params = LLMUserAggregatorParams(vad_analyzer=SileroVADAnalyzer()) - - user_aggregator, assistant_aggregator = LLMContextAggregatorPair( - context, - user_params=user_params, - ) - - # Pipeline - pipeline = Pipeline( - [ - transport.input(), - stt, - user_aggregator, - llm, - tts, - transport.output(), - assistant_aggregator, - ] - ) - - # Task - task = PipelineTask( - pipeline, - params=PipelineParams( - enable_metrics=True, - enable_usage_metrics=True, - ), - ) - - # Handle dynamic updates if provided - if test_dynamic_updates: - asyncio.create_task(test_dynamic_updates(task)) - - # Start the conversation - await task.queue_frames([LLMRunFrame()]) - - # Run - runner = PipelineRunner() - await runner.run(task) - - -# ============================================================================ -# Test Configurations -# ============================================================================ - -# === BASIC CONFIGURATION (1-3) === - - -async def test_01_basic_100ms(): - """Test 1: Basic default configuration (100ms).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=100, - ) - await run_bot(connection_params, "Basic Default Configuration (100ms)") - - -async def test_02_custom_200ms(): - """Test 2: Custom min_end_of_turn_silence (200ms).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=200, - ) - await run_bot(connection_params, "Custom Turn Silence (200ms)") - - -async def test_03_custom_500ms(): - """Test 3: Longer silence threshold (500ms).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=500, - ) - await run_bot(connection_params, "Longer Turn Silence (500ms)") - - -# === PROMPTING & WARNINGS (4-7) === - - -async def test_04_max_warning(): - """Test 4: max_turn_silence warning (should be overridden).""" - logger.warning("⚠️ EXPECT WARNING: max_turn_silence will be overridden") - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - max_turn_silence=500, - ) - await run_bot(connection_params, "max_turn_silence Override Warning") - - -async def test_05_prompt_warning(): - """Test 5: Custom prompt warning.""" - logger.warning("⚠️ EXPECT WARNING: Custom prompts should be tested carefully") - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - prompt="Transcribe speech accurately with proper punctuation.", - ) - await run_bot(connection_params, "Custom Prompt Warning Test") - - -async def test_06_prompt_keyterms_conflict(): - """Test 6: Prompt + keyterms conflict (should error).""" - logger.error("❌ EXPECT ERROR: Cannot use both prompt and keyterms_prompt") - try: - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - prompt="Custom prompt", - keyterms_prompt=["test"], - ) - await run_bot(connection_params, "Prompt + Keyterms Conflict (ERROR)") - except ValueError as e: - logger.error(f"✅ EXPECTED ERROR: {e}") - input("\nPress Enter to continue...") - return - - -async def test_07_keyterms_difficult(): - """Test 7: Keyterms with difficult/unusual names.""" - # Use names that STT wouldn't normally get right - keyterms = ["Xiomara", "Saoirse", "Krzystof", "Nguyen", "Pipecat", "AssemblyAI"] - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - keyterms_prompt=keyterms, - ) - logger.info("🎯 Boosted terms: Xiomara, Saoirse, Krzystof, Nguyen, Pipecat, AssemblyAI") - logger.info(" Try saying these difficult names to test boosting!") - await run_bot(connection_params, "Keyterms with Difficult Names") - - -# === DIARIZATION (8-9) === - - -async def test_08_diarization_basic(): - """Test 8: Basic diarization (speaker IDs logged).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - speaker_labels=True, - ) - logger.info("🎤 Diarization enabled - speaker IDs will be logged") - logger.info(" Try having multiple people speak!") - await run_bot(connection_params, "Diarization - Basic") - - -async def test_09_diarization_xml(): - """Test 9: Diarization with XML formatting.""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - speaker_labels=True, - ) - logger.info("🎤 Diarization with XML tags") - logger.info(" Transcripts will include text") - await run_bot( - connection_params, - "Diarization - XML Formatting", - speaker_format="{text}", - ) - - -# === DYNAMIC UPDATES - SINGLE PARAMETER (10-13) === - - -async def test_10_dynamic_keyterms(): - """Test 10: Dynamic keyterms update with difficult names.""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - ) - - async def dynamic_update(task): - logger.info("\n" + "=" * 80) - logger.info("PHASE 1: No keyterms boosting") - logger.info(" Try saying: Xiomara, Saoirse, Krzystof") - logger.info(" (May not transcribe correctly)") - logger.info("=" * 80) - await asyncio.sleep(15) - - logger.info("\n" + "=" * 80) - logger.info("🔄 UPDATING: Adding keyterms boost") - logger.info("=" * 80) - await task.queue_frame( - STTUpdateSettingsFrame( - delta=AssemblyAISTTSettings( - connection_params=AssemblyAIConnectionParams( - keyterms_prompt=["Xiomara", "Saoirse", "Krzystof", "Nguyen"] - ) - ) - ) - ) - logger.info("\n" + "=" * 80) - logger.info("PHASE 2: Keyterms NOW boosted") - logger.info(" Say the same names again: Xiomara, Saoirse, Krzystof") - logger.info(" (Should transcribe better now!)") - logger.info("=" * 80) - - logger.info("🔄 This test has 2 phases:") - logger.info(" Phase 1 (15s): No boosting - names may be wrong") - logger.info(" Phase 2: Keyterms added - names should improve") - await run_bot( - connection_params, - "Dynamic Keyterms Update (Before/After)", - test_dynamic_updates=dynamic_update, - ) - - -async def test_11_dynamic_silence(): - """Test 11: Dynamic silence parameter update (dramatic change).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=100, - ) - - async def dynamic_update(task): - logger.info("\n" + "=" * 80) - logger.info("PHASE 1: Quick responses (100ms silence threshold)") - logger.info(" Speak normally - bot responds quickly") - logger.info("=" * 80) - await asyncio.sleep(10) - - logger.info("\n" + "=" * 80) - logger.info("🔄 UPDATING: Changing silence from 100ms → 3000ms (3 seconds!)") - logger.info("=" * 80) - await task.queue_frame( - STTUpdateSettingsFrame( - delta=AssemblyAISTTSettings( - connection_params=AssemblyAIConnectionParams(min_turn_silence=3000) - ) - ) - ) - logger.info("\n" + "=" * 80) - logger.info("PHASE 2: Patient responses (3 second silence threshold)") - logger.info(" Bot will wait 3 full seconds before responding") - logger.info(" Try pausing mid-sentence - bot should NOT interrupt") - logger.info("=" * 80) - - logger.info("🔄 Dramatic change: 100ms → 3000ms after 10 seconds") - await run_bot( - connection_params, - "Dynamic Silence Update (100ms → 3s)", - test_dynamic_updates=dynamic_update, - ) - - -async def test_12_dynamic_prompt(): - """Test 12: Dynamic prompt update with keyterms in prompt.""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - ) - - async def dynamic_update(task): - logger.info("\n" + "=" * 80) - logger.info("PHASE 1: Default prompt (no keyterms)") - logger.info(" Try saying: Xiomara, Saoirse, Krzystof") - logger.info(" (May not transcribe correctly)") - logger.info("=" * 80) - await asyncio.sleep(15) - - logger.info("\n" + "=" * 80) - logger.info("🔄 UPDATING: Adding custom prompt with keyterms") - logger.info("=" * 80) - custom_prompt = """Transcribe verbatim. Rules: -1) Always include punctuation in output. -2) Use period/question mark ONLY for complete sentences. -3) Use comma for mid-sentence pauses. -4) Use no punctuation for incomplete trailing speech. -5) Filler words (um, uh, so, like) indicate speaker will continue. - -Pay special attention to these names and transcribe them exactly: Xiomara, Saoirse, Krzystof, Nguyen.""" - await task.queue_frame( - STTUpdateSettingsFrame( - delta=AssemblyAISTTSettings( - connection_params=AssemblyAIConnectionParams(prompt=custom_prompt) - ) - ) - ) - logger.info("\n" + "=" * 80) - logger.info("PHASE 2: Prompt with keyterms NOW active") - logger.info(" Say the same names again: Xiomara, Saoirse, Krzystof") - logger.info(" (Should transcribe better now!)") - logger.info("=" * 80) - - logger.info("🔄 This test has 2 phases:") - logger.info(" Phase 1 (15s): Default prompt - names may be wrong") - logger.info(" Phase 2: Custom prompt with keyterms - names should improve") - await run_bot( - connection_params, - "Dynamic Prompt Update (with keyterms)", - test_dynamic_updates=dynamic_update, - ) - - -async def test_13_dynamic_clear_keyterms(): - """Test 13: Clear keyterms dynamically.""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - keyterms_prompt=["Pipecat", "AssemblyAI"], - ) - - async def dynamic_update(task): - await asyncio.sleep(10) - logger.info("🔄 UPDATING: Clearing keyterms (empty array)") - await task.queue_frame( - STTUpdateSettingsFrame( - delta=AssemblyAISTTSettings( - connection_params=AssemblyAIConnectionParams(keyterms_prompt=[]) - ) - ) - ) - - logger.info("🎯 Initial: Pipecat, AssemblyAI boosted") - logger.info("🔄 After 10s: Keyterms will be cleared") - await run_bot( - connection_params, - "Dynamic Clear Keyterms", - test_dynamic_updates=dynamic_update, - ) - - -# === DYNAMIC UPDATES - MULTIPLE PARAMETERS (14-15) === - - -async def test_14_multi_param_update(): - """Test 14: Update multiple parameters at once.""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=100, - ) - - async def dynamic_update(task): - await asyncio.sleep(10) - logger.info("🔄 UPDATING MULTIPLE: keyterms + silence") - await task.queue_frame( - STTUpdateSettingsFrame( - delta=AssemblyAISTTSettings( - connection_params=AssemblyAIConnectionParams( - keyterms_prompt=["Xiomara", "Pipecat"], - min_turn_silence=250, - ) - ) - ) - ) - - logger.info("🔄 After 10s: Will update BOTH keyterms AND silence threshold") - await run_bot( - connection_params, - "Multiple Parameter Update", - test_dynamic_updates=dynamic_update, - ) - - -async def test_15_complex_sequence(): - """Test 15: Complex multi-stage update sequence.""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - ) - - async def dynamic_update(task): - logger.info("Stage 1: Initial (10s)") - await asyncio.sleep(10) - - logger.info("🔄 Stage 2: Add keyterms") - await task.queue_frame( - STTUpdateSettingsFrame( - delta=AssemblyAISTTSettings( - connection_params=AssemblyAIConnectionParams(keyterms_prompt=["Pipecat"]) - ) - ) - ) - await asyncio.sleep(10) - - logger.info("🔄 Stage 3: Change silence") - await task.queue_frame( - STTUpdateSettingsFrame( - delta=AssemblyAISTTSettings( - connection_params=AssemblyAIConnectionParams(min_turn_silence=200) - ) - ) - ) - await asyncio.sleep(10) - - logger.info("🔄 Stage 4: Update both") - await task.queue_frame( - STTUpdateSettingsFrame( - delta=AssemblyAISTTSettings( - connection_params=AssemblyAIConnectionParams( - keyterms_prompt=["AssemblyAI", "OpenAI"], - min_turn_silence=150, - ) - ) - ) - ) - - logger.info("🔄 Multi-stage: 4 configuration changes over 30 seconds") - await run_bot( - connection_params, - "Complex Update Sequence (4 stages)", - test_dynamic_updates=dynamic_update, - ) - - -# === MODE COMPARISON (16-17) === - - -async def test_16_pipecat_mode(): - """Test 16: Pipecat mode (VAD + Smart Turn controls turns).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=100, - ) - logger.info("🎯 Pipecat Mode: VAD + Smart Turn control turn detection") - logger.info(" Your min_end_of_turn_silence is sent but ForceEndpoint overrides it") - await run_bot( - connection_params, - "Pipecat Mode (VAD + Smart Turn)", - vad_force_turn_endpoint=True, - ) - - -async def test_17_stt_mode(): - """Test 17: STT mode (model controls turns).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=100, - ) - logger.info("🎯 STT Mode: u3-rt-pro model controls turn detection") - logger.info(" No ForceEndpoint - parameters are respected") - await run_bot( - connection_params, - "STT Mode (Model Turn Detection)", - vad_force_turn_endpoint=False, - ) - - -# === STT MODE TIMING EXPERIMENTS (18-20) === - - -async def test_18_stt_long_max_short_min(): - """Test 18: STT mode - Long max_turn_silence + Short min (5000ms + 100ms).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=100, # Short - quick confident turns - max_turn_silence=5000, # Long - allows pauses up to 5 seconds - ) - logger.info("🎯 STT Mode: Testing max/min parameter interaction") - logger.info(" min_turn_silence: 100ms (quick when confident)") - logger.info(" max_turn_silence: 5000ms (allows up to 5 second pauses)") - logger.info(" Try: Quick sentences (should respond fast) + Long pauses mid-thought") - await run_bot( - connection_params, - "STT: Long Max (5s) + Short Min (100ms)", - vad_force_turn_endpoint=False, - ) - - -async def test_19_stt_long_min(): - """Test 19: STT mode - Long min_turn_silence (3000ms).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=3000, # 3 seconds - max_turn_silence=5000, # 5 seconds - ) - logger.info("🎯 STT Mode: Testing long minimum silence requirement") - logger.info(" min_turn_silence: 3000ms") - logger.info(" max_turn_silence: 5000ms") - logger.info(" Bot will wait 3 full seconds of silence before responding!") - logger.info(" Try: Speaking with short pauses - bot should NOT interrupt") - await run_bot( - connection_params, - "STT: Long Min (3s)", - vad_force_turn_endpoint=False, - ) - - -async def test_20_stt_both_short(): - """Test 20: STT mode - Both short (max=300ms, min=100ms).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=100, # 100ms - max_turn_silence=300, # 300ms - ) - logger.info("🎯 STT Mode: Testing aggressive/quick response timing") - logger.info(" min_turn_silence: 100ms") - logger.info(" max_turn_silence: 300ms") - logger.info(" Bot will respond VERY quickly to any pause!") - logger.info(" Try: Speaking with natural pauses - expect quick responses") - await run_bot( - connection_params, - "STT: Both Short (300ms/100ms)", - vad_force_turn_endpoint=False, - ) - - -# === EDGE CASES (21-23) === - - -async def test_21_very_long_silence(): - """Test 21: Very long silence threshold (STT mode only).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=10000, # 10 seconds - ) - logger.warning("⚠️ STT Mode with 10 second silence threshold") - logger.info(" Bot will wait 10 seconds of silence before responding!") - await run_bot( - connection_params, - "Very Long Silence (10s) - STT Mode", - vad_force_turn_endpoint=False, - ) - - -async def test_22_very_short_silence(): - """Test 22: Very short silence threshold (50ms).""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=50, - ) - logger.info("⚡ Very short silence threshold (50ms)") - logger.info(" Bot will respond very quickly!") - await run_bot(connection_params, "Very Short Silence (50ms)") - - -async def test_23_keyterms_plus_diarization(): - """Test 23: Keyterms + Diarization combined.""" - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - keyterms_prompt=["Xiomara", "Saoirse", "Pipecat"], - speaker_labels=True, - ) - logger.info("🎯 Keyterms + 🎤 Diarization both enabled") - logger.info(" Try multiple speakers saying difficult names!") - await run_bot( - connection_params, - "Keyterms + Diarization Combined", - speaker_format="[{speaker}] {text}", - ) - - -# ============================================================================ -# Interactive Menu -# ============================================================================ - - -def show_menu(): - """Display the comprehensive test menu.""" - print("\n" + "=" * 80) - print("AssemblyAI u3-rt-pro Comprehensive Test Suite") - print("=" * 80) - print("\n📋 BASIC CONFIGURATION (1-3)") - print(" 1. Basic Default (100ms)") - print(" 2. Custom Silence (200ms)") - print(" 3. Longer Silence (500ms)") - - print("\n⚠️ PROMPTING & WARNINGS (4-7)") - print(" 4. max_turn_silence Warning") - print(" 5. Custom Prompt Warning") - print(" 6. Prompt + Keyterms Conflict (ERROR)") - print(" 7. Keyterms with Difficult Names") - - print("\n🎤 DIARIZATION (8-9)") - print(" 8. Diarization - Basic") - print(" 9. Diarization - XML Formatting") - - print("\n🔄 DYNAMIC UPDATES - SINGLE (10-13)") - print(" 10. Dynamic Keyterms (Before/After with difficult names)") - print(" 11. Dynamic Silence (100ms → 3s DRAMATIC)") - print(" 12. Dynamic Prompt with Keyterms (Before/After)") - print(" 13. Dynamic Clear Keyterms") - - print("\n🔄 DYNAMIC UPDATES - MULTIPLE (14-15)") - print(" 14. Multiple Parameters at Once") - print(" 15. Complex Update Sequence (4 stages)") - - print("\n⚖️ MODE COMPARISON (16-17)") - print(" 16. Pipecat Mode (VAD + Smart Turn)") - print(" 17. STT Mode (Model Turn Detection)") - - print("\n⏱️ STT MODE TIMING EXPERIMENTS (18-20)") - print(" 18. STT: Long Max (5s) + Short Min (100ms)") - print(" 19. STT: Long Min (3s)") - print(" 20. STT: Both Short (300ms/100ms)") - - print("\n🎯 EDGE CASES (21-23)") - print(" 21. Very Long Silence (10s - STT Mode)") - print(" 22. Very Short Silence (50ms)") - print(" 23. Keyterms + Diarization Combined") - - print("\n 0. Exit") - print("\n" + "=" * 80) - - -async def main(): - """Main interactive menu.""" - tests = { - "1": test_01_basic_100ms, - "2": test_02_custom_200ms, - "3": test_03_custom_500ms, - "4": test_04_max_warning, - "5": test_05_prompt_warning, - "6": test_06_prompt_keyterms_conflict, - "7": test_07_keyterms_difficult, - "8": test_08_diarization_basic, - "9": test_09_diarization_xml, - "10": test_10_dynamic_keyterms, - "11": test_11_dynamic_silence, - "12": test_12_dynamic_prompt, - "13": test_13_dynamic_clear_keyterms, - "14": test_14_multi_param_update, - "15": test_15_complex_sequence, - "16": test_16_pipecat_mode, - "17": test_17_stt_mode, - "18": test_18_stt_long_max_short_min, - "19": test_19_stt_long_min, - "20": test_20_stt_both_short, - "21": test_21_very_long_silence, - "22": test_22_very_short_silence, - "23": test_23_keyterms_plus_diarization, - } - - while True: - show_menu() - choice = input("Enter test number (or 0 to exit): ").strip() - - if choice == "0": - print("\n👋 Goodbye!") - break - - if choice in tests: - try: - await tests[choice]() - except KeyboardInterrupt: - print("\n\n⚠️ Test interrupted by user") - except Exception as e: - logger.error(f"Test failed with error: {e}") - import traceback - - traceback.print_exc() - - input("\n\nPress Enter to return to menu...") - else: - print(f"\n❌ Invalid choice: {choice}") - input("Press Enter to continue...") - - -if __name__ == "__main__": - try: - asyncio.run(main()) - except KeyboardInterrupt: - print("\n\n👋 Goodbye!") diff --git a/test_assemblyai_u3pro.py b/test_assemblyai_u3pro.py deleted file mode 100644 index 236ab9b50..000000000 --- a/test_assemblyai_u3pro.py +++ /dev/null @@ -1,582 +0,0 @@ -#!/usr/bin/env python3 -"""AssemblyAI u3-rt-pro Comprehensive Test Script - -Tests all features: -- Basic configuration -- Prompting and keyterms -- Diarization -- Dynamic updates -- Turn detection modes - -Usage: - python test_assemblyai_u3pro.py --test - python test_assemblyai_u3pro.py --interactive -""" - -import argparse -import asyncio -import os -import sys -from typing import List - -from dotenv import load_dotenv -from loguru import logger - -# Add src to path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src")) - -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.frames.frames import ( - EndFrame, - Frame, - LLMRunFrame, - STTUpdateSettingsFrame, - TranscriptionFrame, -) -from pipecat.pipeline.pipeline import Pipeline -from pipecat.pipeline.runner import PipelineRunner -from pipecat.pipeline.task import PipelineTask -from pipecat.processors.aggregators.llm_context import LLMContext -from pipecat.processors.aggregators.llm_response_universal import ( - LLMContextAggregatorPair, - LLMUserAggregatorParams, -) -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor -from pipecat.services.assemblyai.models import AssemblyAIConnectionParams -from pipecat.services.assemblyai.stt import AssemblyAISTTService -from pipecat.services.cartesia.tts import CartesiaTTSService -from pipecat.services.openai.llm import OpenAILLMService -from pipecat.transports.local.audio import LocalAudioTransport, LocalAudioTransportParams - -load_dotenv() - - -# Test configuration -class TestConfig: - """Centralized test configuration.""" - - ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY") - OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") - CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY") - - @classmethod - def validate(cls): - """Validate all required API keys are set.""" - missing = [] - if not cls.ASSEMBLYAI_API_KEY: - missing.append("ASSEMBLYAI_API_KEY") - if not cls.OPENAI_API_KEY: - missing.append("OPENAI_API_KEY") - if not cls.CARTESIA_API_KEY: - missing.append("CARTESIA_API_KEY") - - if missing: - logger.error(f"Missing required environment variables: {', '.join(missing)}") - return False - return True - - -class TranscriptionLogger(FrameProcessor): - """Log transcriptions for test verification.""" - - async def process_frame(self, frame: Frame, direction: FrameDirection): - if isinstance(frame, TranscriptionFrame): - logger.info(f"📝 TRANSCRIPTION: {frame.text}") - logger.info(f" Speaker: {frame.user_id}") - logger.info(f" Finalized: {frame.finalized}") - if hasattr(frame, "result") and frame.result: - if hasattr(frame.result, "speaker"): - logger.info(f" Diarization: {frame.result.speaker}") - - await self.push_frame(frame, direction) - - -async def create_basic_voice_agent( - connection_params: AssemblyAIConnectionParams, - vad_force_turn_endpoint: bool = True, - speaker_format: str = None, -) -> tuple[PipelineTask, LocalAudioTransport]: - """Create a basic voice agent for testing. - - Args: - connection_params: AssemblyAI connection parameters - vad_force_turn_endpoint: Turn detection mode - speaker_format: Optional speaker formatting string - - Returns: - Tuple of (PipelineTask, LocalAudioTransport) - """ - # Create local audio transport (uses your microphone and speakers) - transport = LocalAudioTransport( - params=LocalAudioTransportParams( - audio_in_enabled=True, - audio_out_enabled=True, - ) - ) - - # Create STT - stt = AssemblyAISTTService( - api_key=TestConfig.ASSEMBLYAI_API_KEY, - connection_params=connection_params, - vad_force_turn_endpoint=vad_force_turn_endpoint, - speaker_format=speaker_format, - ) - - # Create TTS - tts = CartesiaTTSService( - api_key=TestConfig.CARTESIA_API_KEY, - voice_id="a0e99841-438c-4a64-b679-ae501e7d6091", # Conversational English - ) - - # Create LLM context and service - messages = [ - { - "role": "system", - "content": ( - "You are a helpful voice assistant. Keep responses brief and natural. " - "If you see speaker tags like text, acknowledge " - "that you understand multiple speakers are present." - ), - } - ] - - context = LLMContext(messages) - llm = OpenAILLMService(api_key=TestConfig.OPENAI_API_KEY, model="gpt-4") - - # Create aggregators with VAD - user_aggregator, assistant_aggregator = LLMContextAggregatorPair( - context, - user_params=LLMUserAggregatorParams( - vad_analyzer=SileroVADAnalyzer(), - ), - ) - - # Create transcription logger - transcription_logger = TranscriptionLogger() - - # Create pipeline - pipeline = Pipeline( - [ - transport.input(), - stt, - transcription_logger, - user_aggregator, - llm, - tts, - transport.output(), - assistant_aggregator, - ] - ) - - # Create task - task = PipelineTask(pipeline) - - return task, transport - - -# ============================================================================ -# Test Functions -# ============================================================================ - - -async def test_basic_config(): - """Test 1: Basic default configuration.""" - logger.info("=" * 80) - logger.info("TEST 1: Basic Default Configuration") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams(speech_model="u3-rt-pro") - - task, transport = await create_basic_voice_agent(connection_params) - - logger.info("✅ Service created successfully with default params") - logger.info("Expected: min=max=100ms, u3-rt-pro model") - logger.info("Speak into your microphone to test transcription") - - # Trigger initial bot greeting - await task.queue_frames([LLMRunFrame()]) - - runner = PipelineRunner() - await runner.run(task) - - -async def test_custom_min_silence(): - """Test 2: Custom min_turn_silence.""" - logger.info("=" * 80) - logger.info("TEST 2: Custom min_turn_silence") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams(speech_model="u3-rt-pro", min_turn_silence=200) - - task, transport = await create_basic_voice_agent(connection_params) - - logger.info("✅ Service created with min=200ms") - logger.info("Expected: Both min and max set to 200ms") - logger.info("Speak short phrases and observe turn detection timing") - - runner = PipelineRunner() - await runner.run(task) - - -async def test_max_silence_warning(): - """Test 3: Setting max_turn_silence should trigger warning.""" - logger.info("=" * 80) - logger.info("TEST 3: max_turn_silence Warning") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - min_turn_silence=100, - max_turn_silence=500, # Should trigger warning - ) - - task, transport = await create_basic_voice_agent(connection_params) - - logger.info("⚠️ Check logs above for warning about max_turn_silence being overridden") - logger.info("Expected: Warning logged, max set to 100ms (same as min)") - - runner = PipelineRunner() - await runner.run(task) - - -async def test_custom_prompt_warning(): - """Test 5: Custom prompt should trigger warning.""" - logger.info("=" * 80) - logger.info("TEST 5: Custom Prompt Warning") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - prompt="Transcribe verbatim. Always include punctuation.", - ) - - task, transport = await create_basic_voice_agent(connection_params) - - logger.info("⚠️ Check logs above for warning about testing without prompt first") - logger.info("Expected: Warning logged, service continues with custom prompt") - - runner = PipelineRunner() - await runner.run(task) - - -async def test_prompt_keyterms_conflict(): - """Test 6: Prompt + keyterms_prompt should raise error.""" - logger.info("=" * 80) - logger.info("TEST 6: Prompt + Keyterms Conflict (Error)") - logger.info("=" * 80) - - try: - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - prompt="Custom prompt", - keyterms_prompt=["test", "words"], - ) - - task, transport = await create_basic_voice_agent(connection_params) - logger.error("❌ TEST FAILED: Should have raised ValueError") - except ValueError as e: - logger.info(f"✅ TEST PASSED: ValueError raised as expected") - logger.info(f" Error message: {e}") - - -async def test_keyterms_basic(): - """Test 7: Basic keyterms at initialization.""" - logger.info("=" * 80) - logger.info("TEST 7: Basic Keyterms Prompting") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams( - speech_model="u3-rt-pro", - keyterms_prompt=["Pipecat", "AssemblyAI", "Universal-3", "streaming"], - ) - - task, transport = await create_basic_voice_agent(connection_params) - - logger.info("✅ Service created with keyterms: Pipecat, AssemblyAI, Universal-3, streaming") - logger.info("Expected: Boosted recognition for these terms") - logger.info("Try saying: 'I'm testing Pipecat with AssemblyAI Universal-3 for streaming'") - - runner = PipelineRunner() - await runner.run(task) - - -async def test_diarization_no_format(): - """Test 10: Diarization enabled without formatting.""" - logger.info("=" * 80) - logger.info("TEST 10: Diarization Enabled (No Formatting)") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams(speech_model="u3-rt-pro", speaker_labels=True) - - task, transport = await create_basic_voice_agent(connection_params) - - logger.info("✅ Service created with speaker_labels=True") - logger.info("Expected: Speaker IDs in user_id field, plain text in transcript") - logger.info("Have multiple people speak to see different speaker labels") - - runner = PipelineRunner() - await runner.run(task) - - -async def test_diarization_xml_format(): - """Test 11: Diarization with XML formatting.""" - logger.info("=" * 80) - logger.info("TEST 11: Diarization with XML Formatting") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams(speech_model="u3-rt-pro", speaker_labels=True) - - task, transport = await create_basic_voice_agent( - connection_params, speaker_format="<{speaker}>{text}" - ) - - logger.info("✅ Service created with XML speaker formatting") - logger.info("Expected: Text like 'Hello'") - logger.info("Have multiple people speak to see formatted speaker tags") - - runner = PipelineRunner() - await runner.run(task) - - -async def test_dynamic_keyterms(): - """Test 13: Dynamic keyterms updates.""" - logger.info("=" * 80) - logger.info("TEST 13: Dynamic Keyterms Updates") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams(speech_model="u3-rt-pro") - - task, transport = await create_basic_voice_agent(connection_params) - - async def update_keyterms_stages(): - """Simulate multi-stage conversation with keyterms updates.""" - await asyncio.sleep(5) # Wait for connection - - # Stage 1: Greeting - logger.info("🔄 STAGE 1: Greeting (general terms)") - update1 = STTUpdateSettingsFrame( - settings={"keyterms_prompt": ["hello", "hi", "good morning", "welcome"]} - ) - await task.queue_frames([update1]) - - await asyncio.sleep(10) - - # Stage 2: Name collection - logger.info("🔄 STAGE 2: Name Collection") - update2 = STTUpdateSettingsFrame( - settings={ - "keyterms_prompt": [ - "first name", - "last name", - "John", - "Jane", - "Smith", - "Johnson", - ] - } - ) - await task.queue_frames([update2]) - - await asyncio.sleep(10) - - # Stage 3: Medical info - logger.info("🔄 STAGE 3: Medical Information") - update3 = STTUpdateSettingsFrame( - settings={ - "keyterms_prompt": [ - "cardiology", - "echocardiogram", - "blood pressure", - "Dr. Smith", - "metoprolol", - ] - } - ) - await task.queue_frames([update3]) - - await asyncio.sleep(10) - - # Stage 4: Clear keyterms - logger.info("🔄 STAGE 4: Clear Keyterms") - update4 = STTUpdateSettingsFrame(settings={"keyterms_prompt": []}) - await task.queue_frames([update4]) - - # Start update task - asyncio.create_task(update_keyterms_stages()) - - logger.info("✅ Service created, will update keyterms every 10 seconds") - logger.info("Expected: Different keyterms at each stage") - logger.info("Watch logs for 'STAGE X' messages and test relevant terms") - - runner = PipelineRunner() - await runner.run(task) - - -async def test_dynamic_silence_params(): - """Test 15: Dynamic silence parameter updates.""" - logger.info("=" * 80) - logger.info("TEST 15: Dynamic Silence Parameters") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams(speech_model="u3-rt-pro") - - task, transport = await create_basic_voice_agent(connection_params) - - async def update_silence_params(): - """Update silence parameters for different scenarios.""" - await asyncio.sleep(5) - - # Normal conversation - logger.info("🔄 PHASE 1: Normal conversation (default timing)") - await asyncio.sleep(10) - - # Reading credit card - logger.info("🔄 PHASE 2: Reading numbers (longer silence tolerance)") - update1 = STTUpdateSettingsFrame( - settings={ - "max_turn_silence": 5000, - "min_turn_silence": 300, - } - ) - await task.queue_frames([update1]) - - await asyncio.sleep(15) - - # Back to normal - logger.info("🔄 PHASE 3: Back to normal conversation") - update2 = STTUpdateSettingsFrame( - settings={ - "max_turn_silence": 1200, - "min_turn_silence": 100, - } - ) - await task.queue_frames([update2]) - - asyncio.create_task(update_silence_params()) - - logger.info("✅ Service will update silence parameters during conversation") - logger.info("Expected: Longer pauses tolerated in Phase 2") - logger.info("Try pausing between words to test") - - runner = PipelineRunner() - await runner.run(task) - - -async def test_multi_param_update(): - """Test 17: Update multiple parameters at once.""" - logger.info("=" * 80) - logger.info("TEST 17: Multiple Parameter Update") - logger.info("=" * 80) - - connection_params = AssemblyAIConnectionParams(speech_model="u3-rt-pro") - - task, transport = await create_basic_voice_agent(connection_params) - - async def multi_update(): - await asyncio.sleep(5) - - logger.info("🔄 Updating multiple parameters together") - update = STTUpdateSettingsFrame( - settings={ - "keyterms_prompt": ["account", "routing", "number"], - "max_turn_silence": 3000, - "min_turn_silence": 200, - } - ) - await task.queue_frames([update]) - - logger.info("✅ Check logs for single UpdateConfiguration message") - - asyncio.create_task(multi_update()) - - logger.info("Expected: All params updated in single WebSocket message") - - runner = PipelineRunner() - await runner.run(task) - - -# ============================================================================ -# Main Test Runner -# ============================================================================ - - -def main(): - """Main test runner.""" - parser = argparse.ArgumentParser(description="Test AssemblyAI u3-rt-pro integration") - parser.add_argument( - "--test", - type=str, - default="basic", - help="Test to run (basic, custom_min, max_warning, prompt_warning, " - "prompt_keyterms_conflict, keyterms, diarization, diarization_xml, " - "dynamic_keyterms, dynamic_silence, multi_param, all)", - ) - parser.add_argument("--interactive", action="store_true", help="Run in interactive mode") - - args = parser.parse_args() - - # Validate environment - if not TestConfig.validate(): - logger.error("Please set all required environment variables in .env") - sys.exit(1) - - # Test mapping - tests = { - "basic": test_basic_config, - "custom_min": test_custom_min_silence, - "max_warning": test_max_silence_warning, - "prompt_warning": test_custom_prompt_warning, - "prompt_keyterms_conflict": test_prompt_keyterms_conflict, - "keyterms": test_keyterms_basic, - "diarization": test_diarization_no_format, - "diarization_xml": test_diarization_xml_format, - "dynamic_keyterms": test_dynamic_keyterms, - "dynamic_silence": test_dynamic_silence_params, - "multi_param": test_multi_param_update, - } - - if args.interactive: - logger.info("Interactive mode - select test to run:") - for i, (name, _) in enumerate(tests.items(), 1): - logger.info(f"{i}. {name}") - logger.info(f"{len(tests) + 1}. Run all tests") - - choice = input("\nEnter test number: ") - try: - choice_num = int(choice) - if choice_num == len(tests) + 1: - args.test = "all" - else: - args.test = list(tests.keys())[choice_num - 1] - except (ValueError, IndexError): - logger.error("Invalid choice") - sys.exit(1) - - # Run test(s) - if args.test == "all": - logger.info("Running all tests sequentially...") - for test_name, test_func in tests.items(): - try: - asyncio.run(test_func()) - except KeyboardInterrupt: - logger.info(f"Test '{test_name}' interrupted") - break - except Exception as e: - logger.error(f"Test '{test_name}' failed: {e}") - else: - if args.test not in tests: - logger.error(f"Unknown test: {args.test}") - logger.info(f"Available tests: {', '.join(tests.keys())}") - sys.exit(1) - - try: - asyncio.run(tests[args.test]()) - except KeyboardInterrupt: - logger.info("Test interrupted") - except Exception as e: - logger.error(f"Test failed: {e}") - raise - - -if __name__ == "__main__": - main()