Compare commits

...

1 Commits

Author SHA1 Message Date
James Hush
3ec870835f feat(cartesia): Add word-level timestamp support to STT service
- Add include_timestamps parameter to CartesiaLiveOptions (defaults to True)
- Store word timestamp data in TranscriptionFrame.result field
- Add comprehensive documentation for word timestamp structure
- Enhance logging to differentiate transcripts with/without timestamps
- Update example to demonstrate accessing word-level timing data
- Align implementation with ElevenLabs STT approach
2025-11-25 13:58:35 +01:00
2 changed files with 61 additions and 2 deletions

View File

@@ -16,7 +16,7 @@ from pipecat.pipeline.task import PipelineTask
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.stt import CartesiaSTTService
from pipecat.services.cartesia.stt import CartesiaLiveOptions, CartesiaSTTService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -30,6 +30,15 @@ class TranscriptionLogger(FrameProcessor):
if isinstance(frame, TranscriptionFrame):
print(f"Transcription: {frame.text}")
# Access word-level timestamps if available
if frame.result and "words" in frame.result:
words = frame.result["words"]
print(f" Word-level timestamps ({len(words)} words):")
for word_data in words:
word = word_data.get("word", "")
start = word_data.get("start", 0)
end = word_data.get("end", 0)
print(f" '{word}' [{start:.3f}s - {end:.3f}s]")
# Push all frames through
await self.push_frame(frame, direction)
@@ -48,7 +57,16 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = CartesiaSTTService(api_key=os.getenv("CARTESIA_API_KEY"))
# Configure Cartesia STT with word-level timestamps enabled (default is True)
live_options = CartesiaLiveOptions(
model="ink-whisper",
include_timestamps=True, # Enable word-level timestamps
)
stt = CartesiaSTTService(
api_key=os.getenv("CARTESIA_API_KEY"),
live_options=live_options,
)
tl = TranscriptionLogger()

View File

@@ -57,6 +57,7 @@ class CartesiaLiveOptions:
language: str = Language.EN.value,
encoding: str = "pcm_s16le",
sample_rate: int = 16000,
include_timestamps: bool = True,
**kwargs,
):
"""Initialize CartesiaLiveOptions with default or provided parameters.
@@ -66,12 +67,15 @@ class CartesiaLiveOptions:
language: Target language for transcription. Defaults to English.
encoding: Audio encoding format. Defaults to "pcm_s16le".
sample_rate: Audio sample rate in Hz. Defaults to 16000.
include_timestamps: Whether to include word-level timestamps in transcripts.
Defaults to True.
**kwargs: Additional parameters for the transcription service.
"""
self.model = model
self.language = language
self.encoding = encoding
self.sample_rate = sample_rate
self.include_timestamps = include_timestamps
self.additional_params = kwargs
def to_dict(self):
@@ -85,6 +89,7 @@ class CartesiaLiveOptions:
"language": self.language if isinstance(self.language, str) else self.language.value,
"encoding": self.encoding,
"sample_rate": str(self.sample_rate),
"include_timestamps": "true" if self.include_timestamps else "false",
}
return params
@@ -176,6 +181,7 @@ class CartesiaSTTService(WebsocketSTTService):
self.set_model_name(merged_options.model)
self._api_key = api_key
self._base_url = base_url or "api.cartesia.ai"
self._include_timestamps = merged_options.include_timestamps
self._receive_task = None
def can_generate_metrics(self) -> bool:
@@ -330,6 +336,20 @@ class CartesiaSTTService(WebsocketSTTService):
pass
async def _on_transcript(self, data):
"""Handle transcript message from Cartesia API.
When include_timestamps is enabled, the result data includes:
- text: The transcribed text
- is_final: Whether this is a final or interim transcript
- language: Detected language (if available)
- words: Array of word objects with timing information (if timestamps enabled):
- word: The word text
- start: Start time in seconds
- end: End time in seconds
Args:
data: Transcript data from Cartesia API.
"""
if "text" not in data:
return
@@ -343,26 +363,47 @@ class CartesiaSTTService(WebsocketSTTService):
except (ValueError, KeyError):
pass
# Extract word-level timestamps if available and enabled
result = None
has_timestamps = False
if self._include_timestamps and "words" in data:
# Store the entire response data including word-level timestamps
result = data
has_timestamps = True
if len(transcript) > 0:
await self.stop_ttfb_metrics()
if is_final:
# Log with timestamp indicator
if has_timestamps:
logger.debug(f"Committed transcript with word timestamps: [{transcript}]")
else:
logger.debug(f"Committed transcript: [{transcript}]")
await self.push_frame(
TranscriptionFrame(
transcript,
self._user_id,
time_now_iso8601(),
language,
result,
)
)
await self._handle_transcription(transcript, is_final, language)
await self.stop_processing_metrics()
else:
# For interim transcriptions, just push the frame without tracing
if has_timestamps:
logger.trace(f"Partial transcript with word timestamps: [{transcript}]")
else:
logger.trace(f"Partial transcript: [{transcript}]")
await self.push_frame(
InterimTranscriptionFrame(
transcript,
self._user_id,
time_now_iso8601(),
language,
result,
)
)