Update NVIDIA STT services for Nemotron Speech defaults and config parity (#4269)
* Update NVIDIA STT services for Nemotron Speech defaults and config parity * Add changelog entry for PR #4269 * initialize boosted LM settings defaults in streaming STT * Align NVIDIA STT language handling with other STT services * add finalised flag to Nvidia stt final transcripts, remove processing latency logs * Changing interim transcription logging to tracing. --------- Co-authored-by: sathwika <geereddysath@nvidia.com> Co-authored-by: filipi87 <filipi87@gmail.com>
This commit is contained in:
committed by
GitHub
parent
4d14251f4a
commit
21f6c2afa5
2
changelog/4269.changed.md
Normal file
2
changelog/4269.changed.md
Normal file
@@ -0,0 +1,2 @@
|
||||
- Updated NVIDIA STT services to align with Nemotron Speech defaults and configuration: `api_key` is now optional for local deployments, additional recognition settings are available (including alternatives, word offsets, and diarization), and streaming/segmented docs now reflect Nemotron Speech APIs.
|
||||
- NVIDIA streaming STT now sets `TranscriptionFrame.finalized=True` when the provider marks a result as final, and preserves `language` on both `TranscriptionFrame` and `InterimTranscriptionFrame`.
|
||||
@@ -2,9 +2,15 @@
|
||||
# Copyright (c) 2024-2026, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
|
||||
#
|
||||
|
||||
"""NVIDIA Riva Speech-to-Text service implementations for real-time and batch transcription."""
|
||||
"""NVIDIA Nemotron Speech-to-Text service implementations for real-time and batch transcription.
|
||||
|
||||
Refer to the NVIDIA ASR NIM documentation for usage, customization,
|
||||
and local deployment steps:
|
||||
https://docs.nvidia.com/nim/speech/latest/asr/
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from collections.abc import AsyncGenerator, Mapping
|
||||
@@ -32,25 +38,28 @@ from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_stt
|
||||
|
||||
try:
|
||||
import grpc
|
||||
import riva.client
|
||||
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error("In order to use NVIDIA Riva STT, you need to `pip install pipecat-ai[nvidia]`.")
|
||||
logger.error(
|
||||
"In order to use NVIDIA Nemotron Speech STT, you need to `pip install pipecat-ai[nvidia]`."
|
||||
)
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
def language_to_nvidia_riva_language(language: Language) -> str | None:
|
||||
"""Maps Language enum to NVIDIA Riva ASR language codes.
|
||||
def language_to_nvidia_nemotron_speech_language(language: Language) -> str | None:
|
||||
"""Maps Language enum to NVIDIA Nemotron Speech ASR language codes.
|
||||
|
||||
Source:
|
||||
https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-riva-build-table.html?highlight=fr%20fr
|
||||
https://docs.nvidia.com/nim/speech/latest/reference/support-matrix/asr.html#supported-languages-by-model-type
|
||||
|
||||
Args:
|
||||
language: Language enum value.
|
||||
|
||||
Returns:
|
||||
Optional[str]: NVIDIA Riva language code or None if not supported.
|
||||
str | None: NVIDIA Nemotron Speech language code or None if not supported.
|
||||
"""
|
||||
LANGUAGE_MAP = {
|
||||
# Arabic
|
||||
@@ -93,15 +102,8 @@ def language_to_nvidia_riva_language(language: Language) -> str | None:
|
||||
|
||||
|
||||
@dataclass
|
||||
class NvidiaSTTSettings(STTSettings):
|
||||
"""Settings for NvidiaSTTService."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class NvidiaSegmentedSTTSettings(STTSettings):
|
||||
"""Settings for NvidiaSegmentedSTTService.
|
||||
class _NvidiaBaseSTTSettings(STTSettings):
|
||||
"""Shared settings for NVIDIA Nemotron Speech STT services.
|
||||
|
||||
Parameters:
|
||||
profanity_filter: Whether to filter profanity from results.
|
||||
@@ -109,6 +111,10 @@ class NvidiaSegmentedSTTSettings(STTSettings):
|
||||
verbatim_transcripts: Whether to return verbatim transcripts.
|
||||
boosted_lm_words: List of words to boost in language model.
|
||||
boosted_lm_score: Score boost for specified words.
|
||||
max_alternatives: Maximum number of recognition alternatives.
|
||||
word_time_offsets: Whether to include word-level time offsets.
|
||||
speaker_diarization: Whether to enable speaker diarization.
|
||||
diarization_max_speakers: Maximum number of speakers for diarization.
|
||||
"""
|
||||
|
||||
profanity_filter: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
@@ -116,12 +122,34 @@ class NvidiaSegmentedSTTSettings(STTSettings):
|
||||
verbatim_transcripts: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
boosted_lm_words: list[str] | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
boosted_lm_score: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
max_alternatives: int | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
word_time_offsets: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
speaker_diarization: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
diarization_max_speakers: int | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NvidiaSTTSettings(_NvidiaBaseSTTSettings):
|
||||
"""Settings for NvidiaSTTService.
|
||||
|
||||
Parameters:
|
||||
interim_results: Whether to return interim (partial) results.
|
||||
"""
|
||||
|
||||
interim_results: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NvidiaSegmentedSTTSettings(_NvidiaBaseSTTSettings):
|
||||
"""Settings for NvidiaSegmentedSTTService."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class NvidiaSTTService(STTService):
|
||||
"""Real-time speech-to-text service using NVIDIA Riva streaming ASR.
|
||||
"""Real-time speech-to-text service using NVIDIA Nemotron Speech streaming ASR.
|
||||
|
||||
Provides real-time transcription capabilities using NVIDIA's Riva ASR models
|
||||
Provides real-time transcription capabilities using NVIDIA's Nemotron Speech ASR models
|
||||
through streaming recognition. Supports interim results and continuous audio
|
||||
processing for low-latency applications.
|
||||
"""
|
||||
@@ -130,7 +158,7 @@ class NvidiaSTTService(STTService):
|
||||
_settings: Settings
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Configuration parameters for NVIDIA Riva STT service.
|
||||
"""Configuration parameters for NVIDIA Nemotron Speech STT service.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=NvidiaSTTService.Settings(...)`` instead.
|
||||
@@ -144,32 +172,52 @@ class NvidiaSTTService(STTService):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
api_key: str | None = None,
|
||||
server: str = "grpc.nvcf.nvidia.com:443",
|
||||
model_function_map: Mapping[str, str] = {
|
||||
"function_id": "1598d209-5e27-4d3c-8079-4751568b1081",
|
||||
"model_name": "parakeet-ctc-1.1b-asr",
|
||||
"function_id": "bb0837de-8c7b-481f-9ec8-ef5663e9c1fa",
|
||||
"model_name": "nemotron-asr-streaming",
|
||||
},
|
||||
sample_rate: int | None = None,
|
||||
params: InputParams | None = None,
|
||||
use_ssl: bool = True,
|
||||
audio_channel_count: int = 1,
|
||||
start_history: int = -1,
|
||||
start_threshold: float = -1.0,
|
||||
stop_history: int = 320,
|
||||
stop_threshold: float = -1.0,
|
||||
stop_history_eou: int = -1,
|
||||
stop_threshold_eou: float = -1.0,
|
||||
custom_configuration: str = "",
|
||||
settings: Settings | None = None,
|
||||
ttfs_p99_latency: float | None = NVIDIA_TTFS_P99,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the NVIDIA Riva STT service.
|
||||
"""Initialize the NVIDIA Nemotron Speech STT service.
|
||||
|
||||
Args:
|
||||
api_key: NVIDIA API key for authentication.
|
||||
server: NVIDIA Riva server address. Defaults to NVIDIA Cloud Function endpoint.
|
||||
api_key: NVIDIA API key for authentication. Required when using the
|
||||
cloud endpoint. Not needed for local deployments.
|
||||
server: NVIDIA Nemotron Speech server address. Defaults to NVIDIA Cloud Function endpoint.
|
||||
For local deployments, pass the local address (e.g. ``localhost:50051``).
|
||||
model_function_map: Mapping containing 'function_id' and 'model_name' for the ASR model.
|
||||
sample_rate: Audio sample rate in Hz. If None, uses pipeline default.
|
||||
params: Additional configuration parameters for NVIDIA Riva.
|
||||
params: Additional configuration parameters for NVIDIA Nemotron Speech.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=NvidiaSTTService.Settings(...)`` instead.
|
||||
|
||||
use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True.
|
||||
use_ssl: Whether to use SSL for the gRPC connection. Defaults to True
|
||||
for the NVIDIA cloud endpoint. Set to False for local deployments.
|
||||
audio_channel_count: Number of audio channels.
|
||||
start_history: VAD start history in frames. Use -1 for Nemotron Speech default.
|
||||
start_threshold: VAD start threshold. Use -1.0 for Nemotron Speech default.
|
||||
stop_history: VAD stop history in frames. Use -1 for Nemotron Speech default.
|
||||
stop_threshold: VAD stop threshold. Use -1.0 for Nemotron Speech default.
|
||||
stop_history_eou: End-of-utterance stop history in frames. Use -1 for Nemotron Speech default.
|
||||
stop_threshold_eou: End-of-utterance stop threshold. Use -1.0 for Nemotron Speech default.
|
||||
custom_configuration: Custom Nemotron Speech configuration string
|
||||
(e.g. ``"enable_vad_endpointing:true,neural_vad.onset:0.65"``).
|
||||
settings: Runtime-updatable settings. When provided alongside deprecated
|
||||
parameters, ``settings`` values take precedence.
|
||||
ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
|
||||
@@ -180,6 +228,16 @@ class NvidiaSTTService(STTService):
|
||||
default_settings = self.Settings(
|
||||
model=model_function_map.get("model_name"),
|
||||
language=Language.EN_US,
|
||||
profanity_filter=False,
|
||||
automatic_punctuation=True,
|
||||
verbatim_transcripts=True,
|
||||
boosted_lm_words=None,
|
||||
boosted_lm_score=4.0,
|
||||
max_alternatives=1,
|
||||
interim_results=True,
|
||||
word_time_offsets=False,
|
||||
speaker_diarization=False,
|
||||
diarization_max_speakers=0,
|
||||
)
|
||||
|
||||
# 2. (no deprecated direct args for this service)
|
||||
@@ -204,13 +262,14 @@ class NvidiaSTTService(STTService):
|
||||
self._server = server
|
||||
self._api_key = api_key
|
||||
self._use_ssl = use_ssl
|
||||
self._start_history = -1
|
||||
self._start_threshold = -1.0
|
||||
self._stop_history = -1
|
||||
self._stop_threshold = -1.0
|
||||
self._stop_history_eou = -1
|
||||
self._stop_threshold_eou = -1.0
|
||||
self._custom_configuration = ""
|
||||
self._audio_channel_count = audio_channel_count
|
||||
self._start_history = start_history
|
||||
self._start_threshold = start_threshold
|
||||
self._stop_history = stop_history
|
||||
self._stop_threshold = stop_threshold
|
||||
self._stop_history_eou = stop_history_eou
|
||||
self._stop_threshold_eou = stop_threshold_eou
|
||||
self._custom_configuration = custom_configuration
|
||||
self._function_id = model_function_map.get("function_id")
|
||||
|
||||
self._asr_service = None
|
||||
@@ -219,31 +278,38 @@ class NvidiaSTTService(STTService):
|
||||
self._thread_task = None
|
||||
|
||||
def _initialize_client(self):
|
||||
metadata = [
|
||||
["function-id", self._function_id],
|
||||
["authorization", f"Bearer {self._api_key}"],
|
||||
]
|
||||
"""Initialize the NVIDIA Nemotron Speech ASR client with authentication metadata."""
|
||||
metadata = []
|
||||
if self._function_id:
|
||||
metadata.append(["function-id", self._function_id])
|
||||
if self._api_key:
|
||||
metadata.append(["authorization", f"Bearer {self._api_key}"])
|
||||
auth = riva.client.Auth(None, self._use_ssl, self._server, metadata)
|
||||
|
||||
self._asr_service = riva.client.ASRService(auth)
|
||||
|
||||
def _create_recognition_config(self):
|
||||
"""Create the NVIDIA Riva ASR recognition configuration."""
|
||||
"""Create the NVIDIA Nemotron Speech ASR recognition configuration."""
|
||||
s = self._settings
|
||||
config = riva.client.StreamingRecognitionConfig(
|
||||
config=riva.client.RecognitionConfig(
|
||||
encoding=riva.client.AudioEncoding.LINEAR_PCM,
|
||||
language_code=self._settings.language,
|
||||
language_code=s.language,
|
||||
model="",
|
||||
max_alternatives=1,
|
||||
profanity_filter=False,
|
||||
enable_automatic_punctuation=True,
|
||||
verbatim_transcripts=True,
|
||||
max_alternatives=s.max_alternatives,
|
||||
profanity_filter=s.profanity_filter,
|
||||
enable_automatic_punctuation=s.automatic_punctuation,
|
||||
verbatim_transcripts=s.verbatim_transcripts,
|
||||
sample_rate_hertz=self.sample_rate,
|
||||
audio_channel_count=1,
|
||||
audio_channel_count=self._audio_channel_count,
|
||||
enable_word_time_offsets=s.word_time_offsets,
|
||||
),
|
||||
interim_results=True,
|
||||
interim_results=s.interim_results,
|
||||
)
|
||||
|
||||
if s.boosted_lm_words:
|
||||
riva.client.add_word_boosting_to_config(config, s.boosted_lm_words, s.boosted_lm_score)
|
||||
|
||||
riva.client.add_endpoint_parameters_to_config(
|
||||
config,
|
||||
self._start_history,
|
||||
@@ -253,7 +319,14 @@ class NvidiaSTTService(STTService):
|
||||
self._stop_threshold,
|
||||
self._stop_threshold_eou,
|
||||
)
|
||||
riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
|
||||
|
||||
if self._custom_configuration:
|
||||
riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
|
||||
|
||||
if s.speaker_diarization:
|
||||
riva.client.add_speaker_diarization_to_config(
|
||||
config, s.speaker_diarization, s.diarization_max_speakers
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
@@ -261,15 +334,31 @@ class NvidiaSTTService(STTService):
|
||||
"""Check if this service can generate processing metrics.
|
||||
|
||||
Returns:
|
||||
False - this service does not support metrics generation.
|
||||
True - this service supports metrics generation.
|
||||
"""
|
||||
return False
|
||||
return True
|
||||
|
||||
async def _update_settings(self, delta: STTSettings) -> dict[str, Any]:
|
||||
"""Apply a settings delta and sync internal state.
|
||||
|
||||
Args:
|
||||
delta: A :class:`STTSettings` (or ``NvidiaSTTService.Settings``) delta.
|
||||
|
||||
Returns:
|
||||
Dict mapping changed field names to their previous values.
|
||||
"""
|
||||
changed = await super()._update_settings(delta)
|
||||
|
||||
if changed and self._config is not None:
|
||||
self._config = self._create_recognition_config()
|
||||
|
||||
return changed
|
||||
|
||||
async def set_model(self, model: str):
|
||||
"""Set the ASR model for transcription.
|
||||
|
||||
.. deprecated:: 0.0.104
|
||||
Model cannot be changed after initialization for NVIDIA Riva streaming STT.
|
||||
Model cannot be changed after initialization for NVIDIA Nemotron Speech streaming STT.
|
||||
Set model and function id in the constructor instead.
|
||||
|
||||
Example::
|
||||
@@ -288,7 +377,7 @@ class NvidiaSTTService(STTService):
|
||||
warnings.simplefilter("always")
|
||||
warnings.warn(
|
||||
"'set_model' is deprecated. Model cannot be changed after initialization"
|
||||
" for NVIDIA Riva streaming STT. Set model and function id in the"
|
||||
" for NVIDIA Nemotron Speech streaming STT. Set model and function id in the"
|
||||
" constructor instead, e.g.:"
|
||||
" NvidiaSTTService(api_key=..., model_function_map="
|
||||
"{'function_id': '<UUID>', 'model_name': '<model_name>'})",
|
||||
@@ -297,7 +386,7 @@ class NvidiaSTTService(STTService):
|
||||
)
|
||||
|
||||
async def start(self, frame: StartFrame):
|
||||
"""Start the NVIDIA Riva STT service and initialize streaming configuration.
|
||||
"""Start the NVIDIA Nemotron Speech STT service and initialize streaming configuration.
|
||||
|
||||
Args:
|
||||
frame: StartFrame indicating pipeline start.
|
||||
@@ -314,7 +403,7 @@ class NvidiaSTTService(STTService):
|
||||
logger.debug(f"Initialized NvidiaSTTService with model: {self._settings.model}")
|
||||
|
||||
async def stop(self, frame: EndFrame):
|
||||
"""Stop the NVIDIA Riva STT service and clean up resources.
|
||||
"""Stop the NVIDIA Nemotron Speech STT service and clean up resources.
|
||||
|
||||
Args:
|
||||
frame: EndFrame indicating pipeline stop.
|
||||
@@ -323,7 +412,7 @@ class NvidiaSTTService(STTService):
|
||||
await self._stop_tasks()
|
||||
|
||||
async def cancel(self, frame: CancelFrame):
|
||||
"""Cancel the NVIDIA Riva STT service operation.
|
||||
"""Cancel the NVIDIA Nemotron Speech STT service operation.
|
||||
|
||||
Args:
|
||||
frame: CancelFrame indicating operation cancellation.
|
||||
@@ -337,14 +426,25 @@ class NvidiaSTTService(STTService):
|
||||
self._thread_task = None
|
||||
|
||||
def _response_handler(self):
|
||||
responses = self._asr_service.streaming_response_generator(
|
||||
audio_chunks=self,
|
||||
streaming_config=self._config,
|
||||
)
|
||||
for response in responses:
|
||||
if not response.results:
|
||||
continue
|
||||
asyncio.run_coroutine_threadsafe(self._handle_response(response), self.get_event_loop())
|
||||
try:
|
||||
responses = self._asr_service.streaming_response_generator(
|
||||
audio_chunks=self,
|
||||
streaming_config=self._config,
|
||||
)
|
||||
for response in responses:
|
||||
if not response.results:
|
||||
continue
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self._handle_response(response), self.get_event_loop()
|
||||
)
|
||||
except grpc.RpcError as e:
|
||||
status = e.code().name if hasattr(e, "code") else "UNKNOWN"
|
||||
details = e.details() if hasattr(e, "details") else str(e)
|
||||
logger.error(f"{self} gRPC streaming error ({status}): {details}")
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.push_error(f"{self} STT streaming failed (gRPC {status}): {details}"),
|
||||
self.get_event_loop(),
|
||||
)
|
||||
|
||||
async def _thread_task_handler(self):
|
||||
try:
|
||||
@@ -370,6 +470,7 @@ class NvidiaSTTService(STTService):
|
||||
if transcript and len(transcript) > 0:
|
||||
if result.is_final:
|
||||
await self.stop_processing_metrics()
|
||||
logger.debug(f"Transcription: [{transcript}]")
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(
|
||||
transcript,
|
||||
@@ -377,6 +478,7 @@ class NvidiaSTTService(STTService):
|
||||
time_now_iso8601(),
|
||||
self._settings.language,
|
||||
result=result,
|
||||
finalized=True,
|
||||
)
|
||||
)
|
||||
await self._handle_transcription(
|
||||
@@ -394,6 +496,7 @@ class NvidiaSTTService(STTService):
|
||||
result=result,
|
||||
)
|
||||
)
|
||||
logger.trace(f"Interim Transcription: [{transcript}]")
|
||||
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
||||
"""Process audio data for speech-to-text transcription.
|
||||
@@ -409,7 +512,7 @@ class NvidiaSTTService(STTService):
|
||||
yield None
|
||||
|
||||
def __next__(self) -> bytes:
|
||||
"""Get the next audio chunk for NVIDIA Riva processing.
|
||||
"""Get the next audio chunk for NVIDIA Nemotron Speech processing.
|
||||
|
||||
Returns:
|
||||
Audio bytes from the queue.
|
||||
@@ -422,7 +525,8 @@ class NvidiaSTTService(STTService):
|
||||
|
||||
try:
|
||||
future = asyncio.run_coroutine_threadsafe(self._queue.get(), self.get_event_loop())
|
||||
return future.result()
|
||||
audio = future.result()
|
||||
return audio
|
||||
except FuturesCancelledError:
|
||||
raise StopIteration
|
||||
|
||||
@@ -436,9 +540,9 @@ class NvidiaSTTService(STTService):
|
||||
|
||||
|
||||
class NvidiaSegmentedSTTService(SegmentedSTTService):
|
||||
"""Speech-to-text service using NVIDIA Riva's offline/batch models.
|
||||
"""Speech-to-text service using NVIDIA Nemotron Speech's offline/batch models.
|
||||
|
||||
By default, his service uses NVIDIA's Riva Canary ASR API to perform speech-to-text
|
||||
By default, this service uses NVIDIA's Nemotron Speech Canary ASR API to perform speech-to-text
|
||||
transcription on audio segments. It inherits from SegmentedSTTService to handle
|
||||
audio buffering and speech detection.
|
||||
"""
|
||||
@@ -447,7 +551,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
|
||||
_settings: Settings
|
||||
|
||||
class InputParams(BaseModel):
|
||||
"""Configuration parameters for NVIDIA Riva segmented STT service.
|
||||
"""Configuration parameters for NVIDIA Nemotron Speech segmented STT service.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead.
|
||||
@@ -471,7 +575,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
api_key: str,
|
||||
api_key: str | None = None,
|
||||
server: str = "grpc.nvcf.nvidia.com:443",
|
||||
model_function_map: Mapping[str, str] = {
|
||||
"function_id": "ee8dc628-76de-4acc-8595-1836e7e857bd",
|
||||
@@ -480,28 +584,34 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
|
||||
sample_rate: int | None = None,
|
||||
params: InputParams | None = None,
|
||||
use_ssl: bool = True,
|
||||
custom_configuration: str = "",
|
||||
settings: Settings | None = None,
|
||||
ttfs_p99_latency: float | None = NVIDIA_TTFS_P99,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize the NVIDIA Riva segmented STT service.
|
||||
"""Initialize the NVIDIA Nemotron Speech segmented STT service.
|
||||
|
||||
Args:
|
||||
api_key: NVIDIA API key for authentication
|
||||
server: NVIDIA Riva server address (defaults to NVIDIA Cloud Function endpoint)
|
||||
model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID
|
||||
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate
|
||||
params: Additional configuration parameters for NVIDIA Riva
|
||||
api_key: NVIDIA API key for authentication. Required when using the
|
||||
cloud endpoint. Not needed for local deployments.
|
||||
server: NVIDIA Nemotron Speech server address. Defaults to NVIDIA Cloud Function endpoint.
|
||||
For local deployments, pass the local address (e.g. ``localhost:50051``).
|
||||
model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID.
|
||||
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
||||
params: Additional configuration parameters for NVIDIA Nemotron Speech.
|
||||
|
||||
.. deprecated:: 0.0.105
|
||||
Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead.
|
||||
|
||||
use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True.
|
||||
use_ssl: Whether to use SSL for the gRPC connection. Defaults to True
|
||||
for the NVIDIA cloud endpoint. Set to False for local deployments.
|
||||
custom_configuration: Custom Nemotron Speech configuration string
|
||||
(e.g. ``"enable_vad_endpointing:true,neural_vad.onset:0.65"``).
|
||||
settings: Runtime-updatable settings. When provided alongside deprecated
|
||||
parameters, ``settings`` values take precedence.
|
||||
ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
|
||||
Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
|
||||
**kwargs: Additional arguments passed to SegmentedSTTService
|
||||
**kwargs: Additional arguments passed to SegmentedSTTService.
|
||||
"""
|
||||
# 1. Initialize default_settings with hardcoded defaults
|
||||
default_settings = self.Settings(
|
||||
@@ -512,6 +622,8 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
|
||||
verbatim_transcripts=False,
|
||||
boosted_lm_words=None,
|
||||
boosted_lm_score=4.0,
|
||||
max_alternatives=1,
|
||||
word_time_offsets=False,
|
||||
)
|
||||
|
||||
# 2. (no deprecated direct args for this service)
|
||||
@@ -538,81 +650,64 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Initialize NVIDIA Riva settings
|
||||
# Initialize NVIDIA Nemotron Speech settings
|
||||
self._api_key = api_key
|
||||
self._server = server
|
||||
self._use_ssl = use_ssl
|
||||
self._function_id = model_function_map.get("function_id")
|
||||
|
||||
# Voice activity detection thresholds (use NVIDIA Riva defaults)
|
||||
self._start_history = -1
|
||||
self._start_threshold = -1.0
|
||||
self._stop_history = -1
|
||||
self._stop_threshold = -1.0
|
||||
self._stop_history_eou = -1
|
||||
self._stop_threshold_eou = -1.0
|
||||
self._custom_configuration = ""
|
||||
self._custom_configuration = custom_configuration
|
||||
|
||||
self._config = None
|
||||
self._asr_service = None
|
||||
|
||||
def language_to_service_language(self, language: Language) -> str | None:
|
||||
"""Convert pipecat Language enum to NVIDIA Riva's language code.
|
||||
"""Convert pipecat Language enum to NVIDIA Nemotron Speech's language code.
|
||||
|
||||
Args:
|
||||
language: Language enum value.
|
||||
|
||||
Returns:
|
||||
NVIDIA Riva language code or None if not supported.
|
||||
NVIDIA Nemotron Speech language code or None if not supported.
|
||||
"""
|
||||
return language_to_nvidia_riva_language(language)
|
||||
return language_to_nvidia_nemotron_speech_language(language)
|
||||
|
||||
def _initialize_client(self):
|
||||
"""Initialize the NVIDIA Riva ASR client with authentication metadata."""
|
||||
"""Initialize the NVIDIA Nemotron Speech ASR client with authentication metadata."""
|
||||
if self._asr_service is not None:
|
||||
return
|
||||
|
||||
# Set up authentication metadata for NVIDIA Cloud Functions
|
||||
metadata = [
|
||||
["function-id", self._function_id],
|
||||
["authorization", f"Bearer {self._api_key}"],
|
||||
]
|
||||
metadata = []
|
||||
if self._function_id:
|
||||
metadata.append(["function-id", self._function_id])
|
||||
if self._api_key:
|
||||
metadata.append(["authorization", f"Bearer {self._api_key}"])
|
||||
|
||||
# Create authenticated client
|
||||
auth = riva.client.Auth(None, self._use_ssl, self._server, metadata)
|
||||
self._asr_service = riva.client.ASRService(auth)
|
||||
|
||||
def _get_language_code(self) -> str:
|
||||
"""Get the current NVIDIA Riva language code string."""
|
||||
"""Get the current NVIDIA Nemotron Speech language code string."""
|
||||
return self._settings.language or "en-US"
|
||||
|
||||
def _create_recognition_config(self):
|
||||
"""Create the NVIDIA Riva ASR recognition configuration."""
|
||||
"""Create the NVIDIA Nemotron Speech ASR recognition configuration."""
|
||||
# Create base configuration
|
||||
s = self._settings
|
||||
config = riva.client.RecognitionConfig(
|
||||
language_code=self._get_language_code(),
|
||||
max_alternatives=1,
|
||||
max_alternatives=s.max_alternatives,
|
||||
profanity_filter=s.profanity_filter,
|
||||
enable_automatic_punctuation=s.automatic_punctuation,
|
||||
verbatim_transcripts=s.verbatim_transcripts,
|
||||
enable_word_time_offsets=s.word_time_offsets,
|
||||
)
|
||||
|
||||
# Add word boosting if specified
|
||||
if s.boosted_lm_words:
|
||||
riva.client.add_word_boosting_to_config(config, s.boosted_lm_words, s.boosted_lm_score)
|
||||
|
||||
# Add voice activity detection parameters
|
||||
riva.client.add_endpoint_parameters_to_config(
|
||||
config,
|
||||
self._start_history,
|
||||
self._start_threshold,
|
||||
self._stop_history,
|
||||
self._stop_history_eou,
|
||||
self._stop_threshold,
|
||||
self._stop_threshold_eou,
|
||||
)
|
||||
|
||||
# Add any custom configuration
|
||||
if self._custom_configuration:
|
||||
riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
|
||||
@@ -676,7 +771,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
|
||||
|
||||
await self.start_processing_metrics()
|
||||
|
||||
# Process audio with NVIDIA Riva ASR - explicitly request non-future response
|
||||
# Process audio with NVIDIA Nemotron Speech ASR - explicitly request non-future response
|
||||
raw_response = self._asr_service.offline_recognize(audio, self._config, future=False)
|
||||
|
||||
await self.stop_processing_metrics()
|
||||
@@ -712,10 +807,14 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
|
||||
await self._handle_transcription(text, True, self._settings.language)
|
||||
|
||||
if not transcription_found:
|
||||
logger.debug(f"{self}: No transcription results found in NVIDIA Riva response")
|
||||
logger.debug(
|
||||
f"{self}: No transcription results found in NVIDIA Nemotron Speech response"
|
||||
)
|
||||
except AttributeError as ae:
|
||||
logger.error(f"{self}: Unexpected response structure from NVIDIA Riva: {ae}")
|
||||
yield ErrorFrame(f"{self}: Unexpected NVIDIA Riva response format: {str(ae)}")
|
||||
logger.error(f"{self}: Unexpected response structure from NVIDIA Nemotron Speech: {ae}")
|
||||
yield ErrorFrame(
|
||||
error=f"{self}: Unexpected NVIDIA Nemotron Speech response format: {str(ae)}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"{self} exception: {e}")
|
||||
yield ErrorFrame(error=f"{self} error: {e}")
|
||||
|
||||
Reference in New Issue
Block a user