From 21f6c2afa59fb90a9dfd2ae06eeb815c14a78837 Mon Sep 17 00:00:00 2001 From: Sathwika Reddy Geereddy Date: Thu, 23 Apr 2026 18:31:27 +0530 Subject: [PATCH] Update NVIDIA STT services for Nemotron Speech defaults and config parity (#4269) * Update NVIDIA STT services for Nemotron Speech defaults and config parity * Add changelog entry for PR #4269 * initialize boosted LM settings defaults in streaming STT * Align NVIDIA STT language handling with other STT services * add finalised flag to Nvidia stt final transcripts, remove processing latency logs * Changing interim transcription logging to tracing. --------- Co-authored-by: sathwika Co-authored-by: filipi87 --- changelog/4269.changed.md | 2 + src/pipecat/services/nvidia/stt.py | 321 +++++++++++++++++++---------- 2 files changed, 212 insertions(+), 111 deletions(-) create mode 100644 changelog/4269.changed.md diff --git a/changelog/4269.changed.md b/changelog/4269.changed.md new file mode 100644 index 000000000..e88c45b2d --- /dev/null +++ b/changelog/4269.changed.md @@ -0,0 +1,2 @@ +- Updated NVIDIA STT services to align with Nemotron Speech defaults and configuration: `api_key` is now optional for local deployments, additional recognition settings are available (including alternatives, word offsets, and diarization), and streaming/segmented docs now reflect Nemotron Speech APIs. +- NVIDIA streaming STT now sets `TranscriptionFrame.finalized=True` when the provider marks a result as final, and preserves `language` on both `TranscriptionFrame` and `InterimTranscriptionFrame`. diff --git a/src/pipecat/services/nvidia/stt.py b/src/pipecat/services/nvidia/stt.py index 3181b53ba..d8b87aab7 100644 --- a/src/pipecat/services/nvidia/stt.py +++ b/src/pipecat/services/nvidia/stt.py @@ -2,9 +2,15 @@ # Copyright (c) 2024-2026, Daily # # SPDX-License-Identifier: BSD 2-Clause License +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. # -"""NVIDIA Riva Speech-to-Text service implementations for real-time and batch transcription.""" +"""NVIDIA Nemotron Speech-to-Text service implementations for real-time and batch transcription. + +Refer to the NVIDIA ASR NIM documentation for usage, customization, +and local deployment steps: +https://docs.nvidia.com/nim/speech/latest/asr/ +""" import asyncio from collections.abc import AsyncGenerator, Mapping @@ -32,25 +38,28 @@ from pipecat.utils.time import time_now_iso8601 from pipecat.utils.tracing.service_decorators import traced_stt try: + import grpc import riva.client except ModuleNotFoundError as e: logger.error(f"Exception: {e}") - logger.error("In order to use NVIDIA Riva STT, you need to `pip install pipecat-ai[nvidia]`.") + logger.error( + "In order to use NVIDIA Nemotron Speech STT, you need to `pip install pipecat-ai[nvidia]`." + ) raise Exception(f"Missing module: {e}") -def language_to_nvidia_riva_language(language: Language) -> str | None: - """Maps Language enum to NVIDIA Riva ASR language codes. +def language_to_nvidia_nemotron_speech_language(language: Language) -> str | None: + """Maps Language enum to NVIDIA Nemotron Speech ASR language codes. Source: - https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-riva-build-table.html?highlight=fr%20fr + https://docs.nvidia.com/nim/speech/latest/reference/support-matrix/asr.html#supported-languages-by-model-type Args: language: Language enum value. Returns: - Optional[str]: NVIDIA Riva language code or None if not supported. + str | None: NVIDIA Nemotron Speech language code or None if not supported. """ LANGUAGE_MAP = { # Arabic @@ -93,15 +102,8 @@ def language_to_nvidia_riva_language(language: Language) -> str | None: @dataclass -class NvidiaSTTSettings(STTSettings): - """Settings for NvidiaSTTService.""" - - pass - - -@dataclass -class NvidiaSegmentedSTTSettings(STTSettings): - """Settings for NvidiaSegmentedSTTService. +class _NvidiaBaseSTTSettings(STTSettings): + """Shared settings for NVIDIA Nemotron Speech STT services. Parameters: profanity_filter: Whether to filter profanity from results. @@ -109,6 +111,10 @@ class NvidiaSegmentedSTTSettings(STTSettings): verbatim_transcripts: Whether to return verbatim transcripts. boosted_lm_words: List of words to boost in language model. boosted_lm_score: Score boost for specified words. + max_alternatives: Maximum number of recognition alternatives. + word_time_offsets: Whether to include word-level time offsets. + speaker_diarization: Whether to enable speaker diarization. + diarization_max_speakers: Maximum number of speakers for diarization. """ profanity_filter: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN) @@ -116,12 +122,34 @@ class NvidiaSegmentedSTTSettings(STTSettings): verbatim_transcripts: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN) boosted_lm_words: list[str] | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN) boosted_lm_score: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN) + max_alternatives: int | _NotGiven = field(default_factory=lambda: NOT_GIVEN) + word_time_offsets: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN) + speaker_diarization: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN) + diarization_max_speakers: int | _NotGiven = field(default_factory=lambda: NOT_GIVEN) + + +@dataclass +class NvidiaSTTSettings(_NvidiaBaseSTTSettings): + """Settings for NvidiaSTTService. + + Parameters: + interim_results: Whether to return interim (partial) results. + """ + + interim_results: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN) + + +@dataclass +class NvidiaSegmentedSTTSettings(_NvidiaBaseSTTSettings): + """Settings for NvidiaSegmentedSTTService.""" + + pass class NvidiaSTTService(STTService): - """Real-time speech-to-text service using NVIDIA Riva streaming ASR. + """Real-time speech-to-text service using NVIDIA Nemotron Speech streaming ASR. - Provides real-time transcription capabilities using NVIDIA's Riva ASR models + Provides real-time transcription capabilities using NVIDIA's Nemotron Speech ASR models through streaming recognition. Supports interim results and continuous audio processing for low-latency applications. """ @@ -130,7 +158,7 @@ class NvidiaSTTService(STTService): _settings: Settings class InputParams(BaseModel): - """Configuration parameters for NVIDIA Riva STT service. + """Configuration parameters for NVIDIA Nemotron Speech STT service. .. deprecated:: 0.0.105 Use ``settings=NvidiaSTTService.Settings(...)`` instead. @@ -144,32 +172,52 @@ class NvidiaSTTService(STTService): def __init__( self, *, - api_key: str, + api_key: str | None = None, server: str = "grpc.nvcf.nvidia.com:443", model_function_map: Mapping[str, str] = { - "function_id": "1598d209-5e27-4d3c-8079-4751568b1081", - "model_name": "parakeet-ctc-1.1b-asr", + "function_id": "bb0837de-8c7b-481f-9ec8-ef5663e9c1fa", + "model_name": "nemotron-asr-streaming", }, sample_rate: int | None = None, params: InputParams | None = None, use_ssl: bool = True, + audio_channel_count: int = 1, + start_history: int = -1, + start_threshold: float = -1.0, + stop_history: int = 320, + stop_threshold: float = -1.0, + stop_history_eou: int = -1, + stop_threshold_eou: float = -1.0, + custom_configuration: str = "", settings: Settings | None = None, ttfs_p99_latency: float | None = NVIDIA_TTFS_P99, **kwargs, ): - """Initialize the NVIDIA Riva STT service. + """Initialize the NVIDIA Nemotron Speech STT service. Args: - api_key: NVIDIA API key for authentication. - server: NVIDIA Riva server address. Defaults to NVIDIA Cloud Function endpoint. + api_key: NVIDIA API key for authentication. Required when using the + cloud endpoint. Not needed for local deployments. + server: NVIDIA Nemotron Speech server address. Defaults to NVIDIA Cloud Function endpoint. + For local deployments, pass the local address (e.g. ``localhost:50051``). model_function_map: Mapping containing 'function_id' and 'model_name' for the ASR model. sample_rate: Audio sample rate in Hz. If None, uses pipeline default. - params: Additional configuration parameters for NVIDIA Riva. + params: Additional configuration parameters for NVIDIA Nemotron Speech. .. deprecated:: 0.0.105 Use ``settings=NvidiaSTTService.Settings(...)`` instead. - use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True. + use_ssl: Whether to use SSL for the gRPC connection. Defaults to True + for the NVIDIA cloud endpoint. Set to False for local deployments. + audio_channel_count: Number of audio channels. + start_history: VAD start history in frames. Use -1 for Nemotron Speech default. + start_threshold: VAD start threshold. Use -1.0 for Nemotron Speech default. + stop_history: VAD stop history in frames. Use -1 for Nemotron Speech default. + stop_threshold: VAD stop threshold. Use -1.0 for Nemotron Speech default. + stop_history_eou: End-of-utterance stop history in frames. Use -1 for Nemotron Speech default. + stop_threshold_eou: End-of-utterance stop threshold. Use -1.0 for Nemotron Speech default. + custom_configuration: Custom Nemotron Speech configuration string + (e.g. ``"enable_vad_endpointing:true,neural_vad.onset:0.65"``). settings: Runtime-updatable settings. When provided alongside deprecated parameters, ``settings`` values take precedence. ttfs_p99_latency: P99 latency from speech end to final transcript in seconds. @@ -180,6 +228,16 @@ class NvidiaSTTService(STTService): default_settings = self.Settings( model=model_function_map.get("model_name"), language=Language.EN_US, + profanity_filter=False, + automatic_punctuation=True, + verbatim_transcripts=True, + boosted_lm_words=None, + boosted_lm_score=4.0, + max_alternatives=1, + interim_results=True, + word_time_offsets=False, + speaker_diarization=False, + diarization_max_speakers=0, ) # 2. (no deprecated direct args for this service) @@ -204,13 +262,14 @@ class NvidiaSTTService(STTService): self._server = server self._api_key = api_key self._use_ssl = use_ssl - self._start_history = -1 - self._start_threshold = -1.0 - self._stop_history = -1 - self._stop_threshold = -1.0 - self._stop_history_eou = -1 - self._stop_threshold_eou = -1.0 - self._custom_configuration = "" + self._audio_channel_count = audio_channel_count + self._start_history = start_history + self._start_threshold = start_threshold + self._stop_history = stop_history + self._stop_threshold = stop_threshold + self._stop_history_eou = stop_history_eou + self._stop_threshold_eou = stop_threshold_eou + self._custom_configuration = custom_configuration self._function_id = model_function_map.get("function_id") self._asr_service = None @@ -219,31 +278,38 @@ class NvidiaSTTService(STTService): self._thread_task = None def _initialize_client(self): - metadata = [ - ["function-id", self._function_id], - ["authorization", f"Bearer {self._api_key}"], - ] + """Initialize the NVIDIA Nemotron Speech ASR client with authentication metadata.""" + metadata = [] + if self._function_id: + metadata.append(["function-id", self._function_id]) + if self._api_key: + metadata.append(["authorization", f"Bearer {self._api_key}"]) auth = riva.client.Auth(None, self._use_ssl, self._server, metadata) self._asr_service = riva.client.ASRService(auth) def _create_recognition_config(self): - """Create the NVIDIA Riva ASR recognition configuration.""" + """Create the NVIDIA Nemotron Speech ASR recognition configuration.""" + s = self._settings config = riva.client.StreamingRecognitionConfig( config=riva.client.RecognitionConfig( encoding=riva.client.AudioEncoding.LINEAR_PCM, - language_code=self._settings.language, + language_code=s.language, model="", - max_alternatives=1, - profanity_filter=False, - enable_automatic_punctuation=True, - verbatim_transcripts=True, + max_alternatives=s.max_alternatives, + profanity_filter=s.profanity_filter, + enable_automatic_punctuation=s.automatic_punctuation, + verbatim_transcripts=s.verbatim_transcripts, sample_rate_hertz=self.sample_rate, - audio_channel_count=1, + audio_channel_count=self._audio_channel_count, + enable_word_time_offsets=s.word_time_offsets, ), - interim_results=True, + interim_results=s.interim_results, ) + if s.boosted_lm_words: + riva.client.add_word_boosting_to_config(config, s.boosted_lm_words, s.boosted_lm_score) + riva.client.add_endpoint_parameters_to_config( config, self._start_history, @@ -253,7 +319,14 @@ class NvidiaSTTService(STTService): self._stop_threshold, self._stop_threshold_eou, ) - riva.client.add_custom_configuration_to_config(config, self._custom_configuration) + + if self._custom_configuration: + riva.client.add_custom_configuration_to_config(config, self._custom_configuration) + + if s.speaker_diarization: + riva.client.add_speaker_diarization_to_config( + config, s.speaker_diarization, s.diarization_max_speakers + ) return config @@ -261,15 +334,31 @@ class NvidiaSTTService(STTService): """Check if this service can generate processing metrics. Returns: - False - this service does not support metrics generation. + True - this service supports metrics generation. """ - return False + return True + + async def _update_settings(self, delta: STTSettings) -> dict[str, Any]: + """Apply a settings delta and sync internal state. + + Args: + delta: A :class:`STTSettings` (or ``NvidiaSTTService.Settings``) delta. + + Returns: + Dict mapping changed field names to their previous values. + """ + changed = await super()._update_settings(delta) + + if changed and self._config is not None: + self._config = self._create_recognition_config() + + return changed async def set_model(self, model: str): """Set the ASR model for transcription. .. deprecated:: 0.0.104 - Model cannot be changed after initialization for NVIDIA Riva streaming STT. + Model cannot be changed after initialization for NVIDIA Nemotron Speech streaming STT. Set model and function id in the constructor instead. Example:: @@ -288,7 +377,7 @@ class NvidiaSTTService(STTService): warnings.simplefilter("always") warnings.warn( "'set_model' is deprecated. Model cannot be changed after initialization" - " for NVIDIA Riva streaming STT. Set model and function id in the" + " for NVIDIA Nemotron Speech streaming STT. Set model and function id in the" " constructor instead, e.g.:" " NvidiaSTTService(api_key=..., model_function_map=" "{'function_id': '', 'model_name': ''})", @@ -297,7 +386,7 @@ class NvidiaSTTService(STTService): ) async def start(self, frame: StartFrame): - """Start the NVIDIA Riva STT service and initialize streaming configuration. + """Start the NVIDIA Nemotron Speech STT service and initialize streaming configuration. Args: frame: StartFrame indicating pipeline start. @@ -314,7 +403,7 @@ class NvidiaSTTService(STTService): logger.debug(f"Initialized NvidiaSTTService with model: {self._settings.model}") async def stop(self, frame: EndFrame): - """Stop the NVIDIA Riva STT service and clean up resources. + """Stop the NVIDIA Nemotron Speech STT service and clean up resources. Args: frame: EndFrame indicating pipeline stop. @@ -323,7 +412,7 @@ class NvidiaSTTService(STTService): await self._stop_tasks() async def cancel(self, frame: CancelFrame): - """Cancel the NVIDIA Riva STT service operation. + """Cancel the NVIDIA Nemotron Speech STT service operation. Args: frame: CancelFrame indicating operation cancellation. @@ -337,14 +426,25 @@ class NvidiaSTTService(STTService): self._thread_task = None def _response_handler(self): - responses = self._asr_service.streaming_response_generator( - audio_chunks=self, - streaming_config=self._config, - ) - for response in responses: - if not response.results: - continue - asyncio.run_coroutine_threadsafe(self._handle_response(response), self.get_event_loop()) + try: + responses = self._asr_service.streaming_response_generator( + audio_chunks=self, + streaming_config=self._config, + ) + for response in responses: + if not response.results: + continue + asyncio.run_coroutine_threadsafe( + self._handle_response(response), self.get_event_loop() + ) + except grpc.RpcError as e: + status = e.code().name if hasattr(e, "code") else "UNKNOWN" + details = e.details() if hasattr(e, "details") else str(e) + logger.error(f"{self} gRPC streaming error ({status}): {details}") + asyncio.run_coroutine_threadsafe( + self.push_error(f"{self} STT streaming failed (gRPC {status}): {details}"), + self.get_event_loop(), + ) async def _thread_task_handler(self): try: @@ -370,6 +470,7 @@ class NvidiaSTTService(STTService): if transcript and len(transcript) > 0: if result.is_final: await self.stop_processing_metrics() + logger.debug(f"Transcription: [{transcript}]") await self.push_frame( TranscriptionFrame( transcript, @@ -377,6 +478,7 @@ class NvidiaSTTService(STTService): time_now_iso8601(), self._settings.language, result=result, + finalized=True, ) ) await self._handle_transcription( @@ -394,6 +496,7 @@ class NvidiaSTTService(STTService): result=result, ) ) + logger.trace(f"Interim Transcription: [{transcript}]") async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: """Process audio data for speech-to-text transcription. @@ -409,7 +512,7 @@ class NvidiaSTTService(STTService): yield None def __next__(self) -> bytes: - """Get the next audio chunk for NVIDIA Riva processing. + """Get the next audio chunk for NVIDIA Nemotron Speech processing. Returns: Audio bytes from the queue. @@ -422,7 +525,8 @@ class NvidiaSTTService(STTService): try: future = asyncio.run_coroutine_threadsafe(self._queue.get(), self.get_event_loop()) - return future.result() + audio = future.result() + return audio except FuturesCancelledError: raise StopIteration @@ -436,9 +540,9 @@ class NvidiaSTTService(STTService): class NvidiaSegmentedSTTService(SegmentedSTTService): - """Speech-to-text service using NVIDIA Riva's offline/batch models. + """Speech-to-text service using NVIDIA Nemotron Speech's offline/batch models. - By default, his service uses NVIDIA's Riva Canary ASR API to perform speech-to-text + By default, this service uses NVIDIA's Nemotron Speech Canary ASR API to perform speech-to-text transcription on audio segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection. """ @@ -447,7 +551,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService): _settings: Settings class InputParams(BaseModel): - """Configuration parameters for NVIDIA Riva segmented STT service. + """Configuration parameters for NVIDIA Nemotron Speech segmented STT service. .. deprecated:: 0.0.105 Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead. @@ -471,7 +575,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService): def __init__( self, *, - api_key: str, + api_key: str | None = None, server: str = "grpc.nvcf.nvidia.com:443", model_function_map: Mapping[str, str] = { "function_id": "ee8dc628-76de-4acc-8595-1836e7e857bd", @@ -480,28 +584,34 @@ class NvidiaSegmentedSTTService(SegmentedSTTService): sample_rate: int | None = None, params: InputParams | None = None, use_ssl: bool = True, + custom_configuration: str = "", settings: Settings | None = None, ttfs_p99_latency: float | None = NVIDIA_TTFS_P99, **kwargs, ): - """Initialize the NVIDIA Riva segmented STT service. + """Initialize the NVIDIA Nemotron Speech segmented STT service. Args: - api_key: NVIDIA API key for authentication - server: NVIDIA Riva server address (defaults to NVIDIA Cloud Function endpoint) - model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID - sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate - params: Additional configuration parameters for NVIDIA Riva + api_key: NVIDIA API key for authentication. Required when using the + cloud endpoint. Not needed for local deployments. + server: NVIDIA Nemotron Speech server address. Defaults to NVIDIA Cloud Function endpoint. + For local deployments, pass the local address (e.g. ``localhost:50051``). + model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID. + sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate. + params: Additional configuration parameters for NVIDIA Nemotron Speech. .. deprecated:: 0.0.105 Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead. - use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True. + use_ssl: Whether to use SSL for the gRPC connection. Defaults to True + for the NVIDIA cloud endpoint. Set to False for local deployments. + custom_configuration: Custom Nemotron Speech configuration string + (e.g. ``"enable_vad_endpointing:true,neural_vad.onset:0.65"``). settings: Runtime-updatable settings. When provided alongside deprecated parameters, ``settings`` values take precedence. ttfs_p99_latency: P99 latency from speech end to final transcript in seconds. Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark - **kwargs: Additional arguments passed to SegmentedSTTService + **kwargs: Additional arguments passed to SegmentedSTTService. """ # 1. Initialize default_settings with hardcoded defaults default_settings = self.Settings( @@ -512,6 +622,8 @@ class NvidiaSegmentedSTTService(SegmentedSTTService): verbatim_transcripts=False, boosted_lm_words=None, boosted_lm_score=4.0, + max_alternatives=1, + word_time_offsets=False, ) # 2. (no deprecated direct args for this service) @@ -538,81 +650,64 @@ class NvidiaSegmentedSTTService(SegmentedSTTService): **kwargs, ) - # Initialize NVIDIA Riva settings + # Initialize NVIDIA Nemotron Speech settings self._api_key = api_key self._server = server self._use_ssl = use_ssl self._function_id = model_function_map.get("function_id") - - # Voice activity detection thresholds (use NVIDIA Riva defaults) - self._start_history = -1 - self._start_threshold = -1.0 - self._stop_history = -1 - self._stop_threshold = -1.0 - self._stop_history_eou = -1 - self._stop_threshold_eou = -1.0 - self._custom_configuration = "" + self._custom_configuration = custom_configuration self._config = None self._asr_service = None def language_to_service_language(self, language: Language) -> str | None: - """Convert pipecat Language enum to NVIDIA Riva's language code. + """Convert pipecat Language enum to NVIDIA Nemotron Speech's language code. Args: language: Language enum value. Returns: - NVIDIA Riva language code or None if not supported. + NVIDIA Nemotron Speech language code or None if not supported. """ - return language_to_nvidia_riva_language(language) + return language_to_nvidia_nemotron_speech_language(language) def _initialize_client(self): - """Initialize the NVIDIA Riva ASR client with authentication metadata.""" + """Initialize the NVIDIA Nemotron Speech ASR client with authentication metadata.""" if self._asr_service is not None: return # Set up authentication metadata for NVIDIA Cloud Functions - metadata = [ - ["function-id", self._function_id], - ["authorization", f"Bearer {self._api_key}"], - ] + metadata = [] + if self._function_id: + metadata.append(["function-id", self._function_id]) + if self._api_key: + metadata.append(["authorization", f"Bearer {self._api_key}"]) # Create authenticated client auth = riva.client.Auth(None, self._use_ssl, self._server, metadata) self._asr_service = riva.client.ASRService(auth) def _get_language_code(self) -> str: - """Get the current NVIDIA Riva language code string.""" + """Get the current NVIDIA Nemotron Speech language code string.""" return self._settings.language or "en-US" def _create_recognition_config(self): - """Create the NVIDIA Riva ASR recognition configuration.""" + """Create the NVIDIA Nemotron Speech ASR recognition configuration.""" # Create base configuration s = self._settings config = riva.client.RecognitionConfig( language_code=self._get_language_code(), - max_alternatives=1, + max_alternatives=s.max_alternatives, profanity_filter=s.profanity_filter, enable_automatic_punctuation=s.automatic_punctuation, verbatim_transcripts=s.verbatim_transcripts, + enable_word_time_offsets=s.word_time_offsets, ) # Add word boosting if specified if s.boosted_lm_words: riva.client.add_word_boosting_to_config(config, s.boosted_lm_words, s.boosted_lm_score) - # Add voice activity detection parameters - riva.client.add_endpoint_parameters_to_config( - config, - self._start_history, - self._start_threshold, - self._stop_history, - self._stop_history_eou, - self._stop_threshold, - self._stop_threshold_eou, - ) - # Add any custom configuration if self._custom_configuration: riva.client.add_custom_configuration_to_config(config, self._custom_configuration) @@ -676,7 +771,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService): await self.start_processing_metrics() - # Process audio with NVIDIA Riva ASR - explicitly request non-future response + # Process audio with NVIDIA Nemotron Speech ASR - explicitly request non-future response raw_response = self._asr_service.offline_recognize(audio, self._config, future=False) await self.stop_processing_metrics() @@ -712,10 +807,14 @@ class NvidiaSegmentedSTTService(SegmentedSTTService): await self._handle_transcription(text, True, self._settings.language) if not transcription_found: - logger.debug(f"{self}: No transcription results found in NVIDIA Riva response") + logger.debug( + f"{self}: No transcription results found in NVIDIA Nemotron Speech response" + ) except AttributeError as ae: - logger.error(f"{self}: Unexpected response structure from NVIDIA Riva: {ae}") - yield ErrorFrame(f"{self}: Unexpected NVIDIA Riva response format: {str(ae)}") + logger.error(f"{self}: Unexpected response structure from NVIDIA Nemotron Speech: {ae}") + yield ErrorFrame( + error=f"{self}: Unexpected NVIDIA Nemotron Speech response format: {str(ae)}" + ) except Exception as e: logger.error(f"{self} exception: {e}") yield ErrorFrame(error=f"{self} error: {e}")