Update NVIDIA STT services for Nemotron Speech defaults and config parity (#4269)

* Update NVIDIA STT services for Nemotron Speech defaults and config parity * Add changelog entry for PR #4269 * initialize boosted LM settings defaults in streaming STT * Align NVIDIA STT language handling with other STT services * add finalised flag to Nvidia stt final transcripts, remove processing latency logs * Changing interim transcription logging to tracing. --------- Co-authored-by: sathwika <geereddysath@nvidia.com> Co-authored-by: filipi87 <filipi87@gmail.com>
2026-04-23 18:31:27 +05:30
parent 4d14251f4a
commit 21f6c2afa5
2 changed files with 212 additions and 111 deletions
--- a/changelog/4269.changed.md
+++ b/changelog/4269.changed.md
@@ -0,0 +1,2 @@
+- Updated NVIDIA STT services to align with Nemotron Speech defaults and configuration: `api_key` is now optional for local deployments, additional recognition settings are available (including alternatives, word offsets, and diarization), and streaming/segmented docs now reflect Nemotron Speech APIs.
+- NVIDIA streaming STT now sets `TranscriptionFrame.finalized=True` when the provider marks a result as final, and preserves `language` on both `TranscriptionFrame` and `InterimTranscriptionFrame`.
--- a/src/pipecat/services/nvidia/stt.py
+++ b/src/pipecat/services/nvidia/stt.py
@@ -2,9 +2,15 @@
 # Copyright (c) 2024-2026, Daily
 #
 # SPDX-License-Identifier: BSD 2-Clause License
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
 #

-"""NVIDIA Riva Speech-to-Text service implementations for real-time and batch transcription."""
+"""NVIDIA Nemotron Speech-to-Text service implementations for real-time and batch transcription.
+
+Refer to the NVIDIA ASR NIM documentation for usage, customization,
+and local deployment steps:
+https://docs.nvidia.com/nim/speech/latest/asr/
+"""

 import asyncio
 from collections.abc import AsyncGenerator, Mapping
@@ -32,25 +38,28 @@ from pipecat.utils.time import time_now_iso8601
 from pipecat.utils.tracing.service_decorators import traced_stt

 try:
+    import grpc
    import riva.client

 except ModuleNotFoundError as e:
    logger.error(f"Exception: {e}")
-    logger.error("In order to use NVIDIA Riva STT, you need to `pip install pipecat-ai[nvidia]`.")
+    logger.error(
+        "In order to use NVIDIA Nemotron Speech STT, you need to `pip install pipecat-ai[nvidia]`."
+    )
    raise Exception(f"Missing module: {e}")


-def language_to_nvidia_riva_language(language: Language) -> str | None:
-    """Maps Language enum to NVIDIA Riva ASR language codes.
+def language_to_nvidia_nemotron_speech_language(language: Language) -> str | None:
+    """Maps Language enum to NVIDIA Nemotron Speech ASR language codes.

    Source:
-    https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-riva-build-table.html?highlight=fr%20fr
+    https://docs.nvidia.com/nim/speech/latest/reference/support-matrix/asr.html#supported-languages-by-model-type

    Args:
        language: Language enum value.

    Returns:
-        Optional[str]: NVIDIA Riva language code or None if not supported.
+        str | None: NVIDIA Nemotron Speech language code or None if not supported.
    """
    LANGUAGE_MAP = {
        # Arabic
@@ -93,15 +102,8 @@ def language_to_nvidia_riva_language(language: Language) -> str | None:


@dataclass
-class NvidiaSTTSettings(STTSettings):
-    """Settings for NvidiaSTTService."""
-
-    pass
-
-
-@dataclass
-class NvidiaSegmentedSTTSettings(STTSettings):
-    """Settings for NvidiaSegmentedSTTService.
+class _NvidiaBaseSTTSettings(STTSettings):
+    """Shared settings for NVIDIA Nemotron Speech STT services.

    Parameters:
        profanity_filter: Whether to filter profanity from results.
@@ -109,6 +111,10 @@ class NvidiaSegmentedSTTSettings(STTSettings):
        verbatim_transcripts: Whether to return verbatim transcripts.
        boosted_lm_words: List of words to boost in language model.
        boosted_lm_score: Score boost for specified words.
+        max_alternatives: Maximum number of recognition alternatives.
+        word_time_offsets: Whether to include word-level time offsets.
+        speaker_diarization: Whether to enable speaker diarization.
+        diarization_max_speakers: Maximum number of speakers for diarization.
    """

    profanity_filter: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
@@ -116,12 +122,34 @@ class NvidiaSegmentedSTTSettings(STTSettings):
    verbatim_transcripts: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    boosted_lm_words: list[str] | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
    boosted_lm_score: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
+    max_alternatives: int | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
+    word_time_offsets: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
+    speaker_diarization: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
+    diarization_max_speakers: int | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
+
+
+@dataclass
+class NvidiaSTTSettings(_NvidiaBaseSTTSettings):
+    """Settings for NvidiaSTTService.
+
+    Parameters:
+        interim_results: Whether to return interim (partial) results.
+    """
+
+    interim_results: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
+
+
+@dataclass
+class NvidiaSegmentedSTTSettings(_NvidiaBaseSTTSettings):
+    """Settings for NvidiaSegmentedSTTService."""
+
+    pass


 class NvidiaSTTService(STTService):
-    """Real-time speech-to-text service using NVIDIA Riva streaming ASR.
+    """Real-time speech-to-text service using NVIDIA Nemotron Speech streaming ASR.

-    Provides real-time transcription capabilities using NVIDIA's Riva ASR models
+    Provides real-time transcription capabilities using NVIDIA's Nemotron Speech ASR models
    through streaming recognition. Supports interim results and continuous audio
    processing for low-latency applications.
    """
@@ -130,7 +158,7 @@ class NvidiaSTTService(STTService):
    _settings: Settings

    class InputParams(BaseModel):
-        """Configuration parameters for NVIDIA Riva STT service.
+        """Configuration parameters for NVIDIA Nemotron Speech STT service.

        .. deprecated:: 0.0.105
            Use ``settings=NvidiaSTTService.Settings(...)`` instead.
@@ -144,32 +172,52 @@ class NvidiaSTTService(STTService):
    def __init__(
        self,
        *,
-        api_key: str,
+        api_key: str | None = None,
        server: str = "grpc.nvcf.nvidia.com:443",
        model_function_map: Mapping[str, str] = {
-            "function_id": "1598d209-5e27-4d3c-8079-4751568b1081",
-            "model_name": "parakeet-ctc-1.1b-asr",
+            "function_id": "bb0837de-8c7b-481f-9ec8-ef5663e9c1fa",
+            "model_name": "nemotron-asr-streaming",
        },
        sample_rate: int | None = None,
        params: InputParams | None = None,
        use_ssl: bool = True,
+        audio_channel_count: int = 1,
+        start_history: int = -1,
+        start_threshold: float = -1.0,
+        stop_history: int = 320,
+        stop_threshold: float = -1.0,
+        stop_history_eou: int = -1,
+        stop_threshold_eou: float = -1.0,
+        custom_configuration: str = "",
        settings: Settings | None = None,
        ttfs_p99_latency: float | None = NVIDIA_TTFS_P99,
        **kwargs,
    ):
-        """Initialize the NVIDIA Riva STT service.
+        """Initialize the NVIDIA Nemotron Speech STT service.

        Args:
-            api_key: NVIDIA API key for authentication.
-            server: NVIDIA Riva server address. Defaults to NVIDIA Cloud Function endpoint.
+            api_key: NVIDIA API key for authentication. Required when using the
+                cloud endpoint. Not needed for local deployments.
+            server: NVIDIA Nemotron Speech server address. Defaults to NVIDIA Cloud Function endpoint.
+                For local deployments, pass the local address (e.g. ``localhost:50051``).
            model_function_map: Mapping containing 'function_id' and 'model_name' for the ASR model.
            sample_rate: Audio sample rate in Hz. If None, uses pipeline default.
-            params: Additional configuration parameters for NVIDIA Riva.
+            params: Additional configuration parameters for NVIDIA Nemotron Speech.

                .. deprecated:: 0.0.105
                    Use ``settings=NvidiaSTTService.Settings(...)`` instead.

-            use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True.
+            use_ssl: Whether to use SSL for the gRPC connection. Defaults to True
+                for the NVIDIA cloud endpoint. Set to False for local deployments.
+            audio_channel_count: Number of audio channels.
+            start_history: VAD start history in frames. Use -1 for Nemotron Speech default.
+            start_threshold: VAD start threshold. Use -1.0 for Nemotron Speech default.
+            stop_history: VAD stop history in frames. Use -1 for Nemotron Speech default.
+            stop_threshold: VAD stop threshold. Use -1.0 for Nemotron Speech default.
+            stop_history_eou: End-of-utterance stop history in frames. Use -1 for Nemotron Speech default.
+            stop_threshold_eou: End-of-utterance stop threshold. Use -1.0 for Nemotron Speech default.
+            custom_configuration: Custom Nemotron Speech configuration string
+                (e.g. ``"enable_vad_endpointing:true,neural_vad.onset:0.65"``).
            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
@@ -180,6 +228,16 @@ class NvidiaSTTService(STTService):
        default_settings = self.Settings(
            model=model_function_map.get("model_name"),
            language=Language.EN_US,
+            profanity_filter=False,
+            automatic_punctuation=True,
+            verbatim_transcripts=True,
+            boosted_lm_words=None,
+            boosted_lm_score=4.0,
+            max_alternatives=1,
+            interim_results=True,
+            word_time_offsets=False,
+            speaker_diarization=False,
+            diarization_max_speakers=0,
        )

        # 2. (no deprecated direct args for this service)
@@ -204,13 +262,14 @@ class NvidiaSTTService(STTService):
        self._server = server
        self._api_key = api_key
        self._use_ssl = use_ssl
-        self._start_history = -1
-        self._start_threshold = -1.0
-        self._stop_history = -1
-        self._stop_threshold = -1.0
-        self._stop_history_eou = -1
-        self._stop_threshold_eou = -1.0
-        self._custom_configuration = ""
+        self._audio_channel_count = audio_channel_count
+        self._start_history = start_history
+        self._start_threshold = start_threshold
+        self._stop_history = stop_history
+        self._stop_threshold = stop_threshold
+        self._stop_history_eou = stop_history_eou
+        self._stop_threshold_eou = stop_threshold_eou
+        self._custom_configuration = custom_configuration
        self._function_id = model_function_map.get("function_id")

        self._asr_service = None
@@ -219,31 +278,38 @@ class NvidiaSTTService(STTService):
        self._thread_task = None

    def _initialize_client(self):
-        metadata = [
-            ["function-id", self._function_id],
-            ["authorization", f"Bearer {self._api_key}"],
-        ]
+        """Initialize the NVIDIA Nemotron Speech ASR client with authentication metadata."""
+        metadata = []
+        if self._function_id:
+            metadata.append(["function-id", self._function_id])
+        if self._api_key:
+            metadata.append(["authorization", f"Bearer {self._api_key}"])
        auth = riva.client.Auth(None, self._use_ssl, self._server, metadata)

        self._asr_service = riva.client.ASRService(auth)

    def _create_recognition_config(self):
-        """Create the NVIDIA Riva ASR recognition configuration."""
+        """Create the NVIDIA Nemotron Speech ASR recognition configuration."""
+        s = self._settings
        config = riva.client.StreamingRecognitionConfig(
            config=riva.client.RecognitionConfig(
                encoding=riva.client.AudioEncoding.LINEAR_PCM,
-                language_code=self._settings.language,
+                language_code=s.language,
                model="",
-                max_alternatives=1,
-                profanity_filter=False,
-                enable_automatic_punctuation=True,
-                verbatim_transcripts=True,
+                max_alternatives=s.max_alternatives,
+                profanity_filter=s.profanity_filter,
+                enable_automatic_punctuation=s.automatic_punctuation,
+                verbatim_transcripts=s.verbatim_transcripts,
                sample_rate_hertz=self.sample_rate,
-                audio_channel_count=1,
+                audio_channel_count=self._audio_channel_count,
+                enable_word_time_offsets=s.word_time_offsets,
            ),
-            interim_results=True,
+            interim_results=s.interim_results,
        )

+        if s.boosted_lm_words:
+            riva.client.add_word_boosting_to_config(config, s.boosted_lm_words, s.boosted_lm_score)
+
        riva.client.add_endpoint_parameters_to_config(
            config,
            self._start_history,
@@ -253,7 +319,14 @@ class NvidiaSTTService(STTService):
            self._stop_threshold,
            self._stop_threshold_eou,
        )
-        riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
+
+        if self._custom_configuration:
+            riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
+
+        if s.speaker_diarization:
+            riva.client.add_speaker_diarization_to_config(
+                config, s.speaker_diarization, s.diarization_max_speakers
+            )

        return config

@@ -261,15 +334,31 @@ class NvidiaSTTService(STTService):
        """Check if this service can generate processing metrics.

        Returns:
-            False - this service does not support metrics generation.
+            True - this service supports metrics generation.
        """
-        return False
+        return True
+
+    async def _update_settings(self, delta: STTSettings) -> dict[str, Any]:
+        """Apply a settings delta and sync internal state.
+
+        Args:
+            delta: A :class:`STTSettings` (or ``NvidiaSTTService.Settings``) delta.
+
+        Returns:
+            Dict mapping changed field names to their previous values.
+        """
+        changed = await super()._update_settings(delta)
+
+        if changed and self._config is not None:
+            self._config = self._create_recognition_config()
+
+        return changed

    async def set_model(self, model: str):
        """Set the ASR model for transcription.

        .. deprecated:: 0.0.104
-            Model cannot be changed after initialization for NVIDIA Riva streaming STT.
+            Model cannot be changed after initialization for NVIDIA Nemotron Speech streaming STT.
            Set model and function id in the constructor instead.

            Example::
@@ -288,7 +377,7 @@ class NvidiaSTTService(STTService):
            warnings.simplefilter("always")
            warnings.warn(
                "'set_model' is deprecated. Model cannot be changed after initialization"
-                " for NVIDIA Riva streaming STT. Set model and function id in the"
+                " for NVIDIA Nemotron Speech streaming STT. Set model and function id in the"
                " constructor instead, e.g.:"
                " NvidiaSTTService(api_key=..., model_function_map="
                "{'function_id': '<UUID>', 'model_name': '<model_name>'})",
@@ -297,7 +386,7 @@ class NvidiaSTTService(STTService):
            )

    async def start(self, frame: StartFrame):
-        """Start the NVIDIA Riva STT service and initialize streaming configuration.
+        """Start the NVIDIA Nemotron Speech STT service and initialize streaming configuration.

        Args:
            frame: StartFrame indicating pipeline start.
@@ -314,7 +403,7 @@ class NvidiaSTTService(STTService):
        logger.debug(f"Initialized NvidiaSTTService with model: {self._settings.model}")

    async def stop(self, frame: EndFrame):
-        """Stop the NVIDIA Riva STT service and clean up resources.
+        """Stop the NVIDIA Nemotron Speech STT service and clean up resources.

        Args:
            frame: EndFrame indicating pipeline stop.
@@ -323,7 +412,7 @@ class NvidiaSTTService(STTService):
        await self._stop_tasks()

    async def cancel(self, frame: CancelFrame):
-        """Cancel the NVIDIA Riva STT service operation.
+        """Cancel the NVIDIA Nemotron Speech STT service operation.

        Args:
            frame: CancelFrame indicating operation cancellation.
@@ -337,14 +426,25 @@ class NvidiaSTTService(STTService):
            self._thread_task = None

    def _response_handler(self):
-        responses = self._asr_service.streaming_response_generator(
-            audio_chunks=self,
-            streaming_config=self._config,
-        )
-        for response in responses:
-            if not response.results:
-                continue
-            asyncio.run_coroutine_threadsafe(self._handle_response(response), self.get_event_loop())
+        try:
+            responses = self._asr_service.streaming_response_generator(
+                audio_chunks=self,
+                streaming_config=self._config,
+            )
+            for response in responses:
+                if not response.results:
+                    continue
+                asyncio.run_coroutine_threadsafe(
+                    self._handle_response(response), self.get_event_loop()
+                )
+        except grpc.RpcError as e:
+            status = e.code().name if hasattr(e, "code") else "UNKNOWN"
+            details = e.details() if hasattr(e, "details") else str(e)
+            logger.error(f"{self} gRPC streaming error ({status}): {details}")
+            asyncio.run_coroutine_threadsafe(
+                self.push_error(f"{self} STT streaming failed (gRPC {status}): {details}"),
+                self.get_event_loop(),
+            )

    async def _thread_task_handler(self):
        try:
@@ -370,6 +470,7 @@ class NvidiaSTTService(STTService):
            if transcript and len(transcript) > 0:
                if result.is_final:
                    await self.stop_processing_metrics()
+                    logger.debug(f"Transcription: [{transcript}]")
                    await self.push_frame(
                        TranscriptionFrame(
                            transcript,
@@ -377,6 +478,7 @@ class NvidiaSTTService(STTService):
                            time_now_iso8601(),
                            self._settings.language,
                            result=result,
+                            finalized=True,
                        )
                    )
                    await self._handle_transcription(
@@ -394,6 +496,7 @@ class NvidiaSTTService(STTService):
                            result=result,
                        )
                    )
+                    logger.trace(f"Interim Transcription: [{transcript}]")

    async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
        """Process audio data for speech-to-text transcription.
@@ -409,7 +512,7 @@ class NvidiaSTTService(STTService):
        yield None

    def __next__(self) -> bytes:
-        """Get the next audio chunk for NVIDIA Riva processing.
+        """Get the next audio chunk for NVIDIA Nemotron Speech processing.

        Returns:
            Audio bytes from the queue.
@@ -422,7 +525,8 @@ class NvidiaSTTService(STTService):

        try:
            future = asyncio.run_coroutine_threadsafe(self._queue.get(), self.get_event_loop())
-            return future.result()
+            audio = future.result()
+            return audio
        except FuturesCancelledError:
            raise StopIteration

@@ -436,9 +540,9 @@ class NvidiaSTTService(STTService):


 class NvidiaSegmentedSTTService(SegmentedSTTService):
-    """Speech-to-text service using NVIDIA Riva's offline/batch models.
+    """Speech-to-text service using NVIDIA Nemotron Speech's offline/batch models.

-    By default, his service uses NVIDIA's Riva Canary ASR API to perform speech-to-text
+    By default, this service uses NVIDIA's Nemotron Speech Canary ASR API to perform speech-to-text
    transcription on audio segments. It inherits from SegmentedSTTService to handle
    audio buffering and speech detection.
    """
@@ -447,7 +551,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
    _settings: Settings

    class InputParams(BaseModel):
-        """Configuration parameters for NVIDIA Riva segmented STT service.
+        """Configuration parameters for NVIDIA Nemotron Speech segmented STT service.

        .. deprecated:: 0.0.105
            Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead.
@@ -471,7 +575,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
    def __init__(
        self,
        *,
-        api_key: str,
+        api_key: str | None = None,
        server: str = "grpc.nvcf.nvidia.com:443",
        model_function_map: Mapping[str, str] = {
            "function_id": "ee8dc628-76de-4acc-8595-1836e7e857bd",
@@ -480,28 +584,34 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
        sample_rate: int | None = None,
        params: InputParams | None = None,
        use_ssl: bool = True,
+        custom_configuration: str = "",
        settings: Settings | None = None,
        ttfs_p99_latency: float | None = NVIDIA_TTFS_P99,
        **kwargs,
    ):
-        """Initialize the NVIDIA Riva segmented STT service.
+        """Initialize the NVIDIA Nemotron Speech segmented STT service.

        Args:
-            api_key: NVIDIA API key for authentication
-            server: NVIDIA Riva server address (defaults to NVIDIA Cloud Function endpoint)
-            model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID
-            sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate
-            params: Additional configuration parameters for NVIDIA Riva
+            api_key: NVIDIA API key for authentication. Required when using the
+                cloud endpoint. Not needed for local deployments.
+            server: NVIDIA Nemotron Speech server address. Defaults to NVIDIA Cloud Function endpoint.
+                For local deployments, pass the local address (e.g. ``localhost:50051``).
+            model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID.
+            sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
+            params: Additional configuration parameters for NVIDIA Nemotron Speech.

                .. deprecated:: 0.0.105
                    Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead.

-            use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True.
+            use_ssl: Whether to use SSL for the gRPC connection. Defaults to True
+                for the NVIDIA cloud endpoint. Set to False for local deployments.
+            custom_configuration: Custom Nemotron Speech configuration string
+                (e.g. ``"enable_vad_endpointing:true,neural_vad.onset:0.65"``).
            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
                Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
-            **kwargs: Additional arguments passed to SegmentedSTTService
+            **kwargs: Additional arguments passed to SegmentedSTTService.
        """
        # 1. Initialize default_settings with hardcoded defaults
        default_settings = self.Settings(
@@ -512,6 +622,8 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
            verbatim_transcripts=False,
            boosted_lm_words=None,
            boosted_lm_score=4.0,
+            max_alternatives=1,
+            word_time_offsets=False,
        )

        # 2. (no deprecated direct args for this service)
@@ -538,81 +650,64 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
            **kwargs,
        )

-        # Initialize NVIDIA Riva settings
+        # Initialize NVIDIA Nemotron Speech settings
        self._api_key = api_key
        self._server = server
        self._use_ssl = use_ssl
        self._function_id = model_function_map.get("function_id")
-
-        # Voice activity detection thresholds (use NVIDIA Riva defaults)
-        self._start_history = -1
-        self._start_threshold = -1.0
-        self._stop_history = -1
-        self._stop_threshold = -1.0
-        self._stop_history_eou = -1
-        self._stop_threshold_eou = -1.0
-        self._custom_configuration = ""
+        self._custom_configuration = custom_configuration

        self._config = None
        self._asr_service = None

    def language_to_service_language(self, language: Language) -> str | None:
-        """Convert pipecat Language enum to NVIDIA Riva's language code.
+        """Convert pipecat Language enum to NVIDIA Nemotron Speech's language code.

        Args:
            language: Language enum value.

        Returns:
-            NVIDIA Riva language code or None if not supported.
+            NVIDIA Nemotron Speech language code or None if not supported.
        """
-        return language_to_nvidia_riva_language(language)
+        return language_to_nvidia_nemotron_speech_language(language)

    def _initialize_client(self):
-        """Initialize the NVIDIA Riva ASR client with authentication metadata."""
+        """Initialize the NVIDIA Nemotron Speech ASR client with authentication metadata."""
        if self._asr_service is not None:
            return

        # Set up authentication metadata for NVIDIA Cloud Functions
-        metadata = [
-            ["function-id", self._function_id],
-            ["authorization", f"Bearer {self._api_key}"],
-        ]
+        metadata = []
+        if self._function_id:
+            metadata.append(["function-id", self._function_id])
+        if self._api_key:
+            metadata.append(["authorization", f"Bearer {self._api_key}"])

        # Create authenticated client
        auth = riva.client.Auth(None, self._use_ssl, self._server, metadata)
        self._asr_service = riva.client.ASRService(auth)

    def _get_language_code(self) -> str:
-        """Get the current NVIDIA Riva language code string."""
+        """Get the current NVIDIA Nemotron Speech language code string."""
        return self._settings.language or "en-US"

    def _create_recognition_config(self):
-        """Create the NVIDIA Riva ASR recognition configuration."""
+        """Create the NVIDIA Nemotron Speech ASR recognition configuration."""
        # Create base configuration
        s = self._settings
        config = riva.client.RecognitionConfig(
            language_code=self._get_language_code(),
-            max_alternatives=1,
+            max_alternatives=s.max_alternatives,
            profanity_filter=s.profanity_filter,
            enable_automatic_punctuation=s.automatic_punctuation,
            verbatim_transcripts=s.verbatim_transcripts,
+            enable_word_time_offsets=s.word_time_offsets,
        )

        # Add word boosting if specified
        if s.boosted_lm_words:
            riva.client.add_word_boosting_to_config(config, s.boosted_lm_words, s.boosted_lm_score)

-        # Add voice activity detection parameters
-        riva.client.add_endpoint_parameters_to_config(
-            config,
-            self._start_history,
-            self._start_threshold,
-            self._stop_history,
-            self._stop_history_eou,
-            self._stop_threshold,
-            self._stop_threshold_eou,
-        )
-
        # Add any custom configuration
        if self._custom_configuration:
            riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
@@ -676,7 +771,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):

            await self.start_processing_metrics()

-            # Process audio with NVIDIA Riva ASR - explicitly request non-future response
+            # Process audio with NVIDIA Nemotron Speech ASR - explicitly request non-future response
            raw_response = self._asr_service.offline_recognize(audio, self._config, future=False)

            await self.stop_processing_metrics()
@@ -712,10 +807,14 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
                        await self._handle_transcription(text, True, self._settings.language)

            if not transcription_found:
-                logger.debug(f"{self}: No transcription results found in NVIDIA Riva response")
+                logger.debug(
+                    f"{self}: No transcription results found in NVIDIA Nemotron Speech response"
+                )
        except AttributeError as ae:
-            logger.error(f"{self}: Unexpected response structure from NVIDIA Riva: {ae}")
-            yield ErrorFrame(f"{self}: Unexpected NVIDIA Riva response format: {str(ae)}")
+            logger.error(f"{self}: Unexpected response structure from NVIDIA Nemotron Speech: {ae}")
+            yield ErrorFrame(
+                error=f"{self}: Unexpected NVIDIA Nemotron Speech response format: {str(ae)}"
+            )
        except Exception as e:
            logger.error(f"{self} exception: {e}")
            yield ErrorFrame(error=f"{self} error: {e}")