Update NVIDIA STT services for Nemotron Speech defaults and config parity (#4269)

* Update NVIDIA STT services for Nemotron Speech defaults and config parity

* Add changelog entry for PR #4269

* initialize boosted LM settings defaults in streaming STT

* Align NVIDIA STT language handling with other STT services

* add finalised flag to Nvidia stt final transcripts, remove processing latency logs

* Changing interim transcription logging to tracing.

---------

Co-authored-by: sathwika <geereddysath@nvidia.com>
Co-authored-by: filipi87 <filipi87@gmail.com>
This commit is contained in:
Sathwika Reddy Geereddy
2026-04-23 18:31:27 +05:30
committed by GitHub
parent 4d14251f4a
commit 21f6c2afa5
2 changed files with 212 additions and 111 deletions

View File

@@ -0,0 +1,2 @@
- Updated NVIDIA STT services to align with Nemotron Speech defaults and configuration: `api_key` is now optional for local deployments, additional recognition settings are available (including alternatives, word offsets, and diarization), and streaming/segmented docs now reflect Nemotron Speech APIs.
- NVIDIA streaming STT now sets `TranscriptionFrame.finalized=True` when the provider marks a result as final, and preserves `language` on both `TranscriptionFrame` and `InterimTranscriptionFrame`.

View File

@@ -2,9 +2,15 @@
# Copyright (c) 2024-2026, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
#
"""NVIDIA Riva Speech-to-Text service implementations for real-time and batch transcription."""
"""NVIDIA Nemotron Speech-to-Text service implementations for real-time and batch transcription.
Refer to the NVIDIA ASR NIM documentation for usage, customization,
and local deployment steps:
https://docs.nvidia.com/nim/speech/latest/asr/
"""
import asyncio
from collections.abc import AsyncGenerator, Mapping
@@ -32,25 +38,28 @@ from pipecat.utils.time import time_now_iso8601
from pipecat.utils.tracing.service_decorators import traced_stt
try:
import grpc
import riva.client
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error("In order to use NVIDIA Riva STT, you need to `pip install pipecat-ai[nvidia]`.")
logger.error(
"In order to use NVIDIA Nemotron Speech STT, you need to `pip install pipecat-ai[nvidia]`."
)
raise Exception(f"Missing module: {e}")
def language_to_nvidia_riva_language(language: Language) -> str | None:
"""Maps Language enum to NVIDIA Riva ASR language codes.
def language_to_nvidia_nemotron_speech_language(language: Language) -> str | None:
"""Maps Language enum to NVIDIA Nemotron Speech ASR language codes.
Source:
https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-riva-build-table.html?highlight=fr%20fr
https://docs.nvidia.com/nim/speech/latest/reference/support-matrix/asr.html#supported-languages-by-model-type
Args:
language: Language enum value.
Returns:
Optional[str]: NVIDIA Riva language code or None if not supported.
str | None: NVIDIA Nemotron Speech language code or None if not supported.
"""
LANGUAGE_MAP = {
# Arabic
@@ -93,15 +102,8 @@ def language_to_nvidia_riva_language(language: Language) -> str | None:
@dataclass
class NvidiaSTTSettings(STTSettings):
"""Settings for NvidiaSTTService."""
pass
@dataclass
class NvidiaSegmentedSTTSettings(STTSettings):
"""Settings for NvidiaSegmentedSTTService.
class _NvidiaBaseSTTSettings(STTSettings):
"""Shared settings for NVIDIA Nemotron Speech STT services.
Parameters:
profanity_filter: Whether to filter profanity from results.
@@ -109,6 +111,10 @@ class NvidiaSegmentedSTTSettings(STTSettings):
verbatim_transcripts: Whether to return verbatim transcripts.
boosted_lm_words: List of words to boost in language model.
boosted_lm_score: Score boost for specified words.
max_alternatives: Maximum number of recognition alternatives.
word_time_offsets: Whether to include word-level time offsets.
speaker_diarization: Whether to enable speaker diarization.
diarization_max_speakers: Maximum number of speakers for diarization.
"""
profanity_filter: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
@@ -116,12 +122,34 @@ class NvidiaSegmentedSTTSettings(STTSettings):
verbatim_transcripts: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
boosted_lm_words: list[str] | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
boosted_lm_score: float | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
max_alternatives: int | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
word_time_offsets: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
speaker_diarization: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
diarization_max_speakers: int | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
@dataclass
class NvidiaSTTSettings(_NvidiaBaseSTTSettings):
"""Settings for NvidiaSTTService.
Parameters:
interim_results: Whether to return interim (partial) results.
"""
interim_results: bool | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
@dataclass
class NvidiaSegmentedSTTSettings(_NvidiaBaseSTTSettings):
"""Settings for NvidiaSegmentedSTTService."""
pass
class NvidiaSTTService(STTService):
"""Real-time speech-to-text service using NVIDIA Riva streaming ASR.
"""Real-time speech-to-text service using NVIDIA Nemotron Speech streaming ASR.
Provides real-time transcription capabilities using NVIDIA's Riva ASR models
Provides real-time transcription capabilities using NVIDIA's Nemotron Speech ASR models
through streaming recognition. Supports interim results and continuous audio
processing for low-latency applications.
"""
@@ -130,7 +158,7 @@ class NvidiaSTTService(STTService):
_settings: Settings
class InputParams(BaseModel):
"""Configuration parameters for NVIDIA Riva STT service.
"""Configuration parameters for NVIDIA Nemotron Speech STT service.
.. deprecated:: 0.0.105
Use ``settings=NvidiaSTTService.Settings(...)`` instead.
@@ -144,32 +172,52 @@ class NvidiaSTTService(STTService):
def __init__(
self,
*,
api_key: str,
api_key: str | None = None,
server: str = "grpc.nvcf.nvidia.com:443",
model_function_map: Mapping[str, str] = {
"function_id": "1598d209-5e27-4d3c-8079-4751568b1081",
"model_name": "parakeet-ctc-1.1b-asr",
"function_id": "bb0837de-8c7b-481f-9ec8-ef5663e9c1fa",
"model_name": "nemotron-asr-streaming",
},
sample_rate: int | None = None,
params: InputParams | None = None,
use_ssl: bool = True,
audio_channel_count: int = 1,
start_history: int = -1,
start_threshold: float = -1.0,
stop_history: int = 320,
stop_threshold: float = -1.0,
stop_history_eou: int = -1,
stop_threshold_eou: float = -1.0,
custom_configuration: str = "",
settings: Settings | None = None,
ttfs_p99_latency: float | None = NVIDIA_TTFS_P99,
**kwargs,
):
"""Initialize the NVIDIA Riva STT service.
"""Initialize the NVIDIA Nemotron Speech STT service.
Args:
api_key: NVIDIA API key for authentication.
server: NVIDIA Riva server address. Defaults to NVIDIA Cloud Function endpoint.
api_key: NVIDIA API key for authentication. Required when using the
cloud endpoint. Not needed for local deployments.
server: NVIDIA Nemotron Speech server address. Defaults to NVIDIA Cloud Function endpoint.
For local deployments, pass the local address (e.g. ``localhost:50051``).
model_function_map: Mapping containing 'function_id' and 'model_name' for the ASR model.
sample_rate: Audio sample rate in Hz. If None, uses pipeline default.
params: Additional configuration parameters for NVIDIA Riva.
params: Additional configuration parameters for NVIDIA Nemotron Speech.
.. deprecated:: 0.0.105
Use ``settings=NvidiaSTTService.Settings(...)`` instead.
use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True.
use_ssl: Whether to use SSL for the gRPC connection. Defaults to True
for the NVIDIA cloud endpoint. Set to False for local deployments.
audio_channel_count: Number of audio channels.
start_history: VAD start history in frames. Use -1 for Nemotron Speech default.
start_threshold: VAD start threshold. Use -1.0 for Nemotron Speech default.
stop_history: VAD stop history in frames. Use -1 for Nemotron Speech default.
stop_threshold: VAD stop threshold. Use -1.0 for Nemotron Speech default.
stop_history_eou: End-of-utterance stop history in frames. Use -1 for Nemotron Speech default.
stop_threshold_eou: End-of-utterance stop threshold. Use -1.0 for Nemotron Speech default.
custom_configuration: Custom Nemotron Speech configuration string
(e.g. ``"enable_vad_endpointing:true,neural_vad.onset:0.65"``).
settings: Runtime-updatable settings. When provided alongside deprecated
parameters, ``settings`` values take precedence.
ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
@@ -180,6 +228,16 @@ class NvidiaSTTService(STTService):
default_settings = self.Settings(
model=model_function_map.get("model_name"),
language=Language.EN_US,
profanity_filter=False,
automatic_punctuation=True,
verbatim_transcripts=True,
boosted_lm_words=None,
boosted_lm_score=4.0,
max_alternatives=1,
interim_results=True,
word_time_offsets=False,
speaker_diarization=False,
diarization_max_speakers=0,
)
# 2. (no deprecated direct args for this service)
@@ -204,13 +262,14 @@ class NvidiaSTTService(STTService):
self._server = server
self._api_key = api_key
self._use_ssl = use_ssl
self._start_history = -1
self._start_threshold = -1.0
self._stop_history = -1
self._stop_threshold = -1.0
self._stop_history_eou = -1
self._stop_threshold_eou = -1.0
self._custom_configuration = ""
self._audio_channel_count = audio_channel_count
self._start_history = start_history
self._start_threshold = start_threshold
self._stop_history = stop_history
self._stop_threshold = stop_threshold
self._stop_history_eou = stop_history_eou
self._stop_threshold_eou = stop_threshold_eou
self._custom_configuration = custom_configuration
self._function_id = model_function_map.get("function_id")
self._asr_service = None
@@ -219,31 +278,38 @@ class NvidiaSTTService(STTService):
self._thread_task = None
def _initialize_client(self):
metadata = [
["function-id", self._function_id],
["authorization", f"Bearer {self._api_key}"],
]
"""Initialize the NVIDIA Nemotron Speech ASR client with authentication metadata."""
metadata = []
if self._function_id:
metadata.append(["function-id", self._function_id])
if self._api_key:
metadata.append(["authorization", f"Bearer {self._api_key}"])
auth = riva.client.Auth(None, self._use_ssl, self._server, metadata)
self._asr_service = riva.client.ASRService(auth)
def _create_recognition_config(self):
"""Create the NVIDIA Riva ASR recognition configuration."""
"""Create the NVIDIA Nemotron Speech ASR recognition configuration."""
s = self._settings
config = riva.client.StreamingRecognitionConfig(
config=riva.client.RecognitionConfig(
encoding=riva.client.AudioEncoding.LINEAR_PCM,
language_code=self._settings.language,
language_code=s.language,
model="",
max_alternatives=1,
profanity_filter=False,
enable_automatic_punctuation=True,
verbatim_transcripts=True,
max_alternatives=s.max_alternatives,
profanity_filter=s.profanity_filter,
enable_automatic_punctuation=s.automatic_punctuation,
verbatim_transcripts=s.verbatim_transcripts,
sample_rate_hertz=self.sample_rate,
audio_channel_count=1,
audio_channel_count=self._audio_channel_count,
enable_word_time_offsets=s.word_time_offsets,
),
interim_results=True,
interim_results=s.interim_results,
)
if s.boosted_lm_words:
riva.client.add_word_boosting_to_config(config, s.boosted_lm_words, s.boosted_lm_score)
riva.client.add_endpoint_parameters_to_config(
config,
self._start_history,
@@ -253,7 +319,14 @@ class NvidiaSTTService(STTService):
self._stop_threshold,
self._stop_threshold_eou,
)
riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
if self._custom_configuration:
riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
if s.speaker_diarization:
riva.client.add_speaker_diarization_to_config(
config, s.speaker_diarization, s.diarization_max_speakers
)
return config
@@ -261,15 +334,31 @@ class NvidiaSTTService(STTService):
"""Check if this service can generate processing metrics.
Returns:
False - this service does not support metrics generation.
True - this service supports metrics generation.
"""
return False
return True
async def _update_settings(self, delta: STTSettings) -> dict[str, Any]:
"""Apply a settings delta and sync internal state.
Args:
delta: A :class:`STTSettings` (or ``NvidiaSTTService.Settings``) delta.
Returns:
Dict mapping changed field names to their previous values.
"""
changed = await super()._update_settings(delta)
if changed and self._config is not None:
self._config = self._create_recognition_config()
return changed
async def set_model(self, model: str):
"""Set the ASR model for transcription.
.. deprecated:: 0.0.104
Model cannot be changed after initialization for NVIDIA Riva streaming STT.
Model cannot be changed after initialization for NVIDIA Nemotron Speech streaming STT.
Set model and function id in the constructor instead.
Example::
@@ -288,7 +377,7 @@ class NvidiaSTTService(STTService):
warnings.simplefilter("always")
warnings.warn(
"'set_model' is deprecated. Model cannot be changed after initialization"
" for NVIDIA Riva streaming STT. Set model and function id in the"
" for NVIDIA Nemotron Speech streaming STT. Set model and function id in the"
" constructor instead, e.g.:"
" NvidiaSTTService(api_key=..., model_function_map="
"{'function_id': '<UUID>', 'model_name': '<model_name>'})",
@@ -297,7 +386,7 @@ class NvidiaSTTService(STTService):
)
async def start(self, frame: StartFrame):
"""Start the NVIDIA Riva STT service and initialize streaming configuration.
"""Start the NVIDIA Nemotron Speech STT service and initialize streaming configuration.
Args:
frame: StartFrame indicating pipeline start.
@@ -314,7 +403,7 @@ class NvidiaSTTService(STTService):
logger.debug(f"Initialized NvidiaSTTService with model: {self._settings.model}")
async def stop(self, frame: EndFrame):
"""Stop the NVIDIA Riva STT service and clean up resources.
"""Stop the NVIDIA Nemotron Speech STT service and clean up resources.
Args:
frame: EndFrame indicating pipeline stop.
@@ -323,7 +412,7 @@ class NvidiaSTTService(STTService):
await self._stop_tasks()
async def cancel(self, frame: CancelFrame):
"""Cancel the NVIDIA Riva STT service operation.
"""Cancel the NVIDIA Nemotron Speech STT service operation.
Args:
frame: CancelFrame indicating operation cancellation.
@@ -337,14 +426,25 @@ class NvidiaSTTService(STTService):
self._thread_task = None
def _response_handler(self):
responses = self._asr_service.streaming_response_generator(
audio_chunks=self,
streaming_config=self._config,
)
for response in responses:
if not response.results:
continue
asyncio.run_coroutine_threadsafe(self._handle_response(response), self.get_event_loop())
try:
responses = self._asr_service.streaming_response_generator(
audio_chunks=self,
streaming_config=self._config,
)
for response in responses:
if not response.results:
continue
asyncio.run_coroutine_threadsafe(
self._handle_response(response), self.get_event_loop()
)
except grpc.RpcError as e:
status = e.code().name if hasattr(e, "code") else "UNKNOWN"
details = e.details() if hasattr(e, "details") else str(e)
logger.error(f"{self} gRPC streaming error ({status}): {details}")
asyncio.run_coroutine_threadsafe(
self.push_error(f"{self} STT streaming failed (gRPC {status}): {details}"),
self.get_event_loop(),
)
async def _thread_task_handler(self):
try:
@@ -370,6 +470,7 @@ class NvidiaSTTService(STTService):
if transcript and len(transcript) > 0:
if result.is_final:
await self.stop_processing_metrics()
logger.debug(f"Transcription: [{transcript}]")
await self.push_frame(
TranscriptionFrame(
transcript,
@@ -377,6 +478,7 @@ class NvidiaSTTService(STTService):
time_now_iso8601(),
self._settings.language,
result=result,
finalized=True,
)
)
await self._handle_transcription(
@@ -394,6 +496,7 @@ class NvidiaSTTService(STTService):
result=result,
)
)
logger.trace(f"Interim Transcription: [{transcript}]")
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
"""Process audio data for speech-to-text transcription.
@@ -409,7 +512,7 @@ class NvidiaSTTService(STTService):
yield None
def __next__(self) -> bytes:
"""Get the next audio chunk for NVIDIA Riva processing.
"""Get the next audio chunk for NVIDIA Nemotron Speech processing.
Returns:
Audio bytes from the queue.
@@ -422,7 +525,8 @@ class NvidiaSTTService(STTService):
try:
future = asyncio.run_coroutine_threadsafe(self._queue.get(), self.get_event_loop())
return future.result()
audio = future.result()
return audio
except FuturesCancelledError:
raise StopIteration
@@ -436,9 +540,9 @@ class NvidiaSTTService(STTService):
class NvidiaSegmentedSTTService(SegmentedSTTService):
"""Speech-to-text service using NVIDIA Riva's offline/batch models.
"""Speech-to-text service using NVIDIA Nemotron Speech's offline/batch models.
By default, his service uses NVIDIA's Riva Canary ASR API to perform speech-to-text
By default, this service uses NVIDIA's Nemotron Speech Canary ASR API to perform speech-to-text
transcription on audio segments. It inherits from SegmentedSTTService to handle
audio buffering and speech detection.
"""
@@ -447,7 +551,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
_settings: Settings
class InputParams(BaseModel):
"""Configuration parameters for NVIDIA Riva segmented STT service.
"""Configuration parameters for NVIDIA Nemotron Speech segmented STT service.
.. deprecated:: 0.0.105
Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead.
@@ -471,7 +575,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
def __init__(
self,
*,
api_key: str,
api_key: str | None = None,
server: str = "grpc.nvcf.nvidia.com:443",
model_function_map: Mapping[str, str] = {
"function_id": "ee8dc628-76de-4acc-8595-1836e7e857bd",
@@ -480,28 +584,34 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
sample_rate: int | None = None,
params: InputParams | None = None,
use_ssl: bool = True,
custom_configuration: str = "",
settings: Settings | None = None,
ttfs_p99_latency: float | None = NVIDIA_TTFS_P99,
**kwargs,
):
"""Initialize the NVIDIA Riva segmented STT service.
"""Initialize the NVIDIA Nemotron Speech segmented STT service.
Args:
api_key: NVIDIA API key for authentication
server: NVIDIA Riva server address (defaults to NVIDIA Cloud Function endpoint)
model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate
params: Additional configuration parameters for NVIDIA Riva
api_key: NVIDIA API key for authentication. Required when using the
cloud endpoint. Not needed for local deployments.
server: NVIDIA Nemotron Speech server address. Defaults to NVIDIA Cloud Function endpoint.
For local deployments, pass the local address (e.g. ``localhost:50051``).
model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID.
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
params: Additional configuration parameters for NVIDIA Nemotron Speech.
.. deprecated:: 0.0.105
Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead.
use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True.
use_ssl: Whether to use SSL for the gRPC connection. Defaults to True
for the NVIDIA cloud endpoint. Set to False for local deployments.
custom_configuration: Custom Nemotron Speech configuration string
(e.g. ``"enable_vad_endpointing:true,neural_vad.onset:0.65"``).
settings: Runtime-updatable settings. When provided alongside deprecated
parameters, ``settings`` values take precedence.
ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
**kwargs: Additional arguments passed to SegmentedSTTService
**kwargs: Additional arguments passed to SegmentedSTTService.
"""
# 1. Initialize default_settings with hardcoded defaults
default_settings = self.Settings(
@@ -512,6 +622,8 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
verbatim_transcripts=False,
boosted_lm_words=None,
boosted_lm_score=4.0,
max_alternatives=1,
word_time_offsets=False,
)
# 2. (no deprecated direct args for this service)
@@ -538,81 +650,64 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
**kwargs,
)
# Initialize NVIDIA Riva settings
# Initialize NVIDIA Nemotron Speech settings
self._api_key = api_key
self._server = server
self._use_ssl = use_ssl
self._function_id = model_function_map.get("function_id")
# Voice activity detection thresholds (use NVIDIA Riva defaults)
self._start_history = -1
self._start_threshold = -1.0
self._stop_history = -1
self._stop_threshold = -1.0
self._stop_history_eou = -1
self._stop_threshold_eou = -1.0
self._custom_configuration = ""
self._custom_configuration = custom_configuration
self._config = None
self._asr_service = None
def language_to_service_language(self, language: Language) -> str | None:
"""Convert pipecat Language enum to NVIDIA Riva's language code.
"""Convert pipecat Language enum to NVIDIA Nemotron Speech's language code.
Args:
language: Language enum value.
Returns:
NVIDIA Riva language code or None if not supported.
NVIDIA Nemotron Speech language code or None if not supported.
"""
return language_to_nvidia_riva_language(language)
return language_to_nvidia_nemotron_speech_language(language)
def _initialize_client(self):
"""Initialize the NVIDIA Riva ASR client with authentication metadata."""
"""Initialize the NVIDIA Nemotron Speech ASR client with authentication metadata."""
if self._asr_service is not None:
return
# Set up authentication metadata for NVIDIA Cloud Functions
metadata = [
["function-id", self._function_id],
["authorization", f"Bearer {self._api_key}"],
]
metadata = []
if self._function_id:
metadata.append(["function-id", self._function_id])
if self._api_key:
metadata.append(["authorization", f"Bearer {self._api_key}"])
# Create authenticated client
auth = riva.client.Auth(None, self._use_ssl, self._server, metadata)
self._asr_service = riva.client.ASRService(auth)
def _get_language_code(self) -> str:
"""Get the current NVIDIA Riva language code string."""
"""Get the current NVIDIA Nemotron Speech language code string."""
return self._settings.language or "en-US"
def _create_recognition_config(self):
"""Create the NVIDIA Riva ASR recognition configuration."""
"""Create the NVIDIA Nemotron Speech ASR recognition configuration."""
# Create base configuration
s = self._settings
config = riva.client.RecognitionConfig(
language_code=self._get_language_code(),
max_alternatives=1,
max_alternatives=s.max_alternatives,
profanity_filter=s.profanity_filter,
enable_automatic_punctuation=s.automatic_punctuation,
verbatim_transcripts=s.verbatim_transcripts,
enable_word_time_offsets=s.word_time_offsets,
)
# Add word boosting if specified
if s.boosted_lm_words:
riva.client.add_word_boosting_to_config(config, s.boosted_lm_words, s.boosted_lm_score)
# Add voice activity detection parameters
riva.client.add_endpoint_parameters_to_config(
config,
self._start_history,
self._start_threshold,
self._stop_history,
self._stop_history_eou,
self._stop_threshold,
self._stop_threshold_eou,
)
# Add any custom configuration
if self._custom_configuration:
riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
@@ -676,7 +771,7 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
await self.start_processing_metrics()
# Process audio with NVIDIA Riva ASR - explicitly request non-future response
# Process audio with NVIDIA Nemotron Speech ASR - explicitly request non-future response
raw_response = self._asr_service.offline_recognize(audio, self._config, future=False)
await self.stop_processing_metrics()
@@ -712,10 +807,14 @@ class NvidiaSegmentedSTTService(SegmentedSTTService):
await self._handle_transcription(text, True, self._settings.language)
if not transcription_found:
logger.debug(f"{self}: No transcription results found in NVIDIA Riva response")
logger.debug(
f"{self}: No transcription results found in NVIDIA Nemotron Speech response"
)
except AttributeError as ae:
logger.error(f"{self}: Unexpected response structure from NVIDIA Riva: {ae}")
yield ErrorFrame(f"{self}: Unexpected NVIDIA Riva response format: {str(ae)}")
logger.error(f"{self}: Unexpected response structure from NVIDIA Nemotron Speech: {ae}")
yield ErrorFrame(
error=f"{self}: Unexpected NVIDIA Nemotron Speech response format: {str(ae)}"
)
except Exception as e:
logger.error(f"{self} exception: {e}")
yield ErrorFrame(error=f"{self} error: {e}")