Compare commits

...

2 Commits

Author SHA1 Message Date
James Hush
b6da5c18b7 Add changelog for #4389 2026-04-30 14:38:30 +08:00
James Hush
4b6881b81d fix(aws): surface fatal errors on missing/invalid credentials
AWS services were silently failing on bad credentials. Nova Sonic was the
worst offender: no audio, no clear log, and an InvalidStateError from
awscrt at shutdown that masked the real cause.

Changes:
- Nova Sonic: connect failure now pushes a fatal ErrorFrame with a
  "check AWS credentials and region" hint. _disconnect wraps stream and
  session-end cleanup so a partially-initialized stream no longer raises
  InvalidStateError on top of the real error.
- Bedrock LLM and Polly TTS: branch on botocore ClientError. Auth-class
  codes (UnrecognizedClientException, InvalidSignatureException,
  AccessDeniedException, ExpiredTokenException, InvalidAccessKeyId,
  SignatureDoesNotMatch, MissingAuthenticationTokenException, AuthFailure)
  push fatal errors. Other client errors stay non-fatal (transient).
- Transcribe STT: _connect_websocket catch-all is now fatal, since
  presigned URL and websocket connect failures don't recover on retry.
2026-04-30 14:36:44 +08:00
5 changed files with 115 additions and 10 deletions

9
changelog/4389.fixed.md Normal file
View File

@@ -0,0 +1,9 @@
- Fixed AWS services failing silently on missing or invalid credentials.
`AWSNovaSonicLLMService`, `AWSBedrockLLMService`, `AWSPollyTTSService`,
and `AWSTranscribeSTTService` now push a fatal `ErrorFrame` with a
"check AWS credentials and region" hint on auth-class failures, so the
pipeline cancels promptly instead of continuing to run with no output.
- Fixed `AWSNovaSonicLLMService._disconnect` raising `InvalidStateError`
from `awscrt/aio/http.py` when cleanup ran on a stream from a failed
`invoke_model_with_bidirectional_stream` call. The error was masking
the real connect-time auth failure in the logs.

View File

@@ -43,7 +43,7 @@ from pipecat.utils.tracing.service_decorators import traced_llm
try:
import aioboto3
from botocore.config import Config
from botocore.exceptions import ReadTimeoutError
from botocore.exceptions import ClientError, ReadTimeoutError
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
@@ -52,6 +52,23 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
# AWS error codes that indicate the service won't work until creds/region are
# fixed. We treat these as fatal so the pipeline stops instead of silently
# degrading.
_AWS_AUTH_ERROR_CODES = frozenset(
{
"UnrecognizedClientException",
"InvalidSignatureException",
"AccessDeniedException",
"ExpiredTokenException",
"InvalidAccessKeyId",
"SignatureDoesNotMatch",
"MissingAuthenticationTokenException",
"AuthFailure",
}
)
@dataclass
class AWSBedrockLLMSettings(LLMSettings):
"""Settings for AWSBedrockLLMService.
@@ -555,6 +572,20 @@ class AWSBedrockLLMService(LLMService):
raise
except (TimeoutError, ReadTimeoutError):
await self._call_event_handler("on_completion_timeout")
except ClientError as e:
error_code = e.response.get("Error", {}).get("Code", "")
if error_code in _AWS_AUTH_ERROR_CODES:
await self.push_error(
error_msg=(
"AWS Bedrock authentication failed. "
"Check AWS credentials and region. "
f"Underlying error: {e}"
),
exception=e,
fatal=True,
)
else:
await self.push_error(error_msg=f"AWS Bedrock client error: {e}", exception=e)
except Exception as e:
await self.push_error(error_msg=f"Unknown error occurred: {e}", exception=e)
finally:

View File

@@ -602,7 +602,19 @@ class AWSNovaSonicLLMService(LLMService):
self._ready_to_send_context = True
await self._finish_connecting_if_context_available()
except Exception as e:
await self.push_error(error_msg=f"Initialization error: {e}", exception=e)
# Connect-time failures (most commonly bad/missing AWS credentials or
# an unsupported region) leave the bidirectional stream in a partial
# state and produce no audio output. Treat them as fatal so the
# pipeline cancels with a clear ERROR rather than continuing silently.
await self.push_error(
error_msg=(
"AWS Nova Sonic failed to start. "
"Check AWS credentials and region. "
f"Underlying error: {e}"
),
exception=e,
fatal=True,
)
await self._disconnect()
async def _process_completed_function_calls(self, send_new_results: bool):
@@ -703,17 +715,28 @@ class AWSNovaSonicLLMService(LLMService):
# NOTE: see explanation of HACK, below
self._disconnecting = True
# Clean up client
# Clean up client. If connect failed (e.g. bad credentials), the
# session may not have started, so end events can fail. Don't let
# that mask the real error or block cleanup.
if self._client:
await self._send_session_end_events()
try:
await self._send_session_end_events()
except Exception as e:
logger.debug(f"Ignoring error while sending session-end events: {e}")
self._client = None
# Clean up context
self._context = None
# Clean up stream
# Clean up stream. A stream from a failed
# invoke_model_with_bidirectional_stream call has an already-
# cancelled awscrt future; closing it raises InvalidStateError that
# otherwise drowns out the real connect error in the logs.
if self._stream:
await self._stream.close()
try:
await self._stream.close()
except Exception as e:
logger.debug(f"Ignoring error while closing partial stream: {e}")
self._stream = None
# NOTE: see explanation of HACK, below

View File

@@ -323,8 +323,18 @@ class AWSTranscribeSTTService(WebsocketSTTService):
await self._call_event_handler("on_connected")
logger.info(f"{self} Successfully connected to AWS Transcribe")
except Exception as e:
# Connect-time failures (most commonly bad/missing AWS credentials,
# an unsupported region, or a 403 from the presigned URL) won't
# recover on retry. Treat them as fatal so the pipeline cancels
# with a clear ERROR rather than silently producing no transcripts.
await self.push_error(
error_msg=f"Unable to connect to AWS Transcribe: {e}", exception=e
error_msg=(
"Unable to connect to AWS Transcribe. "
"Check AWS credentials and region. "
f"Underlying error: {e}"
),
exception=e,
fatal=True,
)
raise

View File

@@ -37,6 +37,23 @@ except ModuleNotFoundError as e:
raise Exception(f"Missing module: {e}")
# AWS error codes that indicate the service won't work until creds/region are
# fixed. We treat these as fatal so the pipeline stops instead of silently
# degrading.
_AWS_AUTH_ERROR_CODES = frozenset(
{
"UnrecognizedClientException",
"InvalidSignatureException",
"AccessDeniedException",
"ExpiredTokenException",
"InvalidAccessKeyId",
"SignatureDoesNotMatch",
"MissingAuthenticationTokenException",
"AuthFailure",
}
)
def language_to_aws_language(language: Language) -> str | None:
"""Convert a Language enum to AWS Polly language code.
@@ -366,6 +383,21 @@ class AWSPollyTTSService(TTSService):
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1, context_id=context_id)
yield frame
except (BotoCoreError, ClientError) as error:
error_message = f"AWS Polly TTS error: {str(error)}"
yield ErrorFrame(error=error_message)
except ClientError as error:
error_code = error.response.get("Error", {}).get("Code", "")
if error_code in _AWS_AUTH_ERROR_CODES:
# Bad/missing credentials won't fix themselves between calls.
# Stop the pipeline so the failure surfaces clearly.
await self.push_error(
error_msg=(
"AWS Polly authentication failed. "
"Check AWS credentials and region. "
f"Underlying error: {error}"
),
exception=error,
fatal=True,
)
else:
yield ErrorFrame(error=f"AWS Polly TTS error: {error}")
except BotoCoreError as error:
yield ErrorFrame(error=f"AWS Polly TTS error: {error}")