diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b5561e8f..ae7e3dcb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- `PermissionError` is now caught if NLTK's `punkt_tab` can't be downloaded. + - Added `HumeTTSService` for text-to-speech synthesis using Hume AI's expressive voice models. Provides high-quality, emotionally expressive speech synthesis with support for various voice models. Includes example in diff --git a/src/pipecat/utils/string.py b/src/pipecat/utils/string.py index 592d7c2a5..c9cb05142 100644 --- a/src/pipecat/utils/string.py +++ b/src/pipecat/utils/string.py @@ -21,13 +21,24 @@ import re from typing import FrozenSet, Optional, Sequence, Tuple import nltk +from loguru import logger from nltk.tokenize import sent_tokenize # Ensure punkt_tab tokenizer data is available try: nltk.data.find("tokenizers/punkt_tab") except LookupError: - nltk.download("punkt_tab", quiet=True) + try: + nltk.download("punkt_tab", quiet=True) + except (OSError, PermissionError) as e: + logger.error( + f"Failed to download NLTK 'punkt_tab' tokenizer data: {e}. " + "This data is required for sentence tokenization features. " + "The download failed due to filesystem permissions. " + "To resolve: pre-install the data in a location with appropriate read permissions, " + "or set the NLTK_DATA environment variable to point to a writable directory. " + "See https://www.nltk.org/data.html for more information." + ) SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset( {