Catch PermissionError when NLTK data can't be downloaded

This commit is contained in:
Mark Backman
2025-10-04 08:41:32 -04:00
parent 64ceef9cf0
commit 49f44aa7c8
2 changed files with 14 additions and 1 deletions

View File

@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- `PermissionError` is now caught if NLTK's `punkt_tab` can't be downloaded.
- Added `HumeTTSService` for text-to-speech synthesis using Hume AI's
expressive voice models. Provides high-quality, emotionally expressive speech
synthesis with support for various voice models. Includes example in

View File

@@ -21,13 +21,24 @@ import re
from typing import FrozenSet, Optional, Sequence, Tuple
import nltk
from loguru import logger
from nltk.tokenize import sent_tokenize
# Ensure punkt_tab tokenizer data is available
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
nltk.download("punkt_tab", quiet=True)
try:
nltk.download("punkt_tab", quiet=True)
except (OSError, PermissionError) as e:
logger.error(
f"Failed to download NLTK 'punkt_tab' tokenizer data: {e}. "
"This data is required for sentence tokenization features. "
"The download failed due to filesystem permissions. "
"To resolve: pre-install the data in a location with appropriate read permissions, "
"or set the NLTK_DATA environment variable to point to a writable directory. "
"See https://www.nltk.org/data.html for more information."
)
SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset(
{