Catch PermissionError when NLTK data can't be downloaded

2025-10-04 08:41:32 -04:00
parent 64ceef9cf0
commit 49f44aa7c8
2 changed files with 14 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

+- `PermissionError` is now caught if NLTK's `punkt_tab` can't be downloaded.
+
 - Added `HumeTTSService` for text-to-speech synthesis using Hume AI's
  expressive voice models. Provides high-quality, emotionally expressive speech
  synthesis with support for various voice models. Includes example in
--- a/src/pipecat/utils/string.py
+++ b/src/pipecat/utils/string.py
@@ -21,13 +21,24 @@ import re
 from typing import FrozenSet, Optional, Sequence, Tuple

 import nltk
+from loguru import logger
 from nltk.tokenize import sent_tokenize

 # Ensure punkt_tab tokenizer data is available
 try:
    nltk.data.find("tokenizers/punkt_tab")
 except LookupError:
-    nltk.download("punkt_tab", quiet=True)
+    try:
+        nltk.download("punkt_tab", quiet=True)
+    except (OSError, PermissionError) as e:
+        logger.error(
+            f"Failed to download NLTK 'punkt_tab' tokenizer data: {e}. "
+            "This data is required for sentence tokenization features. "
+            "The download failed due to filesystem permissions. "
+            "To resolve: pre-install the data in a location with appropriate read permissions, "
+            "or set the NLTK_DATA environment variable to point to a writable directory. "
+            "See https://www.nltk.org/data.html for more information."
+        )

 SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset(
    {