"""Shared text chunking helpers for streaming TTS.""" from typing import Optional def is_non_sentence_period(text: str, idx: int) -> bool: """Check whether '.' should NOT be treated as a sentence delimiter.""" if idx < 0 or idx >= len(text) or text[idx] != ".": return False # Decimal/version segment: 1.2, v1.2.3 if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit(): return True # Number abbreviations: No.1 / No. 1 left_start = idx - 1 while left_start >= 0 and text[left_start].isalpha(): left_start -= 1 left_token = text[left_start + 1:idx].lower() if left_token == "no": j = idx + 1 while j < len(text) and text[j].isspace(): j += 1 if j < len(text) and text[j].isdigit(): return True return False def has_spoken_content(text: str) -> bool: """Check whether text contains pronounceable content (not punctuation-only).""" return any(char.isalnum() for char in text) def extract_tts_sentence( text_buffer: str, *, end_chars: frozenset[str], trailing_chars: frozenset[str], closers: frozenset[str], min_split_spoken_chars: int = 0, hold_trailing_at_buffer_end: bool = False, force: bool = False, ) -> Optional[tuple[str, str]]: """Extract one TTS sentence from text buffer.""" if not text_buffer: return None search_start = 0 while True: split_idx = -1 for idx in range(search_start, len(text_buffer)): char = text_buffer[idx] if char == "." and is_non_sentence_period(text_buffer, idx): continue if char in end_chars: split_idx = idx break if split_idx == -1: return None end_idx = split_idx + 1 while end_idx < len(text_buffer) and text_buffer[end_idx] in trailing_chars: end_idx += 1 while end_idx < len(text_buffer) and text_buffer[end_idx] in closers: end_idx += 1 if hold_trailing_at_buffer_end and not force and end_idx >= len(text_buffer): return None sentence = text_buffer[:end_idx].strip() spoken_chars = sum(1 for ch in sentence if ch.isalnum()) if ( not force and min_split_spoken_chars > 0 and 0 < spoken_chars < min_split_spoken_chars and end_idx < len(text_buffer) ): search_start = end_idx continue remainder = text_buffer[end_idx:] return sentence, remainder