From ab1d2dbe6a2186f8dcd834512922e301bfafa805 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Wed, 25 Jun 2025 16:24:44 -0400
Subject: [PATCH] Add STTService docstrings

---
 src/pipecat/services/stt_service.py | 99 ++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 10 deletions(-)

diff --git a/src/pipecat/services/stt_service.py b/src/pipecat/services/stt_service.py
index 5e57b3104..e659b403b 100644
--- a/src/pipecat/services/stt_service.py
+++ b/src/pipecat/services/stt_service.py
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
+"""Base classes for Speech-to-Text services with continuous and segmented processing."""
+
 import io
 import wave
 from abc import abstractmethod
@@ -26,7 +28,19 @@ from pipecat.transcriptions.language import Language
 
 
 class STTService(AIService):
-    """STTService is a base class for speech-to-text services."""
+    """Base class for speech-to-text services.
+
+    Provides common functionality for STT services including audio passthrough,
+    muting, settings management, and audio processing. Subclasses must implement
+    the run_stt method to provide actual speech recognition.
+
+    Args:
+            audio_passthrough: Whether to pass audio frames downstream after processing.
+                Defaults to True.
+            sample_rate: The sample rate for audio input. If None, will be determined
+                from the start frame.
+            **kwargs: Additional arguments passed to the parent AIService.
+    """
 
     def __init__(
         self,
@@ -44,25 +58,59 @@ class STTService(AIService):
 
     @property
     def is_muted(self) -> bool:
-        """Returns whether the STT service is currently muted."""
+        """Check if the STT service is currently muted.
+
+        Returns:
+            True if the service is muted and will not process audio.
+        """
         return self._muted
 
     @property
     def sample_rate(self) -> int:
+        """Get the current sample rate for audio processing.
+
+        Returns:
+            The sample rate in Hz.
+        """
         return self._sample_rate
 
     async def set_model(self, model: str):
+        """Set the speech recognition model.
+
+        Args:
+            model: The name of the model to use for speech recognition.
+        """
         self.set_model_name(model)
 
     async def set_language(self, language: Language):
+        """Set the language for speech recognition.
+
+        Args:
+            language: The language to use for speech recognition.
+        """
         pass
 
     @abstractmethod
     async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
-        """Returns transcript as a string"""
+        """Run speech-to-text on the provided audio data.
+
+        This method must be implemented by subclasses to provide actual speech
+        recognition functionality.
+
+        Args:
+            audio: Raw audio bytes to transcribe.
+
+        Yields:
+            Frame: Frames containing transcription results (typically TextFrame).
+        """
         pass
 
     async def start(self, frame: StartFrame):
+        """Start the STT service.
+
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
         await super().start(frame)
         self._sample_rate = self._init_sample_rate or frame.audio_in_sample_rate
 
@@ -80,13 +128,24 @@ class STTService(AIService):
                 logger.warning(f"Unknown setting for STT service: {key}")
 
     async def process_audio_frame(self, frame: AudioRawFrame, direction: FrameDirection):
+        """Process an audio frame for speech recognition.
+
+        Args:
+            frame: The audio frame to process.
+            direction: The direction of frame processing.
+        """
         if self._muted:
             return
 
         await self.process_generator(self.run_stt(frame.audio))
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
-        """Processes a frame of audio data, either buffering or transcribing it."""
+        """Process frames, handling VAD events and audio segmentation.
+
+        Args:
+            frame: The frame to process.
+            direction: The direction of frame processing.
+        """
         await super().process_frame(frame, direction)
 
         if isinstance(frame, AudioRawFrame):
@@ -106,14 +165,19 @@ class STTService(AIService):
 
 
 class SegmentedSTTService(STTService):
-    """SegmentedSTTService is an STTService that uses VAD events to detect
-    speech and will run speech-to-text on speech segments only, instead of a
-    continous stream. Since it uses VAD it means that VAD needs to be enabled in
-    the pipeline.
+    """STT service that processes speech in segments using VAD events.
 
-    This service always keeps a small audio buffer to take into account that VAD
-    events are delayed from when the user speech really starts.
+    Uses Voice Activity Detection (VAD) events to detect speech segments and runs
+    speech-to-text only on those segments, rather than continuously.
 
+    Requires VAD to be enabled in the pipeline to function properly. Maintains a
+    small audio buffer to account for the delay between actual speech start and
+    VAD detection.
+
+    Args:
+            sample_rate: The sample rate for audio input. If None, will be determined
+                from the start frame.
+            **kwargs: Additional arguments passed to the parent STTService.
     """
 
     def __init__(self, *, sample_rate: Optional[int] = None, **kwargs):
@@ -125,10 +189,16 @@ class SegmentedSTTService(STTService):
         self._user_speaking = False
 
     async def start(self, frame: StartFrame):
+        """Start the segmented STT service and initialize audio buffer.
+
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
         await super().start(frame)
         self._audio_buffer_size_1s = self.sample_rate * 2
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Process frames, handling VAD events and audio segmentation."""
         await super().process_frame(frame, direction)
 
         if isinstance(frame, UserStartedSpeakingFrame):
@@ -162,6 +232,15 @@ class SegmentedSTTService(STTService):
         self._audio_buffer.clear()
 
     async def process_audio_frame(self, frame: AudioRawFrame, direction: FrameDirection):
+        """Process audio frames by buffering them for segmented transcription.
+
+        Continuously buffers audio, growing the buffer while user is speaking and
+        maintaining a small buffer when not speaking to account for VAD delay.
+
+        Args:
+            frame: The audio frame to process.
+            direction: The direction of frame processing.
+        """
         # If the user is speaking the audio buffer will keep growing.
         self._audio_buffer += frame.audio