From dfe7815dc5e91e03a211b3d0963f1caacde39ddc Mon Sep 17 00:00:00 2001 From: marcus-daily <111281783+marcus-daily@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:30:06 +0100 Subject: [PATCH] Smart Turn v3: removing torch and torchaudio deps --- CHANGELOG.md | 5 +++++ pyproject.toml | 2 +- src/pipecat/audio/turn/smart_turn/local_smart_turn_v3.py | 6 +++--- uv.lock | 4 ---- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c56f58491..4365a268a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 async def on_pipeline_finished(task: PipelineTask, frame: Frame): ... ``` + +### Changed + +- `torch` and `torchaudio` are no longer required for running Smart Turn + locally. This avoids gigabytes of dependencies being installed. ### Deprecated diff --git a/pyproject.toml b/pyproject.toml index 8c86cbf1c..1317add97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ sambanova = [] sarvam = [ "websockets>=13.1,<15.0" ] sentry = [ "sentry-sdk~=2.23.1" ] local-smart-turn = [ "coremltools>=8.0", "transformers", "torch>=2.5.0,<3", "torchaudio>=2.5.0,<3" ] -local-smart-turn-v3 = [ "transformers", "torch>=2.5.0,<3", "torchaudio>=2.5.0,<3", "onnxruntime>=1.20.1, <2" ] +local-smart-turn-v3 = [ "transformers", "onnxruntime>=1.20.1, <2" ] remote-smart-turn = [] silero = [ "onnxruntime>=1.20.1, <2" ] simli = [ "simli-ai~=0.1.10"] diff --git a/src/pipecat/audio/turn/smart_turn/local_smart_turn_v3.py b/src/pipecat/audio/turn/smart_turn/local_smart_turn_v3.py index e9d06ba1c..07e710da6 100644 --- a/src/pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +++ b/src/pipecat/audio/turn/smart_turn/local_smart_turn_v3.py @@ -98,15 +98,15 @@ class LocalSmartTurnAnalyzerV3(BaseSmartTurn): inputs = self._feature_extractor( audio_array, sampling_rate=16000, - return_tensors="pt", + return_tensors="np", padding="max_length", max_length=8 * 16000, truncation=True, do_normalize=True, ) - # Convert to numpy and ensure correct shape for ONNX - input_features = inputs.input_features.squeeze(0).numpy().astype(np.float32) + # Extract features and ensure correct shape for ONNX + input_features = inputs.input_features.squeeze(0).astype(np.float32) input_features = np.expand_dims(input_features, axis=0) # Add batch dimension # Run ONNX inference diff --git a/uv.lock b/uv.lock index 34ede6c3b..a629d98d4 100644 --- a/uv.lock +++ b/uv.lock @@ -4309,8 +4309,6 @@ local-smart-turn = [ ] local-smart-turn-v3 = [ { name = "onnxruntime" }, - { name = "torch" }, - { name = "torchaudio" }, { name = "transformers" }, ] mcp = [ @@ -4495,9 +4493,7 @@ requires-dist = [ { name = "tenacity", marker = "extra == 'livekit'", specifier = ">=8.2.3,<10.0.0" }, { name = "timm", marker = "extra == 'moondream'", specifier = "~=1.0.13" }, { name = "torch", marker = "extra == 'local-smart-turn'", specifier = ">=2.5.0,<3" }, - { name = "torch", marker = "extra == 'local-smart-turn-v3'", specifier = ">=2.5.0,<3" }, { name = "torchaudio", marker = "extra == 'local-smart-turn'", specifier = ">=2.5.0,<3" }, - { name = "torchaudio", marker = "extra == 'local-smart-turn-v3'", specifier = ">=2.5.0,<3" }, { name = "transformers", marker = "extra == 'local-smart-turn'" }, { name = "transformers", marker = "extra == 'local-smart-turn-v3'" }, { name = "transformers", marker = "extra == 'moondream'", specifier = ">=4.48.0" },