Add input params to Azure TTS

2024-09-23 10:07:05 -04:00
parent c262b272fa
commit 8edee8155d
1 changed files with 75 additions and 38 deletions
--- a/src/pipecat/services/azure.py
+++ b/src/pipecat/services/azure.py
@@ -4,45 +4,34 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #

-import aiohttp
 import asyncio
 import io
+from typing import AsyncGenerator, Optional

+import aiohttp
+from loguru import logger
 from PIL import Image
-from typing import AsyncGenerator
+from pydantic import BaseModel

-from pipecat.frames.frames import (
-    CancelFrame,
-    EndFrame,
-    ErrorFrame,
-    Frame,
-    StartFrame,
-    TTSAudioRawFrame,
-    TTSStartedFrame,
-    TTSStoppedFrame,
-    TranscriptionFrame,
-    URLImageRawFrame,
-)
-from pipecat.metrics.metrics import TTSUsageMetricsData
-from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.ai_services import STTService, TTSService, ImageGenService
+from pipecat.frames.frames import (CancelFrame, EndFrame, ErrorFrame, Frame,
+                                   StartFrame, TranscriptionFrame,
+                                   TTSAudioRawFrame, TTSStartedFrame,
+                                   TTSStoppedFrame, URLImageRawFrame)
+from pipecat.services.ai_services import (ImageGenService, STTService,
+                                          TTSService)
 from pipecat.services.openai import BaseOpenAILLMService
 from pipecat.utils.time import time_now_iso8601

-from loguru import logger
-
 # See .env.example for Azure configuration needed
 try:
-    from openai import AsyncAzureOpenAI
-    from azure.cognitiveservices.speech import (
-        SpeechConfig,
-        SpeechRecognizer,
-        SpeechSynthesizer,
-        ResultReason,
-        CancellationReason,
-    )
-    from azure.cognitiveservices.speech.audio import AudioStreamFormat, PushAudioInputStream
+    from azure.cognitiveservices.speech import (CancellationReason,
+                                                ResultReason, SpeechConfig,
+                                                SpeechRecognizer,
+                                                SpeechSynthesizer)
+    from azure.cognitiveservices.speech.audio import (AudioStreamFormat,
+                                                      PushAudioInputStream)
    from azure.cognitiveservices.speech.dialog import AudioConfig
+    from openai import AsyncAzureOpenAI
 except ModuleNotFoundError as e:
    logger.error(f"Exception: {e}")
    logger.error(
@@ -70,6 +59,17 @@ class AzureLLMService(BaseOpenAILLMService):


 class AzureTTSService(TTSService):
+    class InputParams(BaseModel):
+        emphasis: Optional[str] = None
+        language_code: Optional[str] = "en-US"
+        pitch: Optional[str] = None
+        rate: Optional[str] = "1.05"
+        role: Optional[str] = None
+        style: Optional[str] = None
+        style_degree: Optional[str] = None
+        volume: Optional[str] = None
+
+
    def __init__(
        self,
        *,
@@ -77,6 +77,7 @@ class AzureTTSService(TTSService):
        region: str,
        voice="en-US-SaraNeural",
        sample_rate: int = 16000,
+        params: InputParams = InputParams(),
        **kwargs,
    ):
        super().__init__(sample_rate=sample_rate, **kwargs)
@@ -86,10 +87,55 @@ class AzureTTSService(TTSService):

        self._voice = voice
        self._sample_rate = sample_rate
+        self._params = params

    def can_generate_metrics(self) -> bool:
        return True

+    def _construct_ssml(self, text: str) -> str:
+        ssml = (
+            f"<speak version='1.0' xml:lang='{self._params.language_code}' "
+            "xmlns='http://www.w3.org/2001/10/synthesis' "
+            "xmlns:mstts='http://www.w3.org/2001/mstts'>"
+            f"<voice name='{self._voice}'>"
+            "<mstts:silence type='Sentenceboundary' value='20ms' />"
+        )
+
+        if self._params.style:
+            ssml += f"<mstts:express-as style='{self._params.style}'"
+            if self._params.style_degree:
+                ssml += f" styledegree='{self._params.style_degree}'"
+            if self._params.role:
+                ssml += f" role='{self._params.role}'"
+            ssml += ">"
+
+        prosody_attrs = []
+        if self._params.rate:
+            prosody_attrs.append(f"rate='{self._params.rate}'")
+        if self._params.pitch:
+            prosody_attrs.append(f"pitch='{self._params.pitch}'")
+        if self._params.volume:
+            prosody_attrs.append(f"volume='{self._params.volume}'")
+        
+        ssml += f"<prosody {' '.join(prosody_attrs)}>"
+
+        if self._params.emphasis:
+            ssml += f"<emphasis level='{self._params.emphasis}'>"
+
+        ssml += text
+
+        if self._params.emphasis:
+            ssml += "</emphasis>"
+
+        ssml += "</prosody>"
+
+        if self._params.style:
+            ssml += "</mstts:express-as>"
+
+        ssml += "</voice></speak>"
+
+        return ssml
+
    async def set_voice(self, voice: str):
        logger.debug(f"Switching TTS voice to: [{voice}]")
        self._voice = voice
@@ -99,16 +145,7 @@ class AzureTTSService(TTSService):

        await self.start_ttfb_metrics()

-        ssml = (
-            "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' "
-            "xmlns:mstts='http://www.w3.org/2001/mstts'>"
-            f"<voice name='{self._voice}'>"
-            "<mstts:silence type='Sentenceboundary' value='20ms' />"
-            "<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>"
-            "<prosody rate='1.05'>"
-            f"{text}"
-            "</prosody></mstts:express-as></voice></speak> "
-        )
+        ssml = self._construct_ssml(text)

        result = await asyncio.to_thread(self._speech_synthesizer.speak_ssml, (ssml))