Compare commits
3 Commits
kompfner-p
...
cleanup
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
646db8b9bd | ||
|
|
42c142aff0 | ||
|
|
6da78dbf9c |
@@ -5,7 +5,6 @@ import json
|
||||
from openai import AsyncAzureOpenAI
|
||||
|
||||
import os
|
||||
import requests
|
||||
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
@@ -16,7 +15,10 @@ from PIL import Image
|
||||
from azure.cognitiveservices.speech import SpeechSynthesizer, SpeechConfig, ResultReason, CancellationReason
|
||||
|
||||
class AzureTTSService(TTSService):
|
||||
def __init__(self, speech_key=None, speech_region=None):
|
||||
|
||||
def __init__(
|
||||
self, speech_key=None, speech_region=None, voice_name="en-US-SaraNeural"
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
speech_key = speech_key or os.getenv("AZURE_SPEECH_SERVICE_KEY")
|
||||
@@ -25,11 +27,13 @@ class AzureTTSService(TTSService):
|
||||
self.speech_config = SpeechConfig(subscription=speech_key, region=speech_region)
|
||||
self.speech_synthesizer = SpeechSynthesizer(speech_config=self.speech_config, audio_config=None)
|
||||
|
||||
self.voice_name = voice_name
|
||||
|
||||
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
|
||||
self.logger.info("Running azure tts")
|
||||
ssml = "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' " \
|
||||
ssml = f"<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' " \
|
||||
"xmlns:mstts='http://www.w3.org/2001/mstts'>" \
|
||||
"<voice name='en-US-SaraNeural'>" \
|
||||
f"<voice name={self.voice_name}>" \
|
||||
"<mstts:silence type='Sentenceboundary' value='20ms' />" \
|
||||
"<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>" \
|
||||
"<prosody rate='1.05'>" \
|
||||
@@ -92,81 +96,58 @@ class AzureLLMService(LLMService):
|
||||
|
||||
class AzureImageGenServiceREST(ImageGenService):
|
||||
|
||||
def __init__(self, image_size:str, api_key=None, azure_endpoint=None, api_version=None, model=None):
|
||||
def __init__(
|
||||
self,
|
||||
image_size: str,
|
||||
api_key: str | None = None,
|
||||
azure_endpoint: str | None = None,
|
||||
api_version: str | None = None,
|
||||
model: str | None = None,
|
||||
aiohttp_session: aiohttp.ClientSession | None=None,
|
||||
timeout_seconds=120,
|
||||
):
|
||||
super().__init__(image_size=image_size)
|
||||
self.api_key = api_key or os.getenv("AZURE_DALLE_KEY")
|
||||
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
|
||||
self.api_version = api_version or "2023-06-01-preview"
|
||||
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
|
||||
self.aiohttp_session: aiohttp.ClientSession = (
|
||||
aiohttp_session or aiohttp.ClientSession()
|
||||
)
|
||||
self.timeout_seconds = timeout_seconds
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
# TODO hoist the session to app-level
|
||||
async with aiohttp.ClientSession() as session:
|
||||
url = f"{self.azure_endpoint}openai/images/generations:submit?api-version={self.api_version}"
|
||||
headers= { "api-key": self.api_key, "Content-Type": "application/json" }
|
||||
body = {
|
||||
# Enter your prompt text here
|
||||
"prompt": sentence,
|
||||
"size": self.image_size,
|
||||
"n": 1,
|
||||
}
|
||||
async with session.post(url, headers=headers, json=body) as submission:
|
||||
operation_location = submission.headers['operation-location']
|
||||
url = f"{self.azure_endpoint}openai/images/generations:submit?api-version={self.api_version}"
|
||||
headers= { "api-key": self.api_key, "Content-Type": "application/json" }
|
||||
body = {
|
||||
"prompt": sentence,
|
||||
"size": self.image_size,
|
||||
"n": 1,
|
||||
}
|
||||
async with self.aiohttp_session.post(
|
||||
url, headers=headers, json=body
|
||||
) as submission:
|
||||
operation_location = submission.headers['operation-location']
|
||||
|
||||
status = ""
|
||||
attempts_left = 120
|
||||
json_response = None
|
||||
while status != "succeeded":
|
||||
attempts_left -= 1
|
||||
if attempts_left == 0:
|
||||
raise Exception("Image generation timed out")
|
||||
status = ""
|
||||
attempts_left = self.timeout_seconds
|
||||
json_response = None
|
||||
while status != "succeeded":
|
||||
attempts_left -= 1
|
||||
if attempts_left == 0:
|
||||
raise Exception("Image generation timed out")
|
||||
|
||||
await asyncio.sleep(1)
|
||||
response = await session.get(operation_location, headers=headers)
|
||||
json_response = await response.json()
|
||||
status = json_response["status"]
|
||||
await asyncio.sleep(1)
|
||||
response = await self.aiohttp_session.get(operation_location, headers=headers)
|
||||
json_response = await response.json()
|
||||
status = json_response["status"]
|
||||
|
||||
image_url = json_response["result"]["data"][0]["url"] if json_response else None
|
||||
if not image_url:
|
||||
raise Exception("Image generation failed")
|
||||
image_url = json_response["result"]["data"][0]["url"] if json_response else None
|
||||
if not image_url:
|
||||
raise Exception("Image generation failed")
|
||||
|
||||
# Load the image from the url
|
||||
async with session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
|
||||
|
||||
class AzureImageGenService(ImageGenService):
|
||||
|
||||
def __init__(self, api_key=None, azure_endpoint=None, api_version=None, model=None):
|
||||
super().__init__()
|
||||
|
||||
api_key = api_key or os.getenv("AZURE_DALLE_KEY")
|
||||
azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
|
||||
api_version = api_version or "2023-06-01-preview"
|
||||
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
|
||||
|
||||
self.client = AzureOpenAI(
|
||||
api_key=api_key,
|
||||
azure_endpoint=azure_endpoint,
|
||||
api_version=api_version,
|
||||
)
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
self.logger.info("Generating azure image", sentence)
|
||||
|
||||
image = self.client.images.generate(
|
||||
model=self.model,
|
||||
prompt=sentence,
|
||||
n=1,
|
||||
size=self.image_size,
|
||||
)
|
||||
|
||||
url = image["data"][0]["url"]
|
||||
response = requests.get(url)
|
||||
|
||||
dalle_stream = io.BytesIO(response.content)
|
||||
dalle_im = Image.open(dalle_stream.tobytes())
|
||||
|
||||
return (url, dalle_im)
|
||||
# Load the image from the url
|
||||
async with self.aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
|
||||
@@ -9,28 +9,30 @@ from dailyai.services.ai_services import TTSService
|
||||
|
||||
|
||||
class ElevenLabsTTSService(TTSService):
|
||||
def __init__(self, api_key=None, voice_id=None):
|
||||
def __init__(self, api_key=None, voice_id=None, aiohttp_session:aiohttp.ClientSession=None):
|
||||
super().__init__()
|
||||
|
||||
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
||||
self.voice_id = voice_id or os.getenv("ELEVENLABS_VOICE_ID")
|
||||
self.aiohttp_session = aiohttp_session or aiohttp.ClientSession()
|
||||
|
||||
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream"
|
||||
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
|
||||
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
|
||||
headers = {
|
||||
"xi-api-key": self.api_key,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
async with session.post(url, json=payload, headers=headers, params=querystring) as r:
|
||||
if r.status != 200:
|
||||
self.logger.error(
|
||||
f"audio fetch status code: {r.status}, error: {r.text}"
|
||||
)
|
||||
return
|
||||
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream"
|
||||
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
|
||||
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
|
||||
headers = {
|
||||
"xi-api-key": self.api_key,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
async with self.aiohttp_session.post(
|
||||
url, json=payload, headers=headers, params=querystring
|
||||
) as r:
|
||||
if r.status != 200:
|
||||
self.logger.error(
|
||||
f"audio fetch status code: {r.status}, error: {r.text}"
|
||||
)
|
||||
return
|
||||
|
||||
async for chunk in r.content:
|
||||
if chunk:
|
||||
yield chunk
|
||||
async for chunk in r.content:
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
@@ -34,9 +34,9 @@ class FalImageGenService(ImageGenService):
|
||||
raise Exception("Image generation failed")
|
||||
|
||||
return image_url
|
||||
print(f"fetching image url...")
|
||||
print("fetching image url...")
|
||||
image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size)
|
||||
print(f"got image url, downloading image...")
|
||||
print("got image url, downloading image...")
|
||||
# Load the image from the url
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(image_url) as response:
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
import requests
|
||||
import aiohttp
|
||||
import asyncio
|
||||
from PIL import Image
|
||||
import io
|
||||
from openai import AsyncOpenAI
|
||||
@@ -9,7 +7,7 @@ import os
|
||||
import json
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from dailyai.services.ai_services import AIService, TTSService, LLMService, ImageGenService
|
||||
from dailyai.services.ai_services import LLMService, ImageGenService
|
||||
|
||||
|
||||
class OpenAILLMService(LLMService):
|
||||
@@ -50,11 +48,19 @@ class OpenAILLMService(LLMService):
|
||||
return None
|
||||
|
||||
class OpenAIImageGenService(ImageGenService):
|
||||
def __init__(self, image_size:str, api_key=None, model=None):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_size: str,
|
||||
api_key=None,
|
||||
model=None,
|
||||
aiohttp_session: aiohttp.ClientSession | None = None,
|
||||
):
|
||||
super().__init__(image_size=image_size)
|
||||
api_key = api_key or os.getenv("OPEN_AI_KEY")
|
||||
self.model = model or os.getenv("OPEN_AI_IMAGE_MODEL") or "dall-e-3"
|
||||
self.client = AsyncOpenAI(api_key=api_key)
|
||||
self.aiohttp_session=aiohttp_session or aiohttp.ClientSession()
|
||||
|
||||
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
|
||||
self.logger.info("Generating OpenAI image", sentence)
|
||||
@@ -70,10 +76,7 @@ class OpenAIImageGenService(ImageGenService):
|
||||
raise Exception("No image provided in response", image)
|
||||
|
||||
# Load the image from the url
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
|
||||
return (image_url, dalle_im.tobytes())
|
||||
async with self.aiohttp_session.get(image_url) as response:
|
||||
image_stream = io.BytesIO(await response.content.read())
|
||||
image = Image.open(image_stream)
|
||||
return (image_url, image.tobytes())
|
||||
|
||||
@@ -4,6 +4,7 @@ import asyncio
|
||||
from dailyai.queue_frame import TextQueueFrame
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.open_ai_services import OpenAIImageGenService
|
||||
from dailyai.services.azure_ai_services import AzureImageGenServiceREST
|
||||
|
||||
local_joined = False
|
||||
participant_joined = False
|
||||
@@ -21,7 +22,7 @@ async def main(room_url):
|
||||
transport.camera_width = 1024
|
||||
transport.camera_height = 1024
|
||||
|
||||
imagegen = OpenAIImageGenService(image_size="1024x1024")
|
||||
imagegen = AzureImageGenServiceREST(image_size="1024x1024")
|
||||
image_task = asyncio.create_task(
|
||||
imagegen.run_to_queue(transport.send_queue, [TextQueueFrame("a cat in the style of picasso")])
|
||||
)
|
||||
|
||||
@@ -2,7 +2,7 @@ import argparse
|
||||
import asyncio
|
||||
|
||||
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
|
||||
from dailyai.services.azure_ai_services import AzureLLMService
|
||||
from dailyai.services.azure_ai_services import AzureImageGenServiceREST, AzureLLMService
|
||||
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
|
||||
from dailyai.services.daily_transport_service import DailyTransportService
|
||||
from dailyai.services.fal_ai_services import FalImageGenService
|
||||
@@ -22,9 +22,9 @@ async def main(room_url):
|
||||
transport.camera_height = 1024
|
||||
|
||||
llm = AzureLLMService()
|
||||
dalle = FalImageGenService(image_size="1024x1024")
|
||||
#dalle = FalImageGenService(image_size="1024x1024")
|
||||
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
|
||||
# dalle = OpenAIImageGenService(image_size="1024x1024")
|
||||
dalle = AzureImageGenServiceREST(image_size="1024x1024")
|
||||
|
||||
# Get a complete audio chunk from the given text. Splitting this into its own
|
||||
# coroutine lets us ensure proper ordering of the audio chunks on the send queue.
|
||||
|
||||
Reference in New Issue
Block a user