Compare commits

...

3 Commits

Author SHA1 Message Date
Moishe Lettvin
646db8b9bd cleanup continues 2024-01-26 07:57:41 -05:00
Moishe Lettvin
42c142aff0 ... 2024-01-25 14:55:51 -05:00
Moishe Lettvin
6da78dbf9c getting started on cleanup 2024-01-25 13:50:10 -05:00
6 changed files with 93 additions and 106 deletions

View File

@@ -5,7 +5,6 @@ import json
from openai import AsyncAzureOpenAI from openai import AsyncAzureOpenAI
import os import os
import requests
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
@@ -16,7 +15,10 @@ from PIL import Image
from azure.cognitiveservices.speech import SpeechSynthesizer, SpeechConfig, ResultReason, CancellationReason from azure.cognitiveservices.speech import SpeechSynthesizer, SpeechConfig, ResultReason, CancellationReason
class AzureTTSService(TTSService): class AzureTTSService(TTSService):
def __init__(self, speech_key=None, speech_region=None):
def __init__(
self, speech_key=None, speech_region=None, voice_name="en-US-SaraNeural"
):
super().__init__() super().__init__()
speech_key = speech_key or os.getenv("AZURE_SPEECH_SERVICE_KEY") speech_key = speech_key or os.getenv("AZURE_SPEECH_SERVICE_KEY")
@@ -25,11 +27,13 @@ class AzureTTSService(TTSService):
self.speech_config = SpeechConfig(subscription=speech_key, region=speech_region) self.speech_config = SpeechConfig(subscription=speech_key, region=speech_region)
self.speech_synthesizer = SpeechSynthesizer(speech_config=self.speech_config, audio_config=None) self.speech_synthesizer = SpeechSynthesizer(speech_config=self.speech_config, audio_config=None)
self.voice_name = voice_name
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]: async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
self.logger.info("Running azure tts") self.logger.info("Running azure tts")
ssml = "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' " \ ssml = f"<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' " \
"xmlns:mstts='http://www.w3.org/2001/mstts'>" \ "xmlns:mstts='http://www.w3.org/2001/mstts'>" \
"<voice name='en-US-SaraNeural'>" \ f"<voice name={self.voice_name}>" \
"<mstts:silence type='Sentenceboundary' value='20ms' />" \ "<mstts:silence type='Sentenceboundary' value='20ms' />" \
"<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>" \ "<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>" \
"<prosody rate='1.05'>" \ "<prosody rate='1.05'>" \
@@ -92,81 +96,58 @@ class AzureLLMService(LLMService):
class AzureImageGenServiceREST(ImageGenService): class AzureImageGenServiceREST(ImageGenService):
def __init__(self, image_size:str, api_key=None, azure_endpoint=None, api_version=None, model=None): def __init__(
self,
image_size: str,
api_key: str | None = None,
azure_endpoint: str | None = None,
api_version: str | None = None,
model: str | None = None,
aiohttp_session: aiohttp.ClientSession | None=None,
timeout_seconds=120,
):
super().__init__(image_size=image_size) super().__init__(image_size=image_size)
self.api_key = api_key or os.getenv("AZURE_DALLE_KEY") self.api_key = api_key or os.getenv("AZURE_DALLE_KEY")
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT") self.azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
self.api_version = api_version or "2023-06-01-preview" self.api_version = api_version or "2023-06-01-preview"
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID") self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
self.aiohttp_session: aiohttp.ClientSession = (
aiohttp_session or aiohttp.ClientSession()
)
self.timeout_seconds = timeout_seconds
async def run_image_gen(self, sentence) -> tuple[str, bytes]: async def run_image_gen(self, sentence) -> tuple[str, bytes]:
# TODO hoist the session to app-level url = f"{self.azure_endpoint}openai/images/generations:submit?api-version={self.api_version}"
async with aiohttp.ClientSession() as session: headers= { "api-key": self.api_key, "Content-Type": "application/json" }
url = f"{self.azure_endpoint}openai/images/generations:submit?api-version={self.api_version}" body = {
headers= { "api-key": self.api_key, "Content-Type": "application/json" } "prompt": sentence,
body = { "size": self.image_size,
# Enter your prompt text here "n": 1,
"prompt": sentence, }
"size": self.image_size, async with self.aiohttp_session.post(
"n": 1, url, headers=headers, json=body
} ) as submission:
async with session.post(url, headers=headers, json=body) as submission: operation_location = submission.headers['operation-location']
operation_location = submission.headers['operation-location']
status = "" status = ""
attempts_left = 120 attempts_left = self.timeout_seconds
json_response = None json_response = None
while status != "succeeded": while status != "succeeded":
attempts_left -= 1 attempts_left -= 1
if attempts_left == 0: if attempts_left == 0:
raise Exception("Image generation timed out") raise Exception("Image generation timed out")
await asyncio.sleep(1) await asyncio.sleep(1)
response = await session.get(operation_location, headers=headers) response = await self.aiohttp_session.get(operation_location, headers=headers)
json_response = await response.json() json_response = await response.json()
status = json_response["status"] status = json_response["status"]
image_url = json_response["result"]["data"][0]["url"] if json_response else None image_url = json_response["result"]["data"][0]["url"] if json_response else None
if not image_url: if not image_url:
raise Exception("Image generation failed") raise Exception("Image generation failed")
# Load the image from the url # Load the image from the url
async with session.get(image_url) as response: async with self.aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read()) image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream) image = Image.open(image_stream)
return (image_url, image.tobytes()) return (image_url, image.tobytes())
class AzureImageGenService(ImageGenService):
def __init__(self, api_key=None, azure_endpoint=None, api_version=None, model=None):
super().__init__()
api_key = api_key or os.getenv("AZURE_DALLE_KEY")
azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
api_version = api_version or "2023-06-01-preview"
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
self.client = AzureOpenAI(
api_key=api_key,
azure_endpoint=azure_endpoint,
api_version=api_version,
)
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
self.logger.info("Generating azure image", sentence)
image = self.client.images.generate(
model=self.model,
prompt=sentence,
n=1,
size=self.image_size,
)
url = image["data"][0]["url"]
response = requests.get(url)
dalle_stream = io.BytesIO(response.content)
dalle_im = Image.open(dalle_stream.tobytes())
return (url, dalle_im)

View File

@@ -9,28 +9,30 @@ from dailyai.services.ai_services import TTSService
class ElevenLabsTTSService(TTSService): class ElevenLabsTTSService(TTSService):
def __init__(self, api_key=None, voice_id=None): def __init__(self, api_key=None, voice_id=None, aiohttp_session:aiohttp.ClientSession=None):
super().__init__() super().__init__()
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
self.voice_id = voice_id or os.getenv("ELEVENLABS_VOICE_ID") self.voice_id = voice_id or os.getenv("ELEVENLABS_VOICE_ID")
self.aiohttp_session = aiohttp_session or aiohttp.ClientSession()
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]: async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
async with aiohttp.ClientSession() as session: url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream"
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream" payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
payload = {"text": sentence, "model_id": "eleven_turbo_v2"} querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2} headers = {
headers = { "xi-api-key": self.api_key,
"xi-api-key": self.api_key, "Content-Type": "application/json",
"Content-Type": "application/json", }
} async with self.aiohttp_session.post(
async with session.post(url, json=payload, headers=headers, params=querystring) as r: url, json=payload, headers=headers, params=querystring
if r.status != 200: ) as r:
self.logger.error( if r.status != 200:
f"audio fetch status code: {r.status}, error: {r.text}" self.logger.error(
) f"audio fetch status code: {r.status}, error: {r.text}"
return )
return
async for chunk in r.content: async for chunk in r.content:
if chunk: if chunk:
yield chunk yield chunk

View File

@@ -34,9 +34,9 @@ class FalImageGenService(ImageGenService):
raise Exception("Image generation failed") raise Exception("Image generation failed")
return image_url return image_url
print(f"fetching image url...") print("fetching image url...")
image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size) image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size)
print(f"got image url, downloading image...") print("got image url, downloading image...")
# Load the image from the url # Load the image from the url
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(image_url) as response: async with session.get(image_url) as response:

View File

@@ -1,6 +1,4 @@
import requests
import aiohttp import aiohttp
import asyncio
from PIL import Image from PIL import Image
import io import io
from openai import AsyncOpenAI from openai import AsyncOpenAI
@@ -9,7 +7,7 @@ import os
import json import json
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
from dailyai.services.ai_services import AIService, TTSService, LLMService, ImageGenService from dailyai.services.ai_services import LLMService, ImageGenService
class OpenAILLMService(LLMService): class OpenAILLMService(LLMService):
@@ -50,11 +48,19 @@ class OpenAILLMService(LLMService):
return None return None
class OpenAIImageGenService(ImageGenService): class OpenAIImageGenService(ImageGenService):
def __init__(self, image_size:str, api_key=None, model=None):
def __init__(
self,
image_size: str,
api_key=None,
model=None,
aiohttp_session: aiohttp.ClientSession | None = None,
):
super().__init__(image_size=image_size) super().__init__(image_size=image_size)
api_key = api_key or os.getenv("OPEN_AI_KEY") api_key = api_key or os.getenv("OPEN_AI_KEY")
self.model = model or os.getenv("OPEN_AI_IMAGE_MODEL") or "dall-e-3" self.model = model or os.getenv("OPEN_AI_IMAGE_MODEL") or "dall-e-3"
self.client = AsyncOpenAI(api_key=api_key) self.client = AsyncOpenAI(api_key=api_key)
self.aiohttp_session=aiohttp_session or aiohttp.ClientSession()
async def run_image_gen(self, sentence) -> tuple[str, bytes]: async def run_image_gen(self, sentence) -> tuple[str, bytes]:
self.logger.info("Generating OpenAI image", sentence) self.logger.info("Generating OpenAI image", sentence)
@@ -70,10 +76,7 @@ class OpenAIImageGenService(ImageGenService):
raise Exception("No image provided in response", image) raise Exception("No image provided in response", image)
# Load the image from the url # Load the image from the url
async with aiohttp.ClientSession() as session: async with self.aiohttp_session.get(image_url) as response:
async with session.get(image_url) as response: image_stream = io.BytesIO(await response.content.read())
image_stream = io.BytesIO(await response.content.read()) image = Image.open(image_stream)
image = Image.open(image_stream) return (image_url, image.tobytes())
return (image_url, image.tobytes())
return (image_url, dalle_im.tobytes())

View File

@@ -4,6 +4,7 @@ import asyncio
from dailyai.queue_frame import TextQueueFrame from dailyai.queue_frame import TextQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.open_ai_services import OpenAIImageGenService from dailyai.services.open_ai_services import OpenAIImageGenService
from dailyai.services.azure_ai_services import AzureImageGenServiceREST
local_joined = False local_joined = False
participant_joined = False participant_joined = False
@@ -21,7 +22,7 @@ async def main(room_url):
transport.camera_width = 1024 transport.camera_width = 1024
transport.camera_height = 1024 transport.camera_height = 1024
imagegen = OpenAIImageGenService(image_size="1024x1024") imagegen = AzureImageGenServiceREST(image_size="1024x1024")
image_task = asyncio.create_task( image_task = asyncio.create_task(
imagegen.run_to_queue(transport.send_queue, [TextQueueFrame("a cat in the style of picasso")]) imagegen.run_to_queue(transport.send_queue, [TextQueueFrame("a cat in the style of picasso")])
) )

View File

@@ -2,7 +2,7 @@ import argparse
import asyncio import asyncio
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
from dailyai.services.azure_ai_services import AzureLLMService from dailyai.services.azure_ai_services import AzureImageGenServiceREST, AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.daily_transport_service import DailyTransportService from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.fal_ai_services import FalImageGenService from dailyai.services.fal_ai_services import FalImageGenService
@@ -22,9 +22,9 @@ async def main(room_url):
transport.camera_height = 1024 transport.camera_height = 1024
llm = AzureLLMService() llm = AzureLLMService()
dalle = FalImageGenService(image_size="1024x1024") #dalle = FalImageGenService(image_size="1024x1024")
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV") tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
# dalle = OpenAIImageGenService(image_size="1024x1024") dalle = AzureImageGenServiceREST(image_size="1024x1024")
# Get a complete audio chunk from the given text. Splitting this into its own # Get a complete audio chunk from the given text. Splitting this into its own
# coroutine lets us ensure proper ordering of the audio chunks on the send queue. # coroutine lets us ensure proper ordering of the audio chunks on the send queue.