Compare commits

...

3 Commits

Author SHA1 Message Date
Moishe Lettvin
646db8b9bd cleanup continues 2024-01-26 07:57:41 -05:00
Moishe Lettvin
42c142aff0 ... 2024-01-25 14:55:51 -05:00
Moishe Lettvin
6da78dbf9c getting started on cleanup 2024-01-25 13:50:10 -05:00
6 changed files with 93 additions and 106 deletions

View File

@@ -5,7 +5,6 @@ import json
from openai import AsyncAzureOpenAI
import os
import requests
from collections.abc import AsyncGenerator
@@ -16,7 +15,10 @@ from PIL import Image
from azure.cognitiveservices.speech import SpeechSynthesizer, SpeechConfig, ResultReason, CancellationReason
class AzureTTSService(TTSService):
def __init__(self, speech_key=None, speech_region=None):
def __init__(
self, speech_key=None, speech_region=None, voice_name="en-US-SaraNeural"
):
super().__init__()
speech_key = speech_key or os.getenv("AZURE_SPEECH_SERVICE_KEY")
@@ -25,11 +27,13 @@ class AzureTTSService(TTSService):
self.speech_config = SpeechConfig(subscription=speech_key, region=speech_region)
self.speech_synthesizer = SpeechSynthesizer(speech_config=self.speech_config, audio_config=None)
self.voice_name = voice_name
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
self.logger.info("Running azure tts")
ssml = "<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' " \
ssml = f"<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis' " \
"xmlns:mstts='http://www.w3.org/2001/mstts'>" \
"<voice name='en-US-SaraNeural'>" \
f"<voice name={self.voice_name}>" \
"<mstts:silence type='Sentenceboundary' value='20ms' />" \
"<mstts:express-as style='lyrical' styledegree='2' role='SeniorFemale'>" \
"<prosody rate='1.05'>" \
@@ -92,81 +96,58 @@ class AzureLLMService(LLMService):
class AzureImageGenServiceREST(ImageGenService):
def __init__(self, image_size:str, api_key=None, azure_endpoint=None, api_version=None, model=None):
def __init__(
self,
image_size: str,
api_key: str | None = None,
azure_endpoint: str | None = None,
api_version: str | None = None,
model: str | None = None,
aiohttp_session: aiohttp.ClientSession | None=None,
timeout_seconds=120,
):
super().__init__(image_size=image_size)
self.api_key = api_key or os.getenv("AZURE_DALLE_KEY")
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
self.api_version = api_version or "2023-06-01-preview"
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
self.aiohttp_session: aiohttp.ClientSession = (
aiohttp_session or aiohttp.ClientSession()
)
self.timeout_seconds = timeout_seconds
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
# TODO hoist the session to app-level
async with aiohttp.ClientSession() as session:
url = f"{self.azure_endpoint}openai/images/generations:submit?api-version={self.api_version}"
headers= { "api-key": self.api_key, "Content-Type": "application/json" }
body = {
# Enter your prompt text here
"prompt": sentence,
"size": self.image_size,
"n": 1,
}
async with session.post(url, headers=headers, json=body) as submission:
operation_location = submission.headers['operation-location']
url = f"{self.azure_endpoint}openai/images/generations:submit?api-version={self.api_version}"
headers= { "api-key": self.api_key, "Content-Type": "application/json" }
body = {
"prompt": sentence,
"size": self.image_size,
"n": 1,
}
async with self.aiohttp_session.post(
url, headers=headers, json=body
) as submission:
operation_location = submission.headers['operation-location']
status = ""
attempts_left = 120
json_response = None
while status != "succeeded":
attempts_left -= 1
if attempts_left == 0:
raise Exception("Image generation timed out")
status = ""
attempts_left = self.timeout_seconds
json_response = None
while status != "succeeded":
attempts_left -= 1
if attempts_left == 0:
raise Exception("Image generation timed out")
await asyncio.sleep(1)
response = await session.get(operation_location, headers=headers)
json_response = await response.json()
status = json_response["status"]
await asyncio.sleep(1)
response = await self.aiohttp_session.get(operation_location, headers=headers)
json_response = await response.json()
status = json_response["status"]
image_url = json_response["result"]["data"][0]["url"] if json_response else None
if not image_url:
raise Exception("Image generation failed")
image_url = json_response["result"]["data"][0]["url"] if json_response else None
if not image_url:
raise Exception("Image generation failed")
# Load the image from the url
async with session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
class AzureImageGenService(ImageGenService):
def __init__(self, api_key=None, azure_endpoint=None, api_version=None, model=None):
super().__init__()
api_key = api_key or os.getenv("AZURE_DALLE_KEY")
azure_endpoint = azure_endpoint or os.getenv("AZURE_DALLE_ENDPOINT")
api_version = api_version or "2023-06-01-preview"
self.model = model or os.getenv("AZURE_DALLE_DEPLOYMENT_ID")
self.client = AzureOpenAI(
api_key=api_key,
azure_endpoint=azure_endpoint,
api_version=api_version,
)
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
self.logger.info("Generating azure image", sentence)
image = self.client.images.generate(
model=self.model,
prompt=sentence,
n=1,
size=self.image_size,
)
url = image["data"][0]["url"]
response = requests.get(url)
dalle_stream = io.BytesIO(response.content)
dalle_im = Image.open(dalle_stream.tobytes())
return (url, dalle_im)
# Load the image from the url
async with self.aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())

View File

@@ -9,28 +9,30 @@ from dailyai.services.ai_services import TTSService
class ElevenLabsTTSService(TTSService):
def __init__(self, api_key=None, voice_id=None):
def __init__(self, api_key=None, voice_id=None, aiohttp_session:aiohttp.ClientSession=None):
super().__init__()
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
self.voice_id = voice_id or os.getenv("ELEVENLABS_VOICE_ID")
self.aiohttp_session = aiohttp_session or aiohttp.ClientSession()
async def run_tts(self, sentence) -> AsyncGenerator[bytes, None]:
async with aiohttp.ClientSession() as session:
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream"
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
headers = {
"xi-api-key": self.api_key,
"Content-Type": "application/json",
}
async with session.post(url, json=payload, headers=headers, params=querystring) as r:
if r.status != 200:
self.logger.error(
f"audio fetch status code: {r.status}, error: {r.text}"
)
return
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream"
payload = {"text": sentence, "model_id": "eleven_turbo_v2"}
querystring = {"output_format": "pcm_16000", "optimize_streaming_latency": 2}
headers = {
"xi-api-key": self.api_key,
"Content-Type": "application/json",
}
async with self.aiohttp_session.post(
url, json=payload, headers=headers, params=querystring
) as r:
if r.status != 200:
self.logger.error(
f"audio fetch status code: {r.status}, error: {r.text}"
)
return
async for chunk in r.content:
if chunk:
yield chunk
async for chunk in r.content:
if chunk:
yield chunk

View File

@@ -34,9 +34,9 @@ class FalImageGenService(ImageGenService):
raise Exception("Image generation failed")
return image_url
print(f"fetching image url...")
print("fetching image url...")
image_url = await asyncio.to_thread(get_image_url, sentence, self.image_size)
print(f"got image url, downloading image...")
print("got image url, downloading image...")
# Load the image from the url
async with aiohttp.ClientSession() as session:
async with session.get(image_url) as response:

View File

@@ -1,6 +1,4 @@
import requests
import aiohttp
import asyncio
from PIL import Image
import io
from openai import AsyncOpenAI
@@ -9,7 +7,7 @@ import os
import json
from collections.abc import AsyncGenerator
from dailyai.services.ai_services import AIService, TTSService, LLMService, ImageGenService
from dailyai.services.ai_services import LLMService, ImageGenService
class OpenAILLMService(LLMService):
@@ -50,11 +48,19 @@ class OpenAILLMService(LLMService):
return None
class OpenAIImageGenService(ImageGenService):
def __init__(self, image_size:str, api_key=None, model=None):
def __init__(
self,
image_size: str,
api_key=None,
model=None,
aiohttp_session: aiohttp.ClientSession | None = None,
):
super().__init__(image_size=image_size)
api_key = api_key or os.getenv("OPEN_AI_KEY")
self.model = model or os.getenv("OPEN_AI_IMAGE_MODEL") or "dall-e-3"
self.client = AsyncOpenAI(api_key=api_key)
self.aiohttp_session=aiohttp_session or aiohttp.ClientSession()
async def run_image_gen(self, sentence) -> tuple[str, bytes]:
self.logger.info("Generating OpenAI image", sentence)
@@ -70,10 +76,7 @@ class OpenAIImageGenService(ImageGenService):
raise Exception("No image provided in response", image)
# Load the image from the url
async with aiohttp.ClientSession() as session:
async with session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())
return (image_url, dalle_im.tobytes())
async with self.aiohttp_session.get(image_url) as response:
image_stream = io.BytesIO(await response.content.read())
image = Image.open(image_stream)
return (image_url, image.tobytes())

View File

@@ -4,6 +4,7 @@ import asyncio
from dailyai.queue_frame import TextQueueFrame
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.open_ai_services import OpenAIImageGenService
from dailyai.services.azure_ai_services import AzureImageGenServiceREST
local_joined = False
participant_joined = False
@@ -21,7 +22,7 @@ async def main(room_url):
transport.camera_width = 1024
transport.camera_height = 1024
imagegen = OpenAIImageGenService(image_size="1024x1024")
imagegen = AzureImageGenServiceREST(image_size="1024x1024")
image_task = asyncio.create_task(
imagegen.run_to_queue(transport.send_queue, [TextQueueFrame("a cat in the style of picasso")])
)

View File

@@ -2,7 +2,7 @@ import argparse
import asyncio
from dailyai.queue_frame import AudioQueueFrame, ImageQueueFrame
from dailyai.services.azure_ai_services import AzureLLMService
from dailyai.services.azure_ai_services import AzureImageGenServiceREST, AzureLLMService
from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
from dailyai.services.daily_transport_service import DailyTransportService
from dailyai.services.fal_ai_services import FalImageGenService
@@ -22,9 +22,9 @@ async def main(room_url):
transport.camera_height = 1024
llm = AzureLLMService()
dalle = FalImageGenService(image_size="1024x1024")
#dalle = FalImageGenService(image_size="1024x1024")
tts = ElevenLabsTTSService(voice_id="ErXwobaYiN019PkySvjV")
# dalle = OpenAIImageGenService(image_size="1024x1024")
dalle = AzureImageGenServiceREST(image_size="1024x1024")
# Get a complete audio chunk from the given text. Splitting this into its own
# coroutine lets us ensure proper ordering of the audio chunks on the send queue.