feat: Enhance text-to-mic CLI and GUI with model selection and Edge-TTS support
This commit introduces significant improvements to the text-to-mic functionality, including: - Added command-line argument parsing for better user interaction. - Support for Edge-TTS, allowing audio generation without an API key. - Dynamic model and voice selection based on user input and environment variables. - Improved error handling and user feedback for audio device selection. - Updated default voice settings based on selected TTS model. - Removed unused compiled Python files from the repository. Changes: - text-to-mic-cli.py: Implemented argument parsing and client initialization logic. - utils/text_to_mic.py: Integrated Edge-TTS audio generation and updated voice selection logic. - Cleaned up unnecessary compiled files in the utils/__pycache__ directory. This update enhances usability and flexibility for users leveraging different TTS models.
This commit is contained in:
@@ -5,19 +5,58 @@ import wave
|
||||
import threading
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import argparse
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
# Set up your OpenAI API key from the environment variable
|
||||
api_key = os.getenv('OPENAI_API_KEY')
|
||||
# Default API config (will be overridden by CLI args if provided)
|
||||
api_key = os.getenv('OPENAI_API_KEY', '')
|
||||
api_base_url = os.getenv('OPENAI_API_BASE_URL', '').strip()
|
||||
|
||||
# Create client with custom base URL if provided
|
||||
if api_base_url:
|
||||
client = OpenAI(api_key=api_key, base_url=api_base_url)
|
||||
else:
|
||||
client = OpenAI(api_key=api_key)
|
||||
# Client will be created after args are parsed
|
||||
client = None
|
||||
|
||||
def get_client(key=None, base_url=None):
|
||||
"""Get or create the OpenAI client."""
|
||||
global client
|
||||
effective_key = key or api_key
|
||||
effective_base = (base_url or api_base_url).strip()
|
||||
|
||||
if not effective_key:
|
||||
raise ValueError("API key is required. Set OPENAI_API_KEY env var or use --api-key")
|
||||
|
||||
if effective_base:
|
||||
return OpenAI(api_key=effective_key, base_url=effective_base)
|
||||
return OpenAI(api_key=effective_key)
|
||||
|
||||
# Model name aliases
|
||||
MODEL_ALIASES = {
|
||||
'cosyvoice2': 'FunAudioLLM/CosyVoice2-0.5B',
|
||||
'cosyvoice': 'FunAudioLLM/CosyVoice2-0.5B',
|
||||
'tts-1': 'tts-1',
|
||||
'tts-1-hd': 'tts-1-hd',
|
||||
'gpt-4o-mini-tts': 'gpt-4o-mini-tts',
|
||||
}
|
||||
|
||||
# Map alias to full model name
|
||||
def resolve_model(model_input):
|
||||
"""Resolve model alias to full model name."""
|
||||
if not model_input:
|
||||
return None
|
||||
return MODEL_ALIASES.get(model_input.lower(), model_input)
|
||||
|
||||
# Check if using SiliconFlow
|
||||
def is_siliconflow():
|
||||
"""Check if the API is configured for SiliconFlow."""
|
||||
return api_base_url and 'siliconflow' in api_base_url.lower()
|
||||
|
||||
# Format voice for CosyVoice2
|
||||
def format_cosyvoice2_voice(voice, model):
|
||||
"""Format voice for CosyVoice2: model:voice"""
|
||||
if model and 'CosyVoice2' in model and is_siliconflow():
|
||||
return f"{model}:{voice}"
|
||||
return voice
|
||||
|
||||
def list_audio_devices():
|
||||
p = pyaudio.PyAudio()
|
||||
@@ -88,14 +127,21 @@ def play_audio_multiplexed(file_paths, device_indices):
|
||||
|
||||
p.terminate()
|
||||
|
||||
def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=None, device_index_2=None):
|
||||
def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=None, device_index_2=None, api_key=None, api_base=None):
|
||||
# Get model from environment variable or use default
|
||||
if model is None:
|
||||
model = os.getenv('OPENAI_TTS_MODEL', 'tts-1')
|
||||
|
||||
response = client.audio.speech.create(
|
||||
# Format voice for CosyVoice2 if using SiliconFlow
|
||||
voice_to_use = format_cosyvoice2_voice(voice, model)
|
||||
print(f"Using model: {model}, voice: {voice_to_use}")
|
||||
|
||||
# Get client with potential CLI overrides
|
||||
effective_client = get_client(api_key, api_base)
|
||||
|
||||
response = effective_client.audio.speech.create(
|
||||
model=model,
|
||||
voice=voice,
|
||||
voice=voice_to_use,
|
||||
input=text,
|
||||
response_format='wav'
|
||||
)
|
||||
@@ -118,44 +164,100 @@ def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=No
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Text-to-Mic CLI: Convert text to speech and play to virtual microphone',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog='''
|
||||
Examples:
|
||||
%(prog)s "Hello world"
|
||||
%(prog)s "Hello world" --voice anna --model cosyvoice2
|
||||
%(prog)s "Hello world" --voice alex --model FunAudioLLM/CosyVoice2-0.5B --device 8
|
||||
%(prog)s --list-devices
|
||||
%(prog)s --list-voices
|
||||
|
||||
Environment variables (optional, CLI args take precedence):
|
||||
OPENAI_API_KEY - Your API key (required for API TTS)
|
||||
OPENAI_API_BASE_URL - Custom API base URL (e.g., https://api.siliconflow.cn/v1)
|
||||
OPENAI_TTS_MODEL - Default TTS model (default: tts-1)
|
||||
|
||||
arglen = len(sys.argv)
|
||||
Model aliases:
|
||||
cosyvoice2, cosyvoice -> FunAudioLLM/CosyVoice2-0.5B
|
||||
tts-1 -> tts-1
|
||||
tts-1-hd -> tts-1-hd
|
||||
gpt-4o-mini-tts -> gpt-4o-mini-tts
|
||||
|
||||
if arglen < 2:
|
||||
print("Usage: python script.py 'text to convert'")
|
||||
print("Environment variables:")
|
||||
print(" OPENAI_API_KEY - Your API key (required)")
|
||||
print(" OPENAI_API_BASE_URL - Custom API base URL (optional)")
|
||||
print(" OPENAI_TTS_MODEL - TTS model to use (default: tts-1)")
|
||||
print("")
|
||||
print("Example models:")
|
||||
print(" - tts-1 (OpenAI standard)")
|
||||
print(" - tts-1-hd (OpenAI high quality)")
|
||||
print(" - gpt-4o-mini-tts (OpenAI)")
|
||||
print(" - FunAudioLLM/CosyVoice2-0.5B (SiliconFlow)")
|
||||
print("")
|
||||
print("For SiliconFlow voices with CosyVoice2:")
|
||||
print(" The voice will be auto-formatted as: FunAudioLLM/CosyVoice2-0.5B:alex")
|
||||
sys.exit(1)
|
||||
SiliconFlow CosyVoice2 voices:
|
||||
alex, anna, bella, benjamin, charles, claire, david, diana
|
||||
|
||||
print(f"arg count {arglen}")
|
||||
OpenAI voices:
|
||||
alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer
|
||||
'''
|
||||
)
|
||||
|
||||
# Get TTS model from environment
|
||||
tts_model = os.getenv('OPENAI_TTS_MODEL', 'tts-1')
|
||||
print(f"Using TTS model: {tts_model}")
|
||||
parser.add_argument('text', nargs='?', help='Text to convert to speech')
|
||||
parser.add_argument('--voice', '-v', default='fable',
|
||||
help='Voice to use (default: fable). For CosyVoice2: alex, anna, bella, etc.')
|
||||
parser.add_argument('--model', '-m', default=None,
|
||||
help='TTS model to use. Use alias (cosyvoice2) or full name (FunAudioLLM/CosyVoice2-0.5B)')
|
||||
parser.add_argument('--device', '-d', type=int, default=None,
|
||||
help='Audio device index to play to (use --list-devices to find)')
|
||||
parser.add_argument('--device2', type=int, default=None,
|
||||
help='Second audio device index for multiplexed playback')
|
||||
parser.add_argument('--list-devices', action='store_true',
|
||||
help='List available audio output devices and exit')
|
||||
parser.add_argument('--list-voices', action='store_true',
|
||||
help='List available voices for the configured model and exit')
|
||||
parser.add_argument('--list-models', action='store_true',
|
||||
help='List available model aliases and exit')
|
||||
parser.add_argument('--api-key', default=None,
|
||||
help='OpenAI API key (or set OPENAI_API_KEY env var)')
|
||||
parser.add_argument('--api-base', default=None,
|
||||
help='API base URL (or set OPENAI_API_BASE_URL env var)')
|
||||
|
||||
if arglen == 4:
|
||||
device_index = int(sys.argv[2])
|
||||
device_index_2 = int(sys.argv[3])
|
||||
elif arglen == 3:
|
||||
device_index = int(sys.argv[2])
|
||||
device_index_2 = None
|
||||
else:
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle list commands
|
||||
if args.list_devices:
|
||||
list_audio_devices()
|
||||
exit(0)
|
||||
|
||||
if args.list_models:
|
||||
print("Available model aliases:")
|
||||
for alias, full_name in MODEL_ALIASES.items():
|
||||
print(f" {alias:20} -> {full_name}")
|
||||
exit(0)
|
||||
|
||||
if args.list_voices:
|
||||
# Detect if using SiliconFlow based on API base URL
|
||||
if is_siliconflow():
|
||||
print("SiliconFlow CosyVoice2 voices:")
|
||||
voices = ['alex', 'anna', 'bella', 'benjamin', 'charles', 'claire', 'david', 'diana']
|
||||
else:
|
||||
print("OpenAI voices:")
|
||||
voices = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer']
|
||||
for voice in voices:
|
||||
print(f" {voice}")
|
||||
exit(0)
|
||||
|
||||
# Validate text argument
|
||||
if not args.text:
|
||||
parser.print_help()
|
||||
exit(1)
|
||||
|
||||
# Resolve model (alias -> full name)
|
||||
model = resolve_model(args.model) if args.model else os.getenv('OPENAI_TTS_MODEL', 'tts-1')
|
||||
print(f"Text: {args.text}")
|
||||
print(f"Voice: {args.voice}")
|
||||
print(f"Model: {model}")
|
||||
|
||||
# Get device index
|
||||
device_index = args.device
|
||||
device_index_2 = args.device2
|
||||
|
||||
if device_index is None:
|
||||
list_audio_devices()
|
||||
device_index = int(input("Enter the device index: "))
|
||||
device_index_2 = None
|
||||
|
||||
|
||||
stream_audio_to_virtual_mic(sys.argv[1], voice="fable", model=tts_model, device_index=device_index,device_index_2=device_index_2)
|
||||
stream_audio_to_virtual_mic(args.text, voice=args.voice, model=model,
|
||||
device_index=device_index, device_index_2=device_index_2,
|
||||
api_key=args.api_key, api_base=args.api_base)
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -11,6 +11,8 @@ import time
|
||||
import requests
|
||||
import pyttsx3
|
||||
import tempfile
|
||||
import asyncio
|
||||
import edge_tts
|
||||
|
||||
from pystray import Icon as icon, MenuItem as item, Menu as menu
|
||||
from PIL import Image, ImageDraw, ImageTk
|
||||
@@ -368,11 +370,11 @@ class TextToMic(tk.Tk):
|
||||
for display_name in model_options:
|
||||
self.tts_menu['menu'].add_command(label=display_name, command=tk._setit(self.tts_model_var, display_name, self.on_tts_model_change))
|
||||
|
||||
# Set default based on API base URL
|
||||
if self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
# Set default to CosyVoice if SiliconFlow is configured, otherwise edge-tts
|
||||
if hasattr(self, 'api_base_url') and self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
default_model = "FunAudioLLM/CosyVoice2-0.5B"
|
||||
else:
|
||||
default_model = "gpt-4o-mini-tts"
|
||||
default_model = "edge-tts"
|
||||
|
||||
# Find and set the display name for the default model
|
||||
for i, model_id in enumerate(model_ids):
|
||||
@@ -506,11 +508,11 @@ class TextToMic(tk.Tk):
|
||||
settings = self.load_settings()
|
||||
saved_tts_model = settings.get("tts_model", "")
|
||||
if not saved_tts_model:
|
||||
# Default based on API base URL
|
||||
if self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
# Default to CosyVoice if SiliconFlow is configured, otherwise edge-tts (free, no API key required)
|
||||
if hasattr(self, 'api_base_url') and self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
saved_tts_model = "FunAudioLLM/CosyVoice2-0.5B"
|
||||
else:
|
||||
saved_tts_model = "gpt-4o-mini-tts"
|
||||
saved_tts_model = "edge-tts"
|
||||
|
||||
# Find the display name for the saved model
|
||||
default_tts_model_display = tts_model_options[0]
|
||||
@@ -533,8 +535,11 @@ class TextToMic(tk.Tk):
|
||||
# Initialize voice selection
|
||||
self.available_voices = self.get_available_voices()
|
||||
|
||||
# Determine default voice based on whether API key is available
|
||||
default_voice = "fable" if self.has_api_key else self.available_voices[0] if self.available_voices else "[System] Default"
|
||||
# Default voice: anna for CosyVoice, zh-CN-XiaoxiaoNeural for edge-tts
|
||||
if saved_tts_model and "CosyVoice" in saved_tts_model:
|
||||
default_voice = "anna"
|
||||
else:
|
||||
default_voice = "zh-CN-XiaoxiaoNeural"
|
||||
|
||||
self.voice_var = tk.StringVar(value=default_voice)
|
||||
|
||||
@@ -550,6 +555,10 @@ class TextToMic(tk.Tk):
|
||||
voice_menu.bind('<FocusOut>', lambda e: self.on_voice_exit(voice_menu))
|
||||
self.voice_menu = voice_menu # Store reference for later updates
|
||||
|
||||
# IMPORTANT: Update voices based on the selected TTS model after initialization
|
||||
# This ensures edge-tts voices are loaded when edge-tts is the default model
|
||||
self.update_available_voices()
|
||||
|
||||
# Add hint label for custom voices
|
||||
voice_hint = ttk.Label(voice_frame,
|
||||
text="💡 Click to edit or type custom voice ID",
|
||||
@@ -914,6 +923,15 @@ class TextToMic(tk.Tk):
|
||||
return Path(filename) # Default to current directory for non-macOS systems
|
||||
|
||||
|
||||
def generate_edge_tts_audio(self, text, voice, output_file):
|
||||
"""Generate audio using Edge-TTS (synchronous wrapper for async function)."""
|
||||
async def _generate():
|
||||
communicate = edge_tts.Communicate(text, voice)
|
||||
await communicate.save(output_file)
|
||||
|
||||
# Run in a new event loop to avoid conflicts with any existing loops
|
||||
asyncio.run(_generate())
|
||||
|
||||
def submit_text(self, play_text = None):
|
||||
print(f"submit text self recording: {self.recording}")
|
||||
if self.recording:
|
||||
@@ -977,24 +995,76 @@ class TextToMic(tk.Tk):
|
||||
messagebox.showerror("TTS Error", f"Failed to generate or play system voice: {str(e)}")
|
||||
|
||||
else:
|
||||
# Use OpenAI TTS
|
||||
# Check if using Edge-TTS (doesn't require API key)
|
||||
selected_tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
|
||||
selected_tts_model = "gpt-4o-mini-tts" # Default
|
||||
|
||||
# Find the model ID from display name
|
||||
available_models = self.get_available_tts_models()
|
||||
for model_id, display_name in available_models:
|
||||
if display_name == selected_tts_model_display:
|
||||
selected_tts_model = model_id
|
||||
break
|
||||
|
||||
# Edge-TTS handling
|
||||
if "edge-tts" in selected_tts_model:
|
||||
# Convert device names to indices
|
||||
primary_index = self.available_devices.get(self.device_index.get(), None)
|
||||
secondary_index = self.available_devices.get(self.device_index_2.get(), None) if self.device_index_2.get() != "None" else None
|
||||
|
||||
if primary_index is None:
|
||||
messagebox.showerror("Error", "Primary device not selected or unavailable.")
|
||||
return
|
||||
|
||||
try:
|
||||
print(f"[DEBUG] Using Edge-TTS with voice: {selected_voice}")
|
||||
|
||||
# Generate speech using Edge-TTS (outputs MP3)
|
||||
temp_mp3_filename = "temp_edge_tts_output.mp3"
|
||||
temp_wav_filename = "temp_edge_tts_output.wav"
|
||||
self.generate_edge_tts_audio(text, selected_voice, temp_mp3_filename)
|
||||
|
||||
# Convert MP3 to WAV for playback (play_audio_multiplexed requires WAV format)
|
||||
print(f"[DEBUG] Converting MP3 to WAV...")
|
||||
audio = AudioSegment.from_mp3(temp_mp3_filename)
|
||||
audio.export(temp_wav_filename, format="wav")
|
||||
|
||||
# Store as last audio file for replay
|
||||
self.last_audio_file = temp_wav_filename
|
||||
|
||||
# Play the generated audio
|
||||
if primary_index and secondary_index != "None" and secondary_index is not None:
|
||||
self.play_audio_multiplexed([temp_wav_filename, temp_wav_filename],
|
||||
[primary_index, secondary_index])
|
||||
else:
|
||||
self.play_audio_multiplexed([temp_wav_filename],
|
||||
[primary_index])
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Edge-TTS error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
messagebox.showerror("Edge-TTS Error", f"Failed to generate audio: {str(e)}")
|
||||
return
|
||||
|
||||
# Use OpenAI TTS (or compatible APIs like SiliconFlow)
|
||||
if not self.has_api_key:
|
||||
messagebox.showerror("API Key Required",
|
||||
messagebox.showerror("API Key Required",
|
||||
"An OpenAI API Key is required for speech to text or to use OpenAI voices.\n\n"
|
||||
"Please add your API key in Settings.\n\n"
|
||||
"Note: You can still use text to speech with the system voices only.")
|
||||
"Note: You can still use text to speech with the system voices or Edge-TTS.")
|
||||
return
|
||||
|
||||
|
||||
# Check if a tone preset is selected and add it to the text
|
||||
selected_tone_name = self.tone_var.get()
|
||||
|
||||
|
||||
# Get the actual tone instructions from the tone_presets dictionary
|
||||
tone_instructions = None
|
||||
if selected_tone_name != "None" and selected_tone_name in self.tone_presets:
|
||||
tone_instructions = self.tone_presets[selected_tone_name]
|
||||
else:
|
||||
tone_instructions = "" # Empty string if "None" or not found
|
||||
|
||||
|
||||
# Convert device names to indices
|
||||
primary_index = self.available_devices.get(self.device_index.get(), None)
|
||||
secondary_index = self.available_devices.get(self.device_index_2.get(), None) if self.device_index_2.get() != "None" else None
|
||||
@@ -1002,19 +1072,8 @@ class TextToMic(tk.Tk):
|
||||
if primary_index is None:
|
||||
messagebox.showerror("Error", "Primary device not selected or unavailable.")
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
# Get the selected TTS model
|
||||
selected_tts_model_display = self.tts_model_var.get()
|
||||
selected_tts_model = "gpt-4o-mini-tts" # Default
|
||||
|
||||
# Find the model ID from display name
|
||||
available_models = self.get_available_tts_models()
|
||||
for model_id, display_name in available_models:
|
||||
if display_name == selected_tts_model_display:
|
||||
selected_tts_model = model_id
|
||||
break
|
||||
|
||||
print(f"[DEBUG] Selected TTS model display: {selected_tts_model_display}")
|
||||
print(f"[DEBUG] Using TTS model ID: {selected_tts_model}")
|
||||
print(f"[DEBUG] Selected voice: {selected_voice}")
|
||||
@@ -1758,19 +1817,22 @@ class TextToMic(tk.Tk):
|
||||
|
||||
def get_available_tts_models(self):
|
||||
"""Get list of available TTS models based on the current API base URL."""
|
||||
# Edge-TTS is always available as a free option
|
||||
base_models = [("edge-tts", "Edge-TTS (Free, No API Key)")]
|
||||
|
||||
# Check if using SiliconFlow
|
||||
is_siliconflow = self.api_base_url and "siliconflow" in self.api_base_url.lower()
|
||||
|
||||
if is_siliconflow:
|
||||
# SiliconFlow TTS models
|
||||
return [
|
||||
return base_models + [
|
||||
("FunAudioLLM/CosyVoice2-0.5B", "CosyVoice2-0.5B (Multi-language, Emotional)"),
|
||||
("tts-1", "TTS-1 (OpenAI Compatible)"),
|
||||
("tts-1-hd", "TTS-1 HD (OpenAI Compatible)")
|
||||
]
|
||||
else:
|
||||
# OpenAI TTS models
|
||||
return [
|
||||
return base_models + [
|
||||
("gpt-4o-mini-tts", "GPT-4o Mini TTS (Recommended)"),
|
||||
("tts-1", "TTS-1 (Standard)"),
|
||||
("tts-1-hd", "TTS-1 HD (High Quality)")
|
||||
@@ -1783,6 +1845,94 @@ class TextToMic(tk.Tk):
|
||||
'claire', 'david', 'diana'
|
||||
]
|
||||
|
||||
def get_edge_tts_voices(self):
|
||||
"""Get available Edge-TTS voices, using cache when possible.
|
||||
|
||||
Returns a list of all available Edge-TTS voice short names (IDs), sorted with
|
||||
English voices first for better usability.
|
||||
"""
|
||||
cache_file = "edge_tts_voices_cache.json"
|
||||
cache_max_age_days = 7 # Cache expires after 7 days
|
||||
|
||||
# Try to load from cache first
|
||||
def load_cached_voices():
|
||||
try:
|
||||
if os.path.exists(cache_file):
|
||||
# Check cache age
|
||||
cache_age = time.time() - os.path.getmtime(cache_file)
|
||||
if cache_age < (cache_max_age_days * 24 * 60 * 60):
|
||||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||
import json
|
||||
cached = json.load(f)
|
||||
print(f"[DEBUG] Using cached Edge-TTS voices ({len(cached)} voices, {int(cache_age / 86400)} days old)")
|
||||
return cached
|
||||
except Exception as e:
|
||||
print(f"[DEBUG] Failed to load cached voices: {e}")
|
||||
return None
|
||||
|
||||
# Save voices to cache
|
||||
def save_cached_voices(voices):
|
||||
try:
|
||||
import json
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(voices, f, ensure_ascii=False, indent=2)
|
||||
print(f"[DEBUG] Cached {len(voices)} Edge-TTS voices")
|
||||
except Exception as e:
|
||||
print(f"[DEBUG] Failed to cache voices: {e}")
|
||||
|
||||
# Async function to fetch voices from network
|
||||
async def _fetch_voices():
|
||||
voices = await edge_tts.list_voices()
|
||||
# Use 'ShortName' instead of 'Name' to get the voice ID
|
||||
return [v.get('ShortName', v['Name']) for v in voices]
|
||||
|
||||
# Comprehensive fallback list with common voices
|
||||
fallback_voices = [
|
||||
# Chinese voices
|
||||
'zh-CN-XiaoxiaoNeural', 'zh-CN-XiaoyiNeural', 'zh-CN-YunyangNeural',
|
||||
'zh-TW-YunJheNeural', 'zh-TW-HsiaoChenNeural', 'zh-TW-HsiaoyuNeural',
|
||||
# English US
|
||||
'en-US-AriaNeural', 'en-US-GuyNeural', 'en-US-JennyNeural',
|
||||
'en-US-AnaNeural', 'en-US-ChristopherNeural', 'en-US-EricNeural',
|
||||
# English UK
|
||||
'en-GB-SoniaNeural', 'en-GB-ThomasNeural', 'en-GB-EmmaNeural',
|
||||
'en-GB-LibbyNeural', 'en-GB-RyanNeural',
|
||||
# Other English
|
||||
'en-AU-NatashaNeural', 'en-AU-WilliamNeural',
|
||||
'en-CA-ClaraNeural', 'en-CA-LiamNeural',
|
||||
'en-IN-NeerjaNeural', 'en-IN-PrabhatNeural',
|
||||
# Japanese
|
||||
'ja-JP-NanamiNeural', 'ja-JP-KeitaNeural',
|
||||
# Korean
|
||||
'ko-KR-SunHiNeural', 'ko-KR-InJoonNeural',
|
||||
# Spanish
|
||||
'es-ES-ElviraNeural', 'es-MX-DaliaNeural',
|
||||
# French
|
||||
'fr-FR-DeniseNeural', 'fr-CA-SylvieNeural',
|
||||
# German
|
||||
'de-DE-KatjaNeural', 'de-DE-ConradNeural',
|
||||
]
|
||||
|
||||
# Try to load from cache
|
||||
cached_voices = load_cached_voices()
|
||||
if cached_voices:
|
||||
return cached_voices
|
||||
|
||||
# Try to fetch from network
|
||||
try:
|
||||
all_voices = asyncio.run(_fetch_voices())
|
||||
# Sort voices: English voices first, then alphabetically
|
||||
english_voices = sorted([v for v in all_voices if v.startswith('en-')])
|
||||
other_voices = sorted([v for v in all_voices if not v.startswith('en-')])
|
||||
result = english_voices + other_voices
|
||||
# Save to cache for next time
|
||||
save_cached_voices(result)
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Failed to fetch Edge-TTS voices from network: {e}")
|
||||
print(f"[DEBUG] Using comprehensive fallback voice list")
|
||||
return fallback_voices
|
||||
|
||||
def update_available_voices(self):
|
||||
"""Update available voices based on selected TTS model."""
|
||||
tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
|
||||
@@ -1795,8 +1945,18 @@ class TextToMic(tk.Tk):
|
||||
tts_model_id = model_id
|
||||
break
|
||||
|
||||
# If using Edge-TTS, use Edge-TTS voices
|
||||
if tts_model_id and "edge-tts" in tts_model_id:
|
||||
voices = self.get_edge_tts_voices()
|
||||
print(f"[DEBUG] Using Edge-TTS voices: {voices}")
|
||||
# Also add system voices
|
||||
if hasattr(self, 'system_voices') and self.system_voices:
|
||||
for voice in self.system_voices:
|
||||
voices.append(f"[System] {voice.name}")
|
||||
if not voices:
|
||||
voices.append("[System] Default")
|
||||
# If using CosyVoice2-0.5B with SiliconFlow, use SiliconFlow voices
|
||||
if tts_model_id and "CosyVoice2" in tts_model_id and self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
elif tts_model_id and "CosyVoice2" in tts_model_id and self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
voices = self.get_siliconflow_voices()
|
||||
print(f"[DEBUG] Using SiliconFlow CosyVoice voices: {voices}")
|
||||
# Also add system voices
|
||||
@@ -1818,8 +1978,15 @@ class TextToMic(tk.Tk):
|
||||
|
||||
# Set default if current voice not in list (unless it's a custom voice)
|
||||
if current_voice not in voices and not (current_voice and not current_voice.startswith("[System]")):
|
||||
self.voice_var.set(voices[0] if voices else "")
|
||||
print(f"[DEBUG] Voice changed to: {voices[0] if voices else ''}")
|
||||
# Use zh-CN-XiaoxiaoNeural as default for edge-tts, use anna for SiliconFlow/CosyVoice
|
||||
if tts_model_id and "edge-tts" in tts_model_id:
|
||||
default_voice = "zh-CN-XiaoxiaoNeural" if "zh-CN-XiaoxiaoNeural" in voices else (voices[0] if voices else "")
|
||||
elif tts_model_id and "CosyVoice" in tts_model_id:
|
||||
default_voice = "anna" if "anna" in voices else (voices[0] if voices else "")
|
||||
else:
|
||||
default_voice = voices[0] if voices else ""
|
||||
self.voice_var.set(default_voice)
|
||||
print(f"[DEBUG] Voice changed to: {default_voice}")
|
||||
else:
|
||||
print(f"[DEBUG] Voice kept as: {current_voice}")
|
||||
|
||||
@@ -1828,8 +1995,13 @@ class TextToMic(tk.Tk):
|
||||
selected_voice = self.voice_var.get()
|
||||
is_system_voice = selected_voice.startswith("[System]")
|
||||
|
||||
# Update tone menu state based on voice type
|
||||
if is_system_voice:
|
||||
# Check if using Edge-TTS model
|
||||
selected_tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
|
||||
is_edge_tts = "edge-tts" in selected_tts_model_display.lower()
|
||||
|
||||
# Update tone menu state based on voice type and TTS model
|
||||
# Disable tone for system voices and Edge-TTS (neither support custom tone instructions)
|
||||
if is_system_voice or is_edge_tts:
|
||||
self.tone_menu.state(['disabled'])
|
||||
self.tone_var.set("None")
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user