feat: Enhance text-to-mic CLI and GUI with model selection and Edge-TTS support

This commit introduces significant improvements to the text-to-mic functionality, including:

- Added command-line argument parsing for better user interaction.
- Support for Edge-TTS, allowing audio generation without an API key.
- Dynamic model and voice selection based on user input and environment variables.
- Improved error handling and user feedback for audio device selection.
- Updated default voice settings based on selected TTS model.
- Removed unused compiled Python files from the repository.

Changes:
- text-to-mic-cli.py: Implemented argument parsing and client initialization logic.
- utils/text_to_mic.py: Integrated Edge-TTS audio generation and updated voice selection logic.
- Cleaned up unnecessary compiled files in the utils/__pycache__ directory.

This update enhances usability and flexibility for users leveraging different TTS models.
This commit is contained in:
Xin Wang
2026-06-18 13:55:03 +08:00
parent 92d20e59e9
commit 3d6ef1833e
8 changed files with 349 additions and 75 deletions

View File

@@ -5,19 +5,58 @@ import wave
import threading
from dotenv import load_dotenv
import os
import argparse
# Load environment variables from .env file
load_dotenv()
# Set up your OpenAI API key from the environment variable
api_key = os.getenv('OPENAI_API_KEY')
# Default API config (will be overridden by CLI args if provided)
api_key = os.getenv('OPENAI_API_KEY', '')
api_base_url = os.getenv('OPENAI_API_BASE_URL', '').strip()
# Create client with custom base URL if provided
if api_base_url:
client = OpenAI(api_key=api_key, base_url=api_base_url)
else:
client = OpenAI(api_key=api_key)
# Client will be created after args are parsed
client = None
def get_client(key=None, base_url=None):
"""Get or create the OpenAI client."""
global client
effective_key = key or api_key
effective_base = (base_url or api_base_url).strip()
if not effective_key:
raise ValueError("API key is required. Set OPENAI_API_KEY env var or use --api-key")
if effective_base:
return OpenAI(api_key=effective_key, base_url=effective_base)
return OpenAI(api_key=effective_key)
# Model name aliases
MODEL_ALIASES = {
'cosyvoice2': 'FunAudioLLM/CosyVoice2-0.5B',
'cosyvoice': 'FunAudioLLM/CosyVoice2-0.5B',
'tts-1': 'tts-1',
'tts-1-hd': 'tts-1-hd',
'gpt-4o-mini-tts': 'gpt-4o-mini-tts',
}
# Map alias to full model name
def resolve_model(model_input):
"""Resolve model alias to full model name."""
if not model_input:
return None
return MODEL_ALIASES.get(model_input.lower(), model_input)
# Check if using SiliconFlow
def is_siliconflow():
"""Check if the API is configured for SiliconFlow."""
return api_base_url and 'siliconflow' in api_base_url.lower()
# Format voice for CosyVoice2
def format_cosyvoice2_voice(voice, model):
"""Format voice for CosyVoice2: model:voice"""
if model and 'CosyVoice2' in model and is_siliconflow():
return f"{model}:{voice}"
return voice
def list_audio_devices():
p = pyaudio.PyAudio()
@@ -88,14 +127,21 @@ def play_audio_multiplexed(file_paths, device_indices):
p.terminate()
def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=None, device_index_2=None):
def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=None, device_index_2=None, api_key=None, api_base=None):
# Get model from environment variable or use default
if model is None:
model = os.getenv('OPENAI_TTS_MODEL', 'tts-1')
response = client.audio.speech.create(
# Format voice for CosyVoice2 if using SiliconFlow
voice_to_use = format_cosyvoice2_voice(voice, model)
print(f"Using model: {model}, voice: {voice_to_use}")
# Get client with potential CLI overrides
effective_client = get_client(api_key, api_base)
response = effective_client.audio.speech.create(
model=model,
voice=voice,
voice=voice_to_use,
input=text,
response_format='wav'
)
@@ -118,44 +164,100 @@ def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=No
if __name__ == "__main__":
import sys
parser = argparse.ArgumentParser(
description='Text-to-Mic CLI: Convert text to speech and play to virtual microphone',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
Examples:
%(prog)s "Hello world"
%(prog)s "Hello world" --voice anna --model cosyvoice2
%(prog)s "Hello world" --voice alex --model FunAudioLLM/CosyVoice2-0.5B --device 8
%(prog)s --list-devices
%(prog)s --list-voices
Environment variables (optional, CLI args take precedence):
OPENAI_API_KEY - Your API key (required for API TTS)
OPENAI_API_BASE_URL - Custom API base URL (e.g., https://api.siliconflow.cn/v1)
OPENAI_TTS_MODEL - Default TTS model (default: tts-1)
arglen = len(sys.argv)
Model aliases:
cosyvoice2, cosyvoice -> FunAudioLLM/CosyVoice2-0.5B
tts-1 -> tts-1
tts-1-hd -> tts-1-hd
gpt-4o-mini-tts -> gpt-4o-mini-tts
if arglen < 2:
print("Usage: python script.py 'text to convert'")
print("Environment variables:")
print(" OPENAI_API_KEY - Your API key (required)")
print(" OPENAI_API_BASE_URL - Custom API base URL (optional)")
print(" OPENAI_TTS_MODEL - TTS model to use (default: tts-1)")
print("")
print("Example models:")
print(" - tts-1 (OpenAI standard)")
print(" - tts-1-hd (OpenAI high quality)")
print(" - gpt-4o-mini-tts (OpenAI)")
print(" - FunAudioLLM/CosyVoice2-0.5B (SiliconFlow)")
print("")
print("For SiliconFlow voices with CosyVoice2:")
print(" The voice will be auto-formatted as: FunAudioLLM/CosyVoice2-0.5B:alex")
sys.exit(1)
SiliconFlow CosyVoice2 voices:
alex, anna, bella, benjamin, charles, claire, david, diana
print(f"arg count {arglen}")
OpenAI voices:
alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer
'''
)
# Get TTS model from environment
tts_model = os.getenv('OPENAI_TTS_MODEL', 'tts-1')
print(f"Using TTS model: {tts_model}")
parser.add_argument('text', nargs='?', help='Text to convert to speech')
parser.add_argument('--voice', '-v', default='fable',
help='Voice to use (default: fable). For CosyVoice2: alex, anna, bella, etc.')
parser.add_argument('--model', '-m', default=None,
help='TTS model to use. Use alias (cosyvoice2) or full name (FunAudioLLM/CosyVoice2-0.5B)')
parser.add_argument('--device', '-d', type=int, default=None,
help='Audio device index to play to (use --list-devices to find)')
parser.add_argument('--device2', type=int, default=None,
help='Second audio device index for multiplexed playback')
parser.add_argument('--list-devices', action='store_true',
help='List available audio output devices and exit')
parser.add_argument('--list-voices', action='store_true',
help='List available voices for the configured model and exit')
parser.add_argument('--list-models', action='store_true',
help='List available model aliases and exit')
parser.add_argument('--api-key', default=None,
help='OpenAI API key (or set OPENAI_API_KEY env var)')
parser.add_argument('--api-base', default=None,
help='API base URL (or set OPENAI_API_BASE_URL env var)')
if arglen == 4:
device_index = int(sys.argv[2])
device_index_2 = int(sys.argv[3])
elif arglen == 3:
device_index = int(sys.argv[2])
device_index_2 = None
else:
args = parser.parse_args()
# Handle list commands
if args.list_devices:
list_audio_devices()
exit(0)
if args.list_models:
print("Available model aliases:")
for alias, full_name in MODEL_ALIASES.items():
print(f" {alias:20} -> {full_name}")
exit(0)
if args.list_voices:
# Detect if using SiliconFlow based on API base URL
if is_siliconflow():
print("SiliconFlow CosyVoice2 voices:")
voices = ['alex', 'anna', 'bella', 'benjamin', 'charles', 'claire', 'david', 'diana']
else:
print("OpenAI voices:")
voices = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer']
for voice in voices:
print(f" {voice}")
exit(0)
# Validate text argument
if not args.text:
parser.print_help()
exit(1)
# Resolve model (alias -> full name)
model = resolve_model(args.model) if args.model else os.getenv('OPENAI_TTS_MODEL', 'tts-1')
print(f"Text: {args.text}")
print(f"Voice: {args.voice}")
print(f"Model: {model}")
# Get device index
device_index = args.device
device_index_2 = args.device2
if device_index is None:
list_audio_devices()
device_index = int(input("Enter the device index: "))
device_index_2 = None
stream_audio_to_virtual_mic(sys.argv[1], voice="fable", model=tts_model, device_index=device_index,device_index_2=device_index_2)
stream_audio_to_virtual_mic(args.text, voice=args.voice, model=model,
device_index=device_index, device_index_2=device_index_2,
api_key=args.api_key, api_base=args.api_base)

View File

@@ -11,6 +11,8 @@ import time
import requests
import pyttsx3
import tempfile
import asyncio
import edge_tts
from pystray import Icon as icon, MenuItem as item, Menu as menu
from PIL import Image, ImageDraw, ImageTk
@@ -368,11 +370,11 @@ class TextToMic(tk.Tk):
for display_name in model_options:
self.tts_menu['menu'].add_command(label=display_name, command=tk._setit(self.tts_model_var, display_name, self.on_tts_model_change))
# Set default based on API base URL
if self.api_base_url and "siliconflow" in self.api_base_url.lower():
# Set default to CosyVoice if SiliconFlow is configured, otherwise edge-tts
if hasattr(self, 'api_base_url') and self.api_base_url and "siliconflow" in self.api_base_url.lower():
default_model = "FunAudioLLM/CosyVoice2-0.5B"
else:
default_model = "gpt-4o-mini-tts"
default_model = "edge-tts"
# Find and set the display name for the default model
for i, model_id in enumerate(model_ids):
@@ -506,11 +508,11 @@ class TextToMic(tk.Tk):
settings = self.load_settings()
saved_tts_model = settings.get("tts_model", "")
if not saved_tts_model:
# Default based on API base URL
if self.api_base_url and "siliconflow" in self.api_base_url.lower():
# Default to CosyVoice if SiliconFlow is configured, otherwise edge-tts (free, no API key required)
if hasattr(self, 'api_base_url') and self.api_base_url and "siliconflow" in self.api_base_url.lower():
saved_tts_model = "FunAudioLLM/CosyVoice2-0.5B"
else:
saved_tts_model = "gpt-4o-mini-tts"
saved_tts_model = "edge-tts"
# Find the display name for the saved model
default_tts_model_display = tts_model_options[0]
@@ -533,8 +535,11 @@ class TextToMic(tk.Tk):
# Initialize voice selection
self.available_voices = self.get_available_voices()
# Determine default voice based on whether API key is available
default_voice = "fable" if self.has_api_key else self.available_voices[0] if self.available_voices else "[System] Default"
# Default voice: anna for CosyVoice, zh-CN-XiaoxiaoNeural for edge-tts
if saved_tts_model and "CosyVoice" in saved_tts_model:
default_voice = "anna"
else:
default_voice = "zh-CN-XiaoxiaoNeural"
self.voice_var = tk.StringVar(value=default_voice)
@@ -550,6 +555,10 @@ class TextToMic(tk.Tk):
voice_menu.bind('<FocusOut>', lambda e: self.on_voice_exit(voice_menu))
self.voice_menu = voice_menu # Store reference for later updates
# IMPORTANT: Update voices based on the selected TTS model after initialization
# This ensures edge-tts voices are loaded when edge-tts is the default model
self.update_available_voices()
# Add hint label for custom voices
voice_hint = ttk.Label(voice_frame,
text="💡 Click to edit or type custom voice ID",
@@ -914,6 +923,15 @@ class TextToMic(tk.Tk):
return Path(filename) # Default to current directory for non-macOS systems
def generate_edge_tts_audio(self, text, voice, output_file):
"""Generate audio using Edge-TTS (synchronous wrapper for async function)."""
async def _generate():
communicate = edge_tts.Communicate(text, voice)
await communicate.save(output_file)
# Run in a new event loop to avoid conflicts with any existing loops
asyncio.run(_generate())
def submit_text(self, play_text = None):
print(f"submit text self recording: {self.recording}")
if self.recording:
@@ -977,24 +995,76 @@ class TextToMic(tk.Tk):
messagebox.showerror("TTS Error", f"Failed to generate or play system voice: {str(e)}")
else:
# Use OpenAI TTS
# Check if using Edge-TTS (doesn't require API key)
selected_tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
selected_tts_model = "gpt-4o-mini-tts" # Default
# Find the model ID from display name
available_models = self.get_available_tts_models()
for model_id, display_name in available_models:
if display_name == selected_tts_model_display:
selected_tts_model = model_id
break
# Edge-TTS handling
if "edge-tts" in selected_tts_model:
# Convert device names to indices
primary_index = self.available_devices.get(self.device_index.get(), None)
secondary_index = self.available_devices.get(self.device_index_2.get(), None) if self.device_index_2.get() != "None" else None
if primary_index is None:
messagebox.showerror("Error", "Primary device not selected or unavailable.")
return
try:
print(f"[DEBUG] Using Edge-TTS with voice: {selected_voice}")
# Generate speech using Edge-TTS (outputs MP3)
temp_mp3_filename = "temp_edge_tts_output.mp3"
temp_wav_filename = "temp_edge_tts_output.wav"
self.generate_edge_tts_audio(text, selected_voice, temp_mp3_filename)
# Convert MP3 to WAV for playback (play_audio_multiplexed requires WAV format)
print(f"[DEBUG] Converting MP3 to WAV...")
audio = AudioSegment.from_mp3(temp_mp3_filename)
audio.export(temp_wav_filename, format="wav")
# Store as last audio file for replay
self.last_audio_file = temp_wav_filename
# Play the generated audio
if primary_index and secondary_index != "None" and secondary_index is not None:
self.play_audio_multiplexed([temp_wav_filename, temp_wav_filename],
[primary_index, secondary_index])
else:
self.play_audio_multiplexed([temp_wav_filename],
[primary_index])
except Exception as e:
print(f"[ERROR] Edge-TTS error: {e}")
import traceback
traceback.print_exc()
messagebox.showerror("Edge-TTS Error", f"Failed to generate audio: {str(e)}")
return
# Use OpenAI TTS (or compatible APIs like SiliconFlow)
if not self.has_api_key:
messagebox.showerror("API Key Required",
messagebox.showerror("API Key Required",
"An OpenAI API Key is required for speech to text or to use OpenAI voices.\n\n"
"Please add your API key in Settings.\n\n"
"Note: You can still use text to speech with the system voices only.")
"Note: You can still use text to speech with the system voices or Edge-TTS.")
return
# Check if a tone preset is selected and add it to the text
selected_tone_name = self.tone_var.get()
# Get the actual tone instructions from the tone_presets dictionary
tone_instructions = None
if selected_tone_name != "None" and selected_tone_name in self.tone_presets:
tone_instructions = self.tone_presets[selected_tone_name]
else:
tone_instructions = "" # Empty string if "None" or not found
# Convert device names to indices
primary_index = self.available_devices.get(self.device_index.get(), None)
secondary_index = self.available_devices.get(self.device_index_2.get(), None) if self.device_index_2.get() != "None" else None
@@ -1002,19 +1072,8 @@ class TextToMic(tk.Tk):
if primary_index is None:
messagebox.showerror("Error", "Primary device not selected or unavailable.")
return
try:
# Get the selected TTS model
selected_tts_model_display = self.tts_model_var.get()
selected_tts_model = "gpt-4o-mini-tts" # Default
# Find the model ID from display name
available_models = self.get_available_tts_models()
for model_id, display_name in available_models:
if display_name == selected_tts_model_display:
selected_tts_model = model_id
break
print(f"[DEBUG] Selected TTS model display: {selected_tts_model_display}")
print(f"[DEBUG] Using TTS model ID: {selected_tts_model}")
print(f"[DEBUG] Selected voice: {selected_voice}")
@@ -1758,19 +1817,22 @@ class TextToMic(tk.Tk):
def get_available_tts_models(self):
"""Get list of available TTS models based on the current API base URL."""
# Edge-TTS is always available as a free option
base_models = [("edge-tts", "Edge-TTS (Free, No API Key)")]
# Check if using SiliconFlow
is_siliconflow = self.api_base_url and "siliconflow" in self.api_base_url.lower()
if is_siliconflow:
# SiliconFlow TTS models
return [
return base_models + [
("FunAudioLLM/CosyVoice2-0.5B", "CosyVoice2-0.5B (Multi-language, Emotional)"),
("tts-1", "TTS-1 (OpenAI Compatible)"),
("tts-1-hd", "TTS-1 HD (OpenAI Compatible)")
]
else:
# OpenAI TTS models
return [
return base_models + [
("gpt-4o-mini-tts", "GPT-4o Mini TTS (Recommended)"),
("tts-1", "TTS-1 (Standard)"),
("tts-1-hd", "TTS-1 HD (High Quality)")
@@ -1783,6 +1845,94 @@ class TextToMic(tk.Tk):
'claire', 'david', 'diana'
]
def get_edge_tts_voices(self):
"""Get available Edge-TTS voices, using cache when possible.
Returns a list of all available Edge-TTS voice short names (IDs), sorted with
English voices first for better usability.
"""
cache_file = "edge_tts_voices_cache.json"
cache_max_age_days = 7 # Cache expires after 7 days
# Try to load from cache first
def load_cached_voices():
try:
if os.path.exists(cache_file):
# Check cache age
cache_age = time.time() - os.path.getmtime(cache_file)
if cache_age < (cache_max_age_days * 24 * 60 * 60):
with open(cache_file, 'r', encoding='utf-8') as f:
import json
cached = json.load(f)
print(f"[DEBUG] Using cached Edge-TTS voices ({len(cached)} voices, {int(cache_age / 86400)} days old)")
return cached
except Exception as e:
print(f"[DEBUG] Failed to load cached voices: {e}")
return None
# Save voices to cache
def save_cached_voices(voices):
try:
import json
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(voices, f, ensure_ascii=False, indent=2)
print(f"[DEBUG] Cached {len(voices)} Edge-TTS voices")
except Exception as e:
print(f"[DEBUG] Failed to cache voices: {e}")
# Async function to fetch voices from network
async def _fetch_voices():
voices = await edge_tts.list_voices()
# Use 'ShortName' instead of 'Name' to get the voice ID
return [v.get('ShortName', v['Name']) for v in voices]
# Comprehensive fallback list with common voices
fallback_voices = [
# Chinese voices
'zh-CN-XiaoxiaoNeural', 'zh-CN-XiaoyiNeural', 'zh-CN-YunyangNeural',
'zh-TW-YunJheNeural', 'zh-TW-HsiaoChenNeural', 'zh-TW-HsiaoyuNeural',
# English US
'en-US-AriaNeural', 'en-US-GuyNeural', 'en-US-JennyNeural',
'en-US-AnaNeural', 'en-US-ChristopherNeural', 'en-US-EricNeural',
# English UK
'en-GB-SoniaNeural', 'en-GB-ThomasNeural', 'en-GB-EmmaNeural',
'en-GB-LibbyNeural', 'en-GB-RyanNeural',
# Other English
'en-AU-NatashaNeural', 'en-AU-WilliamNeural',
'en-CA-ClaraNeural', 'en-CA-LiamNeural',
'en-IN-NeerjaNeural', 'en-IN-PrabhatNeural',
# Japanese
'ja-JP-NanamiNeural', 'ja-JP-KeitaNeural',
# Korean
'ko-KR-SunHiNeural', 'ko-KR-InJoonNeural',
# Spanish
'es-ES-ElviraNeural', 'es-MX-DaliaNeural',
# French
'fr-FR-DeniseNeural', 'fr-CA-SylvieNeural',
# German
'de-DE-KatjaNeural', 'de-DE-ConradNeural',
]
# Try to load from cache
cached_voices = load_cached_voices()
if cached_voices:
return cached_voices
# Try to fetch from network
try:
all_voices = asyncio.run(_fetch_voices())
# Sort voices: English voices first, then alphabetically
english_voices = sorted([v for v in all_voices if v.startswith('en-')])
other_voices = sorted([v for v in all_voices if not v.startswith('en-')])
result = english_voices + other_voices
# Save to cache for next time
save_cached_voices(result)
return result
except Exception as e:
print(f"[ERROR] Failed to fetch Edge-TTS voices from network: {e}")
print(f"[DEBUG] Using comprehensive fallback voice list")
return fallback_voices
def update_available_voices(self):
"""Update available voices based on selected TTS model."""
tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
@@ -1795,8 +1945,18 @@ class TextToMic(tk.Tk):
tts_model_id = model_id
break
# If using Edge-TTS, use Edge-TTS voices
if tts_model_id and "edge-tts" in tts_model_id:
voices = self.get_edge_tts_voices()
print(f"[DEBUG] Using Edge-TTS voices: {voices}")
# Also add system voices
if hasattr(self, 'system_voices') and self.system_voices:
for voice in self.system_voices:
voices.append(f"[System] {voice.name}")
if not voices:
voices.append("[System] Default")
# If using CosyVoice2-0.5B with SiliconFlow, use SiliconFlow voices
if tts_model_id and "CosyVoice2" in tts_model_id and self.api_base_url and "siliconflow" in self.api_base_url.lower():
elif tts_model_id and "CosyVoice2" in tts_model_id and self.api_base_url and "siliconflow" in self.api_base_url.lower():
voices = self.get_siliconflow_voices()
print(f"[DEBUG] Using SiliconFlow CosyVoice voices: {voices}")
# Also add system voices
@@ -1818,8 +1978,15 @@ class TextToMic(tk.Tk):
# Set default if current voice not in list (unless it's a custom voice)
if current_voice not in voices and not (current_voice and not current_voice.startswith("[System]")):
self.voice_var.set(voices[0] if voices else "")
print(f"[DEBUG] Voice changed to: {voices[0] if voices else ''}")
# Use zh-CN-XiaoxiaoNeural as default for edge-tts, use anna for SiliconFlow/CosyVoice
if tts_model_id and "edge-tts" in tts_model_id:
default_voice = "zh-CN-XiaoxiaoNeural" if "zh-CN-XiaoxiaoNeural" in voices else (voices[0] if voices else "")
elif tts_model_id and "CosyVoice" in tts_model_id:
default_voice = "anna" if "anna" in voices else (voices[0] if voices else "")
else:
default_voice = voices[0] if voices else ""
self.voice_var.set(default_voice)
print(f"[DEBUG] Voice changed to: {default_voice}")
else:
print(f"[DEBUG] Voice kept as: {current_voice}")
@@ -1828,8 +1995,13 @@ class TextToMic(tk.Tk):
selected_voice = self.voice_var.get()
is_system_voice = selected_voice.startswith("[System]")
# Update tone menu state based on voice type
if is_system_voice:
# Check if using Edge-TTS model
selected_tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
is_edge_tts = "edge-tts" in selected_tts_model_display.lower()
# Update tone menu state based on voice type and TTS model
# Disable tone for system voices and Edge-TTS (neither support custom tone instructions)
if is_system_voice or is_edge_tts:
self.tone_menu.state(['disabled'])
self.tone_var.set("None")
else: