feat: Enhance text-to-mic CLI and GUI with model selection and Edge-TTS support

This commit introduces significant improvements to the text-to-mic functionality, including: - Added command-line argument parsing for better user interaction. - Support for Edge-TTS, allowing audio generation without an API key. - Dynamic model and voice selection based on user input and environment variables. - Improved error handling and user feedback for audio device selection. - Updated default voice settings based on selected TTS model. - Removed unused compiled Python files from the repository. Changes: - text-to-mic-cli.py: Implemented argument parsing and client initialization logic. - utils/text_to_mic.py: Integrated Edge-TTS audio generation and updated voice selection logic. - Cleaned up unnecessary compiled files in the utils/__pycache__ directory. This update enhances usability and flexibility for users leveraging different TTS models.
2026-06-18 13:55:03 +08:00
parent 92d20e59e9
commit 3d6ef1833e
8 changed files with 349 additions and 75 deletions
--- a/text-to-mic-cli.py
+++ b/text-to-mic-cli.py
@@ -5,19 +5,58 @@ import wave
 import threading
 from dotenv import load_dotenv
 import os
+import argparse

 # Load environment variables from .env file
 load_dotenv()

-# Set up your OpenAI API key from the environment variable
-api_key = os.getenv('OPENAI_API_KEY')
+# Default API config (will be overridden by CLI args if provided)
+api_key = os.getenv('OPENAI_API_KEY', '')
 api_base_url = os.getenv('OPENAI_API_BASE_URL', '').strip()

-# Create client with custom base URL if provided
-if api_base_url:
-    client = OpenAI(api_key=api_key, base_url=api_base_url)
-else:
-    client = OpenAI(api_key=api_key)
+# Client will be created after args are parsed
+client = None
+
+def get_client(key=None, base_url=None):
+    """Get or create the OpenAI client."""
+    global client
+    effective_key = key or api_key
+    effective_base = (base_url or api_base_url).strip()
+
+    if not effective_key:
+        raise ValueError("API key is required. Set OPENAI_API_KEY env var or use --api-key")
+
+    if effective_base:
+        return OpenAI(api_key=effective_key, base_url=effective_base)
+    return OpenAI(api_key=effective_key)
+
+# Model name aliases
+MODEL_ALIASES = {
+    'cosyvoice2': 'FunAudioLLM/CosyVoice2-0.5B',
+    'cosyvoice': 'FunAudioLLM/CosyVoice2-0.5B',
+    'tts-1': 'tts-1',
+    'tts-1-hd': 'tts-1-hd',
+    'gpt-4o-mini-tts': 'gpt-4o-mini-tts',
+}
+
+# Map alias to full model name
+def resolve_model(model_input):
+    """Resolve model alias to full model name."""
+    if not model_input:
+        return None
+    return MODEL_ALIASES.get(model_input.lower(), model_input)
+
+# Check if using SiliconFlow
+def is_siliconflow():
+    """Check if the API is configured for SiliconFlow."""
+    return api_base_url and 'siliconflow' in api_base_url.lower()
+
+# Format voice for CosyVoice2
+def format_cosyvoice2_voice(voice, model):
+    """Format voice for CosyVoice2: model:voice"""
+    if model and 'CosyVoice2' in model and is_siliconflow():
+        return f"{model}:{voice}"
+    return voice

 def list_audio_devices():
    p = pyaudio.PyAudio()
@@ -88,14 +127,21 @@ def play_audio_multiplexed(file_paths, device_indices):
    
    p.terminate()
    
-def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=None, device_index_2=None):
+def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=None, device_index_2=None, api_key=None, api_base=None):
    # Get model from environment variable or use default
    if model is None:
        model = os.getenv('OPENAI_TTS_MODEL', 'tts-1')

-    response = client.audio.speech.create(
+    # Format voice for CosyVoice2 if using SiliconFlow
+    voice_to_use = format_cosyvoice2_voice(voice, model)
+    print(f"Using model: {model}, voice: {voice_to_use}")
+
+    # Get client with potential CLI overrides
+    effective_client = get_client(api_key, api_base)
+
+    response = effective_client.audio.speech.create(
        model=model,
-        voice=voice,
+        voice=voice_to_use,
        input=text,
        response_format='wav'
    )
@@ -118,44 +164,100 @@ def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=No
 

 if __name__ == "__main__":
-    import sys
+    parser = argparse.ArgumentParser(
+        description='Text-to-Mic CLI: Convert text to speech and play to virtual microphone',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+Examples:
+  %(prog)s "Hello world"
+  %(prog)s "Hello world" --voice anna --model cosyvoice2
+  %(prog)s "Hello world" --voice alex --model FunAudioLLM/CosyVoice2-0.5B --device 8
+  %(prog)s --list-devices
+  %(prog)s --list-voices

+Environment variables (optional, CLI args take precedence):
+  OPENAI_API_KEY        - Your API key (required for API TTS)
+  OPENAI_API_BASE_URL  - Custom API base URL (e.g., https://api.siliconflow.cn/v1)
+  OPENAI_TTS_MODEL      - Default TTS model (default: tts-1)

-    arglen = len(sys.argv)
+Model aliases:
+  cosyvoice2, cosyvoice  -> FunAudioLLM/CosyVoice2-0.5B
+  tts-1                  -> tts-1
+  tts-1-hd               -> tts-1-hd
+  gpt-4o-mini-tts        -> gpt-4o-mini-tts

-    if arglen < 2:
-        print("Usage: python script.py 'text to convert'")
-        print("Environment variables:")
-        print("  OPENAI_API_KEY - Your API key (required)")
-        print("  OPENAI_API_BASE_URL - Custom API base URL (optional)")
-        print("  OPENAI_TTS_MODEL - TTS model to use (default: tts-1)")
-        print("")
-        print("Example models:")
-        print("  - tts-1 (OpenAI standard)")
-        print("  - tts-1-hd (OpenAI high quality)")
-        print("  - gpt-4o-mini-tts (OpenAI)")
-        print("  - FunAudioLLM/CosyVoice2-0.5B (SiliconFlow)")
-        print("")
-        print("For SiliconFlow voices with CosyVoice2:")
-        print("  The voice will be auto-formatted as: FunAudioLLM/CosyVoice2-0.5B:alex")
-        sys.exit(1)
+SiliconFlow CosyVoice2 voices:
+  alex, anna, bella, benjamin, charles, claire, david, diana

-    print(f"arg count {arglen}")
+OpenAI voices:
+  alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer
+'''
+    )

-    # Get TTS model from environment
-    tts_model = os.getenv('OPENAI_TTS_MODEL', 'tts-1')
-    print(f"Using TTS model: {tts_model}")
+    parser.add_argument('text', nargs='?', help='Text to convert to speech')
+    parser.add_argument('--voice', '-v', default='fable',
+                        help='Voice to use (default: fable). For CosyVoice2: alex, anna, bella, etc.')
+    parser.add_argument('--model', '-m', default=None,
+                        help='TTS model to use. Use alias (cosyvoice2) or full name (FunAudioLLM/CosyVoice2-0.5B)')
+    parser.add_argument('--device', '-d', type=int, default=None,
+                        help='Audio device index to play to (use --list-devices to find)')
+    parser.add_argument('--device2', type=int, default=None,
+                        help='Second audio device index for multiplexed playback')
+    parser.add_argument('--list-devices', action='store_true',
+                        help='List available audio output devices and exit')
+    parser.add_argument('--list-voices', action='store_true',
+                        help='List available voices for the configured model and exit')
+    parser.add_argument('--list-models', action='store_true',
+                        help='List available model aliases and exit')
+    parser.add_argument('--api-key', default=None,
+                        help='OpenAI API key (or set OPENAI_API_KEY env var)')
+    parser.add_argument('--api-base', default=None,
+                        help='API base URL (or set OPENAI_API_BASE_URL env var)')

-    if arglen == 4:
-        device_index = int(sys.argv[2])
-        device_index_2 = int(sys.argv[3])
-    elif arglen == 3:
-        device_index = int(sys.argv[2])
-        device_index_2 = None
-    else:
+    args = parser.parse_args()
+
+    # Handle list commands
+    if args.list_devices:
+        list_audio_devices()
+        exit(0)
+
+    if args.list_models:
+        print("Available model aliases:")
+        for alias, full_name in MODEL_ALIASES.items():
+            print(f"  {alias:20} -> {full_name}")
+        exit(0)
+
+    if args.list_voices:
+        # Detect if using SiliconFlow based on API base URL
+        if is_siliconflow():
+            print("SiliconFlow CosyVoice2 voices:")
+            voices = ['alex', 'anna', 'bella', 'benjamin', 'charles', 'claire', 'david', 'diana']
+        else:
+            print("OpenAI voices:")
+            voices = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer']
+        for voice in voices:
+            print(f"  {voice}")
+        exit(0)
+
+    # Validate text argument
+    if not args.text:
+        parser.print_help()
+        exit(1)
+
+    # Resolve model (alias -> full name)
+    model = resolve_model(args.model) if args.model else os.getenv('OPENAI_TTS_MODEL', 'tts-1')
+    print(f"Text: {args.text}")
+    print(f"Voice: {args.voice}")
+    print(f"Model: {model}")
+
+    # Get device index
+    device_index = args.device
+    device_index_2 = args.device2
+
+    if device_index is None:
        list_audio_devices()
        device_index = int(input("Enter the device index: "))
-        device_index_2 = None

-
-    stream_audio_to_virtual_mic(sys.argv[1], voice="fable", model=tts_model, device_index=device_index,device_index_2=device_index_2)
+    stream_audio_to_virtual_mic(args.text, voice=args.voice, model=model,
+                                 device_index=device_index, device_index_2=device_index_2,
+                                 api_key=args.api_key, api_base=args.api_base)
--- a/utils/pycache/api_key_manager.cpython-312.pyc
+++ b/utils/pycache/api_key_manager.cpython-312.pyc
--- a/utils/pycache/hotkey_manager.cpython-312.pyc
+++ b/utils/pycache/hotkey_manager.cpython-312.pyc
--- a/utils/pycache/presets_manager.cpython-312.pyc
+++ b/utils/pycache/presets_manager.cpython-312.pyc
--- a/utils/pycache/resource_utils.cpython-312.pyc
+++ b/utils/pycache/resource_utils.cpython-312.pyc
--- a/utils/pycache/text_to_mic.cpython-312.pyc
+++ b/utils/pycache/text_to_mic.cpython-312.pyc
--- a/utils/pycache/tone_presets_manager.cpython-312.pyc
+++ b/utils/pycache/tone_presets_manager.cpython-312.pyc
--- a/utils/text_to_mic.py
+++ b/utils/text_to_mic.py
@@ -11,6 +11,8 @@ import time
 import requests
 import pyttsx3
 import tempfile
+import asyncio
+import edge_tts

 from pystray import Icon as icon, MenuItem as item, Menu as menu
 from PIL import Image, ImageDraw, ImageTk
@@ -368,11 +370,11 @@ class TextToMic(tk.Tk):
            for display_name in model_options:
                self.tts_menu['menu'].add_command(label=display_name, command=tk._setit(self.tts_model_var, display_name, self.on_tts_model_change))

-            # Set default based on API base URL
-            if self.api_base_url and "siliconflow" in self.api_base_url.lower():
+            # Set default to CosyVoice if SiliconFlow is configured, otherwise edge-tts
+            if hasattr(self, 'api_base_url') and self.api_base_url and "siliconflow" in self.api_base_url.lower():
                default_model = "FunAudioLLM/CosyVoice2-0.5B"
            else:
-                default_model = "gpt-4o-mini-tts"
+                default_model = "edge-tts"

            # Find and set the display name for the default model
            for i, model_id in enumerate(model_ids):
@@ -506,11 +508,11 @@ class TextToMic(tk.Tk):
        settings = self.load_settings()
        saved_tts_model = settings.get("tts_model", "")
        if not saved_tts_model:
-            # Default based on API base URL
-            if self.api_base_url and "siliconflow" in self.api_base_url.lower():
+            # Default to CosyVoice if SiliconFlow is configured, otherwise edge-tts (free, no API key required)
+            if hasattr(self, 'api_base_url') and self.api_base_url and "siliconflow" in self.api_base_url.lower():
                saved_tts_model = "FunAudioLLM/CosyVoice2-0.5B"
            else:
-                saved_tts_model = "gpt-4o-mini-tts"
+                saved_tts_model = "edge-tts"

        # Find the display name for the saved model
        default_tts_model_display = tts_model_options[0]
@@ -533,8 +535,11 @@ class TextToMic(tk.Tk):
        # Initialize voice selection
        self.available_voices = self.get_available_voices()

-        # Determine default voice based on whether API key is available
-        default_voice = "fable" if self.has_api_key else self.available_voices[0] if self.available_voices else "[System] Default"
+        # Default voice: anna for CosyVoice, zh-CN-XiaoxiaoNeural for edge-tts
+        if saved_tts_model and "CosyVoice" in saved_tts_model:
+            default_voice = "anna"
+        else:
+            default_voice = "zh-CN-XiaoxiaoNeural"

        self.voice_var = tk.StringVar(value=default_voice)

@@ -550,6 +555,10 @@ class TextToMic(tk.Tk):
        voice_menu.bind('<FocusOut>', lambda e: self.on_voice_exit(voice_menu))
        self.voice_menu = voice_menu  # Store reference for later updates

+        # IMPORTANT: Update voices based on the selected TTS model after initialization
+        # This ensures edge-tts voices are loaded when edge-tts is the default model
+        self.update_available_voices()
+
        # Add hint label for custom voices
        voice_hint = ttk.Label(voice_frame,
                              text="💡 Click to edit or type custom voice ID",
@@ -914,6 +923,15 @@ class TextToMic(tk.Tk):
            return Path(filename)  # Default to current directory for non-macOS systems


+    def generate_edge_tts_audio(self, text, voice, output_file):
+        """Generate audio using Edge-TTS (synchronous wrapper for async function)."""
+        async def _generate():
+            communicate = edge_tts.Communicate(text, voice)
+            await communicate.save(output_file)
+
+        # Run in a new event loop to avoid conflicts with any existing loops
+        asyncio.run(_generate())
+
    def submit_text(self, play_text = None):
        print(f"submit text self recording: {self.recording}")
        if self.recording:
@@ -977,24 +995,76 @@ class TextToMic(tk.Tk):
                messagebox.showerror("TTS Error", f"Failed to generate or play system voice: {str(e)}")
                
        else:
-            # Use OpenAI TTS
+            # Check if using Edge-TTS (doesn't require API key)
+            selected_tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
+            selected_tts_model = "gpt-4o-mini-tts"  # Default
+
+            # Find the model ID from display name
+            available_models = self.get_available_tts_models()
+            for model_id, display_name in available_models:
+                if display_name == selected_tts_model_display:
+                    selected_tts_model = model_id
+                    break
+
+            # Edge-TTS handling
+            if "edge-tts" in selected_tts_model:
+                # Convert device names to indices
+                primary_index = self.available_devices.get(self.device_index.get(), None)
+                secondary_index = self.available_devices.get(self.device_index_2.get(), None) if self.device_index_2.get() != "None" else None
+
+                if primary_index is None:
+                    messagebox.showerror("Error", "Primary device not selected or unavailable.")
+                    return
+
+                try:
+                    print(f"[DEBUG] Using Edge-TTS with voice: {selected_voice}")
+
+                    # Generate speech using Edge-TTS (outputs MP3)
+                    temp_mp3_filename = "temp_edge_tts_output.mp3"
+                    temp_wav_filename = "temp_edge_tts_output.wav"
+                    self.generate_edge_tts_audio(text, selected_voice, temp_mp3_filename)
+
+                    # Convert MP3 to WAV for playback (play_audio_multiplexed requires WAV format)
+                    print(f"[DEBUG] Converting MP3 to WAV...")
+                    audio = AudioSegment.from_mp3(temp_mp3_filename)
+                    audio.export(temp_wav_filename, format="wav")
+
+                    # Store as last audio file for replay
+                    self.last_audio_file = temp_wav_filename
+
+                    # Play the generated audio
+                    if primary_index and secondary_index != "None" and secondary_index is not None:
+                        self.play_audio_multiplexed([temp_wav_filename, temp_wav_filename],
+                                                    [primary_index, secondary_index])
+                    else:
+                        self.play_audio_multiplexed([temp_wav_filename],
+                                                    [primary_index])
+
+                except Exception as e:
+                    print(f"[ERROR] Edge-TTS error: {e}")
+                    import traceback
+                    traceback.print_exc()
+                    messagebox.showerror("Edge-TTS Error", f"Failed to generate audio: {str(e)}")
+                return
+
+            # Use OpenAI TTS (or compatible APIs like SiliconFlow)
            if not self.has_api_key:
-                messagebox.showerror("API Key Required", 
+                messagebox.showerror("API Key Required",
                                   "An OpenAI API Key is required for speech to text or to use OpenAI voices.\n\n"
                                   "Please add your API key in Settings.\n\n"
-                                   "Note: You can still use text to speech with the system voices only.")
+                                   "Note: You can still use text to speech with the system voices or Edge-TTS.")
                return
-                
+
            # Check if a tone preset is selected and add it to the text
            selected_tone_name = self.tone_var.get()
-            
+
            # Get the actual tone instructions from the tone_presets dictionary
            tone_instructions = None
            if selected_tone_name != "None" and selected_tone_name in self.tone_presets:
                tone_instructions = self.tone_presets[selected_tone_name]
            else:
                tone_instructions = ""  # Empty string if "None" or not found
-            
+
            # Convert device names to indices
            primary_index = self.available_devices.get(self.device_index.get(), None)
            secondary_index = self.available_devices.get(self.device_index_2.get(), None) if self.device_index_2.get() != "None" else None
@@ -1002,19 +1072,8 @@ class TextToMic(tk.Tk):
            if primary_index is None:
                messagebox.showerror("Error", "Primary device not selected or unavailable.")
                return
-            
+
            try:
-                # Get the selected TTS model
-                selected_tts_model_display = self.tts_model_var.get()
-                selected_tts_model = "gpt-4o-mini-tts"  # Default
-
-                # Find the model ID from display name
-                available_models = self.get_available_tts_models()
-                for model_id, display_name in available_models:
-                    if display_name == selected_tts_model_display:
-                        selected_tts_model = model_id
-                        break
-
                print(f"[DEBUG] Selected TTS model display: {selected_tts_model_display}")
                print(f"[DEBUG] Using TTS model ID: {selected_tts_model}")
                print(f"[DEBUG] Selected voice: {selected_voice}")
@@ -1758,19 +1817,22 @@ class TextToMic(tk.Tk):

    def get_available_tts_models(self):
        """Get list of available TTS models based on the current API base URL."""
+        # Edge-TTS is always available as a free option
+        base_models = [("edge-tts", "Edge-TTS (Free, No API Key)")]
+
        # Check if using SiliconFlow
        is_siliconflow = self.api_base_url and "siliconflow" in self.api_base_url.lower()

        if is_siliconflow:
            # SiliconFlow TTS models
-            return [
+            return base_models + [
                ("FunAudioLLM/CosyVoice2-0.5B", "CosyVoice2-0.5B (Multi-language, Emotional)"),
                ("tts-1", "TTS-1 (OpenAI Compatible)"),
                ("tts-1-hd", "TTS-1 HD (OpenAI Compatible)")
            ]
        else:
            # OpenAI TTS models
-            return [
+            return base_models + [
                ("gpt-4o-mini-tts", "GPT-4o Mini TTS (Recommended)"),
                ("tts-1", "TTS-1 (Standard)"),
                ("tts-1-hd", "TTS-1 HD (High Quality)")
@@ -1783,6 +1845,94 @@ class TextToMic(tk.Tk):
            'claire', 'david', 'diana'
        ]

+    def get_edge_tts_voices(self):
+        """Get available Edge-TTS voices, using cache when possible.
+
+        Returns a list of all available Edge-TTS voice short names (IDs), sorted with
+        English voices first for better usability.
+        """
+        cache_file = "edge_tts_voices_cache.json"
+        cache_max_age_days = 7  # Cache expires after 7 days
+
+        # Try to load from cache first
+        def load_cached_voices():
+            try:
+                if os.path.exists(cache_file):
+                    # Check cache age
+                    cache_age = time.time() - os.path.getmtime(cache_file)
+                    if cache_age < (cache_max_age_days * 24 * 60 * 60):
+                        with open(cache_file, 'r', encoding='utf-8') as f:
+                            import json
+                            cached = json.load(f)
+                            print(f"[DEBUG] Using cached Edge-TTS voices ({len(cached)} voices, {int(cache_age / 86400)} days old)")
+                            return cached
+            except Exception as e:
+                print(f"[DEBUG] Failed to load cached voices: {e}")
+            return None
+
+        # Save voices to cache
+        def save_cached_voices(voices):
+            try:
+                import json
+                with open(cache_file, 'w', encoding='utf-8') as f:
+                    json.dump(voices, f, ensure_ascii=False, indent=2)
+                print(f"[DEBUG] Cached {len(voices)} Edge-TTS voices")
+            except Exception as e:
+                print(f"[DEBUG] Failed to cache voices: {e}")
+
+        # Async function to fetch voices from network
+        async def _fetch_voices():
+            voices = await edge_tts.list_voices()
+            # Use 'ShortName' instead of 'Name' to get the voice ID
+            return [v.get('ShortName', v['Name']) for v in voices]
+
+        # Comprehensive fallback list with common voices
+        fallback_voices = [
+            # Chinese voices
+            'zh-CN-XiaoxiaoNeural', 'zh-CN-XiaoyiNeural', 'zh-CN-YunyangNeural',
+            'zh-TW-YunJheNeural', 'zh-TW-HsiaoChenNeural', 'zh-TW-HsiaoyuNeural',
+            # English US
+            'en-US-AriaNeural', 'en-US-GuyNeural', 'en-US-JennyNeural',
+            'en-US-AnaNeural', 'en-US-ChristopherNeural', 'en-US-EricNeural',
+            # English UK
+            'en-GB-SoniaNeural', 'en-GB-ThomasNeural', 'en-GB-EmmaNeural',
+            'en-GB-LibbyNeural', 'en-GB-RyanNeural',
+            # Other English
+            'en-AU-NatashaNeural', 'en-AU-WilliamNeural',
+            'en-CA-ClaraNeural', 'en-CA-LiamNeural',
+            'en-IN-NeerjaNeural', 'en-IN-PrabhatNeural',
+            # Japanese
+            'ja-JP-NanamiNeural', 'ja-JP-KeitaNeural',
+            # Korean
+            'ko-KR-SunHiNeural', 'ko-KR-InJoonNeural',
+            # Spanish
+            'es-ES-ElviraNeural', 'es-MX-DaliaNeural',
+            # French
+            'fr-FR-DeniseNeural', 'fr-CA-SylvieNeural',
+            # German
+            'de-DE-KatjaNeural', 'de-DE-ConradNeural',
+        ]
+
+        # Try to load from cache
+        cached_voices = load_cached_voices()
+        if cached_voices:
+            return cached_voices
+
+        # Try to fetch from network
+        try:
+            all_voices = asyncio.run(_fetch_voices())
+            # Sort voices: English voices first, then alphabetically
+            english_voices = sorted([v for v in all_voices if v.startswith('en-')])
+            other_voices = sorted([v for v in all_voices if not v.startswith('en-')])
+            result = english_voices + other_voices
+            # Save to cache for next time
+            save_cached_voices(result)
+            return result
+        except Exception as e:
+            print(f"[ERROR] Failed to fetch Edge-TTS voices from network: {e}")
+            print(f"[DEBUG] Using comprehensive fallback voice list")
+            return fallback_voices
+
    def update_available_voices(self):
        """Update available voices based on selected TTS model."""
        tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
@@ -1795,8 +1945,18 @@ class TextToMic(tk.Tk):
                tts_model_id = model_id
                break

+        # If using Edge-TTS, use Edge-TTS voices
+        if tts_model_id and "edge-tts" in tts_model_id:
+            voices = self.get_edge_tts_voices()
+            print(f"[DEBUG] Using Edge-TTS voices: {voices}")
+            # Also add system voices
+            if hasattr(self, 'system_voices') and self.system_voices:
+                for voice in self.system_voices:
+                    voices.append(f"[System] {voice.name}")
+            if not voices:
+                voices.append("[System] Default")
        # If using CosyVoice2-0.5B with SiliconFlow, use SiliconFlow voices
-        if tts_model_id and "CosyVoice2" in tts_model_id and self.api_base_url and "siliconflow" in self.api_base_url.lower():
+        elif tts_model_id and "CosyVoice2" in tts_model_id and self.api_base_url and "siliconflow" in self.api_base_url.lower():
            voices = self.get_siliconflow_voices()
            print(f"[DEBUG] Using SiliconFlow CosyVoice voices: {voices}")
            # Also add system voices
@@ -1818,8 +1978,15 @@ class TextToMic(tk.Tk):

            # Set default if current voice not in list (unless it's a custom voice)
            if current_voice not in voices and not (current_voice and not current_voice.startswith("[System]")):
-                self.voice_var.set(voices[0] if voices else "")
-                print(f"[DEBUG] Voice changed to: {voices[0] if voices else ''}")
+                # Use zh-CN-XiaoxiaoNeural as default for edge-tts, use anna for SiliconFlow/CosyVoice
+                if tts_model_id and "edge-tts" in tts_model_id:
+                    default_voice = "zh-CN-XiaoxiaoNeural" if "zh-CN-XiaoxiaoNeural" in voices else (voices[0] if voices else "")
+                elif tts_model_id and "CosyVoice" in tts_model_id:
+                    default_voice = "anna" if "anna" in voices else (voices[0] if voices else "")
+                else:
+                    default_voice = voices[0] if voices else ""
+                self.voice_var.set(default_voice)
+                print(f"[DEBUG] Voice changed to: {default_voice}")
            else:
                print(f"[DEBUG] Voice kept as: {current_voice}")

@@ -1828,8 +1995,13 @@ class TextToMic(tk.Tk):
        selected_voice = self.voice_var.get()
        is_system_voice = selected_voice.startswith("[System]")

-        # Update tone menu state based on voice type
-        if is_system_voice:
+        # Check if using Edge-TTS model
+        selected_tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
+        is_edge_tts = "edge-tts" in selected_tts_model_display.lower()
+
+        # Update tone menu state based on voice type and TTS model
+        # Disable tone for system voices and Edge-TTS (neither support custom tone instructions)
+        if is_system_voice or is_edge_tts:
            self.tone_menu.state(['disabled'])
            self.tone_var.set("None")
        else: