Compare commits
4 Commits
20358adafb
...
6fc3a30d9c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6fc3a30d9c | ||
|
|
51dd7f8002 | ||
|
|
3d6ef1833e | ||
|
|
92d20e59e9 |
42
.gitignore
vendored
42
.gitignore
vendored
@@ -1,22 +1,34 @@
|
||||
# Secrets and user config
|
||||
.env
|
||||
.env_backup
|
||||
*.mp3
|
||||
build/
|
||||
dist/
|
||||
config/
|
||||
settings.json
|
||||
|
||||
resampled_last_output.wav
|
||||
temp_speech_output.wav
|
||||
esampled_temp_speech_output.wav
|
||||
last_output.wav
|
||||
output.wav
|
||||
**output.wav
|
||||
# Runtime caches
|
||||
edge_tts_voices_cache.json
|
||||
|
||||
.vs
|
||||
vs/
|
||||
.zip
|
||||
# TTS / audio outputs (project root only; assets/ is tracked)
|
||||
/*.wav
|
||||
/*.mp3
|
||||
|
||||
config/
|
||||
# Python
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
*$py.class
|
||||
.venv/
|
||||
venv/
|
||||
env/
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
*.egg-info/
|
||||
|
||||
**/__pycache__/
|
||||
**__pycache__**
|
||||
# Build artifacts
|
||||
build/
|
||||
dist/
|
||||
*.zip
|
||||
|
||||
# IDE / editor
|
||||
.vs/
|
||||
.claude/
|
||||
|
||||
35
Readme.md
35
Readme.md
@@ -65,12 +65,29 @@ https://platform.openai.com/docs/quickstart/account-setup
|
||||
|
||||
6. You can change the API key at any time under the 'Settings' menu.
|
||||
|
||||
7. (Optional) You can also configure a custom API Base URL under 'Settings > API Base URL' to use compatible API endpoints other than OpenAI. For example, to use SiliconFlow's API, set the base URL to `https://api.siliconflow.cn/v1` (Note: use just the base URL, NOT the full endpoint path). Leave empty to use OpenAI's default endpoint.
|
||||
|
||||
8. (Optional) You can select different TTS models from the "TTS Model" dropdown. When using SiliconFlow, the CosyVoice2-0.5B model will be available with 8 built-in voices (alex, anna, bella, benjamin, charles, claire, david, diana). The voice options will update automatically based on the selected model.
|
||||
|
||||
9. (Optional) The Voice dropdown supports both selecting from the list and typing custom voice IDs. Click on the voice field to type a custom voice ID (e.g., for SiliconFlow custom voices like `speech:your-voice-name:xxxx`). This is useful if you've uploaded custom voice samples to SiliconFlow.
|
||||
|
||||
This tool was brought to you by Scorchsoft - We build custom apps to your requirements. Please contact us if you have a requirement for a custom app project.
|
||||
|
||||
## Advanced Tips
|
||||
|
||||
|
||||
### 1. ChatGPT AI Manipulation
|
||||
### 1. Custom Voices with SiliconFlow
|
||||
|
||||
When using SiliconFlow's API, you can upload your own voice samples and use them by entering the custom voice ID in the Voice dropdown. To upload a custom voice:
|
||||
|
||||
1. Upload your voice sample to SiliconFlow (see their documentation)
|
||||
2. You'll receive a voice ID like: `speech:your-voice-name:cm04pf7az00061413w7kz5qxs:mjtkgbyuunvtybnsvbxd`
|
||||
3. Click on the Voice dropdown and type/paste this custom voice ID
|
||||
4. The app will use this custom voice for TTS
|
||||
|
||||
For more information on uploading custom voices, see: [SiliconFlow Text-to-Speech Documentation](https://docs.siliconflow.cn/en/userguide/capabilities/text-to-speech)
|
||||
|
||||
### 2. ChatGPT AI Manipulation
|
||||
|
||||
If you go to "Settings > ChatGPT Manipulation" then you can turn this on and pick which model to use.
|
||||
|
||||
@@ -104,6 +121,22 @@ run the executable or "python text-to-mic.py"
|
||||
https://vb-audio.com/Cable/
|
||||
|
||||
## 2) ensure the OpenAI API key is specified in the .env file
|
||||
You can also optionally set `OPENAI_API_BASE_URL` in the .env file to use a compatible API endpoint other than OpenAI. For example, to use SiliconFlow's API:
|
||||
```
|
||||
OPENAI_API_KEY=your_api_key_here
|
||||
OPENAI_API_BASE_URL=https://api.siliconflow.cn/v1
|
||||
OPENAI_TTS_MODEL=FunAudioLLM/CosyVoice2-0.5B
|
||||
```
|
||||
**Important:** Use just the base URL (e.g., `https://api.siliconflow.cn/v1`), NOT the full endpoint path (don't add `/audio/speech`).
|
||||
|
||||
Leave `OPENAI_API_BASE_URL` empty to use OpenAI's default endpoint.
|
||||
|
||||
Available TTS models:
|
||||
- `tts-1` (OpenAI standard, default)
|
||||
- `tts-1-hd` (OpenAI high quality)
|
||||
- `gpt-4o-mini-tts` (OpenAI)
|
||||
- `FunAudioLLM/CosyVoice2-0.5B` (SiliconFlow - multi-language, emotional TTS)
|
||||
|
||||
This sets up a virtual microphone that we can use to sent text to speech audio to. Then, when you join a meeting, such as a google meeting, you can select this virtual cable to hear the audio being sent on the channel.
|
||||
|
||||
## 3) Run the script:
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -5,12 +5,58 @@ import wave
|
||||
import threading
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import argparse
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
# Set up your OpenAI API key from the environment variable
|
||||
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
||||
# Default API config (will be overridden by CLI args if provided)
|
||||
api_key = os.getenv('OPENAI_API_KEY', '')
|
||||
api_base_url = os.getenv('OPENAI_API_BASE_URL', '').strip()
|
||||
|
||||
# Client will be created after args are parsed
|
||||
client = None
|
||||
|
||||
def get_client(key=None, base_url=None):
|
||||
"""Get or create the OpenAI client."""
|
||||
global client
|
||||
effective_key = key or api_key
|
||||
effective_base = (base_url or api_base_url).strip()
|
||||
|
||||
if not effective_key:
|
||||
raise ValueError("API key is required. Set OPENAI_API_KEY env var or use --api-key")
|
||||
|
||||
if effective_base:
|
||||
return OpenAI(api_key=effective_key, base_url=effective_base)
|
||||
return OpenAI(api_key=effective_key)
|
||||
|
||||
# Model name aliases
|
||||
MODEL_ALIASES = {
|
||||
'cosyvoice2': 'FunAudioLLM/CosyVoice2-0.5B',
|
||||
'cosyvoice': 'FunAudioLLM/CosyVoice2-0.5B',
|
||||
'tts-1': 'tts-1',
|
||||
'tts-1-hd': 'tts-1-hd',
|
||||
'gpt-4o-mini-tts': 'gpt-4o-mini-tts',
|
||||
}
|
||||
|
||||
# Map alias to full model name
|
||||
def resolve_model(model_input):
|
||||
"""Resolve model alias to full model name."""
|
||||
if not model_input:
|
||||
return None
|
||||
return MODEL_ALIASES.get(model_input.lower(), model_input)
|
||||
|
||||
# Check if using SiliconFlow
|
||||
def is_siliconflow():
|
||||
"""Check if the API is configured for SiliconFlow."""
|
||||
return api_base_url and 'siliconflow' in api_base_url.lower()
|
||||
|
||||
# Format voice for CosyVoice2
|
||||
def format_cosyvoice2_voice(voice, model):
|
||||
"""Format voice for CosyVoice2: model:voice"""
|
||||
if model and 'CosyVoice2' in model and is_siliconflow():
|
||||
return f"{model}:{voice}"
|
||||
return voice
|
||||
|
||||
def list_audio_devices():
|
||||
p = pyaudio.PyAudio()
|
||||
@@ -81,10 +127,21 @@ def play_audio_multiplexed(file_paths, device_indices):
|
||||
|
||||
p.terminate()
|
||||
|
||||
def stream_audio_to_virtual_mic(text, voice="fable", device_index=None, device_index_2=None):
|
||||
response = client.audio.speech.create(
|
||||
model="tts-1",
|
||||
voice=voice,
|
||||
def stream_audio_to_virtual_mic(text, voice="fable", model=None, device_index=None, device_index_2=None, api_key=None, api_base=None):
|
||||
# Get model from environment variable or use default
|
||||
if model is None:
|
||||
model = os.getenv('OPENAI_TTS_MODEL', 'tts-1')
|
||||
|
||||
# Format voice for CosyVoice2 if using SiliconFlow
|
||||
voice_to_use = format_cosyvoice2_voice(voice, model)
|
||||
print(f"Using model: {model}, voice: {voice_to_use}")
|
||||
|
||||
# Get client with potential CLI overrides
|
||||
effective_client = get_client(api_key, api_base)
|
||||
|
||||
response = effective_client.audio.speech.create(
|
||||
model=model,
|
||||
voice=voice_to_use,
|
||||
input=text,
|
||||
response_format='wav'
|
||||
)
|
||||
@@ -107,27 +164,100 @@ def stream_audio_to_virtual_mic(text, voice="fable", device_index=None, device_i
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Text-to-Mic CLI: Convert text to speech and play to virtual microphone',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog='''
|
||||
Examples:
|
||||
%(prog)s "Hello world"
|
||||
%(prog)s "Hello world" --voice anna --model cosyvoice2
|
||||
%(prog)s "Hello world" --voice alex --model FunAudioLLM/CosyVoice2-0.5B --device 8
|
||||
%(prog)s --list-devices
|
||||
%(prog)s --list-voices
|
||||
|
||||
Environment variables (optional, CLI args take precedence):
|
||||
OPENAI_API_KEY - Your API key (required for API TTS)
|
||||
OPENAI_API_BASE_URL - Custom API base URL (e.g., https://api.siliconflow.cn/v1)
|
||||
OPENAI_TTS_MODEL - Default TTS model (default: tts-1)
|
||||
|
||||
arglen = len(sys.argv)
|
||||
Model aliases:
|
||||
cosyvoice2, cosyvoice -> FunAudioLLM/CosyVoice2-0.5B
|
||||
tts-1 -> tts-1
|
||||
tts-1-hd -> tts-1-hd
|
||||
gpt-4o-mini-tts -> gpt-4o-mini-tts
|
||||
|
||||
if arglen < 2:
|
||||
print("Usage: python script.py 'text to convert'")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"arg count {arglen}")
|
||||
SiliconFlow CosyVoice2 voices:
|
||||
alex, anna, bella, benjamin, charles, claire, david, diana
|
||||
|
||||
if arglen == 4:
|
||||
device_index = int(sys.argv[2])
|
||||
device_index_2 = int(sys.argv[3])
|
||||
elif arglen == 3:
|
||||
device_index = int(sys.argv[2])
|
||||
device_index_2 = None
|
||||
else:
|
||||
OpenAI voices:
|
||||
alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer
|
||||
'''
|
||||
)
|
||||
|
||||
parser.add_argument('text', nargs='?', help='Text to convert to speech')
|
||||
parser.add_argument('--voice', '-v', default='fable',
|
||||
help='Voice to use (default: fable). For CosyVoice2: alex, anna, bella, etc.')
|
||||
parser.add_argument('--model', '-m', default=None,
|
||||
help='TTS model to use. Use alias (cosyvoice2) or full name (FunAudioLLM/CosyVoice2-0.5B)')
|
||||
parser.add_argument('--device', '-d', type=int, default=None,
|
||||
help='Audio device index to play to (use --list-devices to find)')
|
||||
parser.add_argument('--device2', type=int, default=None,
|
||||
help='Second audio device index for multiplexed playback')
|
||||
parser.add_argument('--list-devices', action='store_true',
|
||||
help='List available audio output devices and exit')
|
||||
parser.add_argument('--list-voices', action='store_true',
|
||||
help='List available voices for the configured model and exit')
|
||||
parser.add_argument('--list-models', action='store_true',
|
||||
help='List available model aliases and exit')
|
||||
parser.add_argument('--api-key', default=None,
|
||||
help='OpenAI API key (or set OPENAI_API_KEY env var)')
|
||||
parser.add_argument('--api-base', default=None,
|
||||
help='API base URL (or set OPENAI_API_BASE_URL env var)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle list commands
|
||||
if args.list_devices:
|
||||
list_audio_devices()
|
||||
exit(0)
|
||||
|
||||
if args.list_models:
|
||||
print("Available model aliases:")
|
||||
for alias, full_name in MODEL_ALIASES.items():
|
||||
print(f" {alias:20} -> {full_name}")
|
||||
exit(0)
|
||||
|
||||
if args.list_voices:
|
||||
# Detect if using SiliconFlow based on API base URL
|
||||
if is_siliconflow():
|
||||
print("SiliconFlow CosyVoice2 voices:")
|
||||
voices = ['alex', 'anna', 'bella', 'benjamin', 'charles', 'claire', 'david', 'diana']
|
||||
else:
|
||||
print("OpenAI voices:")
|
||||
voices = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer']
|
||||
for voice in voices:
|
||||
print(f" {voice}")
|
||||
exit(0)
|
||||
|
||||
# Validate text argument
|
||||
if not args.text:
|
||||
parser.print_help()
|
||||
exit(1)
|
||||
|
||||
# Resolve model (alias -> full name)
|
||||
model = resolve_model(args.model) if args.model else os.getenv('OPENAI_TTS_MODEL', 'tts-1')
|
||||
print(f"Text: {args.text}")
|
||||
print(f"Voice: {args.voice}")
|
||||
print(f"Model: {model}")
|
||||
|
||||
# Get device index
|
||||
device_index = args.device
|
||||
device_index_2 = args.device2
|
||||
|
||||
if device_index is None:
|
||||
list_audio_devices()
|
||||
device_index = int(input("Enter the device index: "))
|
||||
device_index_2 = None
|
||||
|
||||
|
||||
stream_audio_to_virtual_mic(sys.argv[1], voice="fable", device_index=device_index,device_index_2=device_index_2)
|
||||
stream_audio_to_virtual_mic(args.text, voice=args.voice, model=model,
|
||||
device_index=device_index, device_index_2=device_index_2,
|
||||
api_key=args.api_key, api_base=args.api_base)
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -37,7 +37,9 @@ class SettingsManager:
|
||||
"play_last_audio": ["ctrl", "shift", "8"],
|
||||
"cancel_operation": ["ctrl", "shift", "1"]
|
||||
},
|
||||
"max_tokens": 750
|
||||
"max_tokens": 750,
|
||||
"api_base_url": "",
|
||||
"tts_model": ""
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -11,6 +11,8 @@ import time
|
||||
import requests
|
||||
import pyttsx3
|
||||
import tempfile
|
||||
import asyncio
|
||||
import edge_tts
|
||||
|
||||
from pystray import Icon as icon, MenuItem as item, Menu as menu
|
||||
from PIL import Image, ImageDraw, ImageTk
|
||||
@@ -121,9 +123,20 @@ class TextToMic(tk.Tk):
|
||||
# Get API key using APIKeyManager
|
||||
self.api_key = APIKeyManager.get_api_key(self)
|
||||
self.has_api_key = bool(self.api_key)
|
||||
|
||||
|
||||
# Initialize settings before loading them
|
||||
self.api_base_url = ""
|
||||
|
||||
if self.has_api_key:
|
||||
self.client = OpenAI(api_key=self.api_key)
|
||||
# Load settings to get custom base URL
|
||||
settings = self.load_settings()
|
||||
self.api_base_url = settings.get("api_base_url", "").strip()
|
||||
|
||||
# Create OpenAI client with custom base URL if provided
|
||||
if self.api_base_url:
|
||||
self.client = OpenAI(api_key=self.api_key, base_url=self.api_base_url)
|
||||
else:
|
||||
self.client = OpenAI(api_key=self.api_key)
|
||||
|
||||
# Initializing device index variables before they are used
|
||||
self.device_index = tk.StringVar(self)
|
||||
@@ -247,8 +260,9 @@ class TextToMic(tk.Tk):
|
||||
settings_menu = Menu(self.menubar, tearoff=0)
|
||||
self.menubar.add_cascade(label="Settings", menu=settings_menu)
|
||||
settings_menu.add_command(label="API Key", command=self.change_api_key)
|
||||
settings_menu.add_command(label="API Base URL", command=self.change_api_base_url)
|
||||
settings_menu.add_command(label="AI Copyediting", command=self.show_ai_editor_settings)
|
||||
settings_menu.add_command(label="Keyboard Shortcuts", command=self.show_hotkey_settings)
|
||||
settings_menu.add_command(label="Keyboard Shortcuts", command=self.show_hotkey_settings)
|
||||
settings_menu.add_command(label="Manage Tones", command=self.show_tone_presets_manager)
|
||||
settings_menu.add_separator()
|
||||
|
||||
@@ -287,7 +301,89 @@ class TextToMic(tk.Tk):
|
||||
new_key = APIKeyManager.change_api_key(self)
|
||||
if new_key:
|
||||
self.api_key = new_key
|
||||
self.client = OpenAI(api_key=self.api_key)
|
||||
# Recreate client with base URL if set
|
||||
if self.api_base_url:
|
||||
self.client = OpenAI(api_key=self.api_key, base_url=self.api_base_url)
|
||||
else:
|
||||
self.client = OpenAI(api_key=self.api_key)
|
||||
|
||||
def change_api_base_url(self):
|
||||
"""Change the API base URL."""
|
||||
from tkinter import simpledialog
|
||||
|
||||
# Show current URL in the prompt
|
||||
current_url = self.api_base_url if self.api_base_url else "OpenAI Default"
|
||||
prompt = f"Current API Base URL: {current_url}\n\nEnter custom API Base URL (leave empty to use OpenAI default):\n\nNote: For SiliconFlow, use: https://api.siliconflow.cn/v1"
|
||||
|
||||
new_url = simpledialog.askstring("API Base URL", prompt, parent=self)
|
||||
if new_url is not None: # User didn't cancel
|
||||
new_url = new_url.strip()
|
||||
|
||||
# Warn if user included /audio/speech in the URL
|
||||
if new_url and "/audio/speech" in new_url:
|
||||
if not messagebox.askyesno("Incorrect Base URL",
|
||||
f"The base URL should not include '/audio/speech'.\n\n"
|
||||
f"You entered: {new_url}\n\n"
|
||||
f"Did you mean: {new_url.replace('/audio/speech', '')}\n\n"
|
||||
f"Click Yes to correct it, or No to use as-is.",
|
||||
parent=self):
|
||||
# User said No, keep as-is
|
||||
pass
|
||||
else:
|
||||
# User said Yes, correct it
|
||||
new_url = new_url.replace('/audio/speech', '')
|
||||
|
||||
# Update settings
|
||||
SettingsManager.update_settings({"api_base_url": new_url})
|
||||
|
||||
# Update instance variable
|
||||
self.api_base_url = new_url
|
||||
|
||||
# Recreate client with new base URL
|
||||
if self.api_key:
|
||||
if self.api_base_url:
|
||||
self.client = OpenAI(api_key=self.api_key, base_url=self.api_base_url)
|
||||
else:
|
||||
self.client = OpenAI(api_key=self.api_key)
|
||||
|
||||
# Update TTS model options based on new base URL
|
||||
self.update_tts_model_options()
|
||||
|
||||
# Show confirmation
|
||||
if new_url:
|
||||
messagebox.showinfo("API Base URL Updated", f"API Base URL has been set to:\n{new_url}\n\nTTS model options have been updated.")
|
||||
else:
|
||||
messagebox.showinfo("API Base URL Reset", "API Base URL has been reset to OpenAI default.\n\nTTS model options have been updated.")
|
||||
|
||||
def update_tts_model_options(self):
|
||||
"""Update TTS model dropdown options based on current API base URL."""
|
||||
if hasattr(self, 'tts_menu') and hasattr(self, 'tts_model_var'):
|
||||
available_models = self.get_available_tts_models()
|
||||
model_options = [model[1] for model in available_models]
|
||||
model_ids = [model[0] for model in available_models]
|
||||
|
||||
# Store the new model IDs
|
||||
self.tts_model_ids = model_ids
|
||||
|
||||
# Update the dropdown menu
|
||||
self.tts_menu['menu'].delete(0, 'end')
|
||||
for display_name in model_options:
|
||||
self.tts_menu['menu'].add_command(label=display_name, command=tk._setit(self.tts_model_var, display_name, self.on_tts_model_change))
|
||||
|
||||
# Set default to CosyVoice if SiliconFlow is configured, otherwise edge-tts
|
||||
if hasattr(self, 'api_base_url') and self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
default_model = "FunAudioLLM/CosyVoice2-0.5B"
|
||||
else:
|
||||
default_model = "edge-tts"
|
||||
|
||||
# Find and set the display name for the default model
|
||||
for i, model_id in enumerate(model_ids):
|
||||
if model_id == default_model:
|
||||
self.tts_model_var.set(model_options[i])
|
||||
break
|
||||
|
||||
# Trigger model change to update voices
|
||||
self.on_tts_model_change()
|
||||
|
||||
def get_audio_file_path(self, filename):
|
||||
if platform.system() == 'Darwin': # Check if the OS is macOS
|
||||
@@ -403,41 +499,94 @@ class TextToMic(tk.Tk):
|
||||
# Set fixed width for all labels
|
||||
label_width = 35 # Adjust this value as needed for your UI
|
||||
|
||||
# Initialize TTS model selection
|
||||
available_tts_models = self.get_available_tts_models()
|
||||
tts_model_options = [model[1] for model in available_tts_models] # Use display names
|
||||
tts_model_ids = [model[0] for model in available_tts_models] # Store model IDs
|
||||
|
||||
# Get saved TTS model or use default
|
||||
settings = self.load_settings()
|
||||
saved_tts_model = settings.get("tts_model", "")
|
||||
if not saved_tts_model:
|
||||
# Default to CosyVoice if SiliconFlow is configured, otherwise edge-tts (free, no API key required)
|
||||
if hasattr(self, 'api_base_url') and self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
saved_tts_model = "FunAudioLLM/CosyVoice2-0.5B"
|
||||
else:
|
||||
saved_tts_model = "edge-tts"
|
||||
|
||||
# Find the display name for the saved model
|
||||
default_tts_model_display = tts_model_options[0]
|
||||
for i, model_id in enumerate(tts_model_ids):
|
||||
if model_id == saved_tts_model:
|
||||
default_tts_model_display = tts_model_options[i]
|
||||
break
|
||||
|
||||
self.tts_model_var = tk.StringVar(value=default_tts_model_display)
|
||||
self.tts_model_ids = tts_model_ids # Store for later lookup
|
||||
|
||||
# TTS Model selection dropdown
|
||||
tts_label = ttk.Label(voice_frame, text="TTS Model:", width=label_width)
|
||||
tts_label.grid(column=0, row=0, sticky=tk.W, pady=(0, 5))
|
||||
tts_menu = ttk.OptionMenu(voice_frame, self.tts_model_var, self.tts_model_var.get(), *tts_model_options, command=self.on_tts_model_change)
|
||||
tts_menu.grid(column=1, row=0, sticky="ew", pady=(0, 5))
|
||||
tts_menu.config(width=dropdown_width, style='Compact.TMenubutton')
|
||||
self.tts_menu = tts_menu # Store reference for later updates
|
||||
|
||||
# Initialize voice selection
|
||||
self.available_voices = self.get_available_voices()
|
||||
|
||||
# Determine default voice based on whether API key is available
|
||||
default_voice = "fable" if self.has_api_key else self.available_voices[0] if self.available_voices else "[System] Default"
|
||||
|
||||
|
||||
# Default voice: anna for CosyVoice, zh-CN-XiaoxiaoNeural for edge-tts
|
||||
if saved_tts_model and "CosyVoice" in saved_tts_model:
|
||||
default_voice = "anna"
|
||||
else:
|
||||
default_voice = "zh-CN-XiaoxiaoNeural"
|
||||
|
||||
self.voice_var = tk.StringVar(value=default_voice)
|
||||
|
||||
|
||||
voice_label = ttk.Label(voice_frame, text="Voice:", width=label_width)
|
||||
voice_label.grid(column=0, row=1, sticky=tk.W, pady=(0, 5))
|
||||
voice_menu = ttk.OptionMenu(voice_frame, self.voice_var, self.voice_var.get(), *self.available_voices, command=self.on_voice_change)
|
||||
|
||||
# Use Combobox instead of OptionMenu to allow both selection and typing
|
||||
voice_menu = ttk.Combobox(voice_frame, textvariable=self.voice_var, values=self.available_voices, state="readonly", width=30)
|
||||
voice_menu.grid(column=1, row=1, sticky="ew", pady=(0, 5))
|
||||
voice_menu.config(width=dropdown_width, style='Compact.TMenubutton')
|
||||
voice_menu.bind('<<ComboboxSelected>>', lambda e: self.on_voice_change())
|
||||
# Allow typing by switching to normal state on focus, readonly on unfocus
|
||||
voice_menu.bind('<FocusIn>', lambda e: voice_menu.config(state="normal"))
|
||||
voice_menu.bind('<FocusOut>', lambda e: self.on_voice_exit(voice_menu))
|
||||
self.voice_menu = voice_menu # Store reference for later updates
|
||||
|
||||
# IMPORTANT: Update voices based on the selected TTS model after initialization
|
||||
# This ensures edge-tts voices are loaded when edge-tts is the default model
|
||||
self.update_available_voices()
|
||||
|
||||
# Add hint label for custom voices
|
||||
voice_hint = ttk.Label(voice_frame,
|
||||
text="💡 Click to edit or type custom voice ID",
|
||||
font=("Arial", 7, "italic"),
|
||||
foreground="gray")
|
||||
voice_hint.grid(column=1, row=2, sticky="w", pady=(0, 5))
|
||||
|
||||
# Tone selection with warning for basic version
|
||||
self.tone_var = tk.StringVar(value=self.current_tone_name)
|
||||
tone_options = ["None"] + list(self.tone_presets.keys())
|
||||
tone_label = ttk.Label(voice_frame, text="Tone Preset:", width=label_width)
|
||||
tone_label.grid(column=0, row=2, sticky=tk.W, pady=(0, 5))
|
||||
tone_label.grid(column=0, row=3, sticky=tk.W, pady=(0, 5))
|
||||
self.tone_menu = ttk.OptionMenu(voice_frame, self.tone_var, self.tone_var.get(), *tone_options, command=self.on_tone_change)
|
||||
self.tone_menu.grid(column=1, row=2, sticky="ew", pady=(0, 5))
|
||||
self.tone_menu.grid(column=1, row=3, sticky="ew", pady=(0, 5))
|
||||
self.tone_menu.config(width=dropdown_width, style='Compact.TMenubutton')
|
||||
|
||||
|
||||
# Check if we should disable tone menu based on voice type
|
||||
if self.voice_var.get().startswith("[System]"):
|
||||
self.tone_menu.state(['disabled'])
|
||||
self.tone_var.set("None")
|
||||
|
||||
|
||||
# Add warning label for basic version
|
||||
if not self.has_api_key:
|
||||
warning_label = ttk.Label(voice_frame,
|
||||
text="⚠️ Basic Version - Add API Key in Settings for full features",
|
||||
warning_label = ttk.Label(voice_frame,
|
||||
text="⚠️ Basic Version - Add API Key in Settings for full features",
|
||||
foreground="orange",
|
||||
font=("Arial", 8, "italic"))
|
||||
warning_label.grid(column=0, row=3, columnspan=2, sticky=tk.W, pady=(5, 0))
|
||||
warning_label.grid(column=0, row=4, columnspan=2, sticky=tk.W, pady=(5, 0))
|
||||
|
||||
# Separator between Voice Settings and Device Settings
|
||||
separator = ttk.Separator(main_frame, orient='horizontal')
|
||||
@@ -774,6 +923,15 @@ class TextToMic(tk.Tk):
|
||||
return Path(filename) # Default to current directory for non-macOS systems
|
||||
|
||||
|
||||
def generate_edge_tts_audio(self, text, voice, output_file):
|
||||
"""Generate audio using Edge-TTS (synchronous wrapper for async function)."""
|
||||
async def _generate():
|
||||
communicate = edge_tts.Communicate(text, voice)
|
||||
await communicate.save(output_file)
|
||||
|
||||
# Run in a new event loop to avoid conflicts with any existing loops
|
||||
asyncio.run(_generate())
|
||||
|
||||
def submit_text(self, play_text = None):
|
||||
print(f"submit text self recording: {self.recording}")
|
||||
if self.recording:
|
||||
@@ -837,24 +995,76 @@ class TextToMic(tk.Tk):
|
||||
messagebox.showerror("TTS Error", f"Failed to generate or play system voice: {str(e)}")
|
||||
|
||||
else:
|
||||
# Use OpenAI TTS
|
||||
# Check if using Edge-TTS (doesn't require API key)
|
||||
selected_tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
|
||||
selected_tts_model = "gpt-4o-mini-tts" # Default
|
||||
|
||||
# Find the model ID from display name
|
||||
available_models = self.get_available_tts_models()
|
||||
for model_id, display_name in available_models:
|
||||
if display_name == selected_tts_model_display:
|
||||
selected_tts_model = model_id
|
||||
break
|
||||
|
||||
# Edge-TTS handling
|
||||
if "edge-tts" in selected_tts_model:
|
||||
# Convert device names to indices
|
||||
primary_index = self.available_devices.get(self.device_index.get(), None)
|
||||
secondary_index = self.available_devices.get(self.device_index_2.get(), None) if self.device_index_2.get() != "None" else None
|
||||
|
||||
if primary_index is None:
|
||||
messagebox.showerror("Error", "Primary device not selected or unavailable.")
|
||||
return
|
||||
|
||||
try:
|
||||
print(f"[DEBUG] Using Edge-TTS with voice: {selected_voice}")
|
||||
|
||||
# Generate speech using Edge-TTS (outputs MP3)
|
||||
temp_mp3_filename = "temp_edge_tts_output.mp3"
|
||||
temp_wav_filename = "temp_edge_tts_output.wav"
|
||||
self.generate_edge_tts_audio(text, selected_voice, temp_mp3_filename)
|
||||
|
||||
# Convert MP3 to WAV for playback (play_audio_multiplexed requires WAV format)
|
||||
print(f"[DEBUG] Converting MP3 to WAV...")
|
||||
audio = AudioSegment.from_mp3(temp_mp3_filename)
|
||||
audio.export(temp_wav_filename, format="wav")
|
||||
|
||||
# Store as last audio file for replay
|
||||
self.last_audio_file = temp_wav_filename
|
||||
|
||||
# Play the generated audio
|
||||
if primary_index and secondary_index != "None" and secondary_index is not None:
|
||||
self.play_audio_multiplexed([temp_wav_filename, temp_wav_filename],
|
||||
[primary_index, secondary_index])
|
||||
else:
|
||||
self.play_audio_multiplexed([temp_wav_filename],
|
||||
[primary_index])
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Edge-TTS error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
messagebox.showerror("Edge-TTS Error", f"Failed to generate audio: {str(e)}")
|
||||
return
|
||||
|
||||
# Use OpenAI TTS (or compatible APIs like SiliconFlow)
|
||||
if not self.has_api_key:
|
||||
messagebox.showerror("API Key Required",
|
||||
messagebox.showerror("API Key Required",
|
||||
"An OpenAI API Key is required for speech to text or to use OpenAI voices.\n\n"
|
||||
"Please add your API key in Settings.\n\n"
|
||||
"Note: You can still use text to speech with the system voices only.")
|
||||
"Note: You can still use text to speech with the system voices or Edge-TTS.")
|
||||
return
|
||||
|
||||
|
||||
# Check if a tone preset is selected and add it to the text
|
||||
selected_tone_name = self.tone_var.get()
|
||||
|
||||
|
||||
# Get the actual tone instructions from the tone_presets dictionary
|
||||
tone_instructions = None
|
||||
if selected_tone_name != "None" and selected_tone_name in self.tone_presets:
|
||||
tone_instructions = self.tone_presets[selected_tone_name]
|
||||
else:
|
||||
tone_instructions = "" # Empty string if "None" or not found
|
||||
|
||||
|
||||
# Convert device names to indices
|
||||
primary_index = self.available_devices.get(self.device_index.get(), None)
|
||||
secondary_index = self.available_devices.get(self.device_index_2.get(), None) if self.device_index_2.get() != "None" else None
|
||||
@@ -862,11 +1072,25 @@ class TextToMic(tk.Tk):
|
||||
if primary_index is None:
|
||||
messagebox.showerror("Error", "Primary device not selected or unavailable.")
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
print(f"[DEBUG] Selected TTS model display: {selected_tts_model_display}")
|
||||
print(f"[DEBUG] Using TTS model ID: {selected_tts_model}")
|
||||
print(f"[DEBUG] Selected voice: {selected_voice}")
|
||||
|
||||
# For SiliconFlow CosyVoice2-0.5B model, format voice as model:voice
|
||||
# Example: FunAudioLLM/CosyVoice2-0.5B:alex
|
||||
voice_to_use = selected_voice
|
||||
if "CosyVoice2" in selected_tts_model and self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
voice_to_use = f"{selected_tts_model}:{selected_voice}"
|
||||
print(f"[DEBUG] Formatted voice for CosyVoice2: {voice_to_use}")
|
||||
|
||||
print(f"[DEBUG] API call - Model: {selected_tts_model}, Voice: {voice_to_use}")
|
||||
print(f"[DEBUG] API Base URL: {self.api_base_url if self.api_base_url else 'OpenAI Default'}")
|
||||
|
||||
response = self.client.audio.speech.create(
|
||||
model="gpt-4o-mini-tts",
|
||||
voice=selected_voice,
|
||||
model=selected_tts_model,
|
||||
voice=voice_to_use,
|
||||
input=text,
|
||||
instructions=tone_instructions,
|
||||
response_format='wav'
|
||||
@@ -1573,13 +1797,13 @@ class TextToMic(tk.Tk):
|
||||
if self.has_api_key:
|
||||
# Add OpenAI voices
|
||||
voices.extend(['alloy', 'ash', 'ballad', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer'])
|
||||
|
||||
|
||||
# Add system voices with [System] prefix
|
||||
try:
|
||||
if hasattr(self, 'system_voices') and self.system_voices:
|
||||
for voice in self.system_voices:
|
||||
voices.append(f"[System] {voice.name}")
|
||||
|
||||
|
||||
# If no system voices were found, add a default system voice
|
||||
if not voices:
|
||||
voices.append("[System] Default")
|
||||
@@ -1588,21 +1812,241 @@ class TextToMic(tk.Tk):
|
||||
# Ensure we have at least one voice option
|
||||
if not voices:
|
||||
voices.append("[System] Default")
|
||||
|
||||
|
||||
return voices
|
||||
|
||||
def get_available_tts_models(self):
|
||||
"""Get list of available TTS models based on the current API base URL."""
|
||||
# Edge-TTS is always available as a free option
|
||||
base_models = [("edge-tts", "Edge-TTS (Free, No API Key)")]
|
||||
|
||||
# Check if using SiliconFlow
|
||||
is_siliconflow = self.api_base_url and "siliconflow" in self.api_base_url.lower()
|
||||
|
||||
if is_siliconflow:
|
||||
# SiliconFlow TTS models
|
||||
return base_models + [
|
||||
("FunAudioLLM/CosyVoice2-0.5B", "CosyVoice2-0.5B (Multi-language, Emotional)"),
|
||||
("tts-1", "TTS-1 (OpenAI Compatible)"),
|
||||
("tts-1-hd", "TTS-1 HD (OpenAI Compatible)")
|
||||
]
|
||||
else:
|
||||
# OpenAI TTS models
|
||||
return base_models + [
|
||||
("gpt-4o-mini-tts", "GPT-4o Mini TTS (Recommended)"),
|
||||
("tts-1", "TTS-1 (Standard)"),
|
||||
("tts-1-hd", "TTS-1 HD (High Quality)")
|
||||
]
|
||||
|
||||
def get_siliconflow_voices(self):
|
||||
"""Get SiliconFlow-specific voices for CosyVoice2-0.5B model."""
|
||||
return [
|
||||
'alex', 'anna', 'bella', 'benjamin', 'charles',
|
||||
'claire', 'david', 'diana'
|
||||
]
|
||||
|
||||
def get_edge_tts_voices(self):
|
||||
"""Get available Edge-TTS voices, using cache when possible.
|
||||
|
||||
Returns a list of all available Edge-TTS voice short names (IDs), sorted with
|
||||
English voices first for better usability.
|
||||
"""
|
||||
cache_file = "edge_tts_voices_cache.json"
|
||||
cache_max_age_days = 7 # Cache expires after 7 days
|
||||
|
||||
# Try to load from cache first
|
||||
def load_cached_voices():
|
||||
try:
|
||||
if os.path.exists(cache_file):
|
||||
# Check cache age
|
||||
cache_age = time.time() - os.path.getmtime(cache_file)
|
||||
if cache_age < (cache_max_age_days * 24 * 60 * 60):
|
||||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||
import json
|
||||
cached = json.load(f)
|
||||
print(f"[DEBUG] Using cached Edge-TTS voices ({len(cached)} voices, {int(cache_age / 86400)} days old)")
|
||||
return cached
|
||||
except Exception as e:
|
||||
print(f"[DEBUG] Failed to load cached voices: {e}")
|
||||
return None
|
||||
|
||||
# Save voices to cache
|
||||
def save_cached_voices(voices):
|
||||
try:
|
||||
import json
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(voices, f, ensure_ascii=False, indent=2)
|
||||
print(f"[DEBUG] Cached {len(voices)} Edge-TTS voices")
|
||||
except Exception as e:
|
||||
print(f"[DEBUG] Failed to cache voices: {e}")
|
||||
|
||||
# Async function to fetch voices from network
|
||||
async def _fetch_voices():
|
||||
voices = await edge_tts.list_voices()
|
||||
# Use 'ShortName' instead of 'Name' to get the voice ID
|
||||
return [v.get('ShortName', v['Name']) for v in voices]
|
||||
|
||||
# Comprehensive fallback list with common voices
|
||||
fallback_voices = [
|
||||
# Chinese voices
|
||||
'zh-CN-XiaoxiaoNeural', 'zh-CN-XiaoyiNeural', 'zh-CN-YunyangNeural',
|
||||
'zh-TW-YunJheNeural', 'zh-TW-HsiaoChenNeural', 'zh-TW-HsiaoyuNeural',
|
||||
# English US
|
||||
'en-US-AriaNeural', 'en-US-GuyNeural', 'en-US-JennyNeural',
|
||||
'en-US-AnaNeural', 'en-US-ChristopherNeural', 'en-US-EricNeural',
|
||||
# English UK
|
||||
'en-GB-SoniaNeural', 'en-GB-ThomasNeural', 'en-GB-EmmaNeural',
|
||||
'en-GB-LibbyNeural', 'en-GB-RyanNeural',
|
||||
# Other English
|
||||
'en-AU-NatashaNeural', 'en-AU-WilliamNeural',
|
||||
'en-CA-ClaraNeural', 'en-CA-LiamNeural',
|
||||
'en-IN-NeerjaNeural', 'en-IN-PrabhatNeural',
|
||||
# Japanese
|
||||
'ja-JP-NanamiNeural', 'ja-JP-KeitaNeural',
|
||||
# Korean
|
||||
'ko-KR-SunHiNeural', 'ko-KR-InJoonNeural',
|
||||
# Spanish
|
||||
'es-ES-ElviraNeural', 'es-MX-DaliaNeural',
|
||||
# French
|
||||
'fr-FR-DeniseNeural', 'fr-CA-SylvieNeural',
|
||||
# German
|
||||
'de-DE-KatjaNeural', 'de-DE-ConradNeural',
|
||||
]
|
||||
|
||||
# Try to load from cache
|
||||
cached_voices = load_cached_voices()
|
||||
if cached_voices:
|
||||
return cached_voices
|
||||
|
||||
# Try to fetch from network
|
||||
try:
|
||||
all_voices = asyncio.run(_fetch_voices())
|
||||
# Sort voices: English voices first, then alphabetically
|
||||
english_voices = sorted([v for v in all_voices if v.startswith('en-')])
|
||||
other_voices = sorted([v for v in all_voices if not v.startswith('en-')])
|
||||
result = english_voices + other_voices
|
||||
# Save to cache for next time
|
||||
save_cached_voices(result)
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Failed to fetch Edge-TTS voices from network: {e}")
|
||||
print(f"[DEBUG] Using comprehensive fallback voice list")
|
||||
return fallback_voices
|
||||
|
||||
def update_available_voices(self):
|
||||
"""Update available voices based on selected TTS model."""
|
||||
tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
|
||||
|
||||
# Get the actual model ID from display name
|
||||
tts_model_id = None
|
||||
available_models = self.get_available_tts_models()
|
||||
for model_id, display_name in available_models:
|
||||
if display_name == tts_model_display:
|
||||
tts_model_id = model_id
|
||||
break
|
||||
|
||||
# If using Edge-TTS, use Edge-TTS voices
|
||||
if tts_model_id and "edge-tts" in tts_model_id:
|
||||
voices = self.get_edge_tts_voices()
|
||||
print(f"[DEBUG] Using Edge-TTS voices: {voices}")
|
||||
# Also add system voices
|
||||
if hasattr(self, 'system_voices') and self.system_voices:
|
||||
for voice in self.system_voices:
|
||||
voices.append(f"[System] {voice.name}")
|
||||
if not voices:
|
||||
voices.append("[System] Default")
|
||||
# If using CosyVoice2-0.5B with SiliconFlow, use SiliconFlow voices
|
||||
elif tts_model_id and "CosyVoice2" in tts_model_id and self.api_base_url and "siliconflow" in self.api_base_url.lower():
|
||||
voices = self.get_siliconflow_voices()
|
||||
print(f"[DEBUG] Using SiliconFlow CosyVoice voices: {voices}")
|
||||
# Also add system voices
|
||||
if hasattr(self, 'system_voices') and self.system_voices:
|
||||
for voice in self.system_voices:
|
||||
voices.append(f"[System] {voice.name}")
|
||||
if not voices:
|
||||
voices.append("[System] Default")
|
||||
else:
|
||||
voices = self.get_available_voices()
|
||||
print(f"[DEBUG] Using standard voices")
|
||||
|
||||
# Update the voice dropdown (now using Combobox)
|
||||
if hasattr(self, 'voice_menu'):
|
||||
current_voice = self.voice_var.get()
|
||||
|
||||
# Update the combobox values
|
||||
self.voice_menu['values'] = voices
|
||||
|
||||
# Set default if current voice not in list (unless it's a custom voice)
|
||||
if current_voice not in voices and not (current_voice and not current_voice.startswith("[System]")):
|
||||
# Use zh-CN-XiaoxiaoNeural as default for edge-tts, use anna for SiliconFlow/CosyVoice
|
||||
if tts_model_id and "edge-tts" in tts_model_id:
|
||||
default_voice = "zh-CN-XiaoxiaoNeural" if "zh-CN-XiaoxiaoNeural" in voices else (voices[0] if voices else "")
|
||||
elif tts_model_id and "CosyVoice" in tts_model_id:
|
||||
default_voice = "anna" if "anna" in voices else (voices[0] if voices else "")
|
||||
else:
|
||||
default_voice = voices[0] if voices else ""
|
||||
self.voice_var.set(default_voice)
|
||||
print(f"[DEBUG] Voice changed to: {default_voice}")
|
||||
else:
|
||||
print(f"[DEBUG] Voice kept as: {current_voice}")
|
||||
|
||||
def on_voice_change(self, *args):
|
||||
"""Handle voice selection change."""
|
||||
selected_voice = self.voice_var.get()
|
||||
is_system_voice = selected_voice.startswith("[System]")
|
||||
|
||||
# Update tone menu state based on voice type
|
||||
if is_system_voice:
|
||||
|
||||
# Check if using Edge-TTS model
|
||||
selected_tts_model_display = self.tts_model_var.get() if hasattr(self, 'tts_model_var') else ""
|
||||
is_edge_tts = "edge-tts" in selected_tts_model_display.lower()
|
||||
|
||||
# Update tone menu state based on voice type and TTS model
|
||||
# Disable tone for system voices and Edge-TTS (neither support custom tone instructions)
|
||||
if is_system_voice or is_edge_tts:
|
||||
self.tone_menu.state(['disabled'])
|
||||
self.tone_var.set("None")
|
||||
else:
|
||||
self.tone_menu.state(['!disabled'])
|
||||
|
||||
def on_voice_exit(self, combobox):
|
||||
"""Handle voice combobox focus out - validate and update state."""
|
||||
entered_voice = self.voice_var.get().strip()
|
||||
|
||||
# If empty, set to first available voice
|
||||
if not entered_voice:
|
||||
if hasattr(self, 'voice_menu'):
|
||||
values = self.voice_menu['values']
|
||||
if values:
|
||||
self.voice_var.set(values[0])
|
||||
self.on_voice_change()
|
||||
|
||||
# Switch back to readonly state
|
||||
combobox.config(state="readonly")
|
||||
|
||||
# Trigger voice change to update tone menu
|
||||
self.on_voice_change()
|
||||
|
||||
def on_tts_model_change(self, *args):
|
||||
"""Handle TTS model selection change."""
|
||||
selected_model_display = self.tts_model_var.get()
|
||||
|
||||
# Find the model ID from display name
|
||||
model_id = None
|
||||
available_models = self.get_available_tts_models()
|
||||
for model_id_val, display_name in available_models:
|
||||
if display_name == selected_model_display:
|
||||
model_id = model_id_val
|
||||
break
|
||||
|
||||
# Save the selected model to settings
|
||||
if model_id:
|
||||
SettingsManager.update_settings({"tts_model": model_id})
|
||||
print(f"[DEBUG] TTS model changed to: {model_id}") # Debug logging
|
||||
else:
|
||||
print(f"[DEBUG] Warning: Could not find model ID for display name: {selected_model_display}")
|
||||
|
||||
# Update available voices based on the selected model
|
||||
self.update_available_voices()
|
||||
|
||||
def update_window_size(self):
|
||||
"""Update window size based on current banner and presets state."""
|
||||
# Calculate a width that preserves the current width if it's larger than default
|
||||
|
||||
Reference in New Issue
Block a user