From 9e87a029e61c9d5beebad6b4e4f0ddebb883eee7 Mon Sep 17 00:00:00 2001 From: yuzhiheng Date: Thu, 29 Jan 2026 17:46:13 +0800 Subject: [PATCH 01/12] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=E9=9F=B3?= =?UTF-8?q?=E8=89=B2=E8=AE=BE=E8=AE=A1=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E9=9F=B3=E8=89=B2=E5=88=9B=E5=BB=BA=E3=80=81=E7=AE=A1?= =?UTF-8?q?=E7=90=86=E4=B8=8E=E8=AF=AD=E9=9F=B3=E5=90=88=E6=88=90=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/.env.example | 6 +- backend/services/tts_service.py | 73 ++-- backend/services/voice_clone_service.py | 36 +- backend/services/voice_design_service.py | 53 ++- frontend/.env.example | 9 + frontend/package-lock.json | 9 +- frontend/src/router/index.js | 6 + frontend/src/views/Home.vue | 25 +- frontend/src/views/OfficialVoice.vue | 470 +++++++++++++++++++++++ frontend/src/views/VoiceDesign.vue | 51 ++- 10 files changed, 624 insertions(+), 114 deletions(-) create mode 100644 frontend/.env.example create mode 100644 frontend/src/views/OfficialVoice.vue diff --git a/backend/.env.example b/backend/.env.example index f1a4e8e..388b903 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -1,7 +1,9 @@ -# DashScope API Key (必填) +# Qwen3-TTS 环境,可选 aliyun: 阿里云,local: 本地 +VITE_QWEN3_TTS_ENV="aliyun" + +# DashScope API Key (环境为 aliyun 时必填) DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx # 配置说明: # 1. 请将此文件复制为 .env 并填写真实的 API Key # 2. 更多配置选项可在代码中通过前端界面设置 - diff --git a/backend/services/tts_service.py b/backend/services/tts_service.py index de84112..be3f56e 100644 --- a/backend/services/tts_service.py +++ b/backend/services/tts_service.py @@ -1,11 +1,9 @@ import os -import json import base64 -import asyncio import threading import queue from dotenv import load_dotenv - + load_dotenv() @@ -20,31 +18,33 @@ class TTSService: def __init__(self): self.active_connections = {} self.active_tts = {} - + async def connect(self, websocket, message): voice_type = message.get("voice_type", "design") voice_name = message.get("voice_name") websocket_url = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" - + if voice_type == "design" and voice_name: model = "qwen3-tts-vd-realtime-2025-12-16" elif voice_type == "clone" and voice_name: model = "qwen3-tts-vc-realtime-2025-11-27" + elif voice_type == "official": + model = "qwen3-tts-flash-realtime-2025-11-27" else: model = "qwen3-tts-flash" - + self.active_connections[websocket] = { "model": model, "voice_name": voice_name, "websocket_url": websocket_url, "event_queue": queue.Queue() } - + await websocket.send_json({ "type": "connected", "message": "WebSocket连接成功" }) - + async def synthesize(self, websocket, message): if websocket not in self.active_connections: await websocket.send_json({ @@ -52,17 +52,17 @@ async def synthesize(self, websocket, message): "message": "请先连接" }) return - + config = self.active_connections[websocket] text = message.get("text") - + if not text: await websocket.send_json({ "type": "error", "message": "请输入文本" }) return - + api_key = os.getenv("DASHSCOPE_API_KEY") if not api_key: await websocket.send_json({ @@ -70,23 +70,26 @@ async def synthesize(self, websocket, message): "message": "未配置API Key" }) return - + dashscope.api_key = api_key - + try: event_queue = config["event_queue"] - + class WebSocketCallback(QwenTtsRealtimeCallback): def __init__(self, ws, queue): self.ws = ws self.queue = queue - + def on_open(self): pass - + def on_close(self, close_status_code, close_msg): - pass - + self.queue.put({ + "type": "error", + "message": f"连接异常关闭 ({close_status_code}): {close_msg}" + }) + def on_event(self, response): try: event_type = response.get('type', '') @@ -100,21 +103,29 @@ def on_event(self, response): self.queue.put({"type": "done"}) elif event_type == 'session.finished': self.queue.put({"type": "finished"}) + elif event_type == 'error': + self.queue.put({ + "type": "error", + "message": response.get('error').get('message') + }) except Exception as e: - print(f"回调事件处理异常: {e}") - + self.queue.put({"type": "error", "message": str(e)}) + + def on_error(self, message): + self.queue.put({"type": "error", "message": message}) + def run_tts(): try: callback = WebSocketCallback(websocket, event_queue) - + qwen_tts_realtime = QwenTtsRealtime( model=config["model"], callback=callback, url=config["websocket_url"] ) - + qwen_tts_realtime.connect() - + if config["voice_name"]: qwen_tts_realtime.update_session( voice=config["voice_name"], @@ -126,37 +137,37 @@ def run_tts(): response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, mode='server_commit' ) - + qwen_tts_realtime.append_text(text) qwen_tts_realtime.finish() except Exception as e: event_queue.put({"type": "error", "message": str(e)}) - + thread = threading.Thread(target=run_tts) thread.start() - + await websocket.send_json({ "type": "started" }) - + while True: try: event = event_queue.get(timeout=60) await websocket.send_json(event) - + if event.get("type") in ["finished", "error"]: break except queue.Empty: break - + thread.join(timeout=5) - + except Exception as e: await websocket.send_json({ "type": "error", "message": str(e) }) - + async def close(self, websocket): if websocket in self.active_connections: del self.active_connections[websocket] diff --git a/backend/services/voice_clone_service.py b/backend/services/voice_clone_service.py index 659c657..2959c88 100644 --- a/backend/services/voice_clone_service.py +++ b/backend/services/voice_clone_service.py @@ -13,17 +13,17 @@ def __init__(self): self.api_key = os.getenv("DASHSCOPE_API_KEY") self.storage = VoiceStorage("data/cloned_voices.json") self.customization_url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization" - + self.target_model = "qwen3-tts-vc-realtime-2025-11-27" - + def clone_voice(self, audio_file, preferred_name=None, display_name=None): if not self.api_key: raise ValueError("未找到API Key,请先配置") - + file_path = Path(audio_file) if not file_path.exists(): raise FileNotFoundError(f"音频文件不存在: {audio_file}") - + with open(file_path, "rb") as f: header = f.read(44) if len(header) < 44: @@ -37,16 +37,16 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None): duration = data_size / (sample_rate * channels * (bits_per_sample // 8)) if duration < 1.0: raise ValueError("音频过短,请上传至少1秒的音频") - + audio_data = file_path.read_bytes() base64_str = base64.b64encode(audio_data).decode() data_uri = f"data:audio/wav;base64,{base64_str}" - + headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json" } - + payload = { "model": "qwen-voice-enrollment", "input": { @@ -56,7 +56,7 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None): "audio": {"data": data_uri, "format": "wav"} } } - + try: response = requests.post( self.customization_url, @@ -66,31 +66,31 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None): ) print(f"克隆接口状态码: {response.status_code}") print(f"克隆接口返回: {response.text[:500]}") - + if response.status_code != 200: try: error_data = response.json() error_code = error_data.get("code", "") error_msg = error_data.get("message", "") - + if "Audio.PreprocessError" in error_code or "No segments meet" in error_msg: raise ValueError("音频有效时长不足,请确保录音时长超过5秒且声音清晰(去除静音后需满足时长要求)") - + raise ValueError(f"克隆失败: {error_msg}") except json.JSONDecodeError: response.raise_for_status() - + result = response.json() - + voice_name = result["output"]["voice"] - + self.storage.add_voice( voice_name=voice_name, description="录音克隆", display_name=display_name or preferred_name or voice_name, audio_file=audio_file ) - + return { "voice_name": voice_name, "description": "录音克隆", @@ -98,17 +98,17 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None): "audio_file": audio_file, "created_at": result.get("created_at", "") } - + except ValueError as e: raise e except requests.exceptions.RequestException as e: raise Exception(f"网络请求失败: {e}") except Exception as e: raise Exception(f"发生错误: {e}") - + def list_voices(self): voices = self.storage.list_voices() return voices - + def delete_voice(self, voice_name): return self.storage.delete_voice(voice_name) diff --git a/backend/services/voice_design_service.py b/backend/services/voice_design_service.py index e20e7ee..48adec8 100644 --- a/backend/services/voice_design_service.py +++ b/backend/services/voice_design_service.py @@ -1,5 +1,4 @@ import os -import json import base64 import requests from dotenv import load_dotenv @@ -12,18 +11,18 @@ def __init__(self): self.api_key = os.getenv("DASHSCOPE_API_KEY") self.storage = VoiceStorage("data/voices.json") self.voice_design_url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization" - + self.target_model = "qwen3-tts-vd-realtime-2025-12-16" - + def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的声音。", preferred_name=None, display_name=None): if not self.api_key: raise ValueError("未找到API Key,请先配置") - + headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json" } - + data = { "model": "qwen-voice-design", "input": { @@ -37,24 +36,24 @@ def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的 "response_format": "wav" } } - + if preview_text: data["input"]["preview_text"] = preview_text - + if preferred_name: data["input"]["preferred_name"] = preferred_name - + try: response = requests.post(self.voice_design_url, headers=headers, json=data, timeout=60) - + response.raise_for_status() result = response.json() - + if result.get("output"): voice_name = result["output"].get("voice") base64_audio = result["output"]["preview_audio"]["data"] audio_bytes = base64.b64decode(base64_audio) - + # 增加音频增益(放大音量) try: import struct @@ -62,11 +61,11 @@ def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的 if len(audio_bytes) > 44: header = audio_bytes[:44] pcm_data = audio_bytes[44:] - + # 16-bit PCM, Little Endian count = len(pcm_data) // 2 samples = struct.unpack(f"<{count}h", pcm_data) - + gain = 5.0 # 与前端保持一致的增益 new_samples = [] for s in samples: @@ -74,26 +73,26 @@ def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的 if v > 32767: v = 32767 if v < -32768: v = -32768 new_samples.append(v) - + new_pcm_data = struct.pack(f"<{count}h", *new_samples) audio_bytes = header + new_pcm_data print("预览音频增益处理成功") except Exception as e: print(f"预览音频增益处理失败: {e}") - + preview_filename = f"{voice_name}_preview.wav" preview_file = f"previews/{preview_filename}" os.makedirs("previews", exist_ok=True) with open(preview_file, "wb") as f: f.write(audio_bytes) - + self.storage.add_voice( voice_name=voice_name, description=voice_prompt, display_name=display_name or preferred_name or voice_name, preview_file=preview_filename ) - + return { "voice_name": voice_name, "description": voice_prompt, @@ -103,7 +102,7 @@ def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的 } else: raise Exception(f"创建失败: {result}") - + except requests.exceptions.RequestException as e: print(f"网络请求错误: {e}") print(f"响应内容: {response.text if 'response' in locals() else 'N/A'}") @@ -113,28 +112,28 @@ def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的 import traceback traceback.print_exc() raise Exception(f"请求失败: {e}") - + def list_voices(self): voices = self.storage.list_voices() return voices - + def delete_voice(self, voice_name): return self.storage.delete_voice(voice_name) - + def optimize_prompt(self, prompt): if not self.api_key: raise ValueError("未找到API Key,请先配置") - + from dashscope import Generation import dashscope - + dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1' - + messages = [ {"role": "system", "content": "你是一个专业的音色设计助理,负责将用户简洁的音色描述优化为详细、专业的音色设计提示词。优化后的提示词应该包含模仿对象,说清楚年轻范围,性别特征(比如22岁女性,32岁男主播等),音色特质(如甜美、低沉、磁性等)、情感倾向、语音特点(如语速、语调等)等方面的详细描述,以便生成更符合的AI音色。输出要求:仅输出音色描述文本,无需包含其他解释内容"}, {"role": "user", "content": prompt}, ] - + try: response = Generation.call( api_key=self.api_key, @@ -143,12 +142,12 @@ def optimize_prompt(self, prompt): result_format="message", enable_thinking=False, ) - + if response.status_code == 200: return response.output.choices[0].message.content else: raise Exception(f"优化失败: HTTP {response.status_code}, {response.message}") - + except Exception as e: print(f"优化提示词错误: {e}") raise Exception(f"优化提示词失败: {e}") diff --git a/frontend/.env.example b/frontend/.env.example new file mode 100644 index 0000000..388b903 --- /dev/null +++ b/frontend/.env.example @@ -0,0 +1,9 @@ +# Qwen3-TTS 环境,可选 aliyun: 阿里云,local: 本地 +VITE_QWEN3_TTS_ENV="aliyun" + +# DashScope API Key (环境为 aliyun 时必填) +DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx + +# 配置说明: +# 1. 请将此文件复制为 .env 并填写真实的 API Key +# 2. 更多配置选项可在代码中通过前端界面设置 diff --git a/frontend/package-lock.json b/frontend/package-lock.json index e55d47a..f778ce7 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -842,6 +842,7 @@ "resolved": "https://registry.npmjs.org/@types/lodash-es/-/lodash-es-4.17.12.tgz", "integrity": "sha512-0NgftHUcV4v34VhXm8QBSftKVXtbkBG3ViCjs6+eJ5a6y6Mi/jiFGPc1sC7QK+9BFhWrURE3EOggmWaSxL9OzQ==", "license": "MIT", + "peer": true, "dependencies": { "@types/lodash": "*" } @@ -1370,13 +1371,15 @@ "version": "4.17.21", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==", - "license": "MIT" + "license": "MIT", + "peer": true }, "node_modules/lodash-es": { "version": "4.17.22", "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.22.tgz", "integrity": "sha512-XEawp1t0gxSi9x01glktRZ5HDy0HXqrM0x5pXQM98EaI0NxO6jVM7omDOxsuEo5UIASAnm2bRp1Jt/e0a2XU8Q==", - "license": "MIT" + "license": "MIT", + "peer": true }, "node_modules/lodash-unified": { "version": "1.0.3", @@ -1577,6 +1580,7 @@ "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "esbuild": "^0.21.3", "postcss": "^8.4.43", @@ -1636,6 +1640,7 @@ "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.26.tgz", "integrity": "sha512-SJ/NTccVyAoNUJmkM9KUqPcYlY+u8OVL1X5EW9RIs3ch5H2uERxyyIUI4MRxVCSOiEcupX9xNGde1tL9ZKpimA==", "license": "MIT", + "peer": true, "dependencies": { "@vue/compiler-dom": "3.5.26", "@vue/compiler-sfc": "3.5.26", diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js index dee32c9..70cfa70 100644 --- a/frontend/src/router/index.js +++ b/frontend/src/router/index.js @@ -2,6 +2,7 @@ import { createRouter, createWebHistory } from 'vue-router' import Home from '@/views/Home.vue' import VoiceDesign from '@/views/VoiceDesign.vue' import VoiceClone from '@/views/VoiceClone.vue' +import OfficialVoice from '@/views/OfficialVoice.vue' const routes = [ { @@ -18,6 +19,11 @@ const routes = [ path: '/voice-clone', name: 'VoiceClone', component: VoiceClone + }, + { + path: '/official-voice', + name: 'OfficialVoice', + component: OfficialVoice } ] diff --git a/frontend/src/views/Home.vue b/frontend/src/views/Home.vue index bed29b6..073718c 100644 --- a/frontend/src/views/Home.vue +++ b/frontend/src/views/Home.vue @@ -4,7 +4,7 @@

元视界AI妙妙屋

魔法语音

- +
🎨
@@ -12,26 +12,31 @@

通过文字描述创造个性化音色

- +
🎤

音色克隆

录制声音并克隆为专属音色

+ +
+
🎙️
+

官方音色

+

使用官方预置的专业音色

+
+
- - + diff --git a/frontend/src/views/VoiceDesign.vue b/frontend/src/views/VoiceDesign.vue index c037bf8..6fa6879 100644 --- a/frontend/src/views/VoiceDesign.vue +++ b/frontend/src/views/VoiceDesign.vue @@ -8,9 +8,9 @@

元视界AI妙妙屋—魔法语音

音色创造
- + - +

创建新音色

@@ -33,21 +33,21 @@
- + - + - +
- +

已创建的音色

@@ -108,7 +108,7 @@
- +

语音合成

@@ -126,7 +126,7 @@ /> - + - + - +
- - + @@ -266,14 +265,14 @@ const optimizePrompt = async () => { ElMessage.warning('请先输入音色描述') return } - + optimizing.value = true - + try { const response = await api.post('/voice-design/optimize-prompt', { prompt: form.value.voice_prompt }) - + form.value.voice_prompt = response.optimized_prompt ElMessage.success('提示词优化成功') } catch (error) { @@ -288,7 +287,7 @@ const createVoice = async () => { ElMessage.warning('请输入音色描述') return } - + try { const payload = { voice_prompt: form.value.voice_prompt, @@ -347,26 +346,26 @@ const synthesize = async () => { ElMessage.warning('请选择音色') return } - + if (!ttsText.value) { ElMessage.warning('请输入文本') return } - + synthesizing.value = true audioUrl.value = '' - + try { const wsUrl = `${location.protocol === 'https:' ? 'wss' : 'ws'}://${location.host}/ws/tts/streaming` const ws = new WebSocket(wsUrl) - + ws.onopen = () => { ws.send(JSON.stringify({ action: 'connect', voice_type: 'design', voice_name: selectedVoice.value })) - + setTimeout(() => { ws.send(JSON.stringify({ action: 'synthesize', @@ -374,12 +373,12 @@ const synthesize = async () => { })) }, 500) } - + let audioChunks = [] - + ws.onmessage = (event) => { const data = JSON.parse(event.data) - + if (data.type === 'audio') { audioChunks.push(data.data) } else if (data.type === 'finished') { @@ -392,12 +391,12 @@ const synthesize = async () => { ws.close() } } - + ws.onerror = () => { ElMessage.error('WebSocket连接失败') synthesizing.value = false } - + } catch (error) { ElMessage.error('语音合成失败: ' + error.message) synthesizing.value = false From 39385f9b843d2404d55ad0c2c084a7ebc00d4425 Mon Sep 17 00:00:00 2001 From: yuzhiheng Date: Fri, 30 Jan 2026 14:44:58 +0800 Subject: [PATCH 02/12] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AE=9E?= =?UTF-8?q?=E6=97=B6=E8=AF=AD=E9=9F=B3=E5=90=88=E6=88=90=20(TTS)=20?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=94=AF=E6=8C=81=E9=98=BF=E9=87=8C?= =?UTF-8?q?=E4=BA=91=E5=92=8C=E6=9C=AC=E5=9C=B0=E5=8D=83=E9=97=AE3?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=EF=BC=8C=E5=B9=B6=E5=8C=85=E5=90=AB=E9=9F=B3?= =?UTF-8?q?=E8=89=B2=E5=88=9B=E9=80=A0=E3=80=81=E5=85=8B=E9=9A=86=E5=8F=8A?= =?UTF-8?q?=E5=AE=98=E6=96=B9=E9=9F=B3=E8=89=B2=E8=A7=86=E5=9B=BE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/.env.example | 5 +- backend/api/tts.py | 30 ++++- backend/main.py | 15 ++- backend/services/tts_aliyun.py | 171 +++++++++++++++++++++++++++ backend/services/tts_local.py | 135 +++++++++++++++++++++ backend/services/tts_service.py | 168 +------------------------- frontend/.env.example | 5 +- frontend/src/views/OfficialVoice.vue | 34 ++++-- frontend/src/views/VoiceClone.vue | 67 ++++++----- frontend/src/views/VoiceDesign.vue | 3 - 10 files changed, 408 insertions(+), 225 deletions(-) create mode 100644 backend/services/tts_aliyun.py create mode 100644 backend/services/tts_local.py diff --git a/backend/.env.example b/backend/.env.example index 388b903..3fe962a 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -4,6 +4,5 @@ VITE_QWEN3_TTS_ENV="aliyun" # DashScope API Key (环境为 aliyun 时必填) DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx -# 配置说明: -# 1. 请将此文件复制为 .env 并填写真实的 API Key -# 2. 更多配置选项可在代码中通过前端界面设置 +# Huggingface 镜像站 +# HF_ENDPOINT=https://hf-mirror.com diff --git a/backend/api/tts.py b/backend/api/tts.py index fae5405..bffbd95 100644 --- a/backend/api/tts.py +++ b/backend/api/tts.py @@ -1,21 +1,39 @@ from fastapi import APIRouter, WebSocket, WebSocketDisconnect -from services.tts_service import TTSService +import os import json +if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun": + from services.tts_aliyun import TTSServiceAliyun as TTSService +else: + from services.tts_local import TTSServiceLocal as TTSService + + router = APIRouter() -tts_service = TTSService() +tts_service = None + +def init_tts_service(): + global tts_service + if tts_service is None: + tts_service = TTSService() + return tts_service @router.websocket("/tts/streaming") async def websocket_tts(websocket: WebSocket): await websocket.accept() - + + if tts_service is None: + # Fallback if lifespan didn't run for some reason, + # though lifespan is the preferred way + init_tts_service() + + try: while True: data = await websocket.receive_text() message = json.loads(data) - + action = message.get("action") - + if action == "connect": await tts_service.connect(websocket, message) elif action == "synthesize": @@ -23,7 +41,7 @@ async def websocket_tts(websocket: WebSocket): elif action == "close": await tts_service.close(websocket) break - + except WebSocketDisconnect: print("WebSocket disconnected") except Exception as e: diff --git a/backend/main.py b/backend/main.py index f2a083b..d62035d 100644 --- a/backend/main.py +++ b/backend/main.py @@ -4,13 +4,24 @@ import uvicorn import os from pathlib import Path - +from dotenv import load_dotenv from api import voice_design, voice_clone, tts, utils +from contextlib import asynccontextmanager + + +load_dotenv() + +@asynccontextmanager +async def lifespan(app: FastAPI): + # 在应用启动时初始化 TTS 服务(仅在 worker 进程中运行) + tts.init_tts_service() + yield app = FastAPI( title="元视界AI妙妙屋—声音魔法 API", description="基于千问3 TTS 的音色创造和音色克隆服务", - version="1.0.0" + version="1.0.0", + lifespan=lifespan ) app.add_middleware( diff --git a/backend/services/tts_aliyun.py b/backend/services/tts_aliyun.py new file mode 100644 index 0000000..292bdb4 --- /dev/null +++ b/backend/services/tts_aliyun.py @@ -0,0 +1,171 @@ +import os +import base64 +import threading +import queue + +try: + import dashscope + from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat + STREAMING_AVAILABLE = True +except ImportError: + STREAMING_AVAILABLE = False + +from .tts_service import TTSServiceBase + + +class TTSServiceAliyun(TTSServiceBase): + def __init__(self): + super().__init__() + + async def connect(self, websocket, message): + voice_type = message.get("voice_type", "official") + voice_name = message.get("voice_name") + websocket_url = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + + if voice_type == "design": + model = "qwen3-tts-vd-realtime-2025-12-16" + elif voice_type == "clone": + model = "qwen3-tts-vc-realtime-2025-11-27" + elif voice_type == "official": + model = "qwen3-tts-flash-realtime-2025-11-27" + else: + model = "qwen3-tts-flash" + + self.active_connections[websocket] = { + "model": model, + "voice_name": voice_name, + "websocket_url": websocket_url, + "event_queue": queue.Queue() + } + + await websocket.send_json({ + "type": "connected", + "message": "WebSocket连接成功" + }) + + async def synthesize(self, websocket, message): + if websocket not in self.active_connections: + await websocket.send_json({ + "type": "error", + "message": "请先连接" + }) + return + + config = self.active_connections[websocket] + text = message.get("text") + + if not text: + await websocket.send_json({ + "type": "error", + "message": "请输入文本" + }) + return + + api_key = os.getenv("DASHSCOPE_API_KEY") + if not api_key: + await websocket.send_json({ + "type": "error", + "message": "未配置API Key" + }) + return + + dashscope.api_key = api_key + + try: + event_queue = config["event_queue"] + + class WebSocketCallback(QwenTtsRealtimeCallback): + def __init__(self, ws, queue): + self.ws = ws + self.queue = queue + + def on_open(self): + pass + + def on_close(self, close_status_code, close_msg): + self.queue.put({ + "type": "error", + "message": f"连接异常关闭 ({close_status_code}): {close_msg}" + }) + + def on_event(self, response): + try: + event_type = response.get('type', '') + if event_type == 'response.audio.delta': + audio_data = base64.b64decode(response['delta']) + self.queue.put({ + "type": "audio", + "data": base64.b64encode(audio_data).decode() + }) + elif event_type == 'response.done': + self.queue.put({"type": "done"}) + elif event_type == 'session.finished': + self.queue.put({"type": "finished"}) + elif event_type == 'error': + self.queue.put({ + "type": "error", + "message": response.get('error').get('message') + }) + except Exception as e: + self.queue.put({"type": "error", "message": str(e)}) + + def on_error(self, message): + self.queue.put({"type": "error", "message": message}) + + def run_tts(): + try: + callback = WebSocketCallback(websocket, event_queue) + + qwen_tts_realtime = QwenTtsRealtime( + model=config["model"], + callback=callback, + url=config["websocket_url"] + ) + + qwen_tts_realtime.connect() + + if config["voice_name"]: + qwen_tts_realtime.update_session( + voice=config["voice_name"], + response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, + mode='server_commit' + ) + else: + qwen_tts_realtime.update_session( + response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, + mode='server_commit' + ) + + qwen_tts_realtime.append_text(text) + qwen_tts_realtime.finish() + except Exception as e: + event_queue.put({"type": "error", "message": str(e)}) + + thread = threading.Thread(target=run_tts) + thread.start() + + await websocket.send_json({ + "type": "started" + }) + + while True: + try: + event = event_queue.get(timeout=60) + await websocket.send_json(event) + + if event.get("type") in ["finished", "error"]: + break + except queue.Empty: + break + + thread.join(timeout=5) + + except Exception as e: + await websocket.send_json({ + "type": "error", + "message": str(e) + }) + + async def close(self, websocket): + if websocket in self.active_connections: + del self.active_connections[websocket] diff --git a/backend/services/tts_local.py b/backend/services/tts_local.py new file mode 100644 index 0000000..57a6b43 --- /dev/null +++ b/backend/services/tts_local.py @@ -0,0 +1,135 @@ +import os +import torch +import threading +import base64 +import numpy as np +from qwen_tts import Qwen3TTSModel +from .tts_service import TTSServiceBase + + +LOCAL_DIR = os.path.dirname(os.path.abspath(__file__)) + +class TTSServiceLocal(TTSServiceBase): + _base_model = None + _custom_model = None + _model_lock = threading.Lock() + + def __init__(self): + super().__init__() + self._ensure_models_loaded() + + def _ensure_models_loaded(self): + from qwen_tts import Qwen3TTSModel + + with TTSServiceLocal._model_lock: + if TTSServiceLocal._base_model is None: + print("正在预加载本地 TTS 模型(单例模式),请稍候...") + + TTSServiceLocal._base_model = Qwen3TTSModel.from_pretrained( + "Qwen/Qwen3-TTS-12Hz-1.7B-Base", + device_map="cuda:0", + dtype=torch.bfloat16, + ) + TTSServiceLocal._custom_model = Qwen3TTSModel.from_pretrained( + "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", + device_map="cuda:0", + dtype=torch.bfloat16, + ) + print("本地 TTS 模型预加载完成。") + + @property + def base_model(self): + return TTSServiceLocal._base_model + + @property + def custom_model(self): + return TTSServiceLocal._custom_model + + async def connect(self, websocket, message): + print(message) + voice_type = message.get("voice_type", "official") + voice_name = message.get("voice_name") + + # 将连接信息存储在 active_connections 中,而不是全局 self.model, + # 因为可能有多个并发连接 + self.active_connections[websocket] = { + "voice_type": voice_type, + "voice_name": voice_name + } + + await websocket.send_json({ + "type": "connected", + "message": "本地TTS模型连接成功" + }) + + async def synthesize(self, websocket, message): + print(message) + if websocket not in self.active_connections: + await websocket.send_json({"type": "error", "message": "请先连接"}) + return + + conn_info = self.active_connections[websocket] + voice_type = conn_info["voice_type"] + voice_name = conn_info["voice_name"] + + # 同步生成 + try: + if voice_type == "design" or voice_type == "clone": + ref_audio = os.path.join(LOCAL_DIR, "../previews", voice_name + "_preview.wav") + print(ref_audio) + wavs, sr = self.base_model.generate_voice_clone( + text=message.get("text"), + language=message.get("language", "auto"), + ref_audio=ref_audio, + x_vector_only_mode=True, + ) + elif voice_type == "official": + # 这是一个简化的示例,实际生成参数请根据 qwen_tts 的 API 调整 + wavs, sr = self.custom_model.generate_custom_voice( + text=message.get("text"), + language=message.get("language", "auto"), + speaker=voice_name, + instruct=message.get("instruct", ""), + ) + + # 处理音频数据 + audio_data = wavs + if isinstance(audio_data, torch.Tensor): + audio_data = audio_data.cpu().float().numpy() + elif isinstance(audio_data, list): + audio_data = np.array(audio_data) + + # 确保已经是 numpy 数组 + if not isinstance(audio_data, np.ndarray): + audio_data = np.array(audio_data) + + # 确保是 1D 数组 + if audio_data.ndim > 1: + audio_data = audio_data.flatten() + + # 转换为 Int16 PCM + if audio_data.dtype.kind == 'f': + # 裁剪并归一化到 Int16 范围 + audio_data = np.clip(audio_data, -1.0, 1.0) + audio_data = (audio_data * 32767).astype(np.int16) + + # 转换为 Base64 + pcm_data = audio_data.tobytes() + b64_data = base64.b64encode(pcm_data).decode('utf-8') + + # 发送音频数据 + await websocket.send_json({ + "type": "audio", + "data": b64_data + }) + + # 发送完成信号 + await websocket.send_json({ + "type": "finished", + "message": "合成完成" + }) + except Exception as e: + await websocket.send_json({"type": "error", "message": str(e)}) + + async def close(self, websocket): + pass diff --git a/backend/services/tts_service.py b/backend/services/tts_service.py index be3f56e..06877da 100644 --- a/backend/services/tts_service.py +++ b/backend/services/tts_service.py @@ -1,173 +1,13 @@ -import os -import base64 -import threading -import queue -from dotenv import load_dotenv - - -load_dotenv() - -try: - import dashscope - from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat - STREAMING_AVAILABLE = True -except ImportError: - STREAMING_AVAILABLE = False - -class TTSService: +class TTSServiceBase: def __init__(self): self.active_connections = {} self.active_tts = {} async def connect(self, websocket, message): - voice_type = message.get("voice_type", "design") - voice_name = message.get("voice_name") - websocket_url = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" - - if voice_type == "design" and voice_name: - model = "qwen3-tts-vd-realtime-2025-12-16" - elif voice_type == "clone" and voice_name: - model = "qwen3-tts-vc-realtime-2025-11-27" - elif voice_type == "official": - model = "qwen3-tts-flash-realtime-2025-11-27" - else: - model = "qwen3-tts-flash" - - self.active_connections[websocket] = { - "model": model, - "voice_name": voice_name, - "websocket_url": websocket_url, - "event_queue": queue.Queue() - } - - await websocket.send_json({ - "type": "connected", - "message": "WebSocket连接成功" - }) + raise NotImplementedError async def synthesize(self, websocket, message): - if websocket not in self.active_connections: - await websocket.send_json({ - "type": "error", - "message": "请先连接" - }) - return - - config = self.active_connections[websocket] - text = message.get("text") - - if not text: - await websocket.send_json({ - "type": "error", - "message": "请输入文本" - }) - return - - api_key = os.getenv("DASHSCOPE_API_KEY") - if not api_key: - await websocket.send_json({ - "type": "error", - "message": "未配置API Key" - }) - return - - dashscope.api_key = api_key - - try: - event_queue = config["event_queue"] - - class WebSocketCallback(QwenTtsRealtimeCallback): - def __init__(self, ws, queue): - self.ws = ws - self.queue = queue - - def on_open(self): - pass - - def on_close(self, close_status_code, close_msg): - self.queue.put({ - "type": "error", - "message": f"连接异常关闭 ({close_status_code}): {close_msg}" - }) - - def on_event(self, response): - try: - event_type = response.get('type', '') - if event_type == 'response.audio.delta': - audio_data = base64.b64decode(response['delta']) - self.queue.put({ - "type": "audio", - "data": base64.b64encode(audio_data).decode() - }) - elif event_type == 'response.done': - self.queue.put({"type": "done"}) - elif event_type == 'session.finished': - self.queue.put({"type": "finished"}) - elif event_type == 'error': - self.queue.put({ - "type": "error", - "message": response.get('error').get('message') - }) - except Exception as e: - self.queue.put({"type": "error", "message": str(e)}) - - def on_error(self, message): - self.queue.put({"type": "error", "message": message}) - - def run_tts(): - try: - callback = WebSocketCallback(websocket, event_queue) - - qwen_tts_realtime = QwenTtsRealtime( - model=config["model"], - callback=callback, - url=config["websocket_url"] - ) - - qwen_tts_realtime.connect() - - if config["voice_name"]: - qwen_tts_realtime.update_session( - voice=config["voice_name"], - response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, - mode='server_commit' - ) - else: - qwen_tts_realtime.update_session( - response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, - mode='server_commit' - ) - - qwen_tts_realtime.append_text(text) - qwen_tts_realtime.finish() - except Exception as e: - event_queue.put({"type": "error", "message": str(e)}) - - thread = threading.Thread(target=run_tts) - thread.start() - - await websocket.send_json({ - "type": "started" - }) - - while True: - try: - event = event_queue.get(timeout=60) - await websocket.send_json(event) - - if event.get("type") in ["finished", "error"]: - break - except queue.Empty: - break - - thread.join(timeout=5) - - except Exception as e: - await websocket.send_json({ - "type": "error", - "message": str(e) - }) + raise NotImplementedError async def close(self, websocket): - if websocket in self.active_connections: - del self.active_connections[websocket] + raise NotImplementedError diff --git a/frontend/.env.example b/frontend/.env.example index 388b903..3fe962a 100644 --- a/frontend/.env.example +++ b/frontend/.env.example @@ -4,6 +4,5 @@ VITE_QWEN3_TTS_ENV="aliyun" # DashScope API Key (环境为 aliyun 时必填) DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx -# 配置说明: -# 1. 请将此文件复制为 .env 并填写真实的 API Key -# 2. 更多配置选项可在代码中通过前端界面设置 +# Huggingface 镜像站 +# HF_ENDPOINT=https://hf-mirror.com diff --git a/frontend/src/views/OfficialVoice.vue b/frontend/src/views/OfficialVoice.vue index 9e7aa1e..68bde6b 100644 --- a/frontend/src/views/OfficialVoice.vue +++ b/frontend/src/views/OfficialVoice.vue @@ -65,12 +65,24 @@

语音合成

- - - {{ selectedVoiceInfo?.icon }} {{ selectedVoiceInfo?.displayName }} - - 未选择 - + + + + + {{ selectedVoiceInfo?.icon }} {{ selectedVoiceInfo?.displayName }} + + 未选择 + + + + + + + + { setTimeout(() => { ws.send(JSON.stringify({ action: 'synthesize', - text: ttsText.value + text: ttsText.value, + instruct: ttsInstruct.value })) }, 500) } diff --git a/frontend/src/views/VoiceClone.vue b/frontend/src/views/VoiceClone.vue index 3ce05d1..0128451 100644 --- a/frontend/src/views/VoiceClone.vue +++ b/frontend/src/views/VoiceClone.vue @@ -8,9 +8,9 @@

元视界AI妙妙屋—魔法语音

音色克隆
- + - +
- + - + - + - +

音频预览

- +

已克隆的音色

@@ -127,7 +127,7 @@
- +

语音合成

@@ -145,7 +145,7 @@ /> - + - + - +
- - + @@ -395,13 +394,13 @@ const startRecording = async () => { ElMessage.warning('当前为上传模式,请切换到录音克隆') return } - + // 检查是否为安全上下文(录音功能在非安全上下文如 HTTP + IP 地址下不可用) if (window.isSecureContext === false) { ElMessage.error('录音功能受浏览器安全策略限制,请使用 http://localhost:3001 或 https 协议访问') return } - + // 检查浏览器是否支持mediaDevices API if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { // 处理不支持的情况 @@ -420,11 +419,11 @@ const startRecording = async () => { } return } - + // 现代浏览器支持方式 const stream = await navigator.mediaDevices.getUserMedia({ audio: true }) handleMediaStream(stream) - + } catch (error) { ElMessage.error('无法访问麦克风: ' + error.message) } @@ -434,21 +433,21 @@ const startRecording = async () => { const handleMediaStream = (stream) => { mediaRecorder = new MediaRecorder(stream) audioChunks = [] - + mediaRecorder.ondataavailable = (event) => { audioChunks.push(event.data) } - + mediaRecorder.onstop = () => { recordedBlob.value = new Blob(audioChunks, { type: 'audio/wav' }) recordedUrl.value = URL.createObjectURL(recordedBlob.value) stream.getTracks().forEach(track => track.stop()) } - + mediaRecorder.start() isRecording.value = true remainingTime.value = recordDuration.value - + const timer = setInterval(() => { remainingTime.value-- if (remainingTime.value <= 0) { @@ -470,19 +469,19 @@ const cloneVoice = async () => { ElMessage.warning('请先选择或录制音频') return } - + const seconds = await measureBlobDuration(recordedBlob.value) if (seconds < 1) { ElMessage.error('音频过短,请上传至少1秒的音频') return } - + const formData = new FormData() const wavBlob = await convertToWav(recordedBlob.value) formData.append('audio_file', wavBlob, 'recorded.wav') formData.append('preferred_name', await toSlug(form.value.display_name || form.value.preferred_name)) formData.append('display_name', form.value.display_name || form.value.preferred_name || '') - + try { await cloneVoiceApi(formData) ElMessage.success('声音克隆成功') @@ -517,26 +516,26 @@ const synthesize = async () => { ElMessage.warning('请选择音色') return } - + if (!ttsText.value) { ElMessage.warning('请输入文本') return } - + synthesizing.value = true audioUrl.value = '' - + try { const wsUrl = `${location.protocol === 'https:' ? 'wss' : 'ws'}://${location.host}/ws/tts/streaming` const ws = new WebSocket(wsUrl) - + ws.onopen = () => { ws.send(JSON.stringify({ action: 'connect', voice_type: 'clone', voice_name: selectedVoice.value })) - + setTimeout(() => { ws.send(JSON.stringify({ action: 'synthesize', @@ -544,12 +543,12 @@ const synthesize = async () => { })) }, 500) } - + let audioChunks = [] - + ws.onmessage = (event) => { const data = JSON.parse(event.data) - + if (data.type === 'audio') { audioChunks.push(data.data) } else if (data.type === 'finished') { @@ -562,12 +561,12 @@ const synthesize = async () => { ws.close() } } - + ws.onerror = () => { ElMessage.error('WebSocket连接失败') synthesizing.value = false } - + } catch (error) { ElMessage.error('语音合成失败: ' + error.message) synthesizing.value = false diff --git a/frontend/src/views/VoiceDesign.vue b/frontend/src/views/VoiceDesign.vue index 6fa6879..462e825 100644 --- a/frontend/src/views/VoiceDesign.vue +++ b/frontend/src/views/VoiceDesign.vue @@ -180,7 +180,6 @@ const selectedVoice = ref('') const ttsText = ref('') const audioUrl = ref('') const synthesizing = ref(false) -const settingsVisible = ref(false) const audioRefs = ref({}) const optimizing = ref(false) @@ -243,8 +242,6 @@ const goBack = () => { router.push('/') } -const showSettings = () => {} - const toSlug = async (s) => { const isAscii = /^[a-zA-Z0-9\-\s]+$/.test(s || '') if (isAscii) { From 8ad932c483721355400a8c42f7e444cdd67e4eeb Mon Sep 17 00:00:00 2001 From: yuzhiheng Date: Sat, 31 Jan 2026 09:49:40 +0800 Subject: [PATCH 03/12] feat: Add local voice cloning and design services with `ref_text` support and environment-based service selection. --- README.md | 9 +++ backend/api/voice_clone.py | 19 ++++-- backend/api/voice_design.py | 9 ++- backend/main.py | 12 ++-- ...voice_clone_service.py => clone_aliyun.py} | 24 +++---- backend/services/clone_local.py | 54 +++++++++++++++ backend/services/clone_service.py | 16 +++++ ...ice_design_service.py => design_aliyun.py} | 20 ++---- backend/services/design_local.py | 68 +++++++++++++++++++ backend/services/design_service.py | 19 ++++++ backend/services/tts_local.py | 19 ++++-- backend/utils/storage.py | 26 +++---- frontend/package-lock.json | 15 ++-- frontend/package.json | 4 +- frontend/src/views/VoiceClone.vue | 34 ++++++++-- frontend/src/views/VoiceDesign.vue | 18 ++++- 16 files changed, 289 insertions(+), 77 deletions(-) rename backend/services/{voice_clone_service.py => clone_aliyun.py} (88%) create mode 100644 backend/services/clone_local.py create mode 100644 backend/services/clone_service.py rename backend/services/{voice_design_service.py => design_aliyun.py} (93%) create mode 100644 backend/services/design_local.py create mode 100644 backend/services/design_service.py diff --git a/README.md b/README.md index 5f4c986..f9a7083 100644 --- a/README.md +++ b/README.md @@ -241,6 +241,15 @@ MIT License ## 更新日志 +### v1.2.0 (2026-01-31) + +- **新增功能**: + - 新增“官方音色”模块,预置多种高质量官方音色。 + - 支持本地模型运行,可在无网络环境下使用基础音色功能。 +- **架构优化**: + - 重构后端服务,支持阿里云 API 与本地模型双引擎切换。 + - 优化项目版本管理,同步版本号至 v1.2.0。 + ### v1.1.0 (2025-12-28) - **功能优化**: diff --git a/backend/api/voice_clone.py b/backend/api/voice_clone.py index 48727c9..43f46d6 100644 --- a/backend/api/voice_clone.py +++ b/backend/api/voice_clone.py @@ -1,7 +1,13 @@ +import os from fastapi import APIRouter, HTTPException, UploadFile, File, Form from pydantic import BaseModel from typing import List, Optional -from services.voice_clone_service import VoiceCloneService + +if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun": + from services.clone_aliyun import CloneServiceAliyun as VoiceCloneService +else: + from services.clone_local import CloneServiceLocal as VoiceCloneService + router = APIRouter() voice_clone_service = VoiceCloneService() @@ -16,30 +22,33 @@ class VoiceResponse(BaseModel): description: str display_name: str audio_file: str + ref_text: str created_at: str @router.post("/clone", response_model=dict) async def clone_voice( audio_file: UploadFile = File(...), preferred_name: Optional[str] = Form(None), - display_name: Optional[str] = Form(None) + display_name: Optional[str] = Form(None), + ref_text: Optional[str] = Form(None) ): try: from pathlib import Path BASE_DIR = Path(__file__).resolve().parent.parent upload_dir = BASE_DIR / "uploads" upload_dir.mkdir(exist_ok=True) - + file_path = upload_dir / audio_file.filename with open(file_path, "wb") as f: content = await audio_file.read() f.write(content) - + try: result = voice_clone_service.clone_voice( audio_file=str(file_path), preferred_name=preferred_name, - display_name=display_name + display_name=display_name, + ref_text=ref_text ) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) diff --git a/backend/api/voice_design.py b/backend/api/voice_design.py index 6af92bd..bc873bb 100644 --- a/backend/api/voice_design.py +++ b/backend/api/voice_design.py @@ -1,7 +1,13 @@ +import os from fastapi import APIRouter, HTTPException from pydantic import BaseModel from typing import List, Optional -from services.voice_design_service import VoiceDesignService + +if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun": + from services.design_aliyun import DesignServiceAliyun as VoiceDesignService +else: + from services.design_local import DesignServiceLocal as VoiceDesignService + router = APIRouter() voice_design_service = VoiceDesignService() @@ -17,6 +23,7 @@ class VoiceResponse(BaseModel): description: str display_name: str preview_file: str + ref_text: str created_at: str @router.post("/create", response_model=dict) diff --git a/backend/main.py b/backend/main.py index d62035d..68fe5ad 100644 --- a/backend/main.py +++ b/backend/main.py @@ -5,12 +5,14 @@ import os from pathlib import Path from dotenv import load_dotenv -from api import voice_design, voice_clone, tts, utils -from contextlib import asynccontextmanager - +# 加载环境变量必须在导入 api 模块之前,以便 api 模块内部能正确读取配置 load_dotenv() +from api import voice_clone, voice_design, tts, utils +from contextlib import asynccontextmanager + + @asynccontextmanager async def lifespan(app: FastAPI): # 在应用启动时初始化 TTS 服务(仅在 worker 进程中运行) @@ -20,7 +22,7 @@ async def lifespan(app: FastAPI): app = FastAPI( title="元视界AI妙妙屋—声音魔法 API", description="基于千问3 TTS 的音色创造和音色克隆服务", - version="1.0.0", + version="1.2.0", lifespan=lifespan ) @@ -56,7 +58,7 @@ async def test_audio(filename: str): @app.get("/") async def root(): - return {"message": "元视界AI妙妙屋—声音魔法 API", "version": "1.0.0"} + return {"message": "元视界AI妙妙屋—声音魔法 API", "version": "1.2.0"} @app.get("/health") async def health(): diff --git a/backend/services/voice_clone_service.py b/backend/services/clone_aliyun.py similarity index 88% rename from backend/services/voice_clone_service.py rename to backend/services/clone_aliyun.py index 2959c88..79fc959 100644 --- a/backend/services/voice_clone_service.py +++ b/backend/services/clone_aliyun.py @@ -2,21 +2,20 @@ import json import base64 import requests -from dotenv import load_dotenv -from utils.storage import VoiceStorage from pathlib import Path -load_dotenv() +from services.clone_service import CloneServiceBase -class VoiceCloneService: + +class CloneServiceAliyun(CloneServiceBase): def __init__(self): + super().__init__() + self.api_key = os.getenv("DASHSCOPE_API_KEY") - self.storage = VoiceStorage("data/cloned_voices.json") self.customization_url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization" - self.target_model = "qwen3-tts-vc-realtime-2025-11-27" - def clone_voice(self, audio_file, preferred_name=None, display_name=None): + def clone_voice(self, audio_file, ref_text=None, preferred_name=None, display_name=None): if not self.api_key: raise ValueError("未找到API Key,请先配置") @@ -88,7 +87,8 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None): voice_name=voice_name, description="录音克隆", display_name=display_name or preferred_name or voice_name, - audio_file=audio_file + audio_file=audio_file, + ref_text=ref_text, ) return { @@ -96,6 +96,7 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None): "description": "录音克隆", "display_name": display_name or preferred_name or voice_name, "audio_file": audio_file, + "ref_text": ref_text, "created_at": result.get("created_at", "") } @@ -105,10 +106,3 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None): raise Exception(f"网络请求失败: {e}") except Exception as e: raise Exception(f"发生错误: {e}") - - def list_voices(self): - voices = self.storage.list_voices() - return voices - - def delete_voice(self, voice_name): - return self.storage.delete_voice(voice_name) diff --git a/backend/services/clone_local.py b/backend/services/clone_local.py new file mode 100644 index 0000000..5570278 --- /dev/null +++ b/backend/services/clone_local.py @@ -0,0 +1,54 @@ +import os +from pathlib import Path +import time + +from .clone_service import CloneServiceBase + + +class CloneServiceLocal(CloneServiceBase): + def __init__(self): + super().__init__() + + def clone_voice(self, audio_file, ref_text=None, preferred_name=None, display_name=None): + file_path = Path(audio_file) + if not file_path.exists(): + raise FileNotFoundError(f"音频文件不存在: {audio_file}") + + with open(file_path, "rb") as f: + header = f.read(44) + if len(header) < 44: + raise ValueError("音频文件格式不正确") + sample_rate = int.from_bytes(header[24:28], "little") + channels = int.from_bytes(header[22:24], "little") + bits_per_sample = int.from_bytes(header[34:36], "little") + data_size = int.from_bytes(header[40:44], "little") + if sample_rate == 0 or channels == 0 or bits_per_sample == 0: + raise ValueError("音频文件元数据异常") + duration = data_size / (sample_rate * channels * (bits_per_sample // 8)) + if duration < 1.0: + raise ValueError("音频过短,请上传至少1秒的音频") + + audio_bytes = file_path.read_bytes() + save_filename = f"{preferred_name}_cloned.wav" + save_filepath = f"uploads/{save_filename}" + os.makedirs("uploads", exist_ok=True) + with open(save_filepath, "wb") as f: + f.write(audio_bytes) + + print(ref_text) + self.storage.add_voice( + voice_name=preferred_name, + description="录音克隆", + display_name=display_name or preferred_name, + audio_file=save_filename, + ref_text=ref_text, + ) + + return { + "voice_name": preferred_name, + "description": "录音克隆", + "display_name": display_name or preferred_name, + "audio_file": save_filename, + "ref_text": ref_text, + "created_at": time.time() + } diff --git a/backend/services/clone_service.py b/backend/services/clone_service.py new file mode 100644 index 0000000..b655552 --- /dev/null +++ b/backend/services/clone_service.py @@ -0,0 +1,16 @@ +from utils.storage import VoiceStorage + + +class CloneServiceBase: + def __init__(self): + self.storage = VoiceStorage("data/cloned_voices.json") + + def clone_voice(self, audio_file, ref_text=None, preferred_name=None, display_name=None): + raise NotImplementedError + + def list_voices(self): + voices = self.storage.list_voices() + return voices + + def delete_voice(self, voice_name): + return self.storage.delete_voice(voice_name) diff --git a/backend/services/voice_design_service.py b/backend/services/design_aliyun.py similarity index 93% rename from backend/services/voice_design_service.py rename to backend/services/design_aliyun.py index 48adec8..bfb8fcf 100644 --- a/backend/services/voice_design_service.py +++ b/backend/services/design_aliyun.py @@ -1,17 +1,15 @@ import os import base64 import requests -from dotenv import load_dotenv -from utils.storage import VoiceStorage +from .design_service import DesignServiceBase -load_dotenv() -class VoiceDesignService: +class DesignServiceAliyun(DesignServiceBase): def __init__(self): + super().__init__() + self.api_key = os.getenv("DASHSCOPE_API_KEY") - self.storage = VoiceStorage("data/voices.json") self.voice_design_url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization" - self.target_model = "qwen3-tts-vd-realtime-2025-12-16" def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的声音。", preferred_name=None, display_name=None): @@ -90,7 +88,8 @@ def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的 voice_name=voice_name, description=voice_prompt, display_name=display_name or preferred_name or voice_name, - preview_file=preview_filename + preview_file=preview_filename, + ref_text=preview_text, ) return { @@ -113,13 +112,6 @@ def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的 traceback.print_exc() raise Exception(f"请求失败: {e}") - def list_voices(self): - voices = self.storage.list_voices() - return voices - - def delete_voice(self, voice_name): - return self.storage.delete_voice(voice_name) - def optimize_prompt(self, prompt): if not self.api_key: raise ValueError("未找到API Key,请先配置") diff --git a/backend/services/design_local.py b/backend/services/design_local.py new file mode 100644 index 0000000..b8f9cb9 --- /dev/null +++ b/backend/services/design_local.py @@ -0,0 +1,68 @@ +import os +import time +import torch +import soundfile as sf +import threading +from .design_service import DesignServiceBase + + +class DesignServiceLocal(DesignServiceBase): + _design_model = None + _model_lock = threading.Lock() + + def __init__(self): + super().__init__() + self._ensure_models_loaded() + + def _ensure_models_loaded(self): + from qwen_tts import Qwen3TTSModel + + with DesignServiceLocal._model_lock: + if DesignServiceLocal._design_model is None: + print("正在预加载本地 DESIGN 模型(单例模式),请稍候...") + + DesignServiceLocal._design_model = Qwen3TTSModel.from_pretrained( + "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", + device_map="cuda:0", + dtype=torch.bfloat16, + ) + print("本地 DESIGN 模型预加载完成。") + + @property + def design_model(self): + return DesignServiceLocal._design_model + + def create_custom_voice(self, voice_prompt, preview_text="你好,这是我的声音。", preferred_name=None, display_name=None): + wavs, sr = self.design_model.generate_voice_design( + text=preview_text, + language="auto", + instruct=voice_prompt, + ) + + preview_filename = f"{preferred_name}_preview.wav" + preview_file = f"previews/{preview_filename}" + os.makedirs("previews", exist_ok=True) + sf.write(preview_file, wavs[0], sr) + + self.storage.add_voice( + voice_name=preferred_name, + description=voice_prompt, + display_name=display_name or preferred_name, + preview_file=preview_filename, + ref_text=preview_text, + ) + + return { + "voice_name": preferred_name, + "description": voice_prompt, + "display_name": display_name or preferred_name, + "preview_file": preview_filename, + "created_at": time.time() + } + + def list_voices(self): + voices = self.storage.list_voices() + return voices + + def delete_voice(self, voice_name): + return self.storage.delete_voice(voice_name) diff --git a/backend/services/design_service.py b/backend/services/design_service.py new file mode 100644 index 0000000..1f23a64 --- /dev/null +++ b/backend/services/design_service.py @@ -0,0 +1,19 @@ +from utils.storage import VoiceStorage + + +class DesignServiceBase: + def __init__(self): + self.storage = VoiceStorage("data/voices.json") + + def create_custom_voice(self, voice_prompt, preview_text, preferred_name=None, display_name=None): + raise NotImplementedError + + def list_voices(self): + voices = self.storage.list_voices() + return voices + + def delete_voice(self, voice_name): + return self.storage.delete_voice(voice_name) + + def optimize_prompt(self, prompt): + raise NotImplementedError diff --git a/backend/services/tts_local.py b/backend/services/tts_local.py index 57a6b43..af3a8a2 100644 --- a/backend/services/tts_local.py +++ b/backend/services/tts_local.py @@ -46,7 +46,6 @@ def custom_model(self): return TTSServiceLocal._custom_model async def connect(self, websocket, message): - print(message) voice_type = message.get("voice_type", "official") voice_name = message.get("voice_name") @@ -63,7 +62,6 @@ async def connect(self, websocket, message): }) async def synthesize(self, websocket, message): - print(message) if websocket not in self.active_connections: await websocket.send_json({"type": "error", "message": "请先连接"}) return @@ -74,14 +72,25 @@ async def synthesize(self, websocket, message): # 同步生成 try: - if voice_type == "design" or voice_type == "clone": + if voice_type == "design": ref_audio = os.path.join(LOCAL_DIR, "../previews", voice_name + "_preview.wav") - print(ref_audio) + x_vector_only_mode = message.get("ref_text", "") == "" wavs, sr = self.base_model.generate_voice_clone( text=message.get("text"), language=message.get("language", "auto"), ref_audio=ref_audio, - x_vector_only_mode=True, + ref_text=message.get("ref_text", ""), + x_vector_only_mode=x_vector_only_mode, + ) + elif voice_type == "clone": + ref_audio = os.path.join(LOCAL_DIR, "../uploads", voice_name + "_cloned.wav") + x_vector_only_mode = message.get("ref_text", "") == "" + wavs, sr = self.base_model.generate_voice_clone( + text=message.get("text"), + language=message.get("language", "auto"), + ref_audio=ref_audio, + ref_text=message.get("ref_text", ""), + x_vector_only_mode=x_vector_only_mode, ) elif voice_type == "official": # 这是一个简化的示例,实际生成参数请根据 qwen_tts 的 API 调整 diff --git a/backend/utils/storage.py b/backend/utils/storage.py index e8cd27d..7e4cb73 100644 --- a/backend/utils/storage.py +++ b/backend/utils/storage.py @@ -6,7 +6,7 @@ class VoiceStorage: def __init__(self, storage_file): self.storage_file = storage_file self.voices = self._load_voices() - + def _load_voices(self): if os.path.exists(self.storage_file): try: @@ -16,7 +16,7 @@ def _load_voices(self): print(f"加载音色文件失败: {e}") return {} return {} - + def _save_voices(self): try: os.makedirs(os.path.dirname(self.storage_file), exist_ok=True) @@ -24,18 +24,19 @@ def _save_voices(self): json.dump(self.voices, f, ensure_ascii=False, indent=2) except Exception as e: print(f"保存音色文件失败: {e}") - - def add_voice(self, voice_name, description, display_name=None, preview_file=None, audio_file=None): + + def add_voice(self, voice_name, description, display_name=None, preview_file=None, ref_text=None, audio_file=None): import time self.voices[voice_name] = { "description": description, "display_name": display_name or "", "preview_file": preview_file or "", + "ref_text": ref_text or "", "audio_file": audio_file or "", "created_at": time.strftime("%Y-%m-%d %H:%M:%S") } self._save_voices() - + def list_voices(self): voices_list = [] for voice_name, info in self.voices.items(): @@ -44,11 +45,12 @@ def list_voices(self): "description": info.get('description', ''), "display_name": info.get('display_name', ''), "preview_file": info.get('preview_file', ''), + "ref_text": info.get('ref_text', ''), "audio_file": info.get('audio_file', ''), "created_at": info.get('created_at', '') }) return voices_list - + def delete_voice(self, voice_name): if voice_name in self.voices: del self.voices[voice_name] @@ -60,7 +62,7 @@ class SettingsStorage: def __init__(self): self.settings_file = "data/settings.json" self.settings = self._load_settings() - + def _load_settings(self): if os.path.exists(self.settings_file): try: @@ -70,7 +72,7 @@ def _load_settings(self): print(f"加载设置文件失败: {e}") return {} return {} - + def _save_settings(self): try: os.makedirs(os.path.dirname(self.settings_file), exist_ok=True) @@ -78,17 +80,17 @@ def _save_settings(self): json.dump(self.settings, f, ensure_ascii=False, indent=2) except Exception as e: print(f"保存设置文件失败: {e}") - + def save_api_key(self, api_key): self.settings["api_key"] = api_key self._save_settings() - + def get_api_key(self): return self.settings.get("api_key", "") - + def save_region(self, region): self.settings["region"] = region self._save_settings() - + def get_region(self): return self.settings.get("region", "beijing") diff --git a/frontend/package-lock.json b/frontend/package-lock.json index f778ce7..8fe8760 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1,12 +1,12 @@ { "name": "voice-magic-frontend", - "version": "1.0.0", + "version": "1.2.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "voice-magic-frontend", - "version": "1.0.0", + "version": "1.2.0", "dependencies": { "axios": "^1.6.0", "element-plus": "^2.4.4", @@ -842,7 +842,6 @@ "resolved": "https://registry.npmjs.org/@types/lodash-es/-/lodash-es-4.17.12.tgz", "integrity": "sha512-0NgftHUcV4v34VhXm8QBSftKVXtbkBG3ViCjs6+eJ5a6y6Mi/jiFGPc1sC7QK+9BFhWrURE3EOggmWaSxL9OzQ==", "license": "MIT", - "peer": true, "dependencies": { "@types/lodash": "*" } @@ -1371,15 +1370,13 @@ "version": "4.17.21", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==", - "license": "MIT", - "peer": true + "license": "MIT" }, "node_modules/lodash-es": { "version": "4.17.22", "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.22.tgz", "integrity": "sha512-XEawp1t0gxSi9x01glktRZ5HDy0HXqrM0x5pXQM98EaI0NxO6jVM7omDOxsuEo5UIASAnm2bRp1Jt/e0a2XU8Q==", - "license": "MIT", - "peer": true + "license": "MIT" }, "node_modules/lodash-unified": { "version": "1.0.3", @@ -1580,7 +1577,6 @@ "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.21.3", "postcss": "^8.4.43", @@ -1640,7 +1636,6 @@ "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.26.tgz", "integrity": "sha512-SJ/NTccVyAoNUJmkM9KUqPcYlY+u8OVL1X5EW9RIs3ch5H2uERxyyIUI4MRxVCSOiEcupX9xNGde1tL9ZKpimA==", "license": "MIT", - "peer": true, "dependencies": { "@vue/compiler-dom": "3.5.26", "@vue/compiler-sfc": "3.5.26", @@ -1699,4 +1694,4 @@ } } } -} +} \ No newline at end of file diff --git a/frontend/package.json b/frontend/package.json index 6bdf307..6a238ad 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "voice-magic-frontend", - "version": "1.0.0", + "version": "1.2.0", "type": "module", "scripts": { "dev": "vite", @@ -18,4 +18,4 @@ "@vitejs/plugin-vue": "^4.4.0", "vite": "^5.0.0" } -} +} \ No newline at end of file diff --git a/frontend/src/views/VoiceClone.vue b/frontend/src/views/VoiceClone.vue index 0128451..3ecb8df 100644 --- a/frontend/src/views/VoiceClone.vue +++ b/frontend/src/views/VoiceClone.vue @@ -40,6 +40,15 @@ + + + + { router.push('/') } -const showSettings = () => { - settingsVisible.value = true -} - const toSlug = async (s) => { const isAscii = /^[a-zA-Z0-9\-\s]+$/.test(s || '') if (isAscii) { @@ -481,12 +491,14 @@ const cloneVoice = async () => { formData.append('audio_file', wavBlob, 'recorded.wav') formData.append('preferred_name', await toSlug(form.value.display_name || form.value.preferred_name)) formData.append('display_name', form.value.display_name || form.value.preferred_name || '') + formData.append('ref_text', form.value.ref_text || '') try { await cloneVoiceApi(formData) ElMessage.success('声音克隆成功') form.value.preferred_name = '' form.value.display_name = '' + form.value.ref_text = '' recordedBlob.value = null recordedUrl.value = '' } catch (error) { @@ -497,6 +509,14 @@ const cloneVoice = async () => { const selectVoice = (voice) => { selectedVoice.value = voice.voice_name + ref_text.value = voice.ref_text +} + +const handleVoiceChange = (voiceName) => { + const voice = cloneVoices.value.find(v => v.voice_name === voiceName) + if (voice) { + ref_text.value = voice.ref_text + } } const deleteVoice = async (voiceName) => { @@ -505,6 +525,7 @@ const deleteVoice = async (voiceName) => { ElMessage.success('音色删除成功') if (selectedVoice.value === voiceName) { selectedVoice.value = '' + ref_text.value = '' } } catch (error) { ElMessage.error('音色删除失败: ' + error.message) @@ -539,7 +560,8 @@ const synthesize = async () => { setTimeout(() => { ws.send(JSON.stringify({ action: 'synthesize', - text: ttsText.value + text: ttsText.value, + ref_text: ref_text.value })) }, 500) } diff --git a/frontend/src/views/VoiceDesign.vue b/frontend/src/views/VoiceDesign.vue index 462e825..a4fd5a5 100644 --- a/frontend/src/views/VoiceDesign.vue +++ b/frontend/src/views/VoiceDesign.vue @@ -22,7 +22,7 @@ :rows="3" placeholder="例如:温柔的女声,音色甜美,语速适中" /> -
+
{ const selectVoice = (voice) => { selectedVoice.value = voice.voice_name + refText.value = voice.ref_text +} + +const handleVoiceChange = (voiceName) => { + const voice = designVoices.value.find(v => v.voice_name === voiceName) + if (voice) { + refText.value = voice.ref_text + } } const deleteVoice = async (voiceName) => { @@ -312,6 +324,7 @@ const deleteVoice = async (voiceName) => { ElMessage.success('音色删除成功') if (selectedVoice.value === voiceName) { selectedVoice.value = '' + refText.value = '' } } catch (error) { ElMessage.error('音色删除失败: ' + error.message) @@ -366,7 +379,8 @@ const synthesize = async () => { setTimeout(() => { ws.send(JSON.stringify({ action: 'synthesize', - text: ttsText.value + text: ttsText.value, + ref_text: refText.value })) }, 500) } From 9487400006c16b1f650776a1025106d0ce04ea4a Mon Sep 17 00:00:00 2001 From: zero Date: Sat, 31 Jan 2026 11:08:44 +0800 Subject: [PATCH 04/12] =?UTF-8?q?feat:=20=E5=9C=A8=20VoiceClone=20?= =?UTF-8?q?=E9=A1=B5=E9=9D=A2=E4=B8=AD=E6=A0=B9=E6=8D=AE=E5=85=8B=E9=9A=86?= =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E6=9D=A1=E4=BB=B6=E6=98=BE=E7=A4=BA=E5=BD=95?= =?UTF-8?q?=E9=9F=B3=E6=97=B6=E9=95=BF=E8=BE=93=E5=85=A5=E6=A1=86=EF=BC=8C?= =?UTF-8?q?=E5=B9=B6=E7=A7=BB=E9=99=A4=E6=9C=AA=E4=BD=BF=E7=94=A8=E7=9A=84?= =?UTF-8?q?=20`settingsVisible`=20=E5=8F=98=E9=87=8F=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/.env.example | 2 +- frontend/package-lock.json | 2 +- frontend/src/views/VoiceClone.vue | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/frontend/.env.example b/frontend/.env.example index 3fe962a..89523a9 100644 --- a/frontend/.env.example +++ b/frontend/.env.example @@ -2,7 +2,7 @@ VITE_QWEN3_TTS_ENV="aliyun" # DashScope API Key (环境为 aliyun 时必填) -DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx +DASHSCOPE_API_KEY=sk-30eb93fbb7354fe489e1d06f0623e2af # Huggingface 镜像站 # HF_ENDPOINT=https://hf-mirror.com diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 8fe8760..7c2b184 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1694,4 +1694,4 @@ } } } -} \ No newline at end of file +} diff --git a/frontend/src/views/VoiceClone.vue b/frontend/src/views/VoiceClone.vue index 3ecb8df..bd0adf6 100644 --- a/frontend/src/views/VoiceClone.vue +++ b/frontend/src/views/VoiceClone.vue @@ -30,7 +30,7 @@ 上传音频克隆 - + voiceStore.cloneVoices) const loading = computed(() => voiceStore.loading) From 5f8a940874cc4136ff22c935fa22d755d2eb46b5 Mon Sep 17 00:00:00 2001 From: zero Date: Sat, 31 Jan 2026 13:42:21 +0800 Subject: [PATCH 05/12] =?UTF-8?q?feat:=20=E7=BB=9F=E4=B8=80=E7=8E=AF?= =?UTF-8?q?=E5=A2=83=E9=85=8D=E7=BD=AE=E5=8F=98=E9=87=8F=E5=B9=B6=E5=BC=95?= =?UTF-8?q?=E5=85=A5=20Docker=20=E5=AE=B9=E5=99=A8=E5=8C=96=E9=83=A8?= =?UTF-8?q?=E7=BD=B2=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 5 +++ Dockerfile.aliyun | 56 +++++++++++++++++++++++++ Dockerfile.local | 81 +++++++++++++++++++++++++++++++++++++ backend/.env.example | 2 +- backend/api/tts.py | 2 +- backend/api/voice_clone.py | 2 +- backend/api/voice_design.py | 2 +- backend/main.py | 25 ++++++++++-- docker-compose.yml | 19 +++++++++ frontend/.env.example | 6 --- 10 files changed, 187 insertions(+), 13 deletions(-) create mode 100644 .env.example create mode 100644 Dockerfile.aliyun create mode 100644 Dockerfile.local create mode 100644 docker-compose.yml diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..a9847aa --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +# DashScope API Key (环境为 aliyun 时必填) +DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx + +# Huggingface 镜像站 +# HF_ENDPOINT=https://hf-mirror.com diff --git a/Dockerfile.aliyun b/Dockerfile.aliyun new file mode 100644 index 0000000..2fd9200 --- /dev/null +++ b/Dockerfile.aliyun @@ -0,0 +1,56 @@ +# ========================================== +# 第一阶段:构建前端 +# ========================================== +FROM node:20-slim AS frontend-builder + +WORKDIR /app/frontend + +# 复制依赖文件并安装 (利用 Docker 缓存) +COPY frontend/package*.json ./ +RUN npm install + +# 复制源码并构建 +COPY frontend/ ./ +RUN npm run build + +# ========================================== +# 第二阶段:最终运行环境 +# ========================================== +FROM python:3.11-slim + +# 设置工作目录 +WORKDIR /app + +# 安装必要的系统依赖 +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# 从官方 uv 镜像中安装 uv +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/ +ENV PATH="/uv/bin:${PATH}" + +# 复制后端依赖文件并安装 (使用 uv) +COPY backend/requirements.txt ./ +RUN uv pip install --no-cache -r requirements.txt --system + +# 复制后端代码 +COPY backend/ ./ + +# 确保 previews 目录存在 (虽然 main.py 也会检查,但在这里创建更规范) +RUN mkdir -p previews + +# 从第一阶段复制构建好的前端静态文件到后端 static 目录下 +# 注意:我们在 backend/main.py 中配置了从 static 目录服务静态文件 +COPY --from=frontend-builder /app/frontend/dist /app/static + +# 设置环境变量默认值 (可以在运行时通过 -e 覆盖) +ENV QWEN3_TTS_ENV=aliyun +ENV PORT=8000 +# 如果使用阿里云 DashScope,需要在运行容器时传入 DASHSCOPE_API_KEY + +# 暴露端口 +EXPOSE 8000 + +# 启动命令 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Dockerfile.local b/Dockerfile.local new file mode 100644 index 0000000..88c875c --- /dev/null +++ b/Dockerfile.local @@ -0,0 +1,81 @@ +# ========================================== +# 第一阶段:构建前端 +# ========================================== +FROM node:20-slim AS frontend-builder +WORKDIR /app/frontend +COPY frontend/package*.json ./ +RUN npm install +COPY frontend/ ./ +RUN npm run build + +# ========================================== +# 第二阶段:最终运行环境 (支持 GPU) +# ========================================== +# 使用 NVIDIA CUDA 运行时作为基础镜像 +FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 + +# 设置工作目录 +WORKDIR /app + +# 设置环境变量,避免交互式安装提示 +ENV DEBIAN_FRONTEND=noninteractive + +# 安装 Python 3.11 以及必要的系统依赖 +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.11 \ + python3-pip \ + python3.11-dev \ + curl \ + git \ + ffmpeg \ + libsndfile1 \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# 将 python3.11 设置为默认 python +RUN ln -sf /usr/bin/python3.11 /usr/bin/python \ + && ln -sf /usr/bin/pip3 /usr/bin/pip + +# 安装 uv +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/ +ENV PATH="/uv/bin:${PATH}" + +# 复制后端依赖文件 +COPY backend/requirements.txt ./ + +# 1. 安装基础依赖 +RUN uv pip install --no-cache -r requirements.txt --system + +# 2. 安装本地模型运行所需的深度学习依赖 (针对 CUDA 12.1) +RUN uv pip install --no-cache torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --system + +# 3. 安装 Qwen3-TTS 核心库 +# 注意:如果 pypi 版本未发布,可能需要从 git 安装,这里先尝试 pypi +RUN uv pip install --no-cache qwen-tts --system + +# 4. 可选:安装 FlashAttention 2 以优化性能 (构建时间较长,如不需要可注释掉) +# RUN uv pip install --no-cache flash-attn --no-build-isolation --system + +# 复制后端代码 +COPY backend/ ./ + +# 确保必要的目录存在 +RUN mkdir -p previews data uploads samples + +# 从第一阶段复制构建好的前端静态文件 +COPY --from=frontend-builder /app/frontend/dist /app/static + +# 设置本地运行相关的环境变量 +ENV QWEN3_TTS_ENV=local +ENV PORT=8000 +# 允许下载模型时的超时设置 +ENV HF_HUB_ENABLE_HF_TRANSFER=0 + +# 设置 Hugging Face 缓存目录,方便外部挂载持久化 +ENV HF_HOME=/app/models_cache + +# 暴露端口 +EXPOSE 8000 + +# 启动命令 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/backend/.env.example b/backend/.env.example index 3fe962a..b8ac443 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -1,5 +1,5 @@ # Qwen3-TTS 环境,可选 aliyun: 阿里云,local: 本地 -VITE_QWEN3_TTS_ENV="aliyun" +QWEN3_TTS_ENV="aliyun" # DashScope API Key (环境为 aliyun 时必填) DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx diff --git a/backend/api/tts.py b/backend/api/tts.py index bffbd95..7c2f231 100644 --- a/backend/api/tts.py +++ b/backend/api/tts.py @@ -2,7 +2,7 @@ import os import json -if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun": +if os.getenv("QWEN3_TTS_ENV") == "aliyun": from services.tts_aliyun import TTSServiceAliyun as TTSService else: from services.tts_local import TTSServiceLocal as TTSService diff --git a/backend/api/voice_clone.py b/backend/api/voice_clone.py index 43f46d6..f3503bc 100644 --- a/backend/api/voice_clone.py +++ b/backend/api/voice_clone.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from typing import List, Optional -if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun": +if os.getenv("QWEN3_TTS_ENV") == "aliyun": from services.clone_aliyun import CloneServiceAliyun as VoiceCloneService else: from services.clone_local import CloneServiceLocal as VoiceCloneService diff --git a/backend/api/voice_design.py b/backend/api/voice_design.py index bc873bb..8746d17 100644 --- a/backend/api/voice_design.py +++ b/backend/api/voice_design.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from typing import List, Optional -if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun": +if os.getenv("QWEN3_TTS_ENV") == "aliyun": from services.design_aliyun import DesignServiceAliyun as VoiceDesignService else: from services.design_local import DesignServiceLocal as VoiceDesignService diff --git a/backend/main.py b/backend/main.py index 68fe5ad..dd1a0cd 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,6 +1,7 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles +from fastapi.responses import FileResponse import uvicorn import os from pathlib import Path @@ -56,9 +57,27 @@ async def test_audio(filename: str): else: return {"exists": False, "path": str(file_path)} -@app.get("/") -async def root(): - return {"message": "元视界AI妙妙屋—声音魔法 API", "version": "1.2.0"} +# 挂载前端静态文件 +STATIC_DIR = BASE_DIR / "static" +if STATIC_DIR.exists(): + @app.get("/{full_path:path}") + async def serve_spa(full_path: str): + # 排除 API、WS 和预览路径,让它们由各自的路由处理器处理或返回 404 + if any(full_path.startswith(prefix) for prefix in ["api/", "ws/", "previews/"]): + from fastapi.responses import JSONResponse + return JSONResponse(status_code=404, content={"detail": "Not Found"}) + + # 检查是否请求的是具体的静态文件 + file_path = STATIC_DIR / full_path + if full_path != "" and file_path.exists() and file_path.is_file(): + return FileResponse(file_path) + + # 默认返回 index.html 支持 Vue Router History 模式 + return FileResponse(STATIC_DIR / "index.html") +else: + @app.get("/") + async def root(): + return {"message": "元视界AI妙妙屋—声音魔法 API", "version": "1.2.0"} @app.get("/health") async def health(): diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..f9860c7 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,19 @@ +services: + voice-magic: + build: + context: . + dockerfile: Dockerfile.aliyun + image: voice-magic:aliyun + container_name: voice-magic + ports: + - "8000:8000" + env_file: + - .env + volumes: + - voice_magic_data:/app/data + - voice_magic_previews:/app/previews + restart: always + +volumes: + voice_magic_data: + voice_magic_previews: diff --git a/frontend/.env.example b/frontend/.env.example index 89523a9..598087d 100644 --- a/frontend/.env.example +++ b/frontend/.env.example @@ -1,8 +1,2 @@ # Qwen3-TTS 环境,可选 aliyun: 阿里云,local: 本地 VITE_QWEN3_TTS_ENV="aliyun" - -# DashScope API Key (环境为 aliyun 时必填) -DASHSCOPE_API_KEY=sk-30eb93fbb7354fe489e1d06f0623e2af - -# Huggingface 镜像站 -# HF_ENDPOINT=https://hf-mirror.com From 1ba6df61a38741dcdfee498982d94d054380e49c Mon Sep 17 00:00:00 2001 From: yuzhiheng Date: Sat, 31 Jan 2026 16:06:15 +0800 Subject: [PATCH 06/12] =?UTF-8?q?build:=20=E6=B7=BB=E5=8A=A0=20.dockerigno?= =?UTF-8?q?re=20=E6=96=87=E4=BB=B6=EF=BC=8C=E4=BB=A5=E5=9C=A8=20Docker=20?= =?UTF-8?q?=E6=9E=84=E5=BB=BA=E8=BF=87=E7=A8=8B=E4=B8=AD=E6=8E=92=E9=99=A4?= =?UTF-8?q?=20Python=20=E8=99=9A=E6=8B=9F=E7=8E=AF=E5=A2=83=E5=92=8C?= =?UTF-8?q?=E7=BC=93=E5=AD=98=E6=96=87=E4=BB=B6=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .dockerignore | 4 ++++ Dockerfile.local | 31 +++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d80b58a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +backend/.venv +.venv +__pycache__ +*.pyc diff --git a/Dockerfile.local b/Dockerfile.local index 88c875c..90eca84 100644 --- a/Dockerfile.local +++ b/Dockerfile.local @@ -11,8 +11,9 @@ RUN npm run build # ========================================== # 第二阶段:最终运行环境 (支持 GPU) # ========================================== -# 使用 NVIDIA CUDA 运行时作为基础镜像 -FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 +# 使用 NVIDIA CUDA 开发版作为基础镜像 (支持编译 flash-attn) +# RTX 5090 需要 CUDA 12.8+ 才能完整支持其 Blackwell 架构 (SM 10.0) +FROM nvidia/cuda:12.8.0-devel-ubuntu22.04 # 设置工作目录 WORKDIR /app @@ -28,6 +29,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ curl \ git \ ffmpeg \ + sox \ + libsox-fmt-all \ libsndfile1 \ build-essential \ && rm -rf /var/lib/apt/lists/* @@ -46,14 +49,16 @@ COPY backend/requirements.txt ./ # 1. 安装基础依赖 RUN uv pip install --no-cache -r requirements.txt --system -# 2. 安装本地模型运行所需的深度学习依赖 (针对 CUDA 12.1) -RUN uv pip install --no-cache torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --system +# 2. 安装本地模型运行所需的深度学习依赖 (针对 CUDA 12.8, 支持 RTX 5090) +RUN uv pip install --no-cache torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 --system # 3. 安装 Qwen3-TTS 核心库 # 注意:如果 pypi 版本未发布,可能需要从 git 安装,这里先尝试 pypi RUN uv pip install --no-cache qwen-tts --system # 4. 可选:安装 FlashAttention 2 以优化性能 (构建时间较长,如不需要可注释掉) +# 8.0 (A100), 8.6 (RTX 30), 8.9 (RTX 40), 9.0 (H100), 10.0 (RTX 50) +# ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0;10.0" # RUN uv pip install --no-cache flash-attn --no-build-isolation --system # 复制后端代码 @@ -68,12 +73,22 @@ COPY --from=frontend-builder /app/frontend/dist /app/static # 设置本地运行相关的环境变量 ENV QWEN3_TTS_ENV=local ENV PORT=8000 -# 允许下载模型时的超时设置 -ENV HF_HUB_ENABLE_HF_TRANSFER=0 - -# 设置 Hugging Face 缓存目录,方便外部挂载持久化 +# 设置 Hugging Face 缓存目录 ENV HF_HOME=/app/models_cache +# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型 +RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system +ENV HF_HUB_ENABLE_HF_TRANSFER=1 + +# 在构建期间下载模型,避免运行时下载 +RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base +RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice +RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign + +# 切换到离线模式,禁止程序在运行时尝试连接 Hugging Face 服务器 +ENV HF_HUB_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=1 + # 暴露端口 EXPOSE 8000 From 443459e0ca9e478b05ecc857764f7b6e2a296d45 Mon Sep 17 00:00:00 2001 From: yuzhiheng Date: Sat, 31 Jan 2026 17:24:18 +0800 Subject: [PATCH 07/12] =?UTF-8?q?refactor:=20=E6=9B=B4=E6=96=B0Docker=20Co?= =?UTF-8?q?mpose=E4=BB=A5=E4=BD=BF=E7=94=A8=E5=B8=A6=E6=9C=89=E4=B8=8A?= =?UTF-8?q?=E4=BC=A0=E9=87=8F=E5=92=8C=E5=A2=9E=E5=BC=BA=E7=9A=84=E9=A2=84?= =?UTF-8?q?=E6=9E=84=E5=BB=BA=E6=98=A0=E5=83=8F=E3=80=82Dockerignore=20'?= =?UTF-8?q?=E6=9D=A5=E6=8E=92=E9=99=A4=E6=95=B0=E6=8D=AE=E5=92=8C=E5=AA=92?= =?UTF-8?q?=E4=BD=93=E6=96=87=E4=BB=B6=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .dockerignore | 7 ++ Dockerfile.local | 6 +- README.md | 173 ++++++++++++++++++++++++--------------------- docker-compose.yml | 7 +- 4 files changed, 105 insertions(+), 88 deletions(-) diff --git a/.dockerignore b/.dockerignore index d80b58a..78c9159 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,3 +2,10 @@ backend/.venv .venv __pycache__ *.pyc + +# Data +backend/data/ +backend/uploads/ +backend/previews/ +**/*.wav +**/*.mp3 diff --git a/Dockerfile.local b/Dockerfile.local index 90eca84..855cc7c 100644 --- a/Dockerfile.local +++ b/Dockerfile.local @@ -65,7 +65,7 @@ RUN uv pip install --no-cache qwen-tts --system COPY backend/ ./ # 确保必要的目录存在 -RUN mkdir -p previews data uploads samples +RUN mkdir -p previews data uploads # 从第一阶段复制构建好的前端静态文件 COPY --from=frontend-builder /app/frontend/dist /app/static @@ -85,10 +85,6 @@ RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign -# 切换到离线模式,禁止程序在运行时尝试连接 Hugging Face 服务器 -ENV HF_HUB_OFFLINE=1 -ENV TRANSFORMERS_OFFLINE=1 - # 暴露端口 EXPOSE 8000 diff --git a/README.md b/README.md index f9a7083..d85f58a 100644 --- a/README.md +++ b/README.md @@ -10,118 +10,94 @@ - **智能音色创造**:通过自然语言描述生成个性化音色 - **高质量音色克隆**:录制 10 秒语音即可克隆专属音色 +- **多引擎支持**:支持 **阿里云 DashScope API** 与 **本地 Qwen3-TTS 模型** 双引擎切换 +- **双模式运行**: + - **云端模式**:低资源占用,快速部署,需网络连接。 + - **本地模式**:隐私安全,无网络依赖,性能更强(推荐使用 GPU)。 - **实时音频预览**:立即试听生成的音色效果 -- **流式语音合成**:通过 WebSocket 实现低延迟的语音合成 -- **自动音量增益**:内置音频增益处理,确保生成的语音清晰响亮 -- **响应式设计**:适配各种屏幕尺寸的现代化界面 +- **流式语音合成**:通过 WebSocket 实现极低延迟的语音合成 +- **现代化响应式界面**:美观、易用的 Web 后台管理系统 -## 技术栈 +## 运行模式 -### 后端 +项目支持两种运行模式,通过环境变量 `QWEN3_TTS_ENV` 进行切换: -- **FastAPI 0.100+** - 高性能 Python Web 框架 -- **WebSocket** - 实时通信协议 -- **DashScope SDK** - 千问 TTS API 客户端 -- **Python 3.8+** - 编程语言 +1. **aliyun (默认)**:使用阿里云 DashScope 服务。需要配置 `DASHSCOPE_API_KEY`。适用于大多数用户,无需昂贵的 GPU 资源。 +2. **local**:在本地运行 Qwen3-TTS 模型。需要 NVIDIA GPU(建议 RTX 30 系列及以上)和 CUDA 环境。适用于追求极致响应速度和隐私的用户。 -### 前端 +## 快速开始 -- **Vue 3** - 渐进式 JavaScript 框架 -- **Vite** - 下一代前端构建工具 -- **Pinia** - Vue 3 状态管理 -- **Element Plus** - 基于 Vue 3 的 UI 组件库 -- **WebSocket API** - 浏览器实时通信接口 +### 1. 环境准备 -## 项目结构 +- **如果是本地模式**:需要安装 NVIDIA Driver, CUDA 12.1+, 和 NVIDIA Container Toolkit (用于 Docker)。 +- **如果是云端模式**:只需基础 Docker 环境或 Python 环境。 +- **通用**:获取 [阿里云 API Key](https://help.aliyun.com/zh/model-studio/get-api-key)(仅云端模式需要)。 -``` -Voice_Magic/ -├── backend/ # 后端项目 -│ ├── main.py # FastAPI 主入口(包含静态文件服务配置) -│ ├── requirements.txt # Python 依赖列表 -│ ├── .env.example # 环境变量示例文件 -│ ├── .env # 环境变量(实际使用时配置) -│ ├── previews/ # 音频预览文件存储目录 -│ ├── api/ # API 路由模块 -│ │ ├── voice_design.py # 音色创造 API 端点 -│ │ ├── voice_clone.py # 音色克隆 API 端点 -│ │ ├── settings.py # 设置 API 端点 -│ │ └── tts.py # TTS WebSocket API 端点 -│ ├── services/ # 业务逻辑层 -│ │ ├── voice_design_service.py # 音色创造业务逻辑 -│ │ ├── voice_clone_service.py # 音色克隆业务逻辑 -│ │ └── tts_service.py # TTS 流式服务逻辑 -│ ├── models/ # 数据模型定义 -│ │ └── schemas.py # Pydantic 模型定义 -│ ├── utils/ # 工具函数 -│ │ └── storage.py # 文件和数据存储工具 -│ └── data/ # 数据存储目录 -│ ├── voices.json # 创造的音色数据 -│ ├── cloned_voices.json # 克隆的音色数据 -│ └── settings.json # 应用设置 -├── frontend/ # 前端项目 -│ ├── src/ -│ │ ├── main.js # Vue 应用入口 -│ │ ├── App.vue # 根组件 -│ │ ├── router/ # Vue Router 配置 -│ │ ├── views/ # 页面组件 -│ │ │ ├── Home.vue # 首页 -│ │ │ ├── VoiceDesign.vue # 音色创造页面 -│ │ │ └── VoiceClone.vue # 音色克隆页面 -│ │ ├── components/ # 可复用组件 -│ │ │ └── SettingsModal.vue # 设置弹窗组件 -│ │ ├── api/ # API 调用封装 -│ │ └── stores/ # Pinia 状态管理 -│ ├── public/ # 静态资源 -│ ├── package.json # npm 依赖 -│ └── vite.config.js # Vite 配置 -└── README.md # 项目说明文档 +### 2. Docker 部署 (推荐) + +这是最简单的运行方式,所有依赖已打包。 + +#### 2.1 云端模式 (Aliyun API) + +```bash +# 1. 复制 .env.example 并更名为 .env,填入 API Key +cp .env.example .env + +# 2. 启动容器 +docker compose up -d ``` -## 快速开始 +*注意:默认镜像标签为 `aliyun`。如果需要手动指定,修改 `docker-compose.yml` 中的 image。* -### 1. 环境准备 +#### 2.2 本地模式 (GPU 加速) -#### 环境准备 +```bash +# 1. 修改 .env 配置文件 +# QWEN3_TTS_ENV=local -- Python 3.8+ (推荐使用 Python 3.10) -- Node.js 16+ (推荐使用 Node.js 18+) -- npm 或 yarn 包管理工具 +# 2. 修改 docker-compose.yml 使用 local 镜像标签 +# image: yuzhiheng/voice-magic:local -### 2. 本地开发模式 +# 3. 启动并开启 GPU 支持 +docker compose up -d +``` + +### 3. 本地开发模式 + +如果您想在本地直接运行源码: -#### 2.1 后端设置 +#### 3.1 后端设置 ```bash # 进入后端目录 cd backend -# 创建并激活虚拟环境(可选但推荐) -python -m venv venv -# Windows: venv\Scripts\activate -# Linux/Mac: source venv/bin/activate - # 安装依赖 pip install -r requirements.txt +# 如果使用本地模型,还需安装 torch 和 qwen-tts +# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 +# pip install qwen-tts + # 配置环境变量 -# 复制示例文件并修改 cp .env.example .env -# 编辑 .env 文件,填入你的千问 API Key +# 编辑 .env 文件,设置 QWEN3_TTS_ENV 和 DASHSCOPE_API_KEY ``` 环境变量说明: -````env -# 千问 API Key(必填) +```env +# 运行环境: aliyun 或 local +QWEN3_TTS_ENV=aliyun + +# 阿里云 API Key(aliyun 模式下必填) DASHSCOPE_API_KEY=your_api_key_here +``` ```bash # 启动后端服务 python main.py -```` - -后端服务将在 http://localhost:8000 启动 +``` #### 3.2 前端设置 @@ -136,7 +112,46 @@ npm install npm run dev ``` -前端服务将在 http://localhost:3000 启动 +### 4. Docker 镜像构建 + +如果您希望从源码自行构建镜像,可以使用以下命令: + +#### 4.1 构建云端版 (aliyun) +```bash +docker build -t voice-magic:aliyun -f Dockerfile.aliyun . +``` + +#### 4.2 构建本地版 (local) +```bash +docker build -t voice-magic:local -f Dockerfile.local . +``` + +## 技术栈 + +### 后端 +- **FastAPI** / **Uvicorn** - Web 框架与服务器 +- **Qwen3-TTS** - 千问语音模型核心 +- **DashScope SDK** - 阿里云模型服务接入 +- **WebSocket** - 实现流式音频传输 + +### 前端 +- **Vue 3** / **Vite** - 现代前端框架与构建工具 +- **Element Plus** - UI 组件库 +- **Pinia** - 状态管理 + +## 项目结构 + +```text +Voice_Magic/ +├── backend/ # Python 后端代码 +│ ├── api/ # 接口定义 (TTS, 克隆, 创作) +│ ├── services/ # 业务逻辑 (阿里云/本地双引擎实现) +│ └── main.py # 启动入口 +├── frontend/ # Vue 前端代码 +├── Dockerfile.aliyun # 云端模式 Docker 配置文件 +├── Dockerfile.local # 本地模式 Docker 配置文件 +└── docker-compose.yml # Docker 编排配置 +``` ## 功能说明 diff --git a/docker-compose.yml b/docker-compose.yml index f9860c7..70c3ade 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,9 +1,6 @@ services: voice-magic: - build: - context: . - dockerfile: Dockerfile.aliyun - image: voice-magic:aliyun + image: yuzhiheng/voice-magic:aliyun # Tag 可选: aliyun, local, local-flash-attn container_name: voice-magic ports: - "8000:8000" @@ -12,8 +9,10 @@ services: volumes: - voice_magic_data:/app/data - voice_magic_previews:/app/previews + - voice_magic_uploads:/app/uploads restart: always volumes: voice_magic_data: voice_magic_previews: + voice_magic_uploads: From dd27bbc6bbcbeedb86e3e3c1c80fd3dcc6f6f899 Mon Sep 17 00:00:00 2001 From: yuzhiheng Date: Sat, 31 Jan 2026 17:43:48 +0800 Subject: [PATCH 08/12] =?UTF-8?q?feat:=20=E4=B8=BA=E5=85=8B=E9=9A=86?= =?UTF-8?q?=E7=9A=84=E5=A3=B0=E9=9F=B3=E6=B7=BB=E5=8A=A0=E9=9F=B3=E9=A2=91?= =?UTF-8?q?=E9=A2=84=E8=A7=88=E5=92=8C=E6=92=AD=E6=94=BE=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=EF=BC=8C=E5=8C=85=E6=8B=ACUI=E6=9B=B4=E6=96=B0=E5=92=8C?= =?UTF-8?q?=E4=BB=A3=E7=90=86=E9=85=8D=E7=BD=AE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile.local | 23 ++++---- backend/main.py | 8 ++- frontend/src/views/VoiceClone.vue | 90 +++++++++++++++++++++++++++---- frontend/vite.config.js | 4 ++ 4 files changed, 102 insertions(+), 23 deletions(-) diff --git a/Dockerfile.local b/Dockerfile.local index 855cc7c..b45cd6f 100644 --- a/Dockerfile.local +++ b/Dockerfile.local @@ -43,6 +43,18 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python \ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/ ENV PATH="/uv/bin:${PATH}" +# 设置 Hugging Face 缓存目录 +ENV HF_HOME=/app/models_cache + +# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型 +RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system +ENV HF_HUB_ENABLE_HF_TRANSFER=1 + +# 在构建期间下载模型,避免运行时下载 +RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base +RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice +RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign + # 复制后端依赖文件 COPY backend/requirements.txt ./ @@ -73,17 +85,6 @@ COPY --from=frontend-builder /app/frontend/dist /app/static # 设置本地运行相关的环境变量 ENV QWEN3_TTS_ENV=local ENV PORT=8000 -# 设置 Hugging Face 缓存目录 -ENV HF_HOME=/app/models_cache - -# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型 -RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system -ENV HF_HUB_ENABLE_HF_TRANSFER=1 - -# 在构建期间下载模型,避免运行时下载 -RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base -RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice -RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign # 暴露端口 EXPOSE 8000 diff --git a/backend/main.py b/backend/main.py index dd1a0cd..bef1e2f 100644 --- a/backend/main.py +++ b/backend/main.py @@ -37,12 +37,16 @@ async def lifespan(app: FastAPI): BASE_DIR = Path(__file__).resolve().parent PREVIEWS_DIR = BASE_DIR / "previews" +UPLOADS_DIR = BASE_DIR / "uploads" -# 确保previews文件夹存在,不存在则创建 +# 确保文件夹存在 if not PREVIEWS_DIR.exists(): PREVIEWS_DIR.mkdir(parents=True, exist_ok=True) +if not UPLOADS_DIR.exists(): + UPLOADS_DIR.mkdir(parents=True, exist_ok=True) app.mount("/previews", StaticFiles(directory=str(PREVIEWS_DIR)), name="previews") +app.mount("/uploads", StaticFiles(directory=str(UPLOADS_DIR)), name="uploads") app.include_router(voice_design.router, prefix="/api/voice-design", tags=["音色创造"]) app.include_router(voice_clone.router, prefix="/api/voice-clone", tags=["音色克隆"]) @@ -63,7 +67,7 @@ async def test_audio(filename: str): @app.get("/{full_path:path}") async def serve_spa(full_path: str): # 排除 API、WS 和预览路径,让它们由各自的路由处理器处理或返回 404 - if any(full_path.startswith(prefix) for prefix in ["api/", "ws/", "previews/"]): + if any(full_path.startswith(prefix) for prefix in ["api/", "ws/", "previews/", "uploads/"]): from fastapi.responses import JSONResponse return JSONResponse(status_code=404, content={"detail": "Not Found"}) diff --git a/frontend/src/views/VoiceClone.vue b/frontend/src/views/VoiceClone.vue index bd0adf6..b007f93 100644 --- a/frontend/src/views/VoiceClone.vue +++ b/frontend/src/views/VoiceClone.vue @@ -119,20 +119,39 @@ :key="voice.voice_name" class="voice-card" :class="{ active: selectedVoice === voice.voice_name }" + :data-voice="voice.voice_name" @click="selectVoice(voice)" >

{{ voice.display_name || voice.voice_name }}

- - - +
+ + + + + + +
+

{{ voice.ref_text || '' }}

创建时间: {{ voice.created_at }}

+
@@ -191,7 +210,7 @@ import { ref, onMounted, computed } from 'vue' import { useRouter } from 'vue-router' import { ElMessage } from 'element-plus' -import { ArrowLeft, Delete, Microphone, VideoPause } from '@element-plus/icons-vue' +import { ArrowLeft, Delete, Microphone, VideoPause, VideoPlay } from '@element-plus/icons-vue' import { useVoiceStore } from '@/stores/voice' import api from '@/api' @@ -221,6 +240,7 @@ const ref_text = ref('') const ttsText = ref('') const audioUrl = ref('') const synthesizing = ref(false) +const voiceAudioRefs = ref({}) const cloneVoices = computed(() => voiceStore.cloneVoices) const loading = computed(() => voiceStore.loading) @@ -511,6 +531,21 @@ const selectVoice = (voice) => { ref_text.value = voice.ref_text } +const getAudioUrl = (audioFile) => { + return `/uploads/${audioFile}` +} + +const playVoiceAudio = (voice) => { + const audio = voiceAudioRefs.value[voice.voice_name] + if (audio) { + audio.play().catch(error => { + ElMessage.error('播放失败: ' + error.message) + }) + } else { + ElMessage.error('找不到音频文件') + } +} + const handleVoiceChange = (voiceName) => { const voice = cloneVoices.value.find(v => v.voice_name === voiceName) if (voice) { @@ -662,10 +697,35 @@ h2 { color: #999; } +.voices-section { + max-height: 500px; + overflow-y: auto; + padding-right: 10px; +} + .voices-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(250px, 1fr)); gap: 15px; + padding-bottom: 10px; +} + +.voices-section::-webkit-scrollbar { + width: 8px; +} + +.voices-section::-webkit-scrollbar-track { + background: rgba(255, 154, 158, 0.2); + border-radius: 4px; +} + +.voices-section::-webkit-scrollbar-thumb { + background: rgba(255, 100, 100, 0.5); + border-radius: 4px; +} + +.voices-section::-webkit-scrollbar-thumb:hover { + background: rgba(255, 100, 100, 0.7); } .voice-card { @@ -699,7 +759,17 @@ h2 { color: #333; } -.voice-desc { +.voice-actions { + display: flex; + gap: 5px; +} + +.preview-audio { + width: 100%; + margin-top: 10px; +} + +.voice-ref-text { font-size: 14px; color: #666; margin-bottom: 5px; diff --git a/frontend/vite.config.js b/frontend/vite.config.js index 71dcadb..22494a4 100644 --- a/frontend/vite.config.js +++ b/frontend/vite.config.js @@ -25,6 +25,10 @@ export default defineConfig({ target: "http://localhost:8000", changeOrigin: true, }, + "/uploads": { + target: "http://localhost:8000", + changeOrigin: true, + }, }, }, }); From 9e096d021354a9b4932c3114483a0cb4401133d6 Mon Sep 17 00:00:00 2001 From: yuzhiheng Date: Sun, 1 Feb 2026 08:39:05 +0800 Subject: [PATCH 09/12] =?UTF-8?q?build:=20=E6=9B=B4=E6=96=B0=E6=9C=AC?= =?UTF-8?q?=E5=9C=B0Dockerfile=E4=BB=A5=E5=8F=8D=E6=98=A0=E5=BC=80?= =?UTF-8?q?=E5=8F=91=E7=8E=AF=E5=A2=83=E8=AE=BE=E7=BD=AE=E7=9A=84=E5=8F=98?= =?UTF-8?q?=E5=8C=96=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile.local | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Dockerfile.local b/Dockerfile.local index b45cd6f..f337059 100644 --- a/Dockerfile.local +++ b/Dockerfile.local @@ -43,18 +43,6 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python \ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/ ENV PATH="/uv/bin:${PATH}" -# 设置 Hugging Face 缓存目录 -ENV HF_HOME=/app/models_cache - -# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型 -RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system -ENV HF_HUB_ENABLE_HF_TRANSFER=1 - -# 在构建期间下载模型,避免运行时下载 -RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base -RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice -RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign - # 复制后端依赖文件 COPY backend/requirements.txt ./ @@ -82,6 +70,18 @@ RUN mkdir -p previews data uploads # 从第一阶段复制构建好的前端静态文件 COPY --from=frontend-builder /app/frontend/dist /app/static +# 设置 Hugging Face 缓存目录 +ENV HF_HOME=/app/models_cache + +# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型 +RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system +ENV HF_HUB_ENABLE_HF_TRANSFER=1 + +# 在构建期间下载模型,避免运行时下载 +RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base +RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice +RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign + # 设置本地运行相关的环境变量 ENV QWEN3_TTS_ENV=local ENV PORT=8000 From 9942447d3a656123f70efbe3684bcf10c1339c02 Mon Sep 17 00:00:00 2001 From: zero Date: Sun, 1 Feb 2026 17:11:55 +0800 Subject: [PATCH 10/12] =?UTF-8?q?chore:=20=E6=9B=B4=E6=96=B0=20Aliyun=20Do?= =?UTF-8?q?ckerfile=20=E9=85=8D=E7=BD=AE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile.aliyun | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.aliyun b/Dockerfile.aliyun index 2fd9200..0c8cdf5 100644 --- a/Dockerfile.aliyun +++ b/Dockerfile.aliyun @@ -11,6 +11,7 @@ RUN npm install # 复制源码并构建 COPY frontend/ ./ +RUN cp .env.example .env RUN npm run build # ========================================== From 40f604ccf0678985d96fe95111d1c16e3eceb249 Mon Sep 17 00:00:00 2001 From: yuzhiheng Date: Mon, 2 Feb 2026 10:29:20 +0800 Subject: [PATCH 11/12] =?UTF-8?q?feat:=20=E6=9B=B4=E6=94=B9=20pytorch=20?= =?UTF-8?q?=E7=89=88=E6=9C=AC=E4=B8=BA=202.8.0=EF=BC=8C=E6=94=AF=E6=8C=81?= =?UTF-8?q?=20flash-attn?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile.local | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.local b/Dockerfile.local index f337059..876db2e 100644 --- a/Dockerfile.local +++ b/Dockerfile.local @@ -50,7 +50,9 @@ COPY backend/requirements.txt ./ RUN uv pip install --no-cache -r requirements.txt --system # 2. 安装本地模型运行所需的深度学习依赖 (针对 CUDA 12.8, 支持 RTX 5090) -RUN uv pip install --no-cache torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 --system +RUN uv pip install --no-cache \ + torch==2.8.0+cu128 torchvision==0.23.0+cu128 torchaudio==2.8.0+cu128 \ + --index-url https://download.pytorch.org/whl/cu128 --system # 3. 安装 Qwen3-TTS 核心库 # 注意:如果 pypi 版本未发布,可能需要从 git 安装,这里先尝试 pypi From c7a95721b38eae631a1eb5acae1c96bf3b6f030f Mon Sep 17 00:00:00 2001 From: zero Date: Thu, 5 Feb 2026 09:12:57 +0800 Subject: [PATCH 12/12] =?UTF-8?q?refactor:=20=E5=9C=A8=20Dockerfile.local?= =?UTF-8?q?=20=E4=B8=AD=E6=B7=BB=E5=8A=A0=20uv=20=E8=B6=85=E6=97=B6?= =?UTF-8?q?=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=EF=BC=8C=E9=98=B2=E6=AD=A2?= =?UTF-8?q?=20uv=20=E5=AE=89=E8=A3=85=E8=B6=85=E6=97=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile.local | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.local b/Dockerfile.local index 876db2e..acc4a33 100644 --- a/Dockerfile.local +++ b/Dockerfile.local @@ -42,6 +42,7 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python \ # 安装 uv COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/ ENV PATH="/uv/bin:${PATH}" +ENV UV_HTTP_TIMEOUT=600 # 复制后端依赖文件 COPY backend/requirements.txt ./