From 9e87a029e61c9d5beebad6b4e4f0ddebb883eee7 Mon Sep 17 00:00:00 2001
From: yuzhiheng <zhiheng.yu@bestlink.com.cn>
Date: Thu, 29 Jan 2026 17:46:13 +0800
Subject: [PATCH 01/12] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=E9=9F=B3?=
 =?UTF-8?q?=E8=89=B2=E8=AE=BE=E8=AE=A1=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E9=9F=B3=E8=89=B2=E5=88=9B=E5=BB=BA=E3=80=81=E7=AE=A1?=
 =?UTF-8?q?=E7=90=86=E4=B8=8E=E8=AF=AD=E9=9F=B3=E5=90=88=E6=88=90=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/.env.example                     |   6 +-
 backend/services/tts_service.py          |  73 ++--
 backend/services/voice_clone_service.py  |  36 +-
 backend/services/voice_design_service.py |  53 ++-
 frontend/.env.example                    |   9 +
 frontend/package-lock.json               |   9 +-
 frontend/src/router/index.js             |   6 +
 frontend/src/views/Home.vue              |  25 +-
 frontend/src/views/OfficialVoice.vue     | 470 +++++++++++++++++++++++
 frontend/src/views/VoiceDesign.vue       |  51 ++-
 10 files changed, 624 insertions(+), 114 deletions(-)
 create mode 100644 frontend/.env.example
 create mode 100644 frontend/src/views/OfficialVoice.vue

diff --git a/backend/.env.example b/backend/.env.example
index f1a4e8e..388b903 100644
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -1,7 +1,9 @@
-# DashScope API Key (必填)
+# Qwen3-TTS 环境，可选 aliyun: 阿里云，local: 本地
+VITE_QWEN3_TTS_ENV="aliyun"
+
+# DashScope API Key (环境为 aliyun 时必填)
 DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx
 
 # 配置说明：
 # 1. 请将此文件复制为 .env 并填写真实的 API Key
 # 2. 更多配置选项可在代码中通过前端界面设置
-
diff --git a/backend/services/tts_service.py b/backend/services/tts_service.py
index de84112..be3f56e 100644
--- a/backend/services/tts_service.py
+++ b/backend/services/tts_service.py
@@ -1,11 +1,9 @@
 import os
-import json
 import base64
-import asyncio
 import threading
 import queue
 from dotenv import load_dotenv
- 
+
 
 load_dotenv()
 
@@ -20,31 +18,33 @@ class TTSService:
     def __init__(self):
         self.active_connections = {}
         self.active_tts = {}
-    
+
     async def connect(self, websocket, message):
         voice_type = message.get("voice_type", "design")
         voice_name = message.get("voice_name")
         websocket_url = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
-        
+
         if voice_type == "design" and voice_name:
             model = "qwen3-tts-vd-realtime-2025-12-16"
         elif voice_type == "clone" and voice_name:
             model = "qwen3-tts-vc-realtime-2025-11-27"
+        elif voice_type == "official":
+            model = "qwen3-tts-flash-realtime-2025-11-27"
         else:
             model = "qwen3-tts-flash"
-        
+
         self.active_connections[websocket] = {
             "model": model,
             "voice_name": voice_name,
             "websocket_url": websocket_url,
             "event_queue": queue.Queue()
         }
-        
+
         await websocket.send_json({
             "type": "connected",
             "message": "WebSocket连接成功"
         })
-    
+
     async def synthesize(self, websocket, message):
         if websocket not in self.active_connections:
             await websocket.send_json({
@@ -52,17 +52,17 @@ async def synthesize(self, websocket, message):
                 "message": "请先连接"
             })
             return
-        
+
         config = self.active_connections[websocket]
         text = message.get("text")
-        
+
         if not text:
             await websocket.send_json({
                 "type": "error",
                 "message": "请输入文本"
             })
             return
-        
+
         api_key = os.getenv("DASHSCOPE_API_KEY")
         if not api_key:
             await websocket.send_json({
@@ -70,23 +70,26 @@ async def synthesize(self, websocket, message):
                 "message": "未配置API Key"
             })
             return
-        
+
         dashscope.api_key = api_key
-        
+
         try:
             event_queue = config["event_queue"]
-            
+
             class WebSocketCallback(QwenTtsRealtimeCallback):
                 def __init__(self, ws, queue):
                     self.ws = ws
                     self.queue = queue
-                
+
                 def on_open(self):
                     pass
-                
+
                 def on_close(self, close_status_code, close_msg):
-                    pass
-                
+                    self.queue.put({
+                        "type": "error",
+                        "message": f"连接异常关闭 ({close_status_code}): {close_msg}"
+                    })
+
                 def on_event(self, response):
                     try:
                         event_type = response.get('type', '')
@@ -100,21 +103,29 @@ def on_event(self, response):
                             self.queue.put({"type": "done"})
                         elif event_type == 'session.finished':
                             self.queue.put({"type": "finished"})
+                        elif event_type == 'error':
+                            self.queue.put({
+                                "type": "error",
+                                "message": response.get('error').get('message')
+                            })
                     except Exception as e:
-                        print(f"回调事件处理异常: {e}")
-            
+                        self.queue.put({"type": "error", "message": str(e)})
+
+                def on_error(self, message):
+                    self.queue.put({"type": "error", "message": message})
+
             def run_tts():
                 try:
                     callback = WebSocketCallback(websocket, event_queue)
-                    
+
                     qwen_tts_realtime = QwenTtsRealtime(
                         model=config["model"],
                         callback=callback,
                         url=config["websocket_url"]
                     )
-                    
+
                     qwen_tts_realtime.connect()
-                    
+
                     if config["voice_name"]:
                         qwen_tts_realtime.update_session(
                             voice=config["voice_name"],
@@ -126,37 +137,37 @@ def run_tts():
                             response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
                             mode='server_commit'
                         )
-                    
+
                     qwen_tts_realtime.append_text(text)
                     qwen_tts_realtime.finish()
                 except Exception as e:
                     event_queue.put({"type": "error", "message": str(e)})
-            
+
             thread = threading.Thread(target=run_tts)
             thread.start()
-            
+
             await websocket.send_json({
                 "type": "started"
             })
-            
+
             while True:
                 try:
                     event = event_queue.get(timeout=60)
                     await websocket.send_json(event)
-                    
+
                     if event.get("type") in ["finished", "error"]:
                         break
                 except queue.Empty:
                     break
-            
+
             thread.join(timeout=5)
-            
+
         except Exception as e:
             await websocket.send_json({
                 "type": "error",
                 "message": str(e)
             })
-    
+
     async def close(self, websocket):
         if websocket in self.active_connections:
             del self.active_connections[websocket]
diff --git a/backend/services/voice_clone_service.py b/backend/services/voice_clone_service.py
index 659c657..2959c88 100644
--- a/backend/services/voice_clone_service.py
+++ b/backend/services/voice_clone_service.py
@@ -13,17 +13,17 @@ def __init__(self):
         self.api_key = os.getenv("DASHSCOPE_API_KEY")
         self.storage = VoiceStorage("data/cloned_voices.json")
         self.customization_url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
-        
+
         self.target_model = "qwen3-tts-vc-realtime-2025-11-27"
-    
+
     def clone_voice(self, audio_file, preferred_name=None, display_name=None):
         if not self.api_key:
             raise ValueError("未找到API Key，请先配置")
-        
+
         file_path = Path(audio_file)
         if not file_path.exists():
             raise FileNotFoundError(f"音频文件不存在: {audio_file}")
-        
+
         with open(file_path, "rb") as f:
             header = f.read(44)
             if len(header) < 44:
@@ -37,16 +37,16 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None):
             duration = data_size / (sample_rate * channels * (bits_per_sample // 8))
             if duration < 1.0:
                 raise ValueError("音频过短，请上传至少1秒的音频")
-        
+
         audio_data = file_path.read_bytes()
         base64_str = base64.b64encode(audio_data).decode()
         data_uri = f"data:audio/wav;base64,{base64_str}"
-        
+
         headers = {
             "Authorization": f"Bearer {self.api_key}",
             "Content-Type": "application/json"
         }
-        
+
         payload = {
             "model": "qwen-voice-enrollment",
             "input": {
@@ -56,7 +56,7 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None):
                 "audio": {"data": data_uri, "format": "wav"}
             }
         }
-        
+
         try:
             response = requests.post(
                 self.customization_url,
@@ -66,31 +66,31 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None):
             )
             print(f"克隆接口状态码: {response.status_code}")
             print(f"克隆接口返回: {response.text[:500]}")
-            
+
             if response.status_code != 200:
                 try:
                     error_data = response.json()
                     error_code = error_data.get("code", "")
                     error_msg = error_data.get("message", "")
-                    
+
                     if "Audio.PreprocessError" in error_code or "No segments meet" in error_msg:
                         raise ValueError("音频有效时长不足，请确保录音时长超过5秒且声音清晰（去除静音后需满足时长要求）")
-                    
+
                     raise ValueError(f"克隆失败: {error_msg}")
                 except json.JSONDecodeError:
                     response.raise_for_status()
-            
+
             result = response.json()
-            
+
             voice_name = result["output"]["voice"]
-            
+
             self.storage.add_voice(
                 voice_name=voice_name,
                 description="录音克隆",
                 display_name=display_name or preferred_name or voice_name,
                 audio_file=audio_file
             )
-            
+
             return {
                 "voice_name": voice_name,
                 "description": "录音克隆",
@@ -98,17 +98,17 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None):
                 "audio_file": audio_file,
                 "created_at": result.get("created_at", "")
             }
-            
+
         except ValueError as e:
             raise e
         except requests.exceptions.RequestException as e:
             raise Exception(f"网络请求失败: {e}")
         except Exception as e:
             raise Exception(f"发生错误: {e}")
-    
+
     def list_voices(self):
         voices = self.storage.list_voices()
         return voices
-    
+
     def delete_voice(self, voice_name):
         return self.storage.delete_voice(voice_name)
diff --git a/backend/services/voice_design_service.py b/backend/services/voice_design_service.py
index e20e7ee..48adec8 100644
--- a/backend/services/voice_design_service.py
+++ b/backend/services/voice_design_service.py
@@ -1,5 +1,4 @@
 import os
-import json
 import base64
 import requests
 from dotenv import load_dotenv
@@ -12,18 +11,18 @@ def __init__(self):
         self.api_key = os.getenv("DASHSCOPE_API_KEY")
         self.storage = VoiceStorage("data/voices.json")
         self.voice_design_url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
-        
+
         self.target_model = "qwen3-tts-vd-realtime-2025-12-16"
-    
+
     def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的声音。", preferred_name=None, display_name=None):
         if not self.api_key:
             raise ValueError("未找到API Key，请先配置")
-        
+
         headers = {
             "Authorization": f"Bearer {self.api_key}",
             "Content-Type": "application/json"
         }
-        
+
         data = {
             "model": "qwen-voice-design",
             "input": {
@@ -37,24 +36,24 @@ def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的
                 "response_format": "wav"
             }
         }
-        
+
         if preview_text:
             data["input"]["preview_text"] = preview_text
-        
+
         if preferred_name:
             data["input"]["preferred_name"] = preferred_name
-        
+
         try:
             response = requests.post(self.voice_design_url, headers=headers, json=data, timeout=60)
-            
+
             response.raise_for_status()
             result = response.json()
-            
+
             if result.get("output"):
                 voice_name = result["output"].get("voice")
                 base64_audio = result["output"]["preview_audio"]["data"]
                 audio_bytes = base64.b64decode(base64_audio)
-                
+
                 # 增加音频增益（放大音量）
                 try:
                     import struct
@@ -62,11 +61,11 @@ def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的
                     if len(audio_bytes) > 44:
                         header = audio_bytes[:44]
                         pcm_data = audio_bytes[44:]
-                        
+
                         # 16-bit PCM, Little Endian
                         count = len(pcm_data) // 2
                         samples = struct.unpack(f"<{count}h", pcm_data)
-                        
+
                         gain = 5.0 # 与前端保持一致的增益
                         new_samples = []
                         for s in samples:
@@ -74,26 +73,26 @@ def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的
                             if v > 32767: v = 32767
                             if v < -32768: v = -32768
                             new_samples.append(v)
-                            
+
                         new_pcm_data = struct.pack(f"<{count}h", *new_samples)
                         audio_bytes = header + new_pcm_data
                         print("预览音频增益处理成功")
                 except Exception as e:
                     print(f"预览音频增益处理失败: {e}")
-                
+
                 preview_filename = f"{voice_name}_preview.wav"
                 preview_file = f"previews/{preview_filename}"
                 os.makedirs("previews", exist_ok=True)
                 with open(preview_file, "wb") as f:
                     f.write(audio_bytes)
-                
+
                 self.storage.add_voice(
                     voice_name=voice_name,
                     description=voice_prompt,
                     display_name=display_name or preferred_name or voice_name,
                     preview_file=preview_filename
                 )
-                
+
                 return {
                     "voice_name": voice_name,
                     "description": voice_prompt,
@@ -103,7 +102,7 @@ def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的
                 }
             else:
                 raise Exception(f"创建失败: {result}")
-                
+
         except requests.exceptions.RequestException as e:
             print(f"网络请求错误: {e}")
             print(f"响应内容: {response.text if 'response' in locals() else 'N/A'}")
@@ -113,28 +112,28 @@ def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的
             import traceback
             traceback.print_exc()
             raise Exception(f"请求失败: {e}")
-    
+
     def list_voices(self):
         voices = self.storage.list_voices()
         return voices
-    
+
     def delete_voice(self, voice_name):
         return self.storage.delete_voice(voice_name)
-    
+
     def optimize_prompt(self, prompt):
         if not self.api_key:
             raise ValueError("未找到API Key，请先配置")
-        
+
         from dashscope import Generation
         import dashscope
-        
+
         dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'
-        
+
         messages = [
             {"role": "system", "content": "你是一个专业的音色设计助理，负责将用户简洁的音色描述优化为详细、专业的音色设计提示词。优化后的提示词应该包含模仿对象，说清楚年轻范围，性别特征（比如22岁女性，32岁男主播等），音色特质（如甜美、低沉、磁性等）、情感倾向、语音特点（如语速、语调等）等方面的详细描述，以便生成更符合的AI音色。输出要求：仅输出音色描述文本，无需包含其他解释内容"},
             {"role": "user", "content": prompt},
         ]
-        
+
         try:
             response = Generation.call(
                 api_key=self.api_key,
@@ -143,12 +142,12 @@ def optimize_prompt(self, prompt):
                 result_format="message",
                 enable_thinking=False,
             )
-            
+
             if response.status_code == 200:
                 return response.output.choices[0].message.content
             else:
                 raise Exception(f"优化失败: HTTP {response.status_code}, {response.message}")
-                
+
         except Exception as e:
             print(f"优化提示词错误: {e}")
             raise Exception(f"优化提示词失败: {e}")
diff --git a/frontend/.env.example b/frontend/.env.example
new file mode 100644
index 0000000..388b903
--- /dev/null
+++ b/frontend/.env.example
@@ -0,0 +1,9 @@
+# Qwen3-TTS 环境，可选 aliyun: 阿里云，local: 本地
+VITE_QWEN3_TTS_ENV="aliyun"
+
+# DashScope API Key (环境为 aliyun 时必填)
+DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx
+
+# 配置说明：
+# 1. 请将此文件复制为 .env 并填写真实的 API Key
+# 2. 更多配置选项可在代码中通过前端界面设置
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index e55d47a..f778ce7 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -842,6 +842,7 @@
       "resolved": "https://registry.npmjs.org/@types/lodash-es/-/lodash-es-4.17.12.tgz",
       "integrity": "sha512-0NgftHUcV4v34VhXm8QBSftKVXtbkBG3ViCjs6+eJ5a6y6Mi/jiFGPc1sC7QK+9BFhWrURE3EOggmWaSxL9OzQ==",
       "license": "MIT",
+      "peer": true,
       "dependencies": {
         "@types/lodash": "*"
       }
@@ -1370,13 +1371,15 @@
       "version": "4.17.21",
       "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
       "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==",
-      "license": "MIT"
+      "license": "MIT",
+      "peer": true
     },
     "node_modules/lodash-es": {
       "version": "4.17.22",
       "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.22.tgz",
       "integrity": "sha512-XEawp1t0gxSi9x01glktRZ5HDy0HXqrM0x5pXQM98EaI0NxO6jVM7omDOxsuEo5UIASAnm2bRp1Jt/e0a2XU8Q==",
-      "license": "MIT"
+      "license": "MIT",
+      "peer": true
     },
     "node_modules/lodash-unified": {
       "version": "1.0.3",
@@ -1577,6 +1580,7 @@
       "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==",
       "dev": true,
       "license": "MIT",
+      "peer": true,
       "dependencies": {
         "esbuild": "^0.21.3",
         "postcss": "^8.4.43",
@@ -1636,6 +1640,7 @@
       "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.26.tgz",
       "integrity": "sha512-SJ/NTccVyAoNUJmkM9KUqPcYlY+u8OVL1X5EW9RIs3ch5H2uERxyyIUI4MRxVCSOiEcupX9xNGde1tL9ZKpimA==",
       "license": "MIT",
+      "peer": true,
       "dependencies": {
         "@vue/compiler-dom": "3.5.26",
         "@vue/compiler-sfc": "3.5.26",
diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js
index dee32c9..70cfa70 100644
--- a/frontend/src/router/index.js
+++ b/frontend/src/router/index.js
@@ -2,6 +2,7 @@ import { createRouter, createWebHistory } from 'vue-router'
 import Home from '@/views/Home.vue'
 import VoiceDesign from '@/views/VoiceDesign.vue'
 import VoiceClone from '@/views/VoiceClone.vue'
+import OfficialVoice from '@/views/OfficialVoice.vue'
 
 const routes = [
   {
@@ -18,6 +19,11 @@ const routes = [
     path: '/voice-clone',
     name: 'VoiceClone',
     component: VoiceClone
+  },
+  {
+    path: '/official-voice',
+    name: 'OfficialVoice',
+    component: OfficialVoice
   }
 ]
 
diff --git a/frontend/src/views/Home.vue b/frontend/src/views/Home.vue
index bed29b6..073718c 100644
--- a/frontend/src/views/Home.vue
+++ b/frontend/src/views/Home.vue
@@ -4,7 +4,7 @@
       <h1 class="title">元视界AI妙妙屋</h1>
       <p class="subtitle">魔法语音</p>
     </div>
-    
+
     <div class="content">
       <div class="mode-card" @click="goToVoiceDesign">
         <div class="card-icon">🎨</div>
@@ -12,26 +12,31 @@
         <p class="card-desc">通过文字描述创造个性化音色</p>
         <div class="card-arrow">→</div>
       </div>
-      
+
       <div class="mode-card" @click="goToVoiceClone">
         <div class="card-icon">🎤</div>
         <h2 class="card-title">音色克隆</h2>
         <p class="card-desc">录制声音并克隆为专属音色</p>
         <div class="card-arrow">→</div>
       </div>
+
+      <div class="mode-card" @click="goToOfficialVoice">
+        <div class="card-icon">🎙️</div>
+        <h2 class="card-title">官方音色</h2>
+        <p class="card-desc">使用官方预置的专业音色</p>
+        <div class="card-arrow">→</div>
+      </div>
     </div>
-    
-    
+
   </div>
 </template>
 
 <script setup>
-import { ref } from 'vue'
 import { useRouter } from 'vue-router'
- 
+
 
 const router = useRouter()
- 
+
 
 const goToVoiceDesign = () => {
   router.push('/voice-design')
@@ -41,7 +46,11 @@ const goToVoiceClone = () => {
   router.push('/voice-clone')
 }
 
- 
+const goToOfficialVoice = () => {
+  router.push('/official-voice')
+}
+
+
 </script>
 
 <style scoped>
diff --git a/frontend/src/views/OfficialVoice.vue b/frontend/src/views/OfficialVoice.vue
new file mode 100644
index 0000000..9e7aa1e
--- /dev/null
+++ b/frontend/src/views/OfficialVoice.vue
@@ -0,0 +1,470 @@
+<template>
+  <div class="official-voice-container">
+    <div class="header brand">
+      <el-button @click="goBack" circle>
+        <el-icon><ArrowLeft /></el-icon>
+      </el-button>
+      <div class="titles">
+        <h1 class="brand-title">元视界AI妙妙屋—魔法语音</h1>
+        <div class="sub-title">官方音色</div>
+      </div>
+    </div>
+
+    <div class="content">
+      <div class="selection-section">
+        <h2>选择官方音色</h2>
+        <div class="voices-grid">
+          <div
+            v-for="voice in officialVoices"
+            :key="voice.name"
+            class="voice-card"
+            :class="{ active: selectedVoice === voice.name }"
+            @click="selectVoice(voice)"
+          >
+            <div class="voice-header">
+              <div class="voice-icon-wrapper">
+                <span class="voice-icon">{{ voice.icon }}</span>
+              </div>
+              <h3>{{ voice.displayName }}</h3>
+            </div>
+            <p class="voice-desc">{{ voice.description }}</p>
+          </div>
+        </div>
+      </div>
+
+      <div class="details-section">
+        <h2>音色详情</h2>
+        <div v-if="selectedVoiceInfo" class="voice-detail-card">
+          <div class="detail-header">
+            <span class="detail-icon">{{ selectedVoiceInfo.icon }}</span>
+            <div class="detail-titles">
+              <h3>{{ selectedVoiceInfo.displayName }}</h3>
+              <p class="detail-desc">{{ selectedVoiceInfo.description }}</p>
+            </div>
+          </div>
+            <div class="detail-body">
+            <div class="info-item">
+              <span class="label">语言：</span>
+              <span class="value">{{ selectedVoiceInfo.lang }}</span>
+            </div>
+            <div class="info-item">
+              <span class="label">适用场景：</span>
+              <span class="value">{{ selectedVoiceInfo.scenarios }}</span>
+            </div>
+            <div class="info-item">
+              <span class="label">音色模型：</span>
+              <span class="value">Qwen3-TTS-Flash</span>
+            </div>
+          </div>
+        </div>
+        <div v-else class="empty-state">
+          <p>请点击左侧卡片选择一个官方音色</p>
+        </div>
+      </div>
+
+      <div class="tts-section" v-loading="synthesizing" element-loading-text="正在合成语音...">
+        <h2>语音合成</h2>
+        <el-form label-width="100px">
+          <el-form-item label="当前音色">
+            <el-tag v-if="selectedVoice" type="success" size="large" effect="dark">
+               {{ selectedVoiceInfo?.icon }} {{ selectedVoiceInfo?.displayName }}
+            </el-tag>
+            <span v-else style="color: #999;">未选择</span>
+          </el-form-item>
+
+          <el-form-item label="输入文本">
+            <el-input
+              v-model="ttsText"
+              type="textarea"
+              :rows="4"
+              placeholder="请输入要转换的文字，例如：你好，欢迎来到元视界AI妙妙屋！"
+            />
+          </el-form-item>
+
+          <el-form-item>
+            <el-button
+              type="primary"
+              @click="synthesize"
+              :loading="synthesizing"
+              :disabled="!selectedVoice || !ttsText"
+              size="large"
+            >
+              生成语音
+            </el-button>
+          </el-form-item>
+        </el-form>
+
+        <div v-if="audioUrl" class="audio-player">
+          <audio :src="audioUrl" controls autoplay />
+        </div>
+      </div>
+    </div>
+  </div>
+</template>
+
+<script setup>
+import { ref, computed } from 'vue'
+import { useRouter } from 'vue-router'
+import { ElMessage } from 'element-plus'
+import { ArrowLeft } from '@element-plus/icons-vue'
+
+const router = useRouter()
+
+const tts_env = import.meta.env.VITE_QWEN3_TTS_ENV === 'aliyun'
+
+const aliyunVoices = [
+  { name: 'cherry', displayName: '芊悦', description: '阳光积极、亲切自然小姐姐', icon: '👩', scenarios: '活力旁白、短视频、对话', lang: '多语言' },
+  { name: 'serena', displayName: '苏瑶', description: '温柔小姐姐', icon: '🌙', scenarios: '暖心解说、有声书、客服', lang: '多语言' },
+  { name: 'ethan', displayName: '晨煦', description: '阳光、温暖、活力、朝气（北方口音）', icon: '👦', scenarios: '生活Vlog、朝气男声、解说', lang: '多语言' },
+  { name: 'chelsie', displayName: '千雪', description: '二次元虚拟女友', icon: '❄️', scenarios: '动漫配音、虚拟助理', lang: '多语言' },
+  { name: 'momo', displayName: '茉兔', description: '撒娇搞怪，逗你开心', icon: '🐰', scenarios: '萌系配音、短视频', lang: '多语言' },
+  { name: 'vivian', displayName: '十三', description: '拽拽的、可爱的小暴躁', icon: '👧', scenarios: '个性旁白、互动问答', lang: '多语言' },
+  { name: 'moon', displayName: '月白', description: '率性帅气的月白', icon: '🌙', scenarios: '冷酷男声、时尚解说', lang: '多语言' },
+  { name: 'maia', displayName: '四月', description: '知性与温柔的碰撞', icon: '🌸', scenarios: '散文读办、知性广告', lang: '多语言' },
+  { name: 'kai', displayName: '凯', description: '耳朵的一场SPA', icon: '🧔', scenarios: '磁性男声、助眠播报', lang: '多语言' },
+  { name: 'nofish', displayName: '不吃鱼', description: '不会翘舌音的设计师男声', icon: '🐟', scenarios: '自然口音、生活记录', lang: '多语言' },
+  { name: 'bella', displayName: '萌宝', description: '喝酒不打醉拳的小萝莉', icon: '👧', scenarios: '萌系动画、儿童音色', lang: '多语言' },
+  { name: 'jennifer', displayName: '詹妮弗', description: '品牌级、电影质感般美语女声', icon: '🎬', scenarios: '电影解说、高端广告', lang: '多语言' },
+  { name: 'ryan', displayName: '甜茶', description: '节奏拉满，戏感炸裂的男声', icon: '🎸', scenarios: '富有感染力的配音', lang: '多语言' },
+  { name: 'katerina', displayName: '卡捷琳娜', description: '御姐音色，韵律回味十足', icon: '👠', scenarios: '成熟女声、高端解说', lang: '多语言' },
+  { name: 'aiden', displayName: '艾登', description: '精通厨艺的美语大男孩', icon: '👨', scenarios: '美食Vlog、美式男声', lang: '多语言' },
+  { name: 'jada', displayName: '上海-阿珍', description: '风风火火的沪上阿姐', icon: '🥟', scenarios: '上海话解说、方言特色', lang: '中文 (上海话)' },
+  { name: 'dylan', displayName: '北京-晓东', description: '北京胡同里长大的少年', icon: '👦', scenarios: '北京话对话、京味解说', lang: '中文 (北京话)' },
+  { name: 'eric', displayName: '四川-程川', description: '一个跳脱市井的四川成都男子', icon: '🍵', scenarios: '四川话解说、幽默对话', lang: '中文 (四川话)' },
+  { name: 'sunny', displayName: '四川-晴儿', description: '甜到你心里的川妹子', icon: '🌶️', scenarios: '四川话配音、萌系女声', lang: '中文 (四川话)' },
+  { name: 'rocky', displayName: '粤语-阿强', description: '幽默风趣的阿强，在线陪聊', icon: '🕶️', scenarios: '粤语脱口秀、风趣旁白', lang: '中文 (粤语)' },
+  { name: 'kiki', displayName: '粤语-阿清', description: '甜美的港妹闺蜜', icon: '👗', scenarios: '粤语配音、生活Vlog', lang: '中文 (粤语)' }
+]
+
+const localVoices = [
+  { name: 'vivian', displayName: 'Vivian', description: '明快飒爽的年轻女声', icon: '👩', scenarios: '活力旁白、短视频、对话', lang: '中文' },
+  { name: 'serena', displayName: 'Serena', description: '温柔知性的年轻女声', icon: '🌙', scenarios: '暖心解说、有声书、客服', lang: '中文' },
+  { name: 'uncle_fu', displayName: 'Uncle_Fu', description: '低沉浑厚的成熟男声', icon: '🧔‍♂️', scenarios: '纪录片、故事讲述、稳重旁白', lang: '中文' },
+  { name: 'dylan', displayName: 'Dylan', description: '清朗自然的北京少男', icon: '👦', scenarios: '生活Vlog、京味儿对话、朝气男声', lang: '中文 (北京话)' },
+  { name: 'eric', displayName: 'Eric', description: '活泼微哑的成都男声', icon: '🍵', scenarios: '四川话配音、地道解说、个性化内容', lang: '中文 (四川话)' },
+  { name: 'ryan', displayName: 'Ryan', description: '富有节奏驱动感的男声', icon: '🎸', scenarios: '动感广告、热场配音、英文旁白', lang: '英语' },
+  { name: 'aiden', displayName: 'Aiden', description: '阳光清亮的美国男声', icon: '👨', scenarios: '美式配音、英语教学、活力旁白', lang: '英语' },
+  { name: 'ono_anna', displayName: 'Ono_Anna', description: '俏皮灵动的日本女声', icon: '🌸', scenarios: '动漫配音、日式广告、对话', lang: '日语' },
+  { name: 'sohee', displayName: 'Sohee', description: '情感丰富的温暖韩语女声', icon: '🍯', scenarios: '韩语配音、剧情解说、柔美旁白', lang: '韩语' }
+]
+
+const officialVoices = ref(tts_env ? aliyunVoices : localVoices)
+
+const selectedVoice = ref('')
+const ttsText = ref('')
+const audioUrl = ref('')
+const synthesizing = ref(false)
+
+const selectedVoiceInfo = computed(() => {
+  return officialVoices.value.find(v => v.name === selectedVoice.value)
+})
+
+const createWavUrl = (chunks) => {
+  const gain = 5.0
+  const total = chunks.reduce((n, c) => n + atob(c).length, 0)
+  const raw = new Uint8Array(total)
+  let offset = 0
+  for (const c of chunks) {
+    const b = atob(c)
+    const len = b.length
+    for (let i = 0; i < len; i++) raw[offset + i] = b.charCodeAt(i)
+    offset += len
+  }
+  const samples = total / 2
+  const pcm = new DataView(new ArrayBuffer(total))
+  const src = new DataView(raw.buffer)
+  let woff = 0
+  for (let i = 0; i < samples; i++) {
+    const s = src.getInt16(i * 2, true)
+    let v = Math.round(s * gain)
+    if (v > 32767) v = 32767
+    if (v < -32768) v = -32768
+    pcm.setInt16(woff, v, true)
+    woff += 2
+  }
+  const header = new ArrayBuffer(44)
+  const view = new DataView(header)
+  const writeStr = (o, s) => { for (let i = 0; i < s.length; i++) view.setUint8(o + i, s.charCodeAt(i)) }
+  const sampleRate = 24000
+  const channels = 1
+  const bytesPerSample = 2
+  const dataSize = total
+  writeStr(0, 'RIFF')
+  view.setUint32(4, 36 + dataSize, true)
+  writeStr(8, 'WAVE')
+  writeStr(12, 'fmt ')
+  view.setUint32(16, 16, true)
+  view.setUint16(20, 1, true)
+  view.setUint16(22, channels, true)
+  view.setUint32(24, sampleRate, true)
+  view.setUint32(28, sampleRate * channels * bytesPerSample, true)
+  view.setUint16(32, channels * bytesPerSample, true)
+  view.setUint16(34, bytesPerSample * 8, true)
+  writeStr(36, 'data')
+  view.setUint32(40, dataSize, true)
+  const wav = new Uint8Array(44 + dataSize)
+  wav.set(new Uint8Array(header), 0)
+  wav.set(new Uint8Array(pcm.buffer) , 44)
+  return URL.createObjectURL(new Blob([wav], { type: 'audio/wav' }))
+}
+
+const goBack = () => {
+  router.push('/')
+}
+
+const selectVoice = (voice) => {
+  selectedVoice.value = voice.name
+}
+
+const synthesize = async () => {
+  if (!selectedVoice.value) {
+    ElMessage.warning('请选择音色')
+    return
+  }
+
+  if (!ttsText.value) {
+    ElMessage.warning('请输入文本')
+    return
+  }
+
+  synthesizing.value = true
+  audioUrl.value = ''
+
+  try {
+    const wsUrl = `${location.protocol === 'https:' ? 'wss' : 'ws'}://${location.host}/ws/tts/streaming`
+    const ws = new WebSocket(wsUrl)
+
+    ws.onopen = () => {
+      ws.send(JSON.stringify({
+        action: 'connect',
+        voice_type: 'official',
+        voice_name: selectedVoice.value
+      }))
+
+      setTimeout(() => {
+        ws.send(JSON.stringify({
+          action: 'synthesize',
+          text: ttsText.value
+        }))
+      }, 500)
+    }
+
+    let audioChunks = []
+
+    ws.onmessage = (event) => {
+      const data = JSON.parse(event.data)
+
+      if (data.type === 'audio') {
+        audioChunks.push(data.data)
+      } else if (data.type === 'finished') {
+        ws.close()
+        audioUrl.value = createWavUrl(audioChunks)
+        synthesizing.value = false
+      } else if (data.type === 'error') {
+        ElMessage.error('语音合成失败: ' + data.message)
+        synthesizing.value = false
+        ws.close()
+      }
+    }
+
+    ws.onerror = () => {
+      ElMessage.error('WebSocket连接失败')
+      synthesizing.value = false
+    }
+
+    ws.onclose = (event) => {
+      if (synthesizing.value) {
+        ElMessage.error('WebSocket连接已断开')
+        synthesizing.value = false
+      }
+    }
+
+  } catch (error) {
+    ElMessage.error('语音合成失败: ' + error.message)
+    synthesizing.value = false
+  }
+}
+</script>
+
+<style scoped>
+.official-voice-container {
+  min-height: 100vh;
+  padding: 20px;
+  background: linear-gradient(135deg, #FFE29F 0%, #FFA751 100%);
+}
+
+.header {
+  display: flex;
+  justify-content: left;
+  align-items: center;
+  margin-bottom: 30px;
+  background: white;
+  padding: 20px;
+  border-radius: 10px;
+  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
+  position: relative;
+  height: 100px;
+}
+
+.title { display: none; }
+.brand .titles { display: flex; flex-direction: column; align-items: center; position: absolute; left: 50%; transform: translateX(-50%); pointer-events: none; }
+.brand-title { font-size: 34px; font-weight: 800; color: #ff8c00; letter-spacing: 1px; margin: 0; font-family: 'Comic Sans MS', 'Quicksand', 'Baloo 2', sans-serif; text-shadow: 0 2px 6px rgba(0,0,0,0.08); }
+.sub-title { font-size: 22px; font-weight: bold; color: #7a4f1b; margin-top: 6px; font-family: 'Comic Sans MS', 'Quicksand', 'Baloo 2', sans-serif; }
+.header .el-button { z-index: 2; }
+
+.content {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 20px;
+  max-width: 1400px;
+  margin: 0 auto;
+}
+
+.selection-section,
+.details-section,
+.tts-section {
+  background: linear-gradient(180deg, #FFFDF2 0%, #FFE6B3 100%);
+  padding: 30px;
+  border-radius: 15px;
+  box-shadow: 0 6px 20px rgba(255, 154, 158, 0.3);
+  border: 1px solid rgba(255, 154, 158, 0.3);
+}
+
+.tts-section {
+  grid-column: 1 / -1;
+}
+
+h2 {
+  font-size: 28px;
+  color: #3a2d18;
+  margin-bottom: 20px;
+  font-weight: 800;
+  text-shadow: 0 1px 4px rgba(255, 167, 81, 0.25);
+}
+
+.voices-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
+  gap: 12px;
+}
+
+.voice-card {
+  border: 2px solid #e0e0e0;
+  border-radius: 8px;
+  padding: 10px 12px;
+  cursor: pointer;
+  transition: all 0.3s ease;
+  background: white;
+}
+
+.voice-card:hover {
+  border-color: #ff8c00;
+  transform: translateY(-2px);
+  box-shadow: 0 4px 12px rgba(255, 140, 0, 0.2);
+}
+
+.voice-card.active {
+  border-color: #ff8c00;
+  background: #fff8e1;
+}
+
+.voice-header {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  margin-bottom: 10px;
+}
+
+.voice-icon-wrapper {
+  font-size: 24px;
+}
+
+.voice-header h3 {
+  margin: 0;
+  font-size: 18px;
+  color: #333;
+}
+
+.voice-desc {
+  font-size: 13px;
+  color: #666;
+  margin: 0;
+  line-height: 1.4;
+}
+
+.details-section {
+  display: flex;
+  flex-direction: column;
+}
+
+.voice-detail-card {
+  background: rgba(255, 255, 255, 0.6);
+  border-radius: 12px;
+  padding: 20px;
+  border: 1px solid rgba(255, 140, 0, 0.1);
+}
+
+.detail-header {
+  display: flex;
+  align-items: center;
+  gap: 20px;
+  margin-bottom: 20px;
+  padding-bottom: 15px;
+  border-bottom: 1px solid rgba(0, 0, 0, 0.05);
+}
+
+.detail-icon {
+  font-size: 48px;
+}
+
+.detail-titles h3 {
+  margin: 0 0 5px 0;
+  font-size: 24px;
+  color: #3a2d18;
+}
+
+.detail-desc {
+  margin: 0;
+  color: #7a4f1b;
+  font-size: 16px;
+}
+
+.detail-body {
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+
+.info-item {
+  display: flex;
+  font-size: 15px;
+}
+
+.info-item .label {
+  font-weight: bold;
+  color: #3a2d18;
+  width: 80px;
+}
+
+.info-item .value {
+  color: #666;
+  flex: 1;
+}
+
+.empty-state {
+  flex: 1;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  color: #999;
+  font-size: 16px;
+}
+
+.audio-player {
+  margin-top: 20px;
+}
+
+.audio-player audio {
+  width: 100%;
+}
+</style>
diff --git a/frontend/src/views/VoiceDesign.vue b/frontend/src/views/VoiceDesign.vue
index c037bf8..6fa6879 100644
--- a/frontend/src/views/VoiceDesign.vue
+++ b/frontend/src/views/VoiceDesign.vue
@@ -8,9 +8,9 @@
         <h1 class="brand-title">元视界AI妙妙屋—魔法语音</h1>
         <div class="sub-title">音色创造</div>
       </div>
-      
+
     </div>
-    
+
     <div class="content">
       <div class="create-section" v-loading="loading" element-loading-text="正在创建音色...">
         <h2>创建新音色</h2>
@@ -33,21 +33,21 @@
               </el-button>
             </div>
           </el-form-item>
-          
+
           <el-form-item label="预览文本">
             <el-input
               v-model="form.preview_text"
               placeholder="你好，这是我的声音。"
             />
           </el-form-item>
-          
+
           <el-form-item label="显示名称">
             <el-input
               v-model="form.display_name"
               placeholder="例如：猫娘、老板、客服小姐姐"
             />
           </el-form-item>
-          
+
           <el-form-item>
             <el-button
               type="primary"
@@ -60,7 +60,7 @@
           </el-form-item>
         </el-form>
       </div>
-      
+
       <div class="voices-section" v-loading="loading" element-loading-text="正在加载音色...">
         <h2>已创建的音色</h2>
         <div v-if="designVoices.length === 0" class="empty-state">
@@ -108,7 +108,7 @@
           </div>
         </div>
       </div>
-      
+
       <div class="tts-section" v-loading="synthesizing" element-loading-text="正在合成语音...">
         <h2>语音合成</h2>
         <el-form label-width="100px">
@@ -126,7 +126,7 @@
               />
             </el-select>
           </el-form-item>
-          
+
           <el-form-item label="输入文本">
             <el-input
               v-model="ttsText"
@@ -135,7 +135,7 @@
               placeholder="请输入要转换的文字"
             />
           </el-form-item>
-          
+
           <el-form-item>
             <el-button
               type="primary"
@@ -148,14 +148,13 @@
             </el-button>
           </el-form-item>
         </el-form>
-        
+
         <div v-if="audioUrl" class="audio-player">
           <audio :src="audioUrl" controls autoplay />
         </div>
       </div>
     </div>
-    
-    
+
   </div>
 </template>
 
@@ -266,14 +265,14 @@ const optimizePrompt = async () => {
     ElMessage.warning('请先输入音色描述')
     return
   }
-  
+
   optimizing.value = true
-  
+
   try {
     const response = await api.post('/voice-design/optimize-prompt', {
       prompt: form.value.voice_prompt
     })
-    
+
     form.value.voice_prompt = response.optimized_prompt
     ElMessage.success('提示词优化成功')
   } catch (error) {
@@ -288,7 +287,7 @@ const createVoice = async () => {
     ElMessage.warning('请输入音色描述')
     return
   }
-  
+
   try {
     const payload = {
       voice_prompt: form.value.voice_prompt,
@@ -347,26 +346,26 @@ const synthesize = async () => {
     ElMessage.warning('请选择音色')
     return
   }
-  
+
   if (!ttsText.value) {
     ElMessage.warning('请输入文本')
     return
   }
-  
+
   synthesizing.value = true
   audioUrl.value = ''
-  
+
   try {
     const wsUrl = `${location.protocol === 'https:' ? 'wss' : 'ws'}://${location.host}/ws/tts/streaming`
     const ws = new WebSocket(wsUrl)
-    
+
     ws.onopen = () => {
       ws.send(JSON.stringify({
         action: 'connect',
         voice_type: 'design',
         voice_name: selectedVoice.value
       }))
-      
+
       setTimeout(() => {
         ws.send(JSON.stringify({
           action: 'synthesize',
@@ -374,12 +373,12 @@ const synthesize = async () => {
         }))
       }, 500)
     }
-    
+
     let audioChunks = []
-    
+
     ws.onmessage = (event) => {
       const data = JSON.parse(event.data)
-      
+
       if (data.type === 'audio') {
         audioChunks.push(data.data)
       } else if (data.type === 'finished') {
@@ -392,12 +391,12 @@ const synthesize = async () => {
         ws.close()
       }
     }
-    
+
     ws.onerror = () => {
       ElMessage.error('WebSocket连接失败')
       synthesizing.value = false
     }
-    
+
   } catch (error) {
     ElMessage.error('语音合成失败: ' + error.message)
     synthesizing.value = false

From 39385f9b843d2404d55ad0c2c084a7ebc00d4425 Mon Sep 17 00:00:00 2001
From: yuzhiheng <zhiheng.yu@bestlink.com.cn>
Date: Fri, 30 Jan 2026 14:44:58 +0800
Subject: [PATCH 02/12] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=AE=9E?=
 =?UTF-8?q?=E6=97=B6=E8=AF=AD=E9=9F=B3=E5=90=88=E6=88=90=20(TTS)=20?=
 =?UTF-8?q?=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=94=AF=E6=8C=81=E9=98=BF=E9=87=8C?=
 =?UTF-8?q?=E4=BA=91=E5=92=8C=E6=9C=AC=E5=9C=B0=E5=8D=83=E9=97=AE3?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=EF=BC=8C=E5=B9=B6=E5=8C=85=E5=90=AB=E9=9F=B3?=
 =?UTF-8?q?=E8=89=B2=E5=88=9B=E9=80=A0=E3=80=81=E5=85=8B=E9=9A=86=E5=8F=8A?=
 =?UTF-8?q?=E5=AE=98=E6=96=B9=E9=9F=B3=E8=89=B2=E8=A7=86=E5=9B=BE=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 backend/.env.example                 |   5 +-
 backend/api/tts.py                   |  30 ++++-
 backend/main.py                      |  15 ++-
 backend/services/tts_aliyun.py       | 171 +++++++++++++++++++++++++++
 backend/services/tts_local.py        | 135 +++++++++++++++++++++
 backend/services/tts_service.py      | 168 +-------------------------
 frontend/.env.example                |   5 +-
 frontend/src/views/OfficialVoice.vue |  34 ++++--
 frontend/src/views/VoiceClone.vue    |  67 ++++++-----
 frontend/src/views/VoiceDesign.vue   |   3 -
 10 files changed, 408 insertions(+), 225 deletions(-)
 create mode 100644 backend/services/tts_aliyun.py
 create mode 100644 backend/services/tts_local.py

diff --git a/backend/.env.example b/backend/.env.example
index 388b903..3fe962a 100644
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -4,6 +4,5 @@ VITE_QWEN3_TTS_ENV="aliyun"
 # DashScope API Key (环境为 aliyun 时必填)
 DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx
 
-# 配置说明：
-# 1. 请将此文件复制为 .env 并填写真实的 API Key
-# 2. 更多配置选项可在代码中通过前端界面设置
+# Huggingface 镜像站
+# HF_ENDPOINT=https://hf-mirror.com
diff --git a/backend/api/tts.py b/backend/api/tts.py
index fae5405..bffbd95 100644
--- a/backend/api/tts.py
+++ b/backend/api/tts.py
@@ -1,21 +1,39 @@
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
-from services.tts_service import TTSService
+import os
 import json
 
+if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun":
+    from services.tts_aliyun import TTSServiceAliyun as TTSService
+else:
+    from services.tts_local import TTSServiceLocal as TTSService
+
+
 router = APIRouter()
-tts_service = TTSService()
+tts_service = None
+
+def init_tts_service():
+    global tts_service
+    if tts_service is None:
+        tts_service = TTSService()
+    return tts_service
 
 @router.websocket("/tts/streaming")
 async def websocket_tts(websocket: WebSocket):
     await websocket.accept()
-    
+
+    if tts_service is None:
+        # Fallback if lifespan didn't run for some reason,
+        # though lifespan is the preferred way
+        init_tts_service()
+
+
     try:
         while True:
             data = await websocket.receive_text()
             message = json.loads(data)
-            
+
             action = message.get("action")
-            
+
             if action == "connect":
                 await tts_service.connect(websocket, message)
             elif action == "synthesize":
@@ -23,7 +41,7 @@ async def websocket_tts(websocket: WebSocket):
             elif action == "close":
                 await tts_service.close(websocket)
                 break
-                
+
     except WebSocketDisconnect:
         print("WebSocket disconnected")
     except Exception as e:
diff --git a/backend/main.py b/backend/main.py
index f2a083b..d62035d 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -4,13 +4,24 @@
 import uvicorn
 import os
 from pathlib import Path
-
+from dotenv import load_dotenv
 from api import voice_design, voice_clone, tts, utils
+from contextlib import asynccontextmanager
+
+
+load_dotenv()
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # 在应用启动时初始化 TTS 服务（仅在 worker 进程中运行）
+    tts.init_tts_service()
+    yield
 
 app = FastAPI(
     title="元视界AI妙妙屋—声音魔法 API",
     description="基于千问3 TTS 的音色创造和音色克隆服务",
-    version="1.0.0"
+    version="1.0.0",
+    lifespan=lifespan
 )
 
 app.add_middleware(
diff --git a/backend/services/tts_aliyun.py b/backend/services/tts_aliyun.py
new file mode 100644
index 0000000..292bdb4
--- /dev/null
+++ b/backend/services/tts_aliyun.py
@@ -0,0 +1,171 @@
+import os
+import base64
+import threading
+import queue
+
+try:
+    import dashscope
+    from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat
+    STREAMING_AVAILABLE = True
+except ImportError:
+    STREAMING_AVAILABLE = False
+
+from .tts_service import TTSServiceBase
+
+
+class TTSServiceAliyun(TTSServiceBase):
+    def __init__(self):
+        super().__init__()
+
+    async def connect(self, websocket, message):
+        voice_type = message.get("voice_type", "official")
+        voice_name = message.get("voice_name")
+        websocket_url = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
+
+        if voice_type == "design":
+            model = "qwen3-tts-vd-realtime-2025-12-16"
+        elif voice_type == "clone":
+            model = "qwen3-tts-vc-realtime-2025-11-27"
+        elif voice_type == "official":
+            model = "qwen3-tts-flash-realtime-2025-11-27"
+        else:
+            model = "qwen3-tts-flash"
+
+        self.active_connections[websocket] = {
+            "model": model,
+            "voice_name": voice_name,
+            "websocket_url": websocket_url,
+            "event_queue": queue.Queue()
+        }
+
+        await websocket.send_json({
+            "type": "connected",
+            "message": "WebSocket连接成功"
+        })
+
+    async def synthesize(self, websocket, message):
+        if websocket not in self.active_connections:
+            await websocket.send_json({
+                "type": "error",
+                "message": "请先连接"
+            })
+            return
+
+        config = self.active_connections[websocket]
+        text = message.get("text")
+
+        if not text:
+            await websocket.send_json({
+                "type": "error",
+                "message": "请输入文本"
+            })
+            return
+
+        api_key = os.getenv("DASHSCOPE_API_KEY")
+        if not api_key:
+            await websocket.send_json({
+                "type": "error",
+                "message": "未配置API Key"
+            })
+            return
+
+        dashscope.api_key = api_key
+
+        try:
+            event_queue = config["event_queue"]
+
+            class WebSocketCallback(QwenTtsRealtimeCallback):
+                def __init__(self, ws, queue):
+                    self.ws = ws
+                    self.queue = queue
+
+                def on_open(self):
+                    pass
+
+                def on_close(self, close_status_code, close_msg):
+                    self.queue.put({
+                        "type": "error",
+                        "message": f"连接异常关闭 ({close_status_code}): {close_msg}"
+                    })
+
+                def on_event(self, response):
+                    try:
+                        event_type = response.get('type', '')
+                        if event_type == 'response.audio.delta':
+                            audio_data = base64.b64decode(response['delta'])
+                            self.queue.put({
+                                "type": "audio",
+                                "data": base64.b64encode(audio_data).decode()
+                            })
+                        elif event_type == 'response.done':
+                            self.queue.put({"type": "done"})
+                        elif event_type == 'session.finished':
+                            self.queue.put({"type": "finished"})
+                        elif event_type == 'error':
+                            self.queue.put({
+                                "type": "error",
+                                "message": response.get('error').get('message')
+                            })
+                    except Exception as e:
+                        self.queue.put({"type": "error", "message": str(e)})
+
+                def on_error(self, message):
+                    self.queue.put({"type": "error", "message": message})
+
+            def run_tts():
+                try:
+                    callback = WebSocketCallback(websocket, event_queue)
+
+                    qwen_tts_realtime = QwenTtsRealtime(
+                        model=config["model"],
+                        callback=callback,
+                        url=config["websocket_url"]
+                    )
+
+                    qwen_tts_realtime.connect()
+
+                    if config["voice_name"]:
+                        qwen_tts_realtime.update_session(
+                            voice=config["voice_name"],
+                            response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
+                            mode='server_commit'
+                        )
+                    else:
+                        qwen_tts_realtime.update_session(
+                            response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
+                            mode='server_commit'
+                        )
+
+                    qwen_tts_realtime.append_text(text)
+                    qwen_tts_realtime.finish()
+                except Exception as e:
+                    event_queue.put({"type": "error", "message": str(e)})
+
+            thread = threading.Thread(target=run_tts)
+            thread.start()
+
+            await websocket.send_json({
+                "type": "started"
+            })
+
+            while True:
+                try:
+                    event = event_queue.get(timeout=60)
+                    await websocket.send_json(event)
+
+                    if event.get("type") in ["finished", "error"]:
+                        break
+                except queue.Empty:
+                    break
+
+            thread.join(timeout=5)
+
+        except Exception as e:
+            await websocket.send_json({
+                "type": "error",
+                "message": str(e)
+            })
+
+    async def close(self, websocket):
+        if websocket in self.active_connections:
+            del self.active_connections[websocket]
diff --git a/backend/services/tts_local.py b/backend/services/tts_local.py
new file mode 100644
index 0000000..57a6b43
--- /dev/null
+++ b/backend/services/tts_local.py
@@ -0,0 +1,135 @@
+import os
+import torch
+import threading
+import base64
+import numpy as np
+from qwen_tts import Qwen3TTSModel
+from .tts_service import TTSServiceBase
+
+
+LOCAL_DIR = os.path.dirname(os.path.abspath(__file__))
+
+class TTSServiceLocal(TTSServiceBase):
+    _base_model = None
+    _custom_model = None
+    _model_lock = threading.Lock()
+
+    def __init__(self):
+        super().__init__()
+        self._ensure_models_loaded()
+
+    def _ensure_models_loaded(self):
+        from qwen_tts import Qwen3TTSModel
+
+        with TTSServiceLocal._model_lock:
+            if TTSServiceLocal._base_model is None:
+                print("正在预加载本地 TTS 模型（单例模式），请稍候...")
+
+                TTSServiceLocal._base_model = Qwen3TTSModel.from_pretrained(
+                    "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
+                    device_map="cuda:0",
+                    dtype=torch.bfloat16,
+                )
+                TTSServiceLocal._custom_model = Qwen3TTSModel.from_pretrained(
+                    "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
+                    device_map="cuda:0",
+                    dtype=torch.bfloat16,
+                )
+                print("本地 TTS 模型预加载完成。")
+
+    @property
+    def base_model(self):
+        return TTSServiceLocal._base_model
+
+    @property
+    def custom_model(self):
+        return TTSServiceLocal._custom_model
+
+    async def connect(self, websocket, message):
+        print(message)
+        voice_type = message.get("voice_type", "official")
+        voice_name = message.get("voice_name")
+
+        # 将连接信息存储在 active_connections 中，而不是全局 self.model，
+        # 因为可能有多个并发连接
+        self.active_connections[websocket] = {
+            "voice_type": voice_type,
+            "voice_name": voice_name
+        }
+
+        await websocket.send_json({
+            "type": "connected",
+            "message": "本地TTS模型连接成功"
+        })
+
+    async def synthesize(self, websocket, message):
+        print(message)
+        if websocket not in self.active_connections:
+            await websocket.send_json({"type": "error", "message": "请先连接"})
+            return
+
+        conn_info = self.active_connections[websocket]
+        voice_type = conn_info["voice_type"]
+        voice_name = conn_info["voice_name"]
+
+        # 同步生成
+        try:
+            if voice_type == "design" or voice_type == "clone":
+                ref_audio = os.path.join(LOCAL_DIR, "../previews", voice_name + "_preview.wav")
+                print(ref_audio)
+                wavs, sr = self.base_model.generate_voice_clone(
+                    text=message.get("text"),
+                    language=message.get("language", "auto"),
+                    ref_audio=ref_audio,
+                    x_vector_only_mode=True,
+                )
+            elif voice_type == "official":
+                # 这是一个简化的示例，实际生成参数请根据 qwen_tts 的 API 调整
+                wavs, sr = self.custom_model.generate_custom_voice(
+                    text=message.get("text"),
+                    language=message.get("language", "auto"),
+                    speaker=voice_name,
+                    instruct=message.get("instruct", ""),
+                )
+
+            # 处理音频数据
+            audio_data = wavs
+            if isinstance(audio_data, torch.Tensor):
+                audio_data = audio_data.cpu().float().numpy()
+            elif isinstance(audio_data, list):
+                audio_data = np.array(audio_data)
+
+            # 确保已经是 numpy 数组
+            if not isinstance(audio_data, np.ndarray):
+                 audio_data = np.array(audio_data)
+
+            # 确保是 1D 数组
+            if audio_data.ndim > 1:
+                audio_data = audio_data.flatten()
+
+            # 转换为 Int16 PCM
+            if audio_data.dtype.kind == 'f':
+                # 裁剪并归一化到 Int16 范围
+                audio_data = np.clip(audio_data, -1.0, 1.0)
+                audio_data = (audio_data * 32767).astype(np.int16)
+
+            # 转换为 Base64
+            pcm_data = audio_data.tobytes()
+            b64_data = base64.b64encode(pcm_data).decode('utf-8')
+
+            # 发送音频数据
+            await websocket.send_json({
+                "type": "audio",
+                "data": b64_data
+            })
+
+            # 发送完成信号
+            await websocket.send_json({
+                "type": "finished",
+                "message": "合成完成"
+            })
+        except Exception as e:
+            await websocket.send_json({"type": "error", "message": str(e)})
+
+    async def close(self, websocket):
+        pass
diff --git a/backend/services/tts_service.py b/backend/services/tts_service.py
index be3f56e..06877da 100644
--- a/backend/services/tts_service.py
+++ b/backend/services/tts_service.py
@@ -1,173 +1,13 @@
-import os
-import base64
-import threading
-import queue
-from dotenv import load_dotenv
-
-
-load_dotenv()
-
-try:
-    import dashscope
-    from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat
-    STREAMING_AVAILABLE = True
-except ImportError:
-    STREAMING_AVAILABLE = False
-
-class TTSService:
+class TTSServiceBase:
     def __init__(self):
         self.active_connections = {}
         self.active_tts = {}
 
     async def connect(self, websocket, message):
-        voice_type = message.get("voice_type", "design")
-        voice_name = message.get("voice_name")
-        websocket_url = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
-
-        if voice_type == "design" and voice_name:
-            model = "qwen3-tts-vd-realtime-2025-12-16"
-        elif voice_type == "clone" and voice_name:
-            model = "qwen3-tts-vc-realtime-2025-11-27"
-        elif voice_type == "official":
-            model = "qwen3-tts-flash-realtime-2025-11-27"
-        else:
-            model = "qwen3-tts-flash"
-
-        self.active_connections[websocket] = {
-            "model": model,
-            "voice_name": voice_name,
-            "websocket_url": websocket_url,
-            "event_queue": queue.Queue()
-        }
-
-        await websocket.send_json({
-            "type": "connected",
-            "message": "WebSocket连接成功"
-        })
+        raise NotImplementedError
 
     async def synthesize(self, websocket, message):
-        if websocket not in self.active_connections:
-            await websocket.send_json({
-                "type": "error",
-                "message": "请先连接"
-            })
-            return
-
-        config = self.active_connections[websocket]
-        text = message.get("text")
-
-        if not text:
-            await websocket.send_json({
-                "type": "error",
-                "message": "请输入文本"
-            })
-            return
-
-        api_key = os.getenv("DASHSCOPE_API_KEY")
-        if not api_key:
-            await websocket.send_json({
-                "type": "error",
-                "message": "未配置API Key"
-            })
-            return
-
-        dashscope.api_key = api_key
-
-        try:
-            event_queue = config["event_queue"]
-
-            class WebSocketCallback(QwenTtsRealtimeCallback):
-                def __init__(self, ws, queue):
-                    self.ws = ws
-                    self.queue = queue
-
-                def on_open(self):
-                    pass
-
-                def on_close(self, close_status_code, close_msg):
-                    self.queue.put({
-                        "type": "error",
-                        "message": f"连接异常关闭 ({close_status_code}): {close_msg}"
-                    })
-
-                def on_event(self, response):
-                    try:
-                        event_type = response.get('type', '')
-                        if event_type == 'response.audio.delta':
-                            audio_data = base64.b64decode(response['delta'])
-                            self.queue.put({
-                                "type": "audio",
-                                "data": base64.b64encode(audio_data).decode()
-                            })
-                        elif event_type == 'response.done':
-                            self.queue.put({"type": "done"})
-                        elif event_type == 'session.finished':
-                            self.queue.put({"type": "finished"})
-                        elif event_type == 'error':
-                            self.queue.put({
-                                "type": "error",
-                                "message": response.get('error').get('message')
-                            })
-                    except Exception as e:
-                        self.queue.put({"type": "error", "message": str(e)})
-
-                def on_error(self, message):
-                    self.queue.put({"type": "error", "message": message})
-
-            def run_tts():
-                try:
-                    callback = WebSocketCallback(websocket, event_queue)
-
-                    qwen_tts_realtime = QwenTtsRealtime(
-                        model=config["model"],
-                        callback=callback,
-                        url=config["websocket_url"]
-                    )
-
-                    qwen_tts_realtime.connect()
-
-                    if config["voice_name"]:
-                        qwen_tts_realtime.update_session(
-                            voice=config["voice_name"],
-                            response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
-                            mode='server_commit'
-                        )
-                    else:
-                        qwen_tts_realtime.update_session(
-                            response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
-                            mode='server_commit'
-                        )
-
-                    qwen_tts_realtime.append_text(text)
-                    qwen_tts_realtime.finish()
-                except Exception as e:
-                    event_queue.put({"type": "error", "message": str(e)})
-
-            thread = threading.Thread(target=run_tts)
-            thread.start()
-
-            await websocket.send_json({
-                "type": "started"
-            })
-
-            while True:
-                try:
-                    event = event_queue.get(timeout=60)
-                    await websocket.send_json(event)
-
-                    if event.get("type") in ["finished", "error"]:
-                        break
-                except queue.Empty:
-                    break
-
-            thread.join(timeout=5)
-
-        except Exception as e:
-            await websocket.send_json({
-                "type": "error",
-                "message": str(e)
-            })
+        raise NotImplementedError
 
     async def close(self, websocket):
-        if websocket in self.active_connections:
-            del self.active_connections[websocket]
+        raise NotImplementedError
diff --git a/frontend/.env.example b/frontend/.env.example
index 388b903..3fe962a 100644
--- a/frontend/.env.example
+++ b/frontend/.env.example
@@ -4,6 +4,5 @@ VITE_QWEN3_TTS_ENV="aliyun"
 # DashScope API Key (环境为 aliyun 时必填)
 DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx
 
-# 配置说明：
-# 1. 请将此文件复制为 .env 并填写真实的 API Key
-# 2. 更多配置选项可在代码中通过前端界面设置
+# Huggingface 镜像站
+# HF_ENDPOINT=https://hf-mirror.com
diff --git a/frontend/src/views/OfficialVoice.vue b/frontend/src/views/OfficialVoice.vue
index 9e7aa1e..68bde6b 100644
--- a/frontend/src/views/OfficialVoice.vue
+++ b/frontend/src/views/OfficialVoice.vue
@@ -65,12 +65,24 @@
       <div class="tts-section" v-loading="synthesizing" element-loading-text="正在合成语音...">
         <h2>语音合成</h2>
         <el-form label-width="100px">
-          <el-form-item label="当前音色">
-            <el-tag v-if="selectedVoice" type="success" size="large" effect="dark">
-               {{ selectedVoiceInfo?.icon }} {{ selectedVoiceInfo?.displayName }}
-            </el-tag>
-            <span v-else style="color: #999;">未选择</span>
-          </el-form-item>
+          <el-row :gutter="20">
+            <el-col :span="4">
+              <el-form-item label="当前音色">
+                <el-tag v-if="selectedVoice" type="success" size="large" effect="dark">
+                  {{ selectedVoiceInfo?.icon }} {{ selectedVoiceInfo?.displayName }}
+                </el-tag>
+                <span v-else style="color: #999;">未选择</span>
+              </el-form-item>
+            </el-col>
+            <el-col :span="20" v-if="!remote_tts_env">
+              <el-form-item label="控制指令">
+                <el-input
+                  v-model="ttsInstruct"
+                  placeholder="例如：用特别愤怒的语气说（仅本地模型支持）"
+                />
+              </el-form-item>
+            </el-col>
+          </el-row>
 
           <el-form-item label="输入文本">
             <el-input
@@ -110,7 +122,7 @@ import { ArrowLeft } from '@element-plus/icons-vue'
 
 const router = useRouter()
 
-const tts_env = import.meta.env.VITE_QWEN3_TTS_ENV === 'aliyun'
+const remote_tts_env = import.meta.env.VITE_QWEN3_TTS_ENV === 'aliyun'
 
 const aliyunVoices = [
   { name: 'cherry', displayName: '芊悦', description: '阳光积极、亲切自然小姐姐', icon: '👩', scenarios: '活力旁白、短视频、对话', lang: '多语言' },
@@ -137,7 +149,7 @@ const aliyunVoices = [
 ]
 
 const localVoices = [
-  { name: 'vivian', displayName: 'Vivian', description: '明快飒爽的年轻女声', icon: '👩', scenarios: '活力旁白、短视频、对话', lang: '中文' },
+  { name: 'vivian', displayName: 'Vivian', description: '有点急躁的年轻女声', icon: '👩', scenarios: '场景配音、情绪化表达', lang: '中文' },
   { name: 'serena', displayName: 'Serena', description: '温柔知性的年轻女声', icon: '🌙', scenarios: '暖心解说、有声书、客服', lang: '中文' },
   { name: 'uncle_fu', displayName: 'Uncle_Fu', description: '低沉浑厚的成熟男声', icon: '🧔‍♂️', scenarios: '纪录片、故事讲述、稳重旁白', lang: '中文' },
   { name: 'dylan', displayName: 'Dylan', description: '清朗自然的北京少男', icon: '👦', scenarios: '生活Vlog、京味儿对话、朝气男声', lang: '中文 (北京话)' },
@@ -148,10 +160,11 @@ const localVoices = [
   { name: 'sohee', displayName: 'Sohee', description: '情感丰富的温暖韩语女声', icon: '🍯', scenarios: '韩语配音、剧情解说、柔美旁白', lang: '韩语' }
 ]
 
-const officialVoices = ref(tts_env ? aliyunVoices : localVoices)
+const officialVoices = ref(remote_tts_env ? aliyunVoices : localVoices)
 
 const selectedVoice = ref('')
 const ttsText = ref('')
+const ttsInstruct = ref('')
 const audioUrl = ref('')
 const synthesizing = ref(false)
 
@@ -244,7 +257,8 @@ const synthesize = async () => {
       setTimeout(() => {
         ws.send(JSON.stringify({
           action: 'synthesize',
-          text: ttsText.value
+          text: ttsText.value,
+          instruct: ttsInstruct.value
         }))
       }, 500)
     }
diff --git a/frontend/src/views/VoiceClone.vue b/frontend/src/views/VoiceClone.vue
index 3ce05d1..0128451 100644
--- a/frontend/src/views/VoiceClone.vue
+++ b/frontend/src/views/VoiceClone.vue
@@ -8,9 +8,9 @@
         <h1 class="brand-title">元视界AI妙妙屋—魔法语音</h1>
         <div class="sub-title">音色克隆</div>
       </div>
-      
+
     </div>
-    
+
     <div class="content">
       <div class="record-section" v-loading="loading" element-loading-text="正在处理...">
         <el-alert
@@ -39,14 +39,14 @@
             />
             <span style="margin-left: 10px">秒</span>
           </el-form-item>
-          
+
           <el-form-item label="显示名称">
             <el-input
               v-model="form.display_name"
               placeholder="例如：猫娘、老板、客服小姐姐"
             />
           </el-form-item>
-          
+
           <template v-if="cloneMode === 'upload'">
             <el-form-item label="上传音频">
               <el-upload
@@ -62,7 +62,7 @@
               </el-upload>
             </el-form-item>
           </template>
-          
+
           <el-form-item>
             <el-button
               v-if="cloneMode === 'record' && !isRecording"
@@ -84,7 +84,7 @@
             </el-button>
           </el-form-item>
         </el-form>
-        
+
         <div v-if="recordedBlob" class="recorded-audio">
           <h3>音频预览</h3>
           <audio :src="recordedUrl" controls />
@@ -98,7 +98,7 @@
           </el-button>
         </div>
       </div>
-      
+
       <div class="voices-section" v-loading="loading" element-loading-text="正在加载音色...">
         <h2>已克隆的音色</h2>
         <div v-if="cloneVoices.length === 0" class="empty-state">
@@ -127,7 +127,7 @@
           </div>
         </div>
       </div>
-      
+
       <div class="tts-section" v-loading="synthesizing" element-loading-text="正在合成语音...">
         <h2>语音合成</h2>
         <el-form label-width="100px">
@@ -145,7 +145,7 @@
               />
             </el-select>
           </el-form-item>
-          
+
           <el-form-item label="输入文本">
             <el-input
               v-model="ttsText"
@@ -154,7 +154,7 @@
               placeholder="请输入要转换的文字"
             />
           </el-form-item>
-          
+
           <el-form-item>
             <el-button
               type="primary"
@@ -167,14 +167,13 @@
             </el-button>
           </el-form-item>
         </el-form>
-        
+
         <div v-if="audioUrl" class="audio-player">
           <audio :src="audioUrl" controls autoplay />
         </div>
       </div>
     </div>
-    
-    
+
   </div>
 </template>
 
@@ -395,13 +394,13 @@ const startRecording = async () => {
       ElMessage.warning('当前为上传模式，请切换到录音克隆')
       return
     }
-    
+
     // 检查是否为安全上下文（录音功能在非安全上下文如 HTTP + IP 地址下不可用）
     if (window.isSecureContext === false) {
       ElMessage.error('录音功能受浏览器安全策略限制，请使用 http://localhost:3001 或 https 协议访问')
       return
     }
-    
+
     // 检查浏览器是否支持mediaDevices API
     if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
       // 处理不支持的情况
@@ -420,11 +419,11 @@ const startRecording = async () => {
       }
       return
     }
-    
+
     // 现代浏览器支持方式
     const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
     handleMediaStream(stream)
-    
+
   } catch (error) {
     ElMessage.error('无法访问麦克风: ' + error.message)
   }
@@ -434,21 +433,21 @@ const startRecording = async () => {
 const handleMediaStream = (stream) => {
   mediaRecorder = new MediaRecorder(stream)
   audioChunks = []
-  
+
   mediaRecorder.ondataavailable = (event) => {
     audioChunks.push(event.data)
   }
-  
+
   mediaRecorder.onstop = () => {
     recordedBlob.value = new Blob(audioChunks, { type: 'audio/wav' })
     recordedUrl.value = URL.createObjectURL(recordedBlob.value)
     stream.getTracks().forEach(track => track.stop())
   }
-  
+
   mediaRecorder.start()
   isRecording.value = true
   remainingTime.value = recordDuration.value
-  
+
   const timer = setInterval(() => {
     remainingTime.value--
     if (remainingTime.value <= 0) {
@@ -470,19 +469,19 @@ const cloneVoice = async () => {
     ElMessage.warning('请先选择或录制音频')
     return
   }
-  
+
   const seconds = await measureBlobDuration(recordedBlob.value)
   if (seconds < 1) {
     ElMessage.error('音频过短，请上传至少1秒的音频')
     return
   }
-  
+
   const formData = new FormData()
   const wavBlob = await convertToWav(recordedBlob.value)
   formData.append('audio_file', wavBlob, 'recorded.wav')
   formData.append('preferred_name', await toSlug(form.value.display_name || form.value.preferred_name))
   formData.append('display_name', form.value.display_name || form.value.preferred_name || '')
-  
+
   try {
     await cloneVoiceApi(formData)
     ElMessage.success('声音克隆成功')
@@ -517,26 +516,26 @@ const synthesize = async () => {
     ElMessage.warning('请选择音色')
     return
   }
-  
+
   if (!ttsText.value) {
     ElMessage.warning('请输入文本')
     return
   }
-  
+
   synthesizing.value = true
   audioUrl.value = ''
-  
+
   try {
     const wsUrl = `${location.protocol === 'https:' ? 'wss' : 'ws'}://${location.host}/ws/tts/streaming`
     const ws = new WebSocket(wsUrl)
-    
+
     ws.onopen = () => {
       ws.send(JSON.stringify({
         action: 'connect',
         voice_type: 'clone',
         voice_name: selectedVoice.value
       }))
-      
+
       setTimeout(() => {
         ws.send(JSON.stringify({
           action: 'synthesize',
@@ -544,12 +543,12 @@ const synthesize = async () => {
         }))
       }, 500)
     }
-    
+
     let audioChunks = []
-    
+
     ws.onmessage = (event) => {
       const data = JSON.parse(event.data)
-      
+
       if (data.type === 'audio') {
         audioChunks.push(data.data)
       } else if (data.type === 'finished') {
@@ -562,12 +561,12 @@ const synthesize = async () => {
         ws.close()
       }
     }
-    
+
     ws.onerror = () => {
       ElMessage.error('WebSocket连接失败')
       synthesizing.value = false
     }
-    
+
   } catch (error) {
     ElMessage.error('语音合成失败: ' + error.message)
     synthesizing.value = false
diff --git a/frontend/src/views/VoiceDesign.vue b/frontend/src/views/VoiceDesign.vue
index 6fa6879..462e825 100644
--- a/frontend/src/views/VoiceDesign.vue
+++ b/frontend/src/views/VoiceDesign.vue
@@ -180,7 +180,6 @@ const selectedVoice = ref('')
 const ttsText = ref('')
 const audioUrl = ref('')
 const synthesizing = ref(false)
-const settingsVisible = ref(false)
 const audioRefs = ref({})
 const optimizing = ref(false)
 
@@ -243,8 +242,6 @@ const goBack = () => {
   router.push('/')
 }
 
-const showSettings = () => {}
-
 const toSlug = async (s) => {
   const isAscii = /^[a-zA-Z0-9\-\s]+$/.test(s || '')
   if (isAscii) {

From 8ad932c483721355400a8c42f7e444cdd67e4eeb Mon Sep 17 00:00:00 2001
From: yuzhiheng <zhiheng.yu@bestlink.com.cn>
Date: Sat, 31 Jan 2026 09:49:40 +0800
Subject: [PATCH 03/12] feat: Add local voice cloning and design services with
 `ref_text` support and environment-based service selection.

---
 README.md                                     |  9 +++
 backend/api/voice_clone.py                    | 19 ++++--
 backend/api/voice_design.py                   |  9 ++-
 backend/main.py                               | 12 ++--
 ...voice_clone_service.py => clone_aliyun.py} | 24 +++----
 backend/services/clone_local.py               | 54 +++++++++++++++
 backend/services/clone_service.py             | 16 +++++
 ...ice_design_service.py => design_aliyun.py} | 20 ++----
 backend/services/design_local.py              | 68 +++++++++++++++++++
 backend/services/design_service.py            | 19 ++++++
 backend/services/tts_local.py                 | 19 ++++--
 backend/utils/storage.py                      | 26 +++----
 frontend/package-lock.json                    | 15 ++--
 frontend/package.json                         |  4 +-
 frontend/src/views/VoiceClone.vue             | 34 ++++++++--
 frontend/src/views/VoiceDesign.vue            | 18 ++++-
 16 files changed, 289 insertions(+), 77 deletions(-)
 rename backend/services/{voice_clone_service.py => clone_aliyun.py} (88%)
 create mode 100644 backend/services/clone_local.py
 create mode 100644 backend/services/clone_service.py
 rename backend/services/{voice_design_service.py => design_aliyun.py} (93%)
 create mode 100644 backend/services/design_local.py
 create mode 100644 backend/services/design_service.py

diff --git a/README.md b/README.md
index 5f4c986..f9a7083 100644
--- a/README.md
+++ b/README.md
@@ -241,6 +241,15 @@ MIT License
 
 ## 更新日志
 
+### v1.2.0 (2026-01-31)
+
+- **新增功能**：
+  - 新增“官方音色”模块，预置多种高质量官方音色。
+  - 支持本地模型运行，可在无网络环境下使用基础音色功能。
+- **架构优化**：
+  - 重构后端服务，支持阿里云 API 与本地模型双引擎切换。
+  - 优化项目版本管理，同步版本号至 v1.2.0。
+
 ### v1.1.0 (2025-12-28)
 
 - **功能优化**：
diff --git a/backend/api/voice_clone.py b/backend/api/voice_clone.py
index 48727c9..43f46d6 100644
--- a/backend/api/voice_clone.py
+++ b/backend/api/voice_clone.py
@@ -1,7 +1,13 @@
+import os
 from fastapi import APIRouter, HTTPException, UploadFile, File, Form
 from pydantic import BaseModel
 from typing import List, Optional
-from services.voice_clone_service import VoiceCloneService
+
+if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun":
+    from services.clone_aliyun import CloneServiceAliyun as VoiceCloneService
+else:
+    from services.clone_local import CloneServiceLocal as VoiceCloneService
+
 
 router = APIRouter()
 voice_clone_service = VoiceCloneService()
@@ -16,30 +22,33 @@ class VoiceResponse(BaseModel):
     description: str
     display_name: str
     audio_file: str
+    ref_text: str
     created_at: str
 
 @router.post("/clone", response_model=dict)
 async def clone_voice(
     audio_file: UploadFile = File(...),
     preferred_name: Optional[str] = Form(None),
-    display_name: Optional[str] = Form(None)
+    display_name: Optional[str] = Form(None),
+    ref_text: Optional[str] = Form(None)
 ):
     try:
         from pathlib import Path
         BASE_DIR = Path(__file__).resolve().parent.parent
         upload_dir = BASE_DIR / "uploads"
         upload_dir.mkdir(exist_ok=True)
-        
+
         file_path = upload_dir / audio_file.filename
         with open(file_path, "wb") as f:
             content = await audio_file.read()
             f.write(content)
-        
+
         try:
             result = voice_clone_service.clone_voice(
                 audio_file=str(file_path),
                 preferred_name=preferred_name,
-                display_name=display_name
+                display_name=display_name,
+                ref_text=ref_text
             )
         except ValueError as e:
             raise HTTPException(status_code=400, detail=str(e))
diff --git a/backend/api/voice_design.py b/backend/api/voice_design.py
index 6af92bd..bc873bb 100644
--- a/backend/api/voice_design.py
+++ b/backend/api/voice_design.py
@@ -1,7 +1,13 @@
+import os
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 from typing import List, Optional
-from services.voice_design_service import VoiceDesignService
+
+if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun":
+    from services.design_aliyun import DesignServiceAliyun as VoiceDesignService
+else:
+    from services.design_local import DesignServiceLocal as VoiceDesignService
+
 
 router = APIRouter()
 voice_design_service = VoiceDesignService()
@@ -17,6 +23,7 @@ class VoiceResponse(BaseModel):
     description: str
     display_name: str
     preview_file: str
+    ref_text: str
     created_at: str
 
 @router.post("/create", response_model=dict)
diff --git a/backend/main.py b/backend/main.py
index d62035d..68fe5ad 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -5,12 +5,14 @@
 import os
 from pathlib import Path
 from dotenv import load_dotenv
-from api import voice_design, voice_clone, tts, utils
-from contextlib import asynccontextmanager
-
 
+# 加载环境变量必须在导入 api 模块之前，以便 api 模块内部能正确读取配置
 load_dotenv()
 
+from api import voice_clone, voice_design, tts, utils
+from contextlib import asynccontextmanager
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     # 在应用启动时初始化 TTS 服务（仅在 worker 进程中运行）
@@ -20,7 +22,7 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
     title="元视界AI妙妙屋—声音魔法 API",
     description="基于千问3 TTS 的音色创造和音色克隆服务",
-    version="1.0.0",
+    version="1.2.0",
     lifespan=lifespan
 )
 
@@ -56,7 +58,7 @@ async def test_audio(filename: str):
 
 @app.get("/")
 async def root():
-    return {"message": "元视界AI妙妙屋—声音魔法 API", "version": "1.0.0"}
+    return {"message": "元视界AI妙妙屋—声音魔法 API", "version": "1.2.0"}
 
 @app.get("/health")
 async def health():
diff --git a/backend/services/voice_clone_service.py b/backend/services/clone_aliyun.py
similarity index 88%
rename from backend/services/voice_clone_service.py
rename to backend/services/clone_aliyun.py
index 2959c88..79fc959 100644
--- a/backend/services/voice_clone_service.py
+++ b/backend/services/clone_aliyun.py
@@ -2,21 +2,20 @@
 import json
 import base64
 import requests
-from dotenv import load_dotenv
-from utils.storage import VoiceStorage
 from pathlib import Path
 
-load_dotenv()
+from services.clone_service import CloneServiceBase
 
-class VoiceCloneService:
+
+class CloneServiceAliyun(CloneServiceBase):
     def __init__(self):
+        super().__init__()
+
         self.api_key = os.getenv("DASHSCOPE_API_KEY")
-        self.storage = VoiceStorage("data/cloned_voices.json")
         self.customization_url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
-
         self.target_model = "qwen3-tts-vc-realtime-2025-11-27"
 
-    def clone_voice(self, audio_file, preferred_name=None, display_name=None):
+    def clone_voice(self, audio_file, ref_text=None, preferred_name=None, display_name=None):
         if not self.api_key:
             raise ValueError("未找到API Key，请先配置")
 
@@ -88,7 +87,8 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None):
                 voice_name=voice_name,
                 description="录音克隆",
                 display_name=display_name or preferred_name or voice_name,
-                audio_file=audio_file
+                audio_file=audio_file,
+                ref_text=ref_text,
             )
 
             return {
@@ -96,6 +96,7 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None):
                 "description": "录音克隆",
                 "display_name": display_name or preferred_name or voice_name,
                 "audio_file": audio_file,
+                "ref_text": ref_text,
                 "created_at": result.get("created_at", "")
             }
 
@@ -105,10 +106,3 @@ def clone_voice(self, audio_file, preferred_name=None, display_name=None):
             raise Exception(f"网络请求失败: {e}")
         except Exception as e:
             raise Exception(f"发生错误: {e}")
-
-    def list_voices(self):
-        voices = self.storage.list_voices()
-        return voices
-
-    def delete_voice(self, voice_name):
-        return self.storage.delete_voice(voice_name)
diff --git a/backend/services/clone_local.py b/backend/services/clone_local.py
new file mode 100644
index 0000000..5570278
--- /dev/null
+++ b/backend/services/clone_local.py
@@ -0,0 +1,54 @@
+import os
+from pathlib import Path
+import time
+
+from .clone_service import CloneServiceBase
+
+
+class CloneServiceLocal(CloneServiceBase):
+    def __init__(self):
+        super().__init__()
+
+    def clone_voice(self, audio_file, ref_text=None, preferred_name=None, display_name=None):
+        file_path = Path(audio_file)
+        if not file_path.exists():
+            raise FileNotFoundError(f"音频文件不存在: {audio_file}")
+
+        with open(file_path, "rb") as f:
+            header = f.read(44)
+            if len(header) < 44:
+                raise ValueError("音频文件格式不正确")
+            sample_rate = int.from_bytes(header[24:28], "little")
+            channels = int.from_bytes(header[22:24], "little")
+            bits_per_sample = int.from_bytes(header[34:36], "little")
+            data_size = int.from_bytes(header[40:44], "little")
+            if sample_rate == 0 or channels == 0 or bits_per_sample == 0:
+                raise ValueError("音频文件元数据异常")
+            duration = data_size / (sample_rate * channels * (bits_per_sample // 8))
+            if duration < 1.0:
+                raise ValueError("音频过短，请上传至少1秒的音频")
+
+        audio_bytes = file_path.read_bytes()
+        save_filename = f"{preferred_name}_cloned.wav"
+        save_filepath = f"uploads/{save_filename}"
+        os.makedirs("uploads", exist_ok=True)
+        with open(save_filepath, "wb") as f:
+            f.write(audio_bytes)
+
+        print(ref_text)
+        self.storage.add_voice(
+            voice_name=preferred_name,
+            description="录音克隆",
+            display_name=display_name or preferred_name,
+            audio_file=save_filename,
+            ref_text=ref_text,
+        )
+
+        return {
+            "voice_name": preferred_name,
+            "description": "录音克隆",
+            "display_name": display_name or preferred_name,
+            "audio_file": save_filename,
+            "ref_text": ref_text,
+            "created_at": time.time()
+        }
diff --git a/backend/services/clone_service.py b/backend/services/clone_service.py
new file mode 100644
index 0000000..b655552
--- /dev/null
+++ b/backend/services/clone_service.py
@@ -0,0 +1,16 @@
+from utils.storage import VoiceStorage
+
+
+class CloneServiceBase:
+    def __init__(self):
+        self.storage = VoiceStorage("data/cloned_voices.json")
+
+    def clone_voice(self, audio_file, ref_text=None, preferred_name=None, display_name=None):
+        raise NotImplementedError
+
+    def list_voices(self):
+        voices = self.storage.list_voices()
+        return voices
+
+    def delete_voice(self, voice_name):
+        return self.storage.delete_voice(voice_name)
diff --git a/backend/services/voice_design_service.py b/backend/services/design_aliyun.py
similarity index 93%
rename from backend/services/voice_design_service.py
rename to backend/services/design_aliyun.py
index 48adec8..bfb8fcf 100644
--- a/backend/services/voice_design_service.py
+++ b/backend/services/design_aliyun.py
@@ -1,17 +1,15 @@
 import os
 import base64
 import requests
-from dotenv import load_dotenv
-from utils.storage import VoiceStorage
+from .design_service import DesignServiceBase
 
-load_dotenv()
 
-class VoiceDesignService:
+class DesignServiceAliyun(DesignServiceBase):
     def __init__(self):
+        super().__init__()
+
         self.api_key = os.getenv("DASHSCOPE_API_KEY")
-        self.storage = VoiceStorage("data/voices.json")
         self.voice_design_url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
-
         self.target_model = "qwen3-tts-vd-realtime-2025-12-16"
 
     def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的声音。", preferred_name=None, display_name=None):
@@ -90,7 +88,8 @@ def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的
                     voice_name=voice_name,
                     description=voice_prompt,
                     display_name=display_name or preferred_name or voice_name,
-                    preview_file=preview_filename
+                    preview_file=preview_filename,
+                    ref_text=preview_text,
                 )
 
                 return {
@@ -113,13 +112,6 @@ def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的
             traceback.print_exc()
             raise Exception(f"请求失败: {e}")
 
-    def list_voices(self):
-        voices = self.storage.list_voices()
-        return voices
-
-    def delete_voice(self, voice_name):
-        return self.storage.delete_voice(voice_name)
-
     def optimize_prompt(self, prompt):
         if not self.api_key:
             raise ValueError("未找到API Key，请先配置")
diff --git a/backend/services/design_local.py b/backend/services/design_local.py
new file mode 100644
index 0000000..b8f9cb9
--- /dev/null
+++ b/backend/services/design_local.py
@@ -0,0 +1,68 @@
+import os
+import time
+import torch
+import soundfile as sf
+import threading
+from .design_service import DesignServiceBase
+
+
+class DesignServiceLocal(DesignServiceBase):
+    _design_model = None
+    _model_lock = threading.Lock()
+
+    def __init__(self):
+        super().__init__()
+        self._ensure_models_loaded()
+
+    def _ensure_models_loaded(self):
+        from qwen_tts import Qwen3TTSModel
+
+        with DesignServiceLocal._model_lock:
+            if DesignServiceLocal._design_model is None:
+                print("正在预加载本地 DESIGN 模型（单例模式），请稍候...")
+
+                DesignServiceLocal._design_model = Qwen3TTSModel.from_pretrained(
+                    "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
+                    device_map="cuda:0",
+                    dtype=torch.bfloat16,
+                )
+                print("本地 DESIGN 模型预加载完成。")
+
+    @property
+    def design_model(self):
+        return DesignServiceLocal._design_model
+
+    def create_custom_voice(self, voice_prompt, preview_text="你好，这是我的声音。", preferred_name=None, display_name=None):
+        wavs, sr = self.design_model.generate_voice_design(
+            text=preview_text,
+            language="auto",
+            instruct=voice_prompt,
+        )
+
+        preview_filename = f"{preferred_name}_preview.wav"
+        preview_file = f"previews/{preview_filename}"
+        os.makedirs("previews", exist_ok=True)
+        sf.write(preview_file, wavs[0], sr)
+
+        self.storage.add_voice(
+            voice_name=preferred_name,
+            description=voice_prompt,
+            display_name=display_name or preferred_name,
+            preview_file=preview_filename,
+            ref_text=preview_text,
+        )
+
+        return {
+            "voice_name": preferred_name,
+            "description": voice_prompt,
+            "display_name": display_name or preferred_name,
+            "preview_file": preview_filename,
+            "created_at": time.time()
+        }
+
+    def list_voices(self):
+        voices = self.storage.list_voices()
+        return voices
+
+    def delete_voice(self, voice_name):
+        return self.storage.delete_voice(voice_name)
diff --git a/backend/services/design_service.py b/backend/services/design_service.py
new file mode 100644
index 0000000..1f23a64
--- /dev/null
+++ b/backend/services/design_service.py
@@ -0,0 +1,19 @@
+from utils.storage import VoiceStorage
+
+
+class DesignServiceBase:
+    def __init__(self):
+        self.storage = VoiceStorage("data/voices.json")
+
+    def create_custom_voice(self, voice_prompt, preview_text, preferred_name=None, display_name=None):
+        raise NotImplementedError
+
+    def list_voices(self):
+        voices = self.storage.list_voices()
+        return voices
+
+    def delete_voice(self, voice_name):
+        return self.storage.delete_voice(voice_name)
+
+    def optimize_prompt(self, prompt):
+        raise NotImplementedError
diff --git a/backend/services/tts_local.py b/backend/services/tts_local.py
index 57a6b43..af3a8a2 100644
--- a/backend/services/tts_local.py
+++ b/backend/services/tts_local.py
@@ -46,7 +46,6 @@ def custom_model(self):
         return TTSServiceLocal._custom_model
 
     async def connect(self, websocket, message):
-        print(message)
         voice_type = message.get("voice_type", "official")
         voice_name = message.get("voice_name")
 
@@ -63,7 +62,6 @@ async def connect(self, websocket, message):
         })
 
     async def synthesize(self, websocket, message):
-        print(message)
         if websocket not in self.active_connections:
             await websocket.send_json({"type": "error", "message": "请先连接"})
             return
@@ -74,14 +72,25 @@ async def synthesize(self, websocket, message):
 
         # 同步生成
         try:
-            if voice_type == "design" or voice_type == "clone":
+            if voice_type == "design":
                 ref_audio = os.path.join(LOCAL_DIR, "../previews", voice_name + "_preview.wav")
-                print(ref_audio)
+                x_vector_only_mode = message.get("ref_text", "") == ""
                 wavs, sr = self.base_model.generate_voice_clone(
                     text=message.get("text"),
                     language=message.get("language", "auto"),
                     ref_audio=ref_audio,
-                    x_vector_only_mode=True,
+                    ref_text=message.get("ref_text", ""),
+                    x_vector_only_mode=x_vector_only_mode,
+                )
+            elif voice_type == "clone":
+                ref_audio = os.path.join(LOCAL_DIR, "../uploads", voice_name + "_cloned.wav")
+                x_vector_only_mode = message.get("ref_text", "") == ""
+                wavs, sr = self.base_model.generate_voice_clone(
+                    text=message.get("text"),
+                    language=message.get("language", "auto"),
+                    ref_audio=ref_audio,
+                    ref_text=message.get("ref_text", ""),
+                    x_vector_only_mode=x_vector_only_mode,
                 )
             elif voice_type == "official":
                 # 这是一个简化的示例，实际生成参数请根据 qwen_tts 的 API 调整
diff --git a/backend/utils/storage.py b/backend/utils/storage.py
index e8cd27d..7e4cb73 100644
--- a/backend/utils/storage.py
+++ b/backend/utils/storage.py
@@ -6,7 +6,7 @@ class VoiceStorage:
     def __init__(self, storage_file):
         self.storage_file = storage_file
         self.voices = self._load_voices()
-    
+
     def _load_voices(self):
         if os.path.exists(self.storage_file):
             try:
@@ -16,7 +16,7 @@ def _load_voices(self):
                 print(f"加载音色文件失败: {e}")
                 return {}
         return {}
-    
+
     def _save_voices(self):
         try:
             os.makedirs(os.path.dirname(self.storage_file), exist_ok=True)
@@ -24,18 +24,19 @@ def _save_voices(self):
                 json.dump(self.voices, f, ensure_ascii=False, indent=2)
         except Exception as e:
             print(f"保存音色文件失败: {e}")
-    
-    def add_voice(self, voice_name, description, display_name=None, preview_file=None, audio_file=None):
+
+    def add_voice(self, voice_name, description, display_name=None, preview_file=None, ref_text=None, audio_file=None):
         import time
         self.voices[voice_name] = {
             "description": description,
             "display_name": display_name or "",
             "preview_file": preview_file or "",
+            "ref_text": ref_text or "",
             "audio_file": audio_file or "",
             "created_at": time.strftime("%Y-%m-%d %H:%M:%S")
         }
         self._save_voices()
-    
+
     def list_voices(self):
         voices_list = []
         for voice_name, info in self.voices.items():
@@ -44,11 +45,12 @@ def list_voices(self):
                 "description": info.get('description', ''),
                 "display_name": info.get('display_name', ''),
                 "preview_file": info.get('preview_file', ''),
+                "ref_text": info.get('ref_text', ''),
                 "audio_file": info.get('audio_file', ''),
                 "created_at": info.get('created_at', '')
             })
         return voices_list
-    
+
     def delete_voice(self, voice_name):
         if voice_name in self.voices:
             del self.voices[voice_name]
@@ -60,7 +62,7 @@ class SettingsStorage:
     def __init__(self):
         self.settings_file = "data/settings.json"
         self.settings = self._load_settings()
-    
+
     def _load_settings(self):
         if os.path.exists(self.settings_file):
             try:
@@ -70,7 +72,7 @@ def _load_settings(self):
                 print(f"加载设置文件失败: {e}")
                 return {}
         return {}
-    
+
     def _save_settings(self):
         try:
             os.makedirs(os.path.dirname(self.settings_file), exist_ok=True)
@@ -78,17 +80,17 @@ def _save_settings(self):
                 json.dump(self.settings, f, ensure_ascii=False, indent=2)
         except Exception as e:
             print(f"保存设置文件失败: {e}")
-    
+
     def save_api_key(self, api_key):
         self.settings["api_key"] = api_key
         self._save_settings()
-    
+
     def get_api_key(self):
         return self.settings.get("api_key", "")
-    
+
     def save_region(self, region):
         self.settings["region"] = region
         self._save_settings()
-    
+
     def get_region(self):
         return self.settings.get("region", "beijing")
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index f778ce7..8fe8760 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "voice-magic-frontend",
-  "version": "1.0.0",
+  "version": "1.2.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "voice-magic-frontend",
-      "version": "1.0.0",
+      "version": "1.2.0",
       "dependencies": {
         "axios": "^1.6.0",
         "element-plus": "^2.4.4",
@@ -842,7 +842,6 @@
       "resolved": "https://registry.npmjs.org/@types/lodash-es/-/lodash-es-4.17.12.tgz",
       "integrity": "sha512-0NgftHUcV4v34VhXm8QBSftKVXtbkBG3ViCjs6+eJ5a6y6Mi/jiFGPc1sC7QK+9BFhWrURE3EOggmWaSxL9OzQ==",
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "@types/lodash": "*"
       }
@@ -1371,15 +1370,13 @@
       "version": "4.17.21",
       "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
       "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==",
-      "license": "MIT",
-      "peer": true
+      "license": "MIT"
     },
     "node_modules/lodash-es": {
       "version": "4.17.22",
       "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.22.tgz",
       "integrity": "sha512-XEawp1t0gxSi9x01glktRZ5HDy0HXqrM0x5pXQM98EaI0NxO6jVM7omDOxsuEo5UIASAnm2bRp1Jt/e0a2XU8Q==",
-      "license": "MIT",
-      "peer": true
+      "license": "MIT"
     },
     "node_modules/lodash-unified": {
       "version": "1.0.3",
@@ -1580,7 +1577,6 @@
       "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==",
       "dev": true,
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "esbuild": "^0.21.3",
         "postcss": "^8.4.43",
@@ -1640,7 +1636,6 @@
       "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.26.tgz",
       "integrity": "sha512-SJ/NTccVyAoNUJmkM9KUqPcYlY+u8OVL1X5EW9RIs3ch5H2uERxyyIUI4MRxVCSOiEcupX9xNGde1tL9ZKpimA==",
       "license": "MIT",
-      "peer": true,
       "dependencies": {
         "@vue/compiler-dom": "3.5.26",
         "@vue/compiler-sfc": "3.5.26",
@@ -1699,4 +1694,4 @@
       }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/frontend/package.json b/frontend/package.json
index 6bdf307..6a238ad 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
   "name": "voice-magic-frontend",
-  "version": "1.0.0",
+  "version": "1.2.0",
   "type": "module",
   "scripts": {
     "dev": "vite",
@@ -18,4 +18,4 @@
     "@vitejs/plugin-vue": "^4.4.0",
     "vite": "^5.0.0"
   }
-}
+}
\ No newline at end of file
diff --git a/frontend/src/views/VoiceClone.vue b/frontend/src/views/VoiceClone.vue
index 0128451..3ecb8df 100644
--- a/frontend/src/views/VoiceClone.vue
+++ b/frontend/src/views/VoiceClone.vue
@@ -40,6 +40,15 @@
             <span style="margin-left: 10px">秒</span>
           </el-form-item>
 
+          <el-form-item v-if="!remote_tts_env" label="录音文本">
+            <el-input
+              v-model="form.ref_text"
+              type="textarea"
+              :rows="2"
+              placeholder="请输入录音中的文字内容（建议与录音完全一致）"
+            />
+          </el-form-item>
+
           <el-form-item label="显示名称">
             <el-input
               v-model="form.display_name"
@@ -136,6 +145,7 @@
               v-model="selectedVoice"
               placeholder="请选择音色"
               style="width: 100%"
+              @change="handleVoiceChange"
             >
               <el-option
                 v-for="voice in cloneVoices"
@@ -188,9 +198,12 @@ import api from '@/api'
 const router = useRouter()
 const voiceStore = useVoiceStore()
 
+const remote_tts_env = import.meta.env.VITE_QWEN3_TTS_ENV === 'aliyun'
+
 const form = ref({
   preferred_name: '',
-  display_name: ''
+  display_name: '',
+  ref_text: ''
 })
 
 const recordDuration = ref(10)
@@ -204,6 +217,7 @@ const cloneMode = ref('record')
 const isSecureContext = ref(true)
 
 const selectedVoice = ref('')
+const ref_text = ref('')
 const ttsText = ref('')
 const audioUrl = ref('')
 const synthesizing = ref(false)
@@ -369,10 +383,6 @@ const goBack = () => {
   router.push('/')
 }
 
-const showSettings = () => {
-  settingsVisible.value = true
-}
-
 const toSlug = async (s) => {
   const isAscii = /^[a-zA-Z0-9\-\s]+$/.test(s || '')
   if (isAscii) {
@@ -481,12 +491,14 @@ const cloneVoice = async () => {
   formData.append('audio_file', wavBlob, 'recorded.wav')
   formData.append('preferred_name', await toSlug(form.value.display_name || form.value.preferred_name))
   formData.append('display_name', form.value.display_name || form.value.preferred_name || '')
+  formData.append('ref_text', form.value.ref_text || '')
 
   try {
     await cloneVoiceApi(formData)
     ElMessage.success('声音克隆成功')
     form.value.preferred_name = ''
     form.value.display_name = ''
+    form.value.ref_text = ''
     recordedBlob.value = null
     recordedUrl.value = ''
   } catch (error) {
@@ -497,6 +509,14 @@ const cloneVoice = async () => {
 
 const selectVoice = (voice) => {
   selectedVoice.value = voice.voice_name
+  ref_text.value = voice.ref_text
+}
+
+const handleVoiceChange = (voiceName) => {
+  const voice = cloneVoices.value.find(v => v.voice_name === voiceName)
+  if (voice) {
+    ref_text.value = voice.ref_text
+  }
 }
 
 const deleteVoice = async (voiceName) => {
@@ -505,6 +525,7 @@ const deleteVoice = async (voiceName) => {
     ElMessage.success('音色删除成功')
     if (selectedVoice.value === voiceName) {
       selectedVoice.value = ''
+      ref_text.value = ''
     }
   } catch (error) {
     ElMessage.error('音色删除失败: ' + error.message)
@@ -539,7 +560,8 @@ const synthesize = async () => {
       setTimeout(() => {
         ws.send(JSON.stringify({
           action: 'synthesize',
-          text: ttsText.value
+          text: ttsText.value,
+          ref_text: ref_text.value
         }))
       }, 500)
     }
diff --git a/frontend/src/views/VoiceDesign.vue b/frontend/src/views/VoiceDesign.vue
index 462e825..a4fd5a5 100644
--- a/frontend/src/views/VoiceDesign.vue
+++ b/frontend/src/views/VoiceDesign.vue
@@ -22,7 +22,7 @@
               :rows="3"
               placeholder="例如：温柔的女声，音色甜美，语速适中"
             />
-            <div style="margin-top: 10px;">
+            <div v-if="remote_tts_env" style="margin-top: 10px;">
               <el-button
                 type="primary"
                 @click="optimizePrompt"
@@ -117,6 +117,7 @@
               v-model="selectedVoice"
               placeholder="请选择音色"
               style="width: 100%"
+              @change="handleVoiceChange"
             >
               <el-option
                 v-for="voice in designVoices"
@@ -169,6 +170,8 @@ import api from '@/api'
 const router = useRouter()
 const voiceStore = useVoiceStore()
 
+const remote_tts_env = import.meta.env.VITE_QWEN3_TTS_ENV === 'aliyun'
+
 const form = ref({
   voice_prompt: '',
   preview_text: '你好，这是我的声音。',
@@ -177,6 +180,7 @@ const form = ref({
 })
 
 const selectedVoice = ref('')
+const refText = ref('')
 const ttsText = ref('')
 const audioUrl = ref('')
 const synthesizing = ref(false)
@@ -304,6 +308,14 @@ const createVoice = async () => {
 
 const selectVoice = (voice) => {
   selectedVoice.value = voice.voice_name
+  refText.value = voice.ref_text
+}
+
+const handleVoiceChange = (voiceName) => {
+  const voice = designVoices.value.find(v => v.voice_name === voiceName)
+  if (voice) {
+    refText.value = voice.ref_text
+  }
 }
 
 const deleteVoice = async (voiceName) => {
@@ -312,6 +324,7 @@ const deleteVoice = async (voiceName) => {
     ElMessage.success('音色删除成功')
     if (selectedVoice.value === voiceName) {
       selectedVoice.value = ''
+      refText.value = ''
     }
   } catch (error) {
     ElMessage.error('音色删除失败: ' + error.message)
@@ -366,7 +379,8 @@ const synthesize = async () => {
       setTimeout(() => {
         ws.send(JSON.stringify({
           action: 'synthesize',
-          text: ttsText.value
+          text: ttsText.value,
+          ref_text: refText.value
         }))
       }, 500)
     }

From 9487400006c16b1f650776a1025106d0ce04ea4a Mon Sep 17 00:00:00 2001
From: zero <tohhiyin@gmail.com>
Date: Sat, 31 Jan 2026 11:08:44 +0800
Subject: [PATCH 04/12] =?UTF-8?q?feat:=20=E5=9C=A8=20VoiceClone=20?=
 =?UTF-8?q?=E9=A1=B5=E9=9D=A2=E4=B8=AD=E6=A0=B9=E6=8D=AE=E5=85=8B=E9=9A=86?=
 =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E6=9D=A1=E4=BB=B6=E6=98=BE=E7=A4=BA=E5=BD=95?=
 =?UTF-8?q?=E9=9F=B3=E6=97=B6=E9=95=BF=E8=BE=93=E5=85=A5=E6=A1=86=EF=BC=8C?=
 =?UTF-8?q?=E5=B9=B6=E7=A7=BB=E9=99=A4=E6=9C=AA=E4=BD=BF=E7=94=A8=E7=9A=84?=
 =?UTF-8?q?=20`settingsVisible`=20=E5=8F=98=E9=87=8F=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frontend/.env.example             | 2 +-
 frontend/package-lock.json        | 2 +-
 frontend/src/views/VoiceClone.vue | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/frontend/.env.example b/frontend/.env.example
index 3fe962a..89523a9 100644
--- a/frontend/.env.example
+++ b/frontend/.env.example
@@ -2,7 +2,7 @@
 VITE_QWEN3_TTS_ENV="aliyun"
 
 # DashScope API Key (环境为 aliyun 时必填)
-DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx
+DASHSCOPE_API_KEY=sk-30eb93fbb7354fe489e1d06f0623e2af
 
 # Huggingface 镜像站
 # HF_ENDPOINT=https://hf-mirror.com
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index 8fe8760..7c2b184 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -1694,4 +1694,4 @@
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/frontend/src/views/VoiceClone.vue b/frontend/src/views/VoiceClone.vue
index 3ecb8df..bd0adf6 100644
--- a/frontend/src/views/VoiceClone.vue
+++ b/frontend/src/views/VoiceClone.vue
@@ -30,7 +30,7 @@
               <el-radio label="upload">上传音频克隆</el-radio>
             </el-radio-group>
           </el-form-item>
-          <el-form-item label="录音时长">
+          <el-form-item label="录音时长" v-if="cloneMode === 'record'">
             <el-input-number
               v-model="recordDuration"
               :min="1"
@@ -221,7 +221,6 @@ const ref_text = ref('')
 const ttsText = ref('')
 const audioUrl = ref('')
 const synthesizing = ref(false)
-const settingsVisible = ref(false)
 
 const cloneVoices = computed(() => voiceStore.cloneVoices)
 const loading = computed(() => voiceStore.loading)

From 5f8a940874cc4136ff22c935fa22d755d2eb46b5 Mon Sep 17 00:00:00 2001
From: zero <tohhiyin@gmail.com>
Date: Sat, 31 Jan 2026 13:42:21 +0800
Subject: [PATCH 05/12] =?UTF-8?q?feat:=20=E7=BB=9F=E4=B8=80=E7=8E=AF?=
 =?UTF-8?q?=E5=A2=83=E9=85=8D=E7=BD=AE=E5=8F=98=E9=87=8F=E5=B9=B6=E5=BC=95?=
 =?UTF-8?q?=E5=85=A5=20Docker=20=E5=AE=B9=E5=99=A8=E5=8C=96=E9=83=A8?=
 =?UTF-8?q?=E7=BD=B2=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .env.example                |  5 +++
 Dockerfile.aliyun           | 56 +++++++++++++++++++++++++
 Dockerfile.local            | 81 +++++++++++++++++++++++++++++++++++++
 backend/.env.example        |  2 +-
 backend/api/tts.py          |  2 +-
 backend/api/voice_clone.py  |  2 +-
 backend/api/voice_design.py |  2 +-
 backend/main.py             | 25 ++++++++++--
 docker-compose.yml          | 19 +++++++++
 frontend/.env.example       |  6 ---
 10 files changed, 187 insertions(+), 13 deletions(-)
 create mode 100644 .env.example
 create mode 100644 Dockerfile.aliyun
 create mode 100644 Dockerfile.local
 create mode 100644 docker-compose.yml

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..a9847aa
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,5 @@
+# DashScope API Key (环境为 aliyun 时必填)
+DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx
+
+# Huggingface 镜像站
+# HF_ENDPOINT=https://hf-mirror.com
diff --git a/Dockerfile.aliyun b/Dockerfile.aliyun
new file mode 100644
index 0000000..2fd9200
--- /dev/null
+++ b/Dockerfile.aliyun
@@ -0,0 +1,56 @@
+# ==========================================
+# 第一阶段：构建前端
+# ==========================================
+FROM node:20-slim AS frontend-builder
+
+WORKDIR /app/frontend
+
+# 复制依赖文件并安装 (利用 Docker 缓存)
+COPY frontend/package*.json ./
+RUN npm install
+
+# 复制源码并构建
+COPY frontend/ ./
+RUN npm run build
+
+# ==========================================
+# 第二阶段：最终运行环境
+# ==========================================
+FROM python:3.11-slim
+
+# 设置工作目录
+WORKDIR /app
+
+# 安装必要的系统依赖
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# 从官方 uv 镜像中安装 uv
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/
+ENV PATH="/uv/bin:${PATH}"
+
+# 复制后端依赖文件并安装 (使用 uv)
+COPY backend/requirements.txt ./
+RUN uv pip install --no-cache -r requirements.txt --system
+
+# 复制后端代码
+COPY backend/ ./
+
+# 确保 previews 目录存在 (虽然 main.py 也会检查，但在这里创建更规范)
+RUN mkdir -p previews
+
+# 从第一阶段复制构建好的前端静态文件到后端 static 目录下
+# 注意：我们在 backend/main.py 中配置了从 static 目录服务静态文件
+COPY --from=frontend-builder /app/frontend/dist /app/static
+
+# 设置环境变量默认值 (可以在运行时通过 -e 覆盖)
+ENV QWEN3_TTS_ENV=aliyun
+ENV PORT=8000
+# 如果使用阿里云 DashScope，需要在运行容器时传入 DASHSCOPE_API_KEY
+
+# 暴露端口
+EXPOSE 8000
+
+# 启动命令
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/Dockerfile.local b/Dockerfile.local
new file mode 100644
index 0000000..88c875c
--- /dev/null
+++ b/Dockerfile.local
@@ -0,0 +1,81 @@
+# ==========================================
+# 第一阶段：构建前端
+# ==========================================
+FROM node:20-slim AS frontend-builder
+WORKDIR /app/frontend
+COPY frontend/package*.json ./
+RUN npm install
+COPY frontend/ ./
+RUN npm run build
+
+# ==========================================
+# 第二阶段：最终运行环境 (支持 GPU)
+# ==========================================
+# 使用 NVIDIA CUDA 运行时作为基础镜像
+FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
+
+# 设置工作目录
+WORKDIR /app
+
+# 设置环境变量，避免交互式安装提示
+ENV DEBIAN_FRONTEND=noninteractive
+
+# 安装 Python 3.11 以及必要的系统依赖
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3-pip \
+    python3.11-dev \
+    curl \
+    git \
+    ffmpeg \
+    libsndfile1 \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# 将 python3.11 设置为默认 python
+RUN ln -sf /usr/bin/python3.11 /usr/bin/python \
+    && ln -sf /usr/bin/pip3 /usr/bin/pip
+
+# 安装 uv
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/
+ENV PATH="/uv/bin:${PATH}"
+
+# 复制后端依赖文件
+COPY backend/requirements.txt ./
+
+# 1. 安装基础依赖
+RUN uv pip install --no-cache -r requirements.txt --system
+
+# 2. 安装本地模型运行所需的深度学习依赖 (针对 CUDA 12.1)
+RUN uv pip install --no-cache torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --system
+
+# 3. 安装 Qwen3-TTS 核心库
+# 注意：如果 pypi 版本未发布，可能需要从 git 安装，这里先尝试 pypi
+RUN uv pip install --no-cache qwen-tts --system
+
+# 4. 可选：安装 FlashAttention 2 以优化性能 (构建时间较长，如不需要可注释掉)
+# RUN uv pip install --no-cache flash-attn --no-build-isolation --system
+
+# 复制后端代码
+COPY backend/ ./
+
+# 确保必要的目录存在
+RUN mkdir -p previews data uploads samples
+
+# 从第一阶段复制构建好的前端静态文件
+COPY --from=frontend-builder /app/frontend/dist /app/static
+
+# 设置本地运行相关的环境变量
+ENV QWEN3_TTS_ENV=local
+ENV PORT=8000
+# 允许下载模型时的超时设置
+ENV HF_HUB_ENABLE_HF_TRANSFER=0
+
+# 设置 Hugging Face 缓存目录，方便外部挂载持久化
+ENV HF_HOME=/app/models_cache
+
+# 暴露端口
+EXPOSE 8000
+
+# 启动命令
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/backend/.env.example b/backend/.env.example
index 3fe962a..b8ac443 100644
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -1,5 +1,5 @@
 # Qwen3-TTS 环境，可选 aliyun: 阿里云，local: 本地
-VITE_QWEN3_TTS_ENV="aliyun"
+QWEN3_TTS_ENV="aliyun"
 
 # DashScope API Key (环境为 aliyun 时必填)
 DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxx
diff --git a/backend/api/tts.py b/backend/api/tts.py
index bffbd95..7c2f231 100644
--- a/backend/api/tts.py
+++ b/backend/api/tts.py
@@ -2,7 +2,7 @@
 import os
 import json
 
-if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun":
+if os.getenv("QWEN3_TTS_ENV") == "aliyun":
     from services.tts_aliyun import TTSServiceAliyun as TTSService
 else:
     from services.tts_local import TTSServiceLocal as TTSService
diff --git a/backend/api/voice_clone.py b/backend/api/voice_clone.py
index 43f46d6..f3503bc 100644
--- a/backend/api/voice_clone.py
+++ b/backend/api/voice_clone.py
@@ -3,7 +3,7 @@
 from pydantic import BaseModel
 from typing import List, Optional
 
-if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun":
+if os.getenv("QWEN3_TTS_ENV") == "aliyun":
     from services.clone_aliyun import CloneServiceAliyun as VoiceCloneService
 else:
     from services.clone_local import CloneServiceLocal as VoiceCloneService
diff --git a/backend/api/voice_design.py b/backend/api/voice_design.py
index bc873bb..8746d17 100644
--- a/backend/api/voice_design.py
+++ b/backend/api/voice_design.py
@@ -3,7 +3,7 @@
 from pydantic import BaseModel
 from typing import List, Optional
 
-if os.getenv("VITE_QWEN3_TTS_ENV") == "aliyun":
+if os.getenv("QWEN3_TTS_ENV") == "aliyun":
     from services.design_aliyun import DesignServiceAliyun as VoiceDesignService
 else:
     from services.design_local import DesignServiceLocal as VoiceDesignService
diff --git a/backend/main.py b/backend/main.py
index 68fe5ad..dd1a0cd 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -1,6 +1,7 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
 import uvicorn
 import os
 from pathlib import Path
@@ -56,9 +57,27 @@ async def test_audio(filename: str):
     else:
         return {"exists": False, "path": str(file_path)}
 
-@app.get("/")
-async def root():
-    return {"message": "元视界AI妙妙屋—声音魔法 API", "version": "1.2.0"}
+# 挂载前端静态文件
+STATIC_DIR = BASE_DIR / "static"
+if STATIC_DIR.exists():
+    @app.get("/{full_path:path}")
+    async def serve_spa(full_path: str):
+        # 排除 API、WS 和预览路径，让它们由各自的路由处理器处理或返回 404
+        if any(full_path.startswith(prefix) for prefix in ["api/", "ws/", "previews/"]):
+            from fastapi.responses import JSONResponse
+            return JSONResponse(status_code=404, content={"detail": "Not Found"})
+
+        # 检查是否请求的是具体的静态文件
+        file_path = STATIC_DIR / full_path
+        if full_path != "" and file_path.exists() and file_path.is_file():
+            return FileResponse(file_path)
+
+        # 默认返回 index.html 支持 Vue Router History 模式
+        return FileResponse(STATIC_DIR / "index.html")
+else:
+    @app.get("/")
+    async def root():
+        return {"message": "元视界AI妙妙屋—声音魔法 API", "version": "1.2.0"}
 
 @app.get("/health")
 async def health():
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..f9860c7
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,19 @@
+services:
+  voice-magic:
+    build:
+      context: .
+      dockerfile: Dockerfile.aliyun
+    image: voice-magic:aliyun
+    container_name: voice-magic
+    ports:
+      - "8000:8000"
+    env_file:
+      - .env
+    volumes:
+      - voice_magic_data:/app/data
+      - voice_magic_previews:/app/previews
+    restart: always
+
+volumes:
+  voice_magic_data:
+  voice_magic_previews:
diff --git a/frontend/.env.example b/frontend/.env.example
index 89523a9..598087d 100644
--- a/frontend/.env.example
+++ b/frontend/.env.example
@@ -1,8 +1,2 @@
 # Qwen3-TTS 环境，可选 aliyun: 阿里云，local: 本地
 VITE_QWEN3_TTS_ENV="aliyun"
-
-# DashScope API Key (环境为 aliyun 时必填)
-DASHSCOPE_API_KEY=sk-30eb93fbb7354fe489e1d06f0623e2af
-
-# Huggingface 镜像站
-# HF_ENDPOINT=https://hf-mirror.com

From 1ba6df61a38741dcdfee498982d94d054380e49c Mon Sep 17 00:00:00 2001
From: yuzhiheng <zhiheng.yu@bestlink.com.cn>
Date: Sat, 31 Jan 2026 16:06:15 +0800
Subject: [PATCH 06/12] =?UTF-8?q?build:=20=E6=B7=BB=E5=8A=A0=20.dockerigno?=
 =?UTF-8?q?re=20=E6=96=87=E4=BB=B6=EF=BC=8C=E4=BB=A5=E5=9C=A8=20Docker=20?=
 =?UTF-8?q?=E6=9E=84=E5=BB=BA=E8=BF=87=E7=A8=8B=E4=B8=AD=E6=8E=92=E9=99=A4?=
 =?UTF-8?q?=20Python=20=E8=99=9A=E6=8B=9F=E7=8E=AF=E5=A2=83=E5=92=8C?=
 =?UTF-8?q?=E7=BC=93=E5=AD=98=E6=96=87=E4=BB=B6=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .dockerignore    |  4 ++++
 Dockerfile.local | 31 +++++++++++++++++++++++--------
 2 files changed, 27 insertions(+), 8 deletions(-)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..d80b58a
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,4 @@
+backend/.venv
+.venv
+__pycache__
+*.pyc
diff --git a/Dockerfile.local b/Dockerfile.local
index 88c875c..90eca84 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -11,8 +11,9 @@ RUN npm run build
 # ==========================================
 # 第二阶段：最终运行环境 (支持 GPU)
 # ==========================================
-# 使用 NVIDIA CUDA 运行时作为基础镜像
-FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
+# 使用 NVIDIA CUDA 开发版作为基础镜像 (支持编译 flash-attn)
+# RTX 5090 需要 CUDA 12.8+ 才能完整支持其 Blackwell 架构 (SM 10.0)
+FROM nvidia/cuda:12.8.0-devel-ubuntu22.04
 
 # 设置工作目录
 WORKDIR /app
@@ -28,6 +29,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     curl \
     git \
     ffmpeg \
+    sox \
+    libsox-fmt-all \
     libsndfile1 \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
@@ -46,14 +49,16 @@ COPY backend/requirements.txt ./
 # 1. 安装基础依赖
 RUN uv pip install --no-cache -r requirements.txt --system
 
-# 2. 安装本地模型运行所需的深度学习依赖 (针对 CUDA 12.1)
-RUN uv pip install --no-cache torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --system
+# 2. 安装本地模型运行所需的深度学习依赖 (针对 CUDA 12.8, 支持 RTX 5090)
+RUN uv pip install --no-cache torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 --system
 
 # 3. 安装 Qwen3-TTS 核心库
 # 注意：如果 pypi 版本未发布，可能需要从 git 安装，这里先尝试 pypi
 RUN uv pip install --no-cache qwen-tts --system
 
 # 4. 可选：安装 FlashAttention 2 以优化性能 (构建时间较长，如不需要可注释掉)
+# 8.0 (A100), 8.6 (RTX 30), 8.9 (RTX 40), 9.0 (H100), 10.0 (RTX 50)
+# ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0;10.0"
 # RUN uv pip install --no-cache flash-attn --no-build-isolation --system
 
 # 复制后端代码
@@ -68,12 +73,22 @@ COPY --from=frontend-builder /app/frontend/dist /app/static
 # 设置本地运行相关的环境变量
 ENV QWEN3_TTS_ENV=local
 ENV PORT=8000
-# 允许下载模型时的超时设置
-ENV HF_HUB_ENABLE_HF_TRANSFER=0
-
-# 设置 Hugging Face 缓存目录，方便外部挂载持久化
+# 设置 Hugging Face 缓存目录
 ENV HF_HOME=/app/models_cache
 
+# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型
+RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# 在构建期间下载模型，避免运行时下载
+RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base
+RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
+RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
+
+# 切换到离线模式，禁止程序在运行时尝试连接 Hugging Face 服务器
+ENV HF_HUB_OFFLINE=1
+ENV TRANSFORMERS_OFFLINE=1
+
 # 暴露端口
 EXPOSE 8000
 

From 443459e0ca9e478b05ecc857764f7b6e2a296d45 Mon Sep 17 00:00:00 2001
From: yuzhiheng <zhiheng.yu@bestlink.com.cn>
Date: Sat, 31 Jan 2026 17:24:18 +0800
Subject: [PATCH 07/12] =?UTF-8?q?refactor:=20=E6=9B=B4=E6=96=B0Docker=20Co?=
 =?UTF-8?q?mpose=E4=BB=A5=E4=BD=BF=E7=94=A8=E5=B8=A6=E6=9C=89=E4=B8=8A?=
 =?UTF-8?q?=E4=BC=A0=E9=87=8F=E5=92=8C=E5=A2=9E=E5=BC=BA=E7=9A=84=E9=A2=84?=
 =?UTF-8?q?=E6=9E=84=E5=BB=BA=E6=98=A0=E5=83=8F=E3=80=82Dockerignore=20'?=
 =?UTF-8?q?=E6=9D=A5=E6=8E=92=E9=99=A4=E6=95=B0=E6=8D=AE=E5=92=8C=E5=AA=92?=
 =?UTF-8?q?=E4=BD=93=E6=96=87=E4=BB=B6=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .dockerignore      |   7 ++
 Dockerfile.local   |   6 +-
 README.md          | 173 ++++++++++++++++++++++++---------------------
 docker-compose.yml |   7 +-
 4 files changed, 105 insertions(+), 88 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index d80b58a..78c9159 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,3 +2,10 @@ backend/.venv
 .venv
 __pycache__
 *.pyc
+
+# Data
+backend/data/
+backend/uploads/
+backend/previews/
+**/*.wav
+**/*.mp3
diff --git a/Dockerfile.local b/Dockerfile.local
index 90eca84..855cc7c 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -65,7 +65,7 @@ RUN uv pip install --no-cache qwen-tts --system
 COPY backend/ ./
 
 # 确保必要的目录存在
-RUN mkdir -p previews data uploads samples
+RUN mkdir -p previews data uploads
 
 # 从第一阶段复制构建好的前端静态文件
 COPY --from=frontend-builder /app/frontend/dist /app/static
@@ -85,10 +85,6 @@ RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base
 RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
 RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
 
-# 切换到离线模式，禁止程序在运行时尝试连接 Hugging Face 服务器
-ENV HF_HUB_OFFLINE=1
-ENV TRANSFORMERS_OFFLINE=1
-
 # 暴露端口
 EXPOSE 8000
 
diff --git a/README.md b/README.md
index f9a7083..d85f58a 100644
--- a/README.md
+++ b/README.md
@@ -10,118 +10,94 @@
 
 - **智能音色创造**：通过自然语言描述生成个性化音色
 - **高质量音色克隆**：录制 10 秒语音即可克隆专属音色
+- **多引擎支持**：支持 **阿里云 DashScope API** 与 **本地 Qwen3-TTS 模型** 双引擎切换
+- **双模式运行**：
+  - **云端模式**：低资源占用，快速部署，需网络连接。
+  - **本地模式**：隐私安全，无网络依赖，性能更强（推荐使用 GPU）。
 - **实时音频预览**：立即试听生成的音色效果
-- **流式语音合成**：通过 WebSocket 实现低延迟的语音合成
-- **自动音量增益**：内置音频增益处理，确保生成的语音清晰响亮
-- **响应式设计**：适配各种屏幕尺寸的现代化界面
+- **流式语音合成**：通过 WebSocket 实现极低延迟的语音合成
+- **现代化响应式界面**：美观、易用的 Web 后台管理系统
 
-## 技术栈
+## 运行模式
 
-### 后端
+项目支持两种运行模式，通过环境变量 `QWEN3_TTS_ENV` 进行切换：
 
-- **FastAPI 0.100+** - 高性能 Python Web 框架
-- **WebSocket** - 实时通信协议
-- **DashScope SDK** - 千问 TTS API 客户端
-- **Python 3.8+** - 编程语言
+1.  **aliyun (默认)**：使用阿里云 DashScope 服务。需要配置 `DASHSCOPE_API_KEY`。适用于大多数用户，无需昂贵的 GPU 资源。
+2.  **local**：在本地运行 Qwen3-TTS 模型。需要 NVIDIA GPU（建议 RTX 30 系列及以上）和 CUDA 环境。适用于追求极致响应速度和隐私的用户。
 
-### 前端
+## 快速开始
 
-- **Vue 3** - 渐进式 JavaScript 框架
-- **Vite** - 下一代前端构建工具
-- **Pinia** - Vue 3 状态管理
-- **Element Plus** - 基于 Vue 3 的 UI 组件库
-- **WebSocket API** - 浏览器实时通信接口
+### 1. 环境准备
 
-## 项目结构
+- **如果是本地模式**：需要安装 NVIDIA Driver, CUDA 12.1+, 和 NVIDIA Container Toolkit (用于 Docker)。
+- **如果是云端模式**：只需基础 Docker 环境或 Python 环境。
+- **通用**：获取 [阿里云 API Key](https://help.aliyun.com/zh/model-studio/get-api-key)（仅云端模式需要）。
 
-```
-Voice_Magic/
-├── backend/                    # 后端项目
-│   ├── main.py                # FastAPI 主入口（包含静态文件服务配置）
-│   ├── requirements.txt        # Python 依赖列表
-│   ├── .env.example           # 环境变量示例文件
-│   ├── .env                   # 环境变量（实际使用时配置）
-│   ├── previews/              # 音频预览文件存储目录
-│   ├── api/                   # API 路由模块
-│   │   ├── voice_design.py    # 音色创造 API 端点
-│   │   ├── voice_clone.py     # 音色克隆 API 端点
-│   │   ├── settings.py        # 设置 API 端点
-│   │   └── tts.py            # TTS WebSocket API 端点
-│   ├── services/              # 业务逻辑层
-│   │   ├── voice_design_service.py  # 音色创造业务逻辑
-│   │   ├── voice_clone_service.py   # 音色克隆业务逻辑
-│   │   └── tts_service.py           # TTS 流式服务逻辑
-│   ├── models/                # 数据模型定义
-│   │   └── schemas.py        # Pydantic 模型定义
-│   ├── utils/                 # 工具函数
-│   │   └── storage.py        # 文件和数据存储工具
-│   └── data/                  # 数据存储目录
-│       ├── voices.json        # 创造的音色数据
-│       ├── cloned_voices.json # 克隆的音色数据
-│       └── settings.json      # 应用设置
-├── frontend/                  # 前端项目
-│   ├── src/
-│   │   ├── main.js            # Vue 应用入口
-│   │   ├── App.vue            # 根组件
-│   │   ├── router/            # Vue Router 配置
-│   │   ├── views/             # 页面组件
-│   │   │   ├── Home.vue       # 首页
-│   │   │   ├── VoiceDesign.vue # 音色创造页面
-│   │   │   └── VoiceClone.vue  # 音色克隆页面
-│   │   ├── components/        # 可复用组件
-│   │   │   └── SettingsModal.vue # 设置弹窗组件
-│   │   ├── api/               # API 调用封装
-│   │   └── stores/            # Pinia 状态管理
-│   ├── public/                # 静态资源
-│   ├── package.json           # npm 依赖
-│   └── vite.config.js         # Vite 配置
-└── README.md                  # 项目说明文档
+### 2. Docker 部署 (推荐)
+
+这是最简单的运行方式，所有依赖已打包。
+
+#### 2.1 云端模式 (Aliyun API)
+
+```bash
+# 1. 复制 .env.example 并更名为 .env，填入 API Key
+cp .env.example .env
+
+# 2. 启动容器
+docker compose up -d
 ```
 
-## 快速开始
+*注意：默认镜像标签为 `aliyun`。如果需要手动指定，修改 `docker-compose.yml` 中的 image。*
 
-### 1. 环境准备
+#### 2.2 本地模式 (GPU 加速)
 
-#### 环境准备
+```bash
+# 1. 修改 .env 配置文件
+# QWEN3_TTS_ENV=local
 
-- Python 3.8+ （推荐使用 Python 3.10）
-- Node.js 16+ （推荐使用 Node.js 18+）
-- npm 或 yarn 包管理工具
+# 2. 修改 docker-compose.yml 使用 local 镜像标签
+# image: yuzhiheng/voice-magic:local
 
-### 2. 本地开发模式
+# 3. 启动并开启 GPU 支持
+docker compose up -d
+```
+
+### 3. 本地开发模式
+
+如果您想在本地直接运行源码：
 
-#### 2.1 后端设置
+#### 3.1 后端设置
 
 ```bash
 # 进入后端目录
 cd backend
 
-# 创建并激活虚拟环境（可选但推荐）
-python -m venv venv
-# Windows: venv\Scripts\activate
-# Linux/Mac: source venv/bin/activate
-
 # 安装依赖
 pip install -r requirements.txt
 
+# 如果使用本地模型，还需安装 torch 和 qwen-tts
+# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+# pip install qwen-tts
+
 # 配置环境变量
-# 复制示例文件并修改
 cp .env.example .env
-# 编辑 .env 文件，填入你的千问 API Key
+# 编辑 .env 文件，设置 QWEN3_TTS_ENV 和 DASHSCOPE_API_KEY
 ```
 
 环境变量说明：
 
-````env
-# 千问 API Key（必填）
+```env
+# 运行环境: aliyun 或 local
+QWEN3_TTS_ENV=aliyun
+
+# 阿里云 API Key（aliyun 模式下必填）
 DASHSCOPE_API_KEY=your_api_key_here
+```
 
 ```bash
 # 启动后端服务
 python main.py
-````
-
-后端服务将在 http://localhost:8000 启动
+```
 
 #### 3.2 前端设置
 
@@ -136,7 +112,46 @@ npm install
 npm run dev
 ```
 
-前端服务将在 http://localhost:3000 启动
+### 4. Docker 镜像构建
+
+如果您希望从源码自行构建镜像，可以使用以下命令：
+
+#### 4.1 构建云端版 (aliyun)
+```bash
+docker build -t voice-magic:aliyun -f Dockerfile.aliyun .
+```
+
+#### 4.2 构建本地版 (local)
+```bash
+docker build -t voice-magic:local -f Dockerfile.local .
+```
+
+## 技术栈
+
+### 后端
+- **FastAPI** / **Uvicorn** - Web 框架与服务器
+- **Qwen3-TTS** - 千问语音模型核心
+- **DashScope SDK** - 阿里云模型服务接入
+- **WebSocket** - 实现流式音频传输
+
+### 前端
+- **Vue 3** / **Vite** - 现代前端框架与构建工具
+- **Element Plus** - UI 组件库
+- **Pinia** - 状态管理
+
+## 项目结构
+
+```text
+Voice_Magic/
+├── backend/            # Python 后端代码
+│   ├── api/            # 接口定义 (TTS, 克隆, 创作)
+│   ├── services/       # 业务逻辑 (阿里云/本地双引擎实现)
+│   └── main.py         # 启动入口
+├── frontend/           # Vue 前端代码
+├── Dockerfile.aliyun   # 云端模式 Docker 配置文件
+├── Dockerfile.local    # 本地模式 Docker 配置文件
+└── docker-compose.yml  # Docker 编排配置
+```
 
 ## 功能说明
 
diff --git a/docker-compose.yml b/docker-compose.yml
index f9860c7..70c3ade 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,9 +1,6 @@
 services:
   voice-magic:
-    build:
-      context: .
-      dockerfile: Dockerfile.aliyun
-    image: voice-magic:aliyun
+    image: yuzhiheng/voice-magic:aliyun # Tag 可选: aliyun, local, local-flash-attn
     container_name: voice-magic
     ports:
       - "8000:8000"
@@ -12,8 +9,10 @@ services:
     volumes:
       - voice_magic_data:/app/data
       - voice_magic_previews:/app/previews
+      - voice_magic_uploads:/app/uploads
     restart: always
 
 volumes:
   voice_magic_data:
   voice_magic_previews:
+  voice_magic_uploads:

From dd27bbc6bbcbeedb86e3e3c1c80fd3dcc6f6f899 Mon Sep 17 00:00:00 2001
From: yuzhiheng <zhiheng.yu@bestlink.com.cn>
Date: Sat, 31 Jan 2026 17:43:48 +0800
Subject: [PATCH 08/12] =?UTF-8?q?feat:=20=E4=B8=BA=E5=85=8B=E9=9A=86?=
 =?UTF-8?q?=E7=9A=84=E5=A3=B0=E9=9F=B3=E6=B7=BB=E5=8A=A0=E9=9F=B3=E9=A2=91?=
 =?UTF-8?q?=E9=A2=84=E8=A7=88=E5=92=8C=E6=92=AD=E6=94=BE=E5=8A=9F=E8=83=BD?=
 =?UTF-8?q?=EF=BC=8C=E5=8C=85=E6=8B=ACUI=E6=9B=B4=E6=96=B0=E5=92=8C?=
 =?UTF-8?q?=E4=BB=A3=E7=90=86=E9=85=8D=E7=BD=AE=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile.local                  | 23 ++++----
 backend/main.py                   |  8 ++-
 frontend/src/views/VoiceClone.vue | 90 +++++++++++++++++++++++++++----
 frontend/vite.config.js           |  4 ++
 4 files changed, 102 insertions(+), 23 deletions(-)

diff --git a/Dockerfile.local b/Dockerfile.local
index 855cc7c..b45cd6f 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -43,6 +43,18 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python \
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/
 ENV PATH="/uv/bin:${PATH}"
 
+# 设置 Hugging Face 缓存目录
+ENV HF_HOME=/app/models_cache
+
+# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型
+RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# 在构建期间下载模型，避免运行时下载
+RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base
+RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
+RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
+
 # 复制后端依赖文件
 COPY backend/requirements.txt ./
 
@@ -73,17 +85,6 @@ COPY --from=frontend-builder /app/frontend/dist /app/static
 # 设置本地运行相关的环境变量
 ENV QWEN3_TTS_ENV=local
 ENV PORT=8000
-# 设置 Hugging Face 缓存目录
-ENV HF_HOME=/app/models_cache
-
-# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型
-RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
-
-# 在构建期间下载模型，避免运行时下载
-RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base
-RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
-RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
 
 # 暴露端口
 EXPOSE 8000
diff --git a/backend/main.py b/backend/main.py
index dd1a0cd..bef1e2f 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -37,12 +37,16 @@ async def lifespan(app: FastAPI):
 
 BASE_DIR = Path(__file__).resolve().parent
 PREVIEWS_DIR = BASE_DIR / "previews"
+UPLOADS_DIR = BASE_DIR / "uploads"
 
-# 确保previews文件夹存在，不存在则创建
+# 确保文件夹存在
 if not PREVIEWS_DIR.exists():
     PREVIEWS_DIR.mkdir(parents=True, exist_ok=True)
+if not UPLOADS_DIR.exists():
+    UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
 
 app.mount("/previews", StaticFiles(directory=str(PREVIEWS_DIR)), name="previews")
+app.mount("/uploads", StaticFiles(directory=str(UPLOADS_DIR)), name="uploads")
 
 app.include_router(voice_design.router, prefix="/api/voice-design", tags=["音色创造"])
 app.include_router(voice_clone.router, prefix="/api/voice-clone", tags=["音色克隆"])
@@ -63,7 +67,7 @@ async def test_audio(filename: str):
     @app.get("/{full_path:path}")
     async def serve_spa(full_path: str):
         # 排除 API、WS 和预览路径，让它们由各自的路由处理器处理或返回 404
-        if any(full_path.startswith(prefix) for prefix in ["api/", "ws/", "previews/"]):
+        if any(full_path.startswith(prefix) for prefix in ["api/", "ws/", "previews/", "uploads/"]):
             from fastapi.responses import JSONResponse
             return JSONResponse(status_code=404, content={"detail": "Not Found"})
 
diff --git a/frontend/src/views/VoiceClone.vue b/frontend/src/views/VoiceClone.vue
index bd0adf6..b007f93 100644
--- a/frontend/src/views/VoiceClone.vue
+++ b/frontend/src/views/VoiceClone.vue
@@ -119,20 +119,39 @@
             :key="voice.voice_name"
             class="voice-card"
             :class="{ active: selectedVoice === voice.voice_name }"
+            :data-voice="voice.voice_name"
             @click="selectVoice(voice)"
           >
             <div class="voice-header">
               <h3>{{ voice.display_name || voice.voice_name }}</h3>
-              <el-button
-                type="danger"
-                size="small"
-                circle
-                @click.stop="deleteVoice(voice.voice_name)"
-              >
-                <el-icon><Delete /></el-icon>
-              </el-button>
+              <div class="voice-actions">
+                <el-button
+                  type="primary"
+                  size="small"
+                  circle
+                  @click.stop="playVoiceAudio(voice)"
+                >
+                  <el-icon><VideoPlay /></el-icon>
+                </el-button>
+                <el-button
+                  type="danger"
+                  size="small"
+                  circle
+                  @click.stop="deleteVoice(voice.voice_name)"
+                >
+                  <el-icon><Delete /></el-icon>
+                </el-button>
+              </div>
             </div>
+            <p class="voice-ref-text">{{ voice.ref_text || '' }}</p>
             <p class="voice-time">创建时间: {{ voice.created_at }}</p>
+            <audio
+              v-if="voice.audio_file"
+              :ref="el => { if (el) voiceAudioRefs[voice.voice_name] = el }"
+              :src="getAudioUrl(voice.audio_file)"
+              controls
+              class="preview-audio"
+            />
           </div>
         </div>
       </div>
@@ -191,7 +210,7 @@
 import { ref, onMounted, computed } from 'vue'
 import { useRouter } from 'vue-router'
 import { ElMessage } from 'element-plus'
-import { ArrowLeft, Delete, Microphone, VideoPause } from '@element-plus/icons-vue'
+import { ArrowLeft, Delete, Microphone, VideoPause, VideoPlay } from '@element-plus/icons-vue'
 import { useVoiceStore } from '@/stores/voice'
 import api from '@/api'
 
@@ -221,6 +240,7 @@ const ref_text = ref('')
 const ttsText = ref('')
 const audioUrl = ref('')
 const synthesizing = ref(false)
+const voiceAudioRefs = ref({})
 
 const cloneVoices = computed(() => voiceStore.cloneVoices)
 const loading = computed(() => voiceStore.loading)
@@ -511,6 +531,21 @@ const selectVoice = (voice) => {
   ref_text.value = voice.ref_text
 }
 
+const getAudioUrl = (audioFile) => {
+  return `/uploads/${audioFile}`
+}
+
+const playVoiceAudio = (voice) => {
+  const audio = voiceAudioRefs.value[voice.voice_name]
+  if (audio) {
+    audio.play().catch(error => {
+      ElMessage.error('播放失败: ' + error.message)
+    })
+  } else {
+    ElMessage.error('找不到音频文件')
+  }
+}
+
 const handleVoiceChange = (voiceName) => {
   const voice = cloneVoices.value.find(v => v.voice_name === voiceName)
   if (voice) {
@@ -662,10 +697,35 @@ h2 {
   color: #999;
 }
 
+.voices-section {
+  max-height: 500px;
+  overflow-y: auto;
+  padding-right: 10px;
+}
+
 .voices-grid {
   display: grid;
   grid-template-columns: repeat(auto-fill, minmax(250px, 1fr));
   gap: 15px;
+  padding-bottom: 10px;
+}
+
+.voices-section::-webkit-scrollbar {
+  width: 8px;
+}
+
+.voices-section::-webkit-scrollbar-track {
+  background: rgba(255, 154, 158, 0.2);
+  border-radius: 4px;
+}
+
+.voices-section::-webkit-scrollbar-thumb {
+  background: rgba(255, 100, 100, 0.5);
+  border-radius: 4px;
+}
+
+.voices-section::-webkit-scrollbar-thumb:hover {
+  background: rgba(255, 100, 100, 0.7);
 }
 
 .voice-card {
@@ -699,7 +759,17 @@ h2 {
   color: #333;
 }
 
-.voice-desc {
+.voice-actions {
+  display: flex;
+  gap: 5px;
+}
+
+.preview-audio {
+  width: 100%;
+  margin-top: 10px;
+}
+
+.voice-ref-text {
   font-size: 14px;
   color: #666;
   margin-bottom: 5px;
diff --git a/frontend/vite.config.js b/frontend/vite.config.js
index 71dcadb..22494a4 100644
--- a/frontend/vite.config.js
+++ b/frontend/vite.config.js
@@ -25,6 +25,10 @@ export default defineConfig({
         target: "http://localhost:8000",
         changeOrigin: true,
       },
+      "/uploads": {
+        target: "http://localhost:8000",
+        changeOrigin: true,
+      },
     },
   },
 });

From 9e096d021354a9b4932c3114483a0cb4401133d6 Mon Sep 17 00:00:00 2001
From: yuzhiheng <zhiheng.yu@bestlink.com.cn>
Date: Sun, 1 Feb 2026 08:39:05 +0800
Subject: [PATCH 09/12] =?UTF-8?q?build:=20=E6=9B=B4=E6=96=B0=E6=9C=AC?=
 =?UTF-8?q?=E5=9C=B0Dockerfile=E4=BB=A5=E5=8F=8D=E6=98=A0=E5=BC=80?=
 =?UTF-8?q?=E5=8F=91=E7=8E=AF=E5=A2=83=E8=AE=BE=E7=BD=AE=E7=9A=84=E5=8F=98?=
 =?UTF-8?q?=E5=8C=96=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile.local | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Dockerfile.local b/Dockerfile.local
index b45cd6f..f337059 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -43,18 +43,6 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python \
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/
 ENV PATH="/uv/bin:${PATH}"
 
-# 设置 Hugging Face 缓存目录
-ENV HF_HOME=/app/models_cache
-
-# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型
-RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
-
-# 在构建期间下载模型，避免运行时下载
-RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base
-RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
-RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
-
 # 复制后端依赖文件
 COPY backend/requirements.txt ./
 
@@ -82,6 +70,18 @@ RUN mkdir -p previews data uploads
 # 从第一阶段复制构建好的前端静态文件
 COPY --from=frontend-builder /app/frontend/dist /app/static
 
+# 设置 Hugging Face 缓存目录
+ENV HF_HOME=/app/models_cache
+
+# 安装 huggingface_hub 和 hf_transfer 用于加速在构建阶段下载模型
+RUN uv pip install --no-cache huggingface_hub[hf_transfer] --system
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# 在构建期间下载模型，避免运行时下载
+RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-Base
+RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
+RUN huggingface-cli download Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
+
 # 设置本地运行相关的环境变量
 ENV QWEN3_TTS_ENV=local
 ENV PORT=8000

From 9942447d3a656123f70efbe3684bcf10c1339c02 Mon Sep 17 00:00:00 2001
From: zero <tohhiyin@gmail.com>
Date: Sun, 1 Feb 2026 17:11:55 +0800
Subject: [PATCH 10/12] =?UTF-8?q?chore:=20=E6=9B=B4=E6=96=B0=20Aliyun=20Do?=
 =?UTF-8?q?ckerfile=20=E9=85=8D=E7=BD=AE=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile.aliyun | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.aliyun b/Dockerfile.aliyun
index 2fd9200..0c8cdf5 100644
--- a/Dockerfile.aliyun
+++ b/Dockerfile.aliyun
@@ -11,6 +11,7 @@ RUN npm install
 
 # 复制源码并构建
 COPY frontend/ ./
+RUN cp .env.example .env
 RUN npm run build
 
 # ==========================================

From 40f604ccf0678985d96fe95111d1c16e3eceb249 Mon Sep 17 00:00:00 2001
From: yuzhiheng <zhiheng.yu@bestlink.com.cn>
Date: Mon, 2 Feb 2026 10:29:20 +0800
Subject: [PATCH 11/12] =?UTF-8?q?feat:=20=E6=9B=B4=E6=94=B9=20pytorch=20?=
 =?UTF-8?q?=E7=89=88=E6=9C=AC=E4=B8=BA=202.8.0=EF=BC=8C=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?=20flash-attn?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile.local | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.local b/Dockerfile.local
index f337059..876db2e 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -50,7 +50,9 @@ COPY backend/requirements.txt ./
 RUN uv pip install --no-cache -r requirements.txt --system
 
 # 2. 安装本地模型运行所需的深度学习依赖 (针对 CUDA 12.8, 支持 RTX 5090)
-RUN uv pip install --no-cache torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 --system
+RUN uv pip install --no-cache \
+    torch==2.8.0+cu128 torchvision==0.23.0+cu128 torchaudio==2.8.0+cu128 \
+    --index-url https://download.pytorch.org/whl/cu128 --system
 
 # 3. 安装 Qwen3-TTS 核心库
 # 注意：如果 pypi 版本未发布，可能需要从 git 安装，这里先尝试 pypi

From c7a95721b38eae631a1eb5acae1c96bf3b6f030f Mon Sep 17 00:00:00 2001
From: zero <tohhiyin@gmail.com>
Date: Thu, 5 Feb 2026 09:12:57 +0800
Subject: [PATCH 12/12] =?UTF-8?q?refactor:=20=E5=9C=A8=20Dockerfile.local?=
 =?UTF-8?q?=20=E4=B8=AD=E6=B7=BB=E5=8A=A0=20uv=20=E8=B6=85=E6=97=B6?=
 =?UTF-8?q?=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=EF=BC=8C=E9=98=B2=E6=AD=A2?=
 =?UTF-8?q?=20uv=20=E5=AE=89=E8=A3=85=E8=B6=85=E6=97=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile.local | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.local b/Dockerfile.local
index 876db2e..acc4a33 100644
--- a/Dockerfile.local
+++ b/Dockerfile.local
@@ -42,6 +42,7 @@ RUN ln -sf /usr/bin/python3.11 /usr/bin/python \
 # 安装 uv
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uv/bin/
 ENV PATH="/uv/bin:${PATH}"
+ENV UV_HTTP_TIMEOUT=600
 
 # 复制后端依赖文件
 COPY backend/requirements.txt ./