@@ -225,6 +225,7 @@ def _select_best_uvr_deecho_output(self, reference_path: str, candidate_files: L
225225 """Pick the UVR DeEcho branch best suited for VC input."""
226226 best_path = None
227227 best_score = None
228+ best_metrics = None
228229
229230 for candidate_path in candidate_files :
230231 scored = self ._score_uvr_deecho_candidate (reference_path , candidate_path )
@@ -241,7 +242,10 @@ def _select_best_uvr_deecho_output(self, reference_path: str, candidate_files: L
241242 if best_score is None or score > best_score :
242243 best_score = score
243244 best_path = candidate_path
245+ best_metrics = metrics
244246
247+ # 保存最佳候选的质量指标,供 blend 决策使用
248+ self ._uvr_deecho_metrics = best_metrics
245249 return best_path
246250
247251 def _init_separator (
@@ -902,9 +906,12 @@ def _should_apply_source_constraint(
902906 """Decide whether to run source-guided post constraint."""
903907 normalized_mode = str (source_constraint_mode or "auto" ).strip ().lower ()
904908 if normalized_mode == "on" :
909+ if self ._last_vc_preprocess_mode == "direct" :
910+ log .detail ("源约束跳过: direct 模式下源未去回音,强制约束会放大回音伪影" )
911+ return False
905912 return vc_preprocessed
906913 if normalized_mode == "auto" :
907- return vc_preprocessed and self ._last_vc_preprocess_mode in {"uvr_deecho" , "legacy" }
914+ return vc_preprocessed and self ._last_vc_preprocess_mode in {"uvr_deecho" , "legacy" , "advanced_dereverb" }
908915 return False
909916
910917 def _refine_source_constrained_output (
@@ -1022,14 +1029,17 @@ def _blend_direct_with_deecho(
10221029 echo_ratio = echo_ratio [:n_blend ]
10231030
10241031 # --- Blending weight ---
1025- # Base: original low-activity weight (for silent gaps)
1026- base_weight = 0.65 * np .square (1.0 - activity [:n_blend ])
1027- # Echo boost: even during active singing, apply DeEcho proportional
1028- # to detected echo. Max additional contribution capped at 0.55.
1029- echo_boost = 0.55 * echo_ratio * activity [:n_blend ]
1032+ # 全局回音水平驱动系数自适应
1033+ global_echo = float (np .mean (echo_ratio ))
1034+ # 沉默段基权: 轻回音0.65, 重回音0.85
1035+ base_coef = 0.65 + 0.20 * global_echo
1036+ base_weight = base_coef * np .square (1.0 - activity [:n_blend ])
1037+ # 活跃唱段 echo_boost: 轻回音0.55, 重回音0.90
1038+ echo_boost_coef = 0.55 + 0.35 * global_echo
1039+ echo_boost = echo_boost_coef * echo_ratio * activity [:n_blend ]
10301040 deecho_weight = base_weight + echo_boost
10311041 deecho_weight = np .convolve (deecho_weight , smooth_kernel , mode = "same" )
1032- deecho_weight = np .clip (deecho_weight , 0.0 , 0.80 )
1042+ deecho_weight = np .clip (deecho_weight , 0.0 , 0.95 )
10331043 deecho_weight = CoverPipeline ._frame_curve_to_sample_gain (
10341044 deecho_weight ,
10351045 aligned_len ,
@@ -1066,6 +1076,7 @@ def _prepare_vocals_for_vc(
10661076
10671077 # 保存原始混响用于后处理
10681078 self ._original_reverb_path = None
1079+ self ._uvr_deecho_metrics = None
10691080
10701081 if preprocess_mode == "advanced_dereverb" :
10711082 # 使用高级去混响:分离干声和混响
@@ -1096,17 +1107,20 @@ def _prepare_vocals_for_vc(
10961107 log .detail ("VC preprocess: legacy dereverb chain -> mono select" )
10971108 else :
10981109 preprocess_input = vocals_path
1110+ mono_resolved = False
1111+
10991112 if preprocess_mode in {"auto" , "uvr_deecho" }:
11001113 preprocess_input = self ._apply_uvr_deecho_for_vc (vocals_path , session_dir ) or vocals_path
11011114
11021115 if preprocess_input == vocals_path :
1103- # 如果UVR DeEcho不可用,在auto模式下使用advanced dereverb
1104- if preprocess_mode == " auto" :
1116+ if preprocess_mode in { "auto" , "uvr_deecho" }:
1117+ # auto / uvr_deecho 模式在 UVR 模型缺失时都回退到 advanced_dereverb
11051118 audio , sr = librosa .load (vocals_path , sr = None , mono = False )
11061119 audio = self ._ensure_2d (audio ).astype (np .float32 )
11071120 mono = self ._select_mono_for_vc (audio , sr )
11081121
1109- log .detail ("VC preprocess: UVR DeEcho not available, using advanced dereverb" )
1122+ fallback_name = "auto" if preprocess_mode == "auto" else "uvr_deecho"
1123+ log .detail (f"VC preprocess ({ fallback_name } ): UVR DeEcho not available, using advanced dereverb" )
11101124 dry_signal , reverb_tail = advanced_dereverb (mono , sr )
11111125
11121126 # 保存混响用于后处理
@@ -1116,38 +1130,55 @@ def _prepare_vocals_for_vc(
11161130
11171131 mono = dry_signal
11181132 self ._last_vc_preprocess_mode = "advanced_dereverb"
1133+ mono_resolved = True
11191134 log .detail (f"Dry/Wet separation: dry RMS={ np .sqrt (np .mean (dry_signal ** 2 )):.4f} , reverb RMS={ np .sqrt (np .mean (reverb_tail ** 2 )):.4f} " )
11201135 else :
1136+ # direct 模式
11211137 self ._last_vc_preprocess_mode = "direct"
1122- if preprocess_mode == "uvr_deecho" :
1123- log .warning ("Official DeEcho model not found, falling back to direct lead input" )
11241138 log .detail ("VC preprocess: direct lead -> mono select" )
1125- audio , sr = librosa .load (preprocess_input , sr = None , mono = False )
1126- audio = self ._ensure_2d (audio ).astype (np .float32 )
1127- mono = self ._select_mono_for_vc (audio , sr )
11281139 else :
11291140 self ._last_vc_preprocess_mode = "uvr_deecho"
11301141 log .detail ("VC preprocess: UVR learned DeEcho/DeReverb -> mono select" )
11311142
1132- if preprocess_input == vocals_path :
1133- audio , sr = librosa .load (preprocess_input , sr = None , mono = False )
1134- audio = self ._ensure_2d (audio ).astype (np .float32 )
1135- mono = self ._select_mono_for_vc (audio , sr )
1136- else :
1137- direct_audio , sr = librosa .load (vocals_path , sr = None , mono = False )
1138- deecho_audio , deecho_sr = librosa .load (preprocess_input , sr = None , mono = False )
1139- direct_audio = self ._ensure_2d (direct_audio ).astype (np .float32 )
1140- deecho_audio = self ._ensure_2d (deecho_audio ).astype (np .float32 )
1141- direct_mono = self ._select_mono_for_vc (direct_audio , sr )
1142- deecho_mono = self ._select_mono_for_vc (deecho_audio , deecho_sr )
1143- if deecho_sr != sr :
1144- deecho_mono = librosa .resample (
1145- deecho_mono ,
1146- orig_sr = deecho_sr ,
1147- target_sr = sr ,
1148- ).astype (np .float32 )
1149- mono = self ._blend_direct_with_deecho (direct_mono , deecho_mono , sr )
1150- log .detail ("VC preprocess: blended direct lead with UVR DeEcho" )
1143+ # 最终 mono 确定(仅在 mono 未被上面解决时执行)
1144+ if not mono_resolved :
1145+ if preprocess_input == vocals_path :
1146+ audio , sr = librosa .load (preprocess_input , sr = None , mono = False )
1147+ audio = self ._ensure_2d (audio ).astype (np .float32 )
1148+ mono = self ._select_mono_for_vc (audio , sr )
1149+ else :
1150+ direct_audio , sr = librosa .load (vocals_path , sr = None , mono = False )
1151+ deecho_audio , deecho_sr = librosa .load (preprocess_input , sr = None , mono = False )
1152+ direct_audio = self ._ensure_2d (direct_audio ).astype (np .float32 )
1153+ deecho_audio = self ._ensure_2d (deecho_audio ).astype (np .float32 )
1154+ direct_mono = self ._select_mono_for_vc (direct_audio , sr )
1155+ deecho_mono = self ._select_mono_for_vc (deecho_audio , deecho_sr )
1156+ if deecho_sr != sr :
1157+ deecho_mono = librosa .resample (
1158+ deecho_mono ,
1159+ orig_sr = deecho_sr ,
1160+ target_sr = sr ,
1161+ ).astype (np .float32 )
1162+
1163+ # DeEcho 质量检测:用 UVR 候选打分指标判断是否跳过 blend
1164+ uvr_metrics = getattr (self , '_uvr_deecho_metrics' , None )
1165+ skip_blend = False
1166+ if uvr_metrics :
1167+ sep_db = uvr_metrics .get ('separation_db' , 0.0 )
1168+ corr = uvr_metrics .get ('corr' , 0.0 )
1169+ log .detail (
1170+ f"DeEcho quality: sep={ sep_db :.2f} dB, corr={ corr :.3f} "
1171+ )
1172+ # sep > 30dB 且 corr > 0.9 说明 DeEcho 质量好
1173+ if sep_db > 30.0 and corr > 0.9 :
1174+ skip_blend = True
1175+
1176+ if skip_blend :
1177+ mono = deecho_mono
1178+ log .detail ("VC preprocess: UVR DeEcho quality sufficient, using deecho directly (skip blend)" )
1179+ else :
1180+ mono = CoverPipeline ._blend_direct_with_deecho (direct_mono , deecho_mono , sr )
1181+ log .detail ("VC preprocess: blended direct lead with UVR DeEcho (enhanced)" )
11511182
11521183 mono = soft_clip (mono , threshold = 0.9 , ceiling = 0.99 )
11531184
0 commit comments