diff --git a/.gitignore b/.gitignore
index f792265..d80845d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,9 +34,10 @@ venv.bak/
 *.idea
 .DS_Store
 
-*.h5
+
 __pycache__/
 *.pyc
 audio/
-emotion_diary/
+#emotion_diary/
+emotion_png/
 pyvenv.cfg
\ No newline at end of file
diff --git a/app/ML/ModelService.py b/app/ML/ModelService.py
new file mode 100644
index 0000000..a89245c
--- /dev/null
+++ b/app/ML/ModelService.py
@@ -0,0 +1,97 @@
+# import numpy as np
+# from dotenv import load_dotenv
+# from fastapi import Request, UploadFile, File, APIRouter
+# from typing import List
+# from tensorflow.keras.models import load_model
+# from sentence_transformers import SentenceTransformer
+# import io
+# import requests
+#
+# from app.ML.audio_extractor_utils import get_features
+# from app.ML.loss import boundary_enhanced_focal_loss
+# from app.ML.plot_utils import save_plot, get_s3_png_url
+# from app.ML.speech_to_text import speech_to_text
+#
+# import os
+#
+# from app.service.gpt import EmotionReportGPT
+# from app.utils.convertFileExtension import convert_to_wav
+#
+# router = APIRouter(
+#     prefix="/api/fastapi",
+# )
+# load_dotenv()
+# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+#
+#
+#
+#
+# @router.post("/predict")
+# async def predict(request: Request, files: List[UploadFile] = File(...)):
+#     # token = request.headers.get("Authorization").split(" ")[1]
+#     print(files)
+#     # 1) 임시 파일 저장 or 메모리 내 처리
+#     wav_data_list = []
+#     for file in files:
+#         raw = await file.read()
+#         ext = file.filename.split('.')[-1]  # 'm4a', 'mp3' 등
+#         wav_bytes = convert_to_wav(raw, ext)  # BytesIO 변환
+#         wav_data_list.append(wav_bytes)
+#
+#     # 2) 오디오 특징 추출
+#     all_feats = []
+#     for wav_bytes in wav_data_list:
+#         # get_features 함수가 경로 입력이면, 아래처럼 메모리 파일 처리 필요
+#         # 임시파일로 저장 후 경로 전달 or get_features 수정 필요
+#
+#         temp_path = f"temp_{file.filename}"
+#         with open(temp_path, "wb") as f:
+#             f.write(wav_bytes)
+#         feats = get_features(temp_path)
+#         os.remove(temp_path)
+#         all_feats.append(feats)
+#
+#     all_feats = np.stack(all_feats, axis=0)
+#     pooled_feats = all_feats.mean(axis=0)
+#     audio_input = pooled_feats[np.newaxis, :, np.newaxis]
+#
+#     # 3) STT & 텍스트 임베딩
+#     texts = []
+#     for wav_bytes in wav_data_list:
+#         temp_path = f"temp_stt.wav"
+#         with open(temp_path, "wb") as f:
+#             f.write(wav_bytes)
+#         text = speech_to_text(temp_path)
+#         os.remove(temp_path)
+#         texts.append(text)
+#
+#     full_text = " . ".join(texts)
+#     text_vec = embedding_model.encode([full_text])[0]
+#     text_input = text_vec[np.newaxis, :]
+#
+#     # 4) 예측
+#     prediction = model.predict([audio_input, text_input])
+#     pred_percent = (prediction[0] * 100).tolist()
+#
+#     # 5) JSON 응답
+#     result = {label: round(p, 2) for label, p in zip(emotion_labels, pred_percent)}
+#     top_idx = np.argmax(pred_percent)
+#     result['predicted_emotion'] = emotion_labels[top_idx]
+#
+#     local_path = save_plot(pred_percent)
+#     s3_path = get_s3_png_url(local_path)
+#     reporter = EmotionReportGPT(full_text, pred_percent)
+#     report_text = reporter.get_report_text()
+#
+#     print(s3_path)
+#
+#     # send_emotion_report_to_spring(s3_path, report_text)
+#
+#     data = {
+#         "imageUrl": s3_path,
+#         "report_text": report_text
+#     }
+#     return data
+#
+#
+#
diff --git a/app/ML/audio_extractor_utils.py b/app/ML/audio_extractor_utils.py
new file mode 100644
index 0000000..dcace4c
--- /dev/null
+++ b/app/ML/audio_extractor_utils.py
@@ -0,0 +1,69 @@
+import librosa
+import librosa.display
+import numpy as np
+
+
+def noise(data):
+    noise_amp = 0.035 * np.random.uniform() * np.amax(data)
+    data = data + noise_amp * np.random.normal(size=data.shape[0])
+    return data
+
+
+def stretch(data, rate=0.8):
+    return librosa.effects.time_stretch(y=data, rate=rate)
+
+
+def shift(data):
+    shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
+    return np.roll(data, shift_range)
+
+
+def pitch(data, sampling_rate, pitch_factor=0.7):
+    return librosa.effects.pitch_shift(y=data, sr=sampling_rate, n_steps=pitch_factor)
+
+
+def extract_features(data, sample_rate):
+    # ZCR
+    result = np.array([])
+    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
+    result = np.hstack((result, zcr))  # stacking horizontally
+
+    # Chroma_stft
+    stft = np.abs(librosa.stft(data))
+    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, chroma_stft))  # stacking horizontally
+
+    # MFCC
+    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, mfcc))  # stacking horizontally
+
+    # Root Mean Square Value
+    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
+    result = np.hstack((result, rms))  # stacking horizontally
+
+    # MelSpectogram
+    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, mel))  # stacking horizontally
+
+    return result
+
+
+def get_features(path):
+    data, sample_rate = librosa.load(path, duration=2.5, offset=0.0)
+
+    # without augmentation
+    res1 = extract_features(data, sample_rate)
+    result = np.array(res1)
+
+    # data with noise
+    noise_data = noise(data)
+    res2 = extract_features(noise_data, sample_rate)
+    result = np.concatenate((result, res2), axis=0)
+
+    # data with stretching and pitching
+    new_data = stretch(data)
+    data_stretch_pitch = pitch(new_data, sample_rate)
+    res3 = extract_features(data_stretch_pitch, sample_rate)
+    result = np.concatenate((result, res3), axis=0)
+
+    return result
diff --git a/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5 b/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5
new file mode 100644
index 0000000..b4f22d5
Binary files /dev/null and b/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5 differ
diff --git a/app/ML/loss.py b/app/ML/loss.py
new file mode 100644
index 0000000..c41199b
--- /dev/null
+++ b/app/ML/loss.py
@@ -0,0 +1,29 @@
+import tensorflow as tf
+
+
+# 1. Boundary-Enhanced Focal Loss 구현 (소수 클래스 식별 강화)
+def boundary_enhanced_focal_loss(y_true, y_pred, gamma=2.0, margin=0.3):
+    y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)
+
+    # 하드 샘플 마이닝 (낮은 확률로 예측된 샘플 식별)
+    correct_prob = tf.reduce_sum(y_true * y_pred, axis=-1)
+    hard_mask = tf.cast(tf.less(correct_prob, margin), tf.float32)
+
+    # 클래스별 가중치 계산 (소수 클래스에 더 높은 가중치)
+    effective_counts = tf.reduce_sum(y_true, axis=0)
+    alpha = 1.0 / (effective_counts + 1e-7)
+    alpha = alpha / tf.reduce_sum(alpha)
+
+    # 소수 클래스 추가 가중치 부여 (surprise, neutral)
+    class_boost = tf.constant([1.0, 0.5, 1.0, 1.0, 1.0, 2.5, 5.0], dtype=tf.float32)
+    alpha = alpha * class_boost
+
+    # Focal Loss 계산
+    cross_entropy = -y_true * tf.math.log(y_pred)
+    focal_weight = tf.pow(1.0 - y_pred, gamma)
+
+    # 하드 샘플에 추가 가중치 부여
+    sample_weight = 1.0 + hard_mask * 2.0
+    loss = sample_weight[:, tf.newaxis] * alpha * focal_weight * cross_entropy
+
+    return tf.reduce_sum(loss)
diff --git a/app/ML/plot_utils.py b/app/ML/plot_utils.py
new file mode 100644
index 0000000..b7744d0
--- /dev/null
+++ b/app/ML/plot_utils.py
@@ -0,0 +1,40 @@
+# 그래프 그리기
+import os
+from datetime import datetime
+
+from matplotlib import pyplot as plt
+from app.service.s3Service import upload_to_s3_png
+
+colors = ['#e74c3c', '#3498db', '#f1c40f', '#e67e22', '#9b59b6', '#1abc9c', '#95a5a6']
+emotion_labels = ['angry', 'sadness', 'happiness', 'fear', 'disgust', 'surprise', 'neutral']
+
+
+def save_plot(predictions_percent):
+    plt.figure(figsize=(10, 6))
+    bars = plt.barh(emotion_labels, predictions_percent, color=colors, alpha=0.85)
+
+    plt.title('Emotion Probability Distribution', fontsize=20, weight='bold', pad=15)
+    plt.xlabel('Probability (%)', fontsize=14)
+    plt.xlim(0, max(predictions_percent) + 10)
+    plt.grid(axis='x', linestyle='--', alpha=0.6)
+
+    for bar, percent in zip(bars, predictions_percent):
+        width = bar.get_width()
+        plt.text(width + 0.8, bar.get_y() + bar.get_height() / 2, f'{percent:.1f}%', va='center', fontsize=13,
+                 weight='bold', color='#333')
+
+    plt.yticks(fontsize=14, weight='bold')
+    plt.tight_layout()
+
+    date_str = datetime.now().strftime("%Y%m%d")
+    filename = f"{date_str}"
+    local_path = os.getcwd() + f"/app/emotion_png/{filename}_emotion_distribution.png"
+    # 이미지 파일로 저장
+    plt.savefig(local_path, dpi=300, bbox_inches='tight')
+    plt.show()
+
+    return local_path
+
+
+def get_s3_png_url(local_path):
+    return upload_to_s3_png(local_path)
diff --git a/app/ML/predict_colab.py b/app/ML/predict_colab.py
new file mode 100644
index 0000000..a95d54a
--- /dev/null
+++ b/app/ML/predict_colab.py
@@ -0,0 +1,95 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from tensorflow.keras.models import load_model
+from sentence_transformers import SentenceTransformer
+import glob
+import os
+
+from app.ML.audio_extractor_utils import get_features
+from app.ML.loss import boundary_enhanced_focal_loss
+from app.ML.speech_to_text import speech_to_text
+import os
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+
+
+BASE_DIR_resp = "/home/team4/Desktop/capstone/AI/app/emotion_diary"
+BASE_DIR_win = "C:/Users/YJG/Desktop/2025_1_capstone_2/AI/app/emotion_diary"
+emotion_labels = ['angry', 'sadness', 'happiness', 'fear', 'disgust', 'surprise', 'neutral']
+model_path_resp = "/home/team4/Desktop/capstone/AI/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5"
+model_path_win = "C:/Users/YJG/Desktop/2025_1_capstone_2/AI/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5"
+
+
+def predict():
+    BASE_DIR = BASE_DIR_win
+    model_path = model_path_win
+    # (가정) 미리 정의된 함수/변수
+    # get_features(path): (486,) 벡터 반환
+    # speech_to_text(path): STT → 문자열 반환
+    # boundary_enhanced_focal_loss: 커스텀 손실
+    # emotion_labels: ['angry','sadness','happiness','fear','disgust','surprise','neutral']
+    # model_path, sample_path: 경로 문자열
+
+    # 1) WAV 파일 리스트
+    # sample_wav_list = [
+    #     sample_path + "/jg_sadness_1.wav",
+    #     sample_path + "/jg_sadness_2.wav",
+    #     sample_path + "/jg_sadness_3.wav",
+    #     sample_path + "/jg_sadness_4.wav",
+    #     sample_path + "/jg_sadness_5.wav"
+    # ]
+    sample_wav_list = glob.glob(os.path.join(BASE_DIR, "**", "*.wav"), recursive=True)
+
+    # 2) 오디오 특징 평균 풀링
+    all_feats = np.stack([get_features(p) for p in sample_wav_list], axis=0)  # (5,486)
+    pooled_feats = all_feats.mean(axis=0)  # (486,)
+
+    # 3) 모델 입력 형태 맞추기
+    audio_input = pooled_feats[np.newaxis, :, np.newaxis]  # (1,486,1)
+
+    # 4) 전체 텍스트 STT → 하나의 문장으로 결합
+    texts = [speech_to_text(p) for p in sample_wav_list]
+    full_text = " . ".join(texts)
+
+    # 5) 텍스트 임베딩
+    embedding_model = SentenceTransformer('jhgan/ko-sbert-multitask')
+    text_vec = embedding_model.encode([full_text])[0]  # (768,)
+    text_input = text_vec[np.newaxis, :]  # (1,768)
+
+    # 6) 모델 로드 및 예측
+    model = load_model(model_path, custom_objects={
+        'boundary_enhanced_focal_loss': boundary_enhanced_focal_loss
+    })
+    prediction = model.predict([audio_input, text_input])  # (1,7)
+    pred_percent = prediction[0] * 100  # (7,)
+
+    # 7) 콘솔에 출력
+    for lbl, p in zip(emotion_labels, pred_percent):
+        print(f"{lbl}: {p:.2f}%")
+    top_idx = np.argmax(pred_percent)
+    print(f"\n최종 예측 감정: {emotion_labels[top_idx]}")
+
+    # 8) 가로 막대그래프 시각화
+    colors = ['#e74c3c', '#3498db', '#f1c40f', '#e67e22', '#9b59b6', '#1abc9c', '#95a5a6']
+
+    plt.figure(figsize=(10, 6))
+    bars = plt.barh(emotion_labels, pred_percent, color=colors, alpha=0.85)
+
+    plt.title('Emotion Probability Distribution', fontsize=18, weight='bold', pad=15)
+    plt.xlabel('Probability (%)', fontsize=14)
+    plt.xlim(0, pred_percent.max() + 10)
+    plt.grid(axis='x', linestyle='--', alpha=0.6)
+
+    for bar, p in zip(bars, pred_percent):
+        plt.text(p + 1, bar.get_y() + bar.get_height() / 2,
+                 f'{p:.1f}%', va='center', fontsize=12, weight='bold', color='#333')
+
+    plt.yticks(fontsize=13, weight='bold')
+    plt.tight_layout()
+
+    # 이미지 파일로 저장
+    plt.savefig('emotion_distribution.png', dpi=300, bbox_inches='tight')
+    plt.show()
+
+
+if __name__ == "__main__":
+    predict()
diff --git a/app/ML/speech_to_text.py b/app/ML/speech_to_text.py
new file mode 100644
index 0000000..8cd19c5
--- /dev/null
+++ b/app/ML/speech_to_text.py
@@ -0,0 +1,26 @@
+import speech_recognition as sr
+
+# sample_wav_path = sample_path + "/sh_sadness_2.wav"
+
+
+# STT 변환 함수
+def speech_to_text(audio_path):
+    recognizer = sr.Recognizer()
+
+    # 음성 파일 로드
+    with sr.AudioFile(audio_path) as source:
+        audio_data = recognizer.record(source)  # 음성 데이터 읽기
+
+    try:
+        # 구글 STT API 사용 (무료)
+        text = recognizer.recognize_google(audio_data, language="ko-KR")
+        return text
+    except sr.UnknownValueError:
+        return "음성을 인식할 수 없습니다."
+    except sr.RequestError:
+        return "STT 요청 실패"
+
+#
+# # MP3에서 변환한 WAV 파일 입력
+# sample_text = speech_to_text(sample_wav_path)
+# print("변환된 텍스트:", sample_text)
diff --git a/app/controller/RecordController.py b/app/controller/RecordController.py
index 25ca700..fef983c 100644
--- a/app/controller/RecordController.py
+++ b/app/controller/RecordController.py
@@ -3,16 +3,25 @@
 import subprocess
 from typing import List
 
+import numpy as np
 import requests
 from boto3 import client
 from fastapi import APIRouter, Request, UploadFile, File, Form
+from sentence_transformers import SentenceTransformer
 
+from app.ML.audio_extractor_utils import get_features
+from app.ML.loss import boundary_enhanced_focal_loss
+from app.ML.plot_utils import save_plot, get_s3_png_url
+from app.ML.speech_to_text import speech_to_text
 from app.dto.ScheduleSpeakRequestDto import ScheduleSpeakRequestDto
 from app.dto.ScheduleTTSRequestDto import ScheduleTTSRequestDto
-from app.service.elevenLabs import add_voice, text_to_speech_file_save_AWS, text_to_speech_file
-from app.service.gpt import ChatgptAPI
-from app.service.s3Service import upload_to_s3, download_from_s3, save_local_file
+from app.service.elevenLabs import text_to_speech_file_save_AWS, text_to_speech_file
+from app.service.gpt import ChatgptAPI, EmotionReportGPT
+from app.service.s3Service import download_from_s3, save_local_file
 from app.utils import play_file
+from tensorflow.keras.models import load_model
+
+from app.utils.convertFileExtension import convert_to_wav
 
 router = APIRouter(
     prefix="/api/fastapi",
@@ -31,6 +40,17 @@
     region_name="ap-northeast-2",
 )
 
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+
+# app = FastAPI()
+
+BASE_DIR_win = os.getcwd() + "/app/emotion_diary"
+model_path_win = os.getcwd() + "/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5"
+emotion_labels = ['angry', 'sadness', 'happiness', 'fear', 'disgust', 'surprise', 'neutral']
+
+embedding_model = SentenceTransformer('jhgan/ko-sbert-multitask')
+model = load_model(model_path_win, custom_objects={'boundary_enhanced_focal_loss': boundary_enhanced_focal_loss})
+
 
 async def save_local_files(files: List[UploadFile]) -> list:
     """업로드된 파일을 로컬에 저장하고 파일 경로를 반환합니다."""
@@ -50,22 +70,23 @@ async def save_local_files(files: List[UploadFile]) -> list:
 @router.post("/voices")
 async def getVoice(request: Request, file: UploadFile = File(...)):
     token = request.headers.get("Authorization").split(" ")[1]
-    local_file_path = await save_local_file(file)
-    voice_id = add_voice(name="박석진", local_file_paths=[local_file_path])
-    voice_url = upload_to_s3(local_file_path)
-    os.remove(local_file_path)
+    # local_file_path = await save_local_file(file)
+    # voice_id = add_voice(name=name, local_file_paths=[local_file_path])
+    # voice_url = s3Service.upload_to_s3(local_file_path)
+    # os.remove(local_file_path)
+
+    send_user_voice_file_to_spring(token=token, voice_url=yjg_voice_id)
 
-    send_user_voice_file_to_spring(token=token, voice_url=voice_id) #yjg_voice_id
 
-#만약 voice_id와 요구하는 분야가 오면 맞춰서 return
+# 만약 voice_id와 요구하는 분야가 오면 맞춰서 return
 @router.post("/schedules")
 async def schedule_tts(request: Request, schedules: ScheduleTTSRequestDto):
     # token = request.headers.get("Authorization").split(" ")[1]
     voice_id = yjg_voice_id
 
-    prompt = ChatgptAPI(schedules.schedule_text, "엄마")
+    prompt = ChatgptAPI(schedules.schedule_text, schedules.alias)
 
-    schedule_dict: {"저녁": "엄마~ 저녁 잘 챙겨 먹었어?", "운동": "오늘 운동했어? 건강 챙겨~!"}
+    # schedule_dict: {"저녁": "엄마~ 저녁 잘 챙겨 먹었어?", "운동": "오늘 운동했어? 건강 챙겨~!"}
     schedule_dict = prompt.get_schedule_json()
 
     # TTS 처리 (MP3 파일 생성 후 s3 저장)
@@ -80,45 +101,73 @@ async def schedule_tts(request: Request, schedules: ScheduleTTSRequestDto):
     return response
 
 
-# @router.post("/basic-tts")
-# async def speak_schedule_tts(request: Request, basicTTSRequestDto: BasicTTSRequestDto):
-#     # token = request.headers.get("Authorization").split(" ")[1]
-#     local_file_path = download_from_s3(basicTTSRequestDto.schedule_voice_Url)
-#     print(f"Downloaded file path: {local_file_path}")
-#
-#     # 블루투스 헤드셋 또는 기본 스피커로 출력
-#     os.system("pactl list sinks | grep 'bluez_sink'")  # 블루투스 출력 장치 확인
-#     os.system("pactl set-default-sink `pactl list sinks short | grep bluez_sink | awk '{print $2}'`")  # 기본 출력 변경
-#
-#     # 로컬 파일을 직접 재생
-#     subprocess.run(["mpg321", local_file_path])
-#
-#     return {"message": "TTS completed and played on Bluetooth headset or speaker"}
-
-
-# @router.post("/extra-tts")
-# async def speak_schedule_tts(request: Rlocalhostquest, extraTTSRequestDto: ExtraTTSRequestDto):
-#     # token = request.headers.get("Authorization").split(" ")[1]
-#     schedule_text = extraTTSRequestDto.schedule_text
-#
-#     #진짜 실제로 쓸 코드
-#     local_file_path = text_to_speech_file(schedule_text, yjg_voice_id)
-#
-#     # 테스트하면서 AWS에 올려놓으려고 남긴 코드
-#     url = text_to_speech_file_save_AWS(schedule_text, yjg_voice_id)
-#     local_file_path = download_from_s3(url)
-#
-#     # local_file_path = os.getcwd()+"/test_audio/test8.mp3" # test
-#     # 블루투스 헤드셋 또는 기본 스피커로 출력
-#     os.system("pactl list sinks | grep 'bluez_sink'")  # 블루투스 출력 장치 확인
-#     os.system("pactl set-default-sink `pactl list sinks short | grep bluez_sink | awk '{print $2}'`")  # 기본 출력 변경
-#
-#     # 로컬 파일을 직접 재생
-#     subprocess.run(["/usr/bin/mpg321", local_file_path])
-#     # subprocess.run(["ffplay", "-nodisp", "-autoexit", local_file_path],
-#     #                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # 윈도우용
-#     return {"message": "TTS completed and played on Bluetooth headset or speaker"}
-#
+@router.post("/predict")
+async def predict(request: Request, files: List[UploadFile] = File(...)):
+    # token = request.headers.get("Authorization").split(" ")[1]
+    print(files)
+    # 1) 임시 파일 저장 or 메모리 내 처리
+    wav_data_list = []
+    for file in files:
+        raw = await file.read()
+        ext = file.filename.split('.')[-1]  # 'm4a', 'mp3' 등
+        wav_bytes = convert_to_wav(raw, ext)  # BytesIO 변환
+        wav_data_list.append(wav_bytes)
+
+    # 2) 오디오 특징 추출
+    all_feats = []
+    for wav_bytes in wav_data_list:
+        # get_features 함수가 경로 입력이면, 아래처럼 메모리 파일 처리 필요
+        # 임시파일로 저장 후 경로 전달 or get_features 수정 필요
+
+        temp_path = f"temp_{file.filename}"
+        with open(temp_path, "wb") as f:
+            f.write(wav_bytes)
+        feats = get_features(temp_path)
+        os.remove(temp_path)
+        all_feats.append(feats)
+
+    all_feats = np.stack(all_feats, axis=0)
+    pooled_feats = all_feats.mean(axis=0)
+    audio_input = pooled_feats[np.newaxis, :, np.newaxis]
+
+    # 3) STT & 텍스트 임베딩
+    texts = []
+    for wav_bytes in wav_data_list:
+        temp_path = f"temp_stt.wav"
+        with open(temp_path, "wb") as f:
+            f.write(wav_bytes)
+        text = speech_to_text(temp_path)
+        os.remove(temp_path)
+        texts.append(text)
+
+    full_text = " . ".join(texts)
+    text_vec = embedding_model.encode([full_text])[0]
+    text_input = text_vec[np.newaxis, :]
+
+    # 4) 예측
+    prediction = model.predict([audio_input, text_input])
+    pred_percent = (prediction[0] * 100).tolist()
+
+    # 5) JSON 응답
+    result = {label: round(p, 2) for label, p in zip(emotion_labels, pred_percent)}
+    top_idx = np.argmax(pred_percent)
+    result['predicted_emotion'] = emotion_labels[top_idx]
+
+    local_path = save_plot(pred_percent)
+    s3_path = get_s3_png_url(local_path)
+    reporter = EmotionReportGPT(full_text, pred_percent)
+    report_text = reporter.get_report_text()
+
+    print(s3_path)
+
+    # send_emotion_report_to_spring(s3_path, report_text)
+
+    data = {
+        "imageUrl": s3_path,
+        "report_text": report_text
+    }
+    return data
+
 
 def send_user_voice_file_to_spring(token: str, voice_url: str):
     headers = {
@@ -146,107 +195,17 @@ def send_user_voice_id_to_spring(token: str, voice_id: str):
     # requests.post("https://peachmentor.com/api/spring/records/voices", headers=headers, json=data)
 
 
-def send_user_speech_file_to_spring(token: str, before_audio_link: str, answerId: int):
-    headers = {
-        "Authorization": f"Bearer {token}"
-    }
-    data = {
-        "beforeAudioLink": before_audio_link,
-        "answerId": answerId
-    }
-    requests.post("http://springboot:8080/api/spring/records/speeches", headers=headers, json=data)
-    # requests.post("https://peachmentor.com/api/spring/records/speeches", headers=headers, json=data)
-
-
-def receive_self_feedback(token: str) -> str:
+def send_emotion_report_to_spring(image_url: str, analysis_text):
     headers = {
-        "Authorization": f"Bearer {token}"
-    }
-    response = requests.get("http://springboot:8080/api/spring/self-feedbacks/latest-feedbacks", headers=headers)
-    # response = requests.get("https://peachmentor.com/api/spring/self-feedbacks/latest-feedbacks", headers=headers)
-
-    feedback_data = response.json().get('result', {})
-    self_feedback = feedback_data.get('feedback')
-
-    if self_feedback is None:
-        return "없음"
-    return self_feedback
-
-
-def send_statistics_to_spring(token: str, gantourCount: int, silentTime: float, answerId: int):
-    headers = {
-        "Authorization": f"Bearer {token}"
+        # "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json"
     }
     data = {
-        "gantourCount": gantourCount,
-        "silentTime": silentTime,
-        "answerId": answerId
+        "imageUrl": image_url,
+        "report_text": analysis_text
     }
-    requests.post("http://springboot:8080/api/spring/statistics", headers=headers, json=data)
-    # requests.post("https://peachmentor.com/api/spring/statistics", headers=headers, json=data)
-
-# # 질문 답변에 대한 insight 제공 api
-# @router.post("/insights")
-# async def getRecord(request: Request, answerId: int = Form(...), question: str = Form(...),
-#                     file: UploadFile = File(...)):
-#     token = request.headers.get("Authorization").split(" ")[1]
-#
-#     local_file_path = await s3Service.save_local_file(file)
-#     before_audio_link = s3Service.upload_to_s3(local_file_path)
-#
-#     send_user_speech_file_to_spring(token=token, before_audio_link=before_audio_link, answerId=answerId)
-#
-#     insightGpt = InsightAssistant(question)
-#     insight = insightGpt.get_insight()
-#
-#     os.remove(local_file_path)
-#     return {"insight": insight}
-
-
-# 피드백 후 데이터 전송 api
-# @router.post("/feedbacks")
-# async def getFeedback(request: Request, feedbackRequestDto: FeedbackRequestDto):
-#     token = request.headers.get("Authorization").split(" ")[1]  # todo: 토큰 에러처리 좀 (밑에도)
-#
-#     filtered_past_audio_links = [link for link in feedbackRequestDto.pastAudioLinks if
-#                                  link != feedbackRequestDto.beforeAudioLink]
-#     links = [feedbackRequestDto.beforeAudioLink, feedbackRequestDto.voiceUrl] + filtered_past_audio_links
-#     file_paths = download_from_s3_links(links)
-#
-#     voice_id = add_voice(name=feedbackRequestDto.name, local_file_paths=file_paths)
-#
-#     transcribe_token = speechToTextWithApi.get_token()
-#     t_id = speechToTextWithApi.get_transcribe_id(transcribe_token, beforeAudioLink=feedbackRequestDto.beforeAudioLink)
-#
-#     time.sleep(0.5)  # 첫 요청시 바로 하면 404 뜰수도 있다고 함
-#     first_script, silence_time = speechToTextWithApi.start_stt(transcribe_token, t_id)
-#
-#     before_script_gpt = FeedbackAssistantUseBeforeScript(first_script)
-#     before_script = before_script_gpt.get_feedback()
-#
-#     filler_count = speechToTextWithApi.get_filler_count(before_script[0])
-#
-#     feedbackGpt = FeedbackAssistant(first_script, filler_count, silence_time)
-#     feedback = feedbackGpt.get_feedback()
-#
-#     after_audio_link = text_to_speech_file(text=feedback[0], voice_id=voice_id)
-#
-#     send_statistics_to_spring(token=token, gantourCount=filler_count, silentTime=silence_time,
-#                               answerId=feedbackRequestDto.answerId)
-#
-#     for file_path in file_paths:
-#         os.remove(file_path)
-#
-#     return {"beforeScript": before_script[0],
-#             "afterScript": feedback[0],
-#             "afterAudioLink": after_audio_link,
-#             "feedbackText": "\n".join(feedback[1:])}
-
-
-# @router.post("/analyses")
-# def getUserSpeechHabit(request: Request, analysisRequestDto: AnalysisRequestDto):
-#     token = request.headers.get("Authorization").split(" ")[1]
-#     analysis_gpt = AnalysisAssistant(questions=analysisRequestDto.questions, beforeScripts=analysisRequestDto.beforeScripts)
-#     analysis = analysis_gpt.get_analysis()
-#
-#     return {"analysisText": analysis}  # 데이터를 JSON 객체로 감쌈
\ No newline at end of file
+    requests.post(
+        "http://springboot:8080/api/spring/report",
+        headers=headers,
+        json=data
+    )
diff --git a/app/emotion_diary/jg_sadness_1.m4a b/app/emotion_diary/jg_sadness_1.m4a
new file mode 100644
index 0000000..7a0015c
Binary files /dev/null and b/app/emotion_diary/jg_sadness_1.m4a differ
diff --git a/app/emotion_diary/jg_sadness_2.m4a b/app/emotion_diary/jg_sadness_2.m4a
new file mode 100644
index 0000000..9375565
Binary files /dev/null and b/app/emotion_diary/jg_sadness_2.m4a differ
diff --git a/app/emotion_diary/jg_sadness_3.m4a b/app/emotion_diary/jg_sadness_3.m4a
new file mode 100644
index 0000000..ee1a08a
Binary files /dev/null and b/app/emotion_diary/jg_sadness_3.m4a differ
diff --git a/app/emotion_diary/jg_sadness_4.m4a b/app/emotion_diary/jg_sadness_4.m4a
new file mode 100644
index 0000000..26b0d4d
Binary files /dev/null and b/app/emotion_diary/jg_sadness_4.m4a differ
diff --git a/app/emotion_diary/jg_sadness_5.m4a b/app/emotion_diary/jg_sadness_5.m4a
new file mode 100644
index 0000000..235860c
Binary files /dev/null and b/app/emotion_diary/jg_sadness_5.m4a differ
diff --git a/app/service/gpt.py b/app/service/gpt.py
index 6d027c1..eb06023 100644
--- a/app/service/gpt.py
+++ b/app/service/gpt.py
@@ -51,6 +51,7 @@ def get_schedule_json(self):
 
         return schedule_dict
 
+
 class GenerateQuestionGPT:
     def __init__(self, text, alias):
         self.text = text
@@ -92,4 +93,47 @@ def get_schedule_json(self):
         content = response.choices[0].message.content
         schedule_dict = parsing_json.extract_json_from_content(content)
 
-        return schedule_dict
\ No newline at end of file
+        return schedule_dict
+
+
+class EmotionReportGPT:
+    def __init__(self, text, percent_list):
+        self.text = text
+        self.percent_list = percent_list
+
+    def create_report_prompt(self):
+        system_message = f"""
+            너는 지금부터 감정을 분석 하는 심리 상담사야. 
+
+            네 역할은 텍스트와 수치를 보고, 해당 발화의 인물이 하루 동안 어떤 감정 상태를 가졌는지 체크해주는 거야.
+            텍스트는 다음과 같아: {str(self.text)}
+            수치는 다음과 같아 : {self.percent_list}
+
+            너의 목표는 두 가지야:
+            1. 텍스트와 수치를 보고 발화의 인물의 하루 감정을 종합적으로 분석해줘. 
+                1-a) 분석을 할 때는 텍스트나 문맥에서 근거를 들어서 논리적으로 서술해줘.
+                1-b) 분석 말투는 보호자에게 피보호자의 상태를 설명하는 존댓말 말투로 해줘. 
+                1-c) '발화자'를 지칭하는 말은 '피보호자'로 해줘 
+            2. 분석 문장은 4-5 줄이어야 해.
+
+            결과는 꼭 큰따옴표(")만 사용해서 str로 반환해줘. 만약 여러 문장이라면 "\n"를 문장 끝에 넣어줘.
+
+        """
+
+        messages = [
+            {"role": "system", "content": system_message}
+        ]
+        return messages
+
+    def get_report_text(self):
+        prompt = self.create_report_prompt()
+        response = client.chat.completions.create(
+            model="gpt-4-turbo",
+            messages=prompt,
+            temperature=0.5,
+            max_tokens=2048
+        )
+
+        content = response.choices[0].message.content
+
+        return content
diff --git a/app/service/interaction.py b/app/service/interaction.py
index 4518264..fef5577 100644
--- a/app/service/interaction.py
+++ b/app/service/interaction.py
@@ -2,43 +2,30 @@
 import subprocess
 from datetime import datetime
 
-import pyaudio
-import numpy as np
 from faster_whisper import WhisperModel
 from openai import OpenAI
-from elevenLabs import text_to_speech_file
 from elevenlabs import ElevenLabs
 from dotenv import load_dotenv
 
-# 아래 두 함수는 record_respberry.py 에 구현된 그대로 사용합니다.
-# emotion_record(index) → "{prefix}{index}.wav" 파일을 만들어 리턴
-# is_silent(data) → 음성 청크가 침묵인지 여부 판단
-from record_respberry import emotion_record, is_silent
+from app.service.elevenLabs import text_to_speech_file
+# 녹음 함수 (arecord 사용) - 수정된 record_respberry.py 참고
+from record_respberry import emotion_record
 
 # ==== 공통 설정 ====
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-ELEVENLABS_KEY   = os.getenv("ELEVENLABS_KEY")
+ELEVENLABS_KEY = os.getenv("ELEVENLABS_KEY")
 
 if not OPENAI_API_KEY or not ELEVENLABS_KEY:
     raise RuntimeError(".env 에 OPENAI_API_KEY/ELEVENLABS_KEY 를 설정하세요")
 
 # OpenAI / ElevenLabs 클라이언트
-gpt_client   = OpenAI(api_key=OPENAI_API_KEY)
-tts_client   = ElevenLabs(api_key=ELEVENLABS_KEY)
+gpt_client = OpenAI(api_key=OPENAI_API_KEY)
+tts_client = ElevenLabs(api_key=ELEVENLABS_KEY)
 
 # Whisper 모델 (tiny, CPU, int8)
 whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
 
-# 녹음 파라미터 (ALSA default=USBMIC 으로 잡힌 상태)
-FORMAT   = pyaudio.paInt16
-CHANNELS = 1
-RATE     = 44100
-CHUNK    = RATE * 3    # 3초 단위 버퍼
-
-# 오늘 날짜 기반 녹음 파일 저장 경로 prefix
-today_str           = datetime.now().strftime("%Y%m%d")
-WAVE_OUTPUT_PREFIX  = f"/home/team4/Desktop/capstone/AI/app/emotion_diary/{today_str}_"
 
 def interaction(alias: str):
     """
@@ -104,8 +91,8 @@ def interaction(alias: str):
 
     print("=== interaction 종료 ===")
 
+
 if __name__ == "__main__":
     # 스크립트를 직접 실행할 때만 동작
     # alias를 원하는 이름으로 바꿔주세요
     interaction("홍길동")
-
diff --git a/app/service/main.py b/app/service/main.py
index b186d8f..18296c5 100644
--- a/app/service/main.py
+++ b/app/service/main.py
@@ -7,6 +7,10 @@
 from contextlib import asynccontextmanager
 
 from app.controller.RecordController import router
+
+# from app.controller.RecordController import router
+
+
 # from app.service.subscribe import subscribe_schedule
 
 # @asynccontextmanager
@@ -86,9 +90,3 @@ def custom_openapi():
 
 app.include_router(router)
 
-# before_script, statistics_filler_json, statistics_silence_json = startSTT(
-#     "https://peachmentor-bucket.s3.ap-northeast-2.amazonaws.com/record/%E1%84%82%E1%85%A9%E1%86%A8%E1%84%8B%E1%85%B3%E1%86%B7.m4a")
-# self_feedback = "그래도 주어진 시간동안 말을 이어나가긴 했는데 말을 자연스럽게 연결하지 못한 것 같아"
-# feedbackAss = FeedbackAssistant(before_script, statistics_filler_json, statistics_silence_json)
-# feedback = feedbackAss.get_feedback()
-# print(feedback)
diff --git a/app/service/predict_resp.py b/app/service/predict_resp.py
new file mode 100644
index 0000000..5d93ac6
--- /dev/null
+++ b/app/service/predict_resp.py
@@ -0,0 +1,35 @@
+import requests
+import glob
+import os
+import mimetypes
+
+
+def predict():
+    ip = "192.168.1.243"
+    # FastAPI 라우터 경로에 맞춘 URL
+    url = f"http://{ip}:8000/api/fastapi/predict"
+
+    # 전송할 오디오 파일 경로 (wav, m4a, mp3 등 모두 포함)
+    BASE_DIR = "/home/team4/Desktop/capstone/AI/app/emotion_diary"
+    audio_paths = glob.glob(os.path.join(BASE_DIR, "**", "*.*"), recursive=True)
+
+    files = []
+    for path in audio_paths:
+        filename = os.path.basename(path)
+        # 확장자에 맞는 MIME 타입 추출 (fallback: application/octet-stream)
+        content_type = mimetypes.guess_type(path)[0] or "application/octet-stream"
+        files.append(
+            ("files", (filename, open(path, "rb"), content_type))
+        )
+
+    response = requests.post(url, files=files)
+    if response.status_code == 200:
+        print("감정 예측 결과:")
+        for label, score in response.json().items():
+            print(f"{label}: {score}")
+    else:
+        print(f"Error: {response.status_code} - {response.text}")
+
+
+if __name__ == "__main__":
+    predict()
diff --git a/app/service/record_respberry.py b/app/service/record_respberry.py
index 3bb9206..3e0460f 100644
--- a/app/service/record_respberry.py
+++ b/app/service/record_respberry.py
@@ -1,32 +1,34 @@
 import os
 import wave
 from datetime import datetime
-
 import numpy as np
-import pyaudio
+import sounddevice as sd
+from scipy.io.wavfile import write
 
 # === 녹음 설정 ===
-FORMAT = pyaudio.paInt16
 CHANNELS = 1
 RATE = 44100
-CHUNK = 4096               # 약 0.093초 분량 (4096/44100)
-SILENCE_LIMIT = 5          # 5초 연속 침묵이면 녹음 종료
+CHUNK_DURATION = 0.1  # 초 단위, 약 100ms
+CHUNK = int(RATE * CHUNK_DURATION)
+SILENCE_LIMIT = 5  # 5초 연속 침묵이면 녹음 종료
+THRESHOLD = 1000.0  # 침묵 판별 기준 (RMS)
+
 BASE_DIR = "/home/team4/Desktop/capstone/AI/app/emotion_diary"
 
+
 # 날짜 기반 하위 디렉터리(매일 한 번만 생성)
 def _ensure_dir():
     os.makedirs(BASE_DIR, exist_ok=True)
 
-def is_silent(data: bytes, threshold: float = 1000.0) -> bool:
+
+def is_silent(data: np.ndarray, threshold: float = THRESHOLD) -> bool:
     """
-    한 프레임(CHUNK) 크기의 raw PCM data를 받아
-    RMS 기준으로 침묵 여부를 판단.
+    float32 numpy 배열을 받아 RMS 기준으로 침묵 여부를 판단
     """
-    audio_data = np.frombuffer(data, dtype=np.int16)
-    rms = np.sqrt(np.mean(audio_data.astype(np.float32) ** 2))
-    # print(f"RMS={rms:.1f}")  # 필요 시 디버그용
+    rms = np.sqrt(np.mean(data ** 2))
     return rms < threshold
 
+
 def emotion_record(index: int) -> str:
     """
     index: 녹음 파일 구분을 위한 정수 인덱스
@@ -37,49 +39,34 @@ def emotion_record(index: int) -> str:
     filename = f"{date_str}_{index}.wav"
     filepath = os.path.join(BASE_DIR, filename)
 
-    pa = pyaudio.PyAudio()
-    # input_device_index 를 지정하지 않으면 ALSA default (=USBMIC) 사용
-    stream = pa.open(
-        format=FORMAT,
-        channels=CHANNELS,
-        rate=RATE,
-        input=True,
-        frames_per_buffer=CHUNK
-    )
-
     print(f"[녹음 시작] {filename}")
+
     frames = []
     silent_secs = 0.0
 
     try:
-        while True:
-            data = stream.read(CHUNK, exception_on_overflow=False)
-            frames.append(data)
+        with sd.InputStream(samplerate=RATE, channels=CHANNELS, dtype='float32') as stream:
+            while True:
+                data, _ = stream.read(CHUNK)
+                audio_chunk = data[:, 0]  # mono
+                frames.append(audio_chunk.copy())
 
-            if is_silent(data):
-                silent_secs += CHUNK / RATE
-            else:
-                silent_secs = 0.0
+                if is_silent(audio_chunk):
+                    silent_secs += CHUNK_DURATION
+                else:
+                    silent_secs = 0.0
 
-            if silent_secs >= SILENCE_LIMIT:
-                print(f"[침묵 {SILENCE_LIMIT}초 감지 → 녹음 종료]")
-                break
+                if silent_secs >= SILENCE_LIMIT:
+                    print(f"[침묵 {SILENCE_LIMIT}초 감지 → 녹음 종료]")
+                    break
 
     except Exception as e:
         print("녹음 중 예외:", e)
-    finally:
-        stream.stop_stream()
-        stream.close()
-        pa.terminate()
-
-    # WAV 파일로 저장
-    wf = wave.open(filepath, 'wb')
-    wf.setnchannels(CHANNELS)
-    wf.setsampwidth(pa.get_sample_size(FORMAT))
-    wf.setframerate(RATE)
-    wf.writeframes(b''.join(frames))
-    wf.close()
 
+    # float32 → int16 변환 후 저장
+    all_audio = np.concatenate(frames)
+    int_audio = np.int16(np.clip(all_audio * 32767, -32768, 32767))
+
+    write(filepath, RATE, int_audio)
     print(f"[저장 완료] {filepath}\n")
     return filepath
-
diff --git a/app/service/s3Service.py b/app/service/s3Service.py
index b992d24..aa2ca35 100644
--- a/app/service/s3Service.py
+++ b/app/service/s3Service.py
@@ -1,6 +1,7 @@
 import os
 import time
 import uuid
+from datetime import datetime
 from typing import List
 
 import requests
@@ -62,6 +63,34 @@ def upload_to_s3(local_file_path: str) -> str:
         print(f"Another error => {e}")
 
 
+def upload_to_s3_png(local_file_path: str) -> str:
+    """로컬 파일을 S3에 업로드하고 S3 URL을 반환합니다."""
+    try:
+        if not os.path.isfile(local_file_path):
+            print(f"Local file does not exist: {local_file_path}")
+            return None
+
+        date_str = datetime.now().strftime("%Y%m%d")
+        filename = f"{date_str}"
+
+        timestamp = int(time.time())
+        unique_id = str(uuid.uuid4())
+        s3_file_name = f"image/{filename}_{timestamp}_{unique_id}.png"
+
+        # S3에 파일 업로드
+        with open(local_file_path, "rb") as data:
+            s3_client.upload_fileobj(data, bucket_name, s3_file_name)
+
+        # S3 URL 생성
+        aws_file_url = f"{url_base}/{s3_file_name}"
+        return aws_file_url
+
+    except ClientError as e:
+        print(f'Credential error => {e}')
+    except Exception as e:
+        print(f"Another error => {e}")
+
+
 # AWS S3에서 녹음 파일 다운로드
 def download_from_s3(file_s3_url: str) -> str:
     """S3에서 파일을 다운로드하고 로컬에 저장합니다."""
diff --git a/app/utils/convertFileExtension.py b/app/utils/convertFileExtension.py
index e27dd38..d58eee0 100644
--- a/app/utils/convertFileExtension.py
+++ b/app/utils/convertFileExtension.py
@@ -1,7 +1,10 @@
+import io
 import os
+import tempfile
 from datetime import datetime
 
 from pydub import AudioSegment
+from pydub.exceptions import CouldntDecodeError
 
 
 def merge_all_wavs_to_mp3(audio_dir="audio", silence_duration_ms=500):
@@ -45,4 +48,51 @@ def convert_to_mp3(file_path):
     return output_path
 
 
-0
+def convert_to_wav(raw_bytes: bytes, ext: str) -> bytes:
+    ext = ext.lower()
+    # 이미 WAV라면 바로 반환
+    if ext == "wav":
+        return raw_bytes
+
+    # 임시 입력 파일 생성
+    with tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}") as fin:
+        fin.write(raw_bytes)
+        fin.flush()
+        fin_path = fin.name
+
+    try:
+        # 1) format 인자 없이 자동 감지 시도
+        audio = AudioSegment.from_file(fin_path)
+    except CouldntDecodeError:
+        try:
+            # 2) 자동 감지도 실패하면, 프로브 크기 늘려서 재시도
+            audio = AudioSegment.from_file(
+                fin_path,
+                parameters=["-probesize", "50M", "-analyzeduration", "100M"]
+            )
+        except CouldntDecodeError as e:
+            os.unlink(fin_path)
+            raise RuntimeError(f"FFmpeg 디코딩 실패({ext}): {e}") from e
+
+    # WAV(PCM) 사양으로 맞춰주기
+    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
+
+    # 메모리로 WAV 내보내기
+    out = io.BytesIO()
+    audio.export(out, format="wav")
+    wav_bytes = out.getvalue()
+
+    os.unlink(fin_path)
+    return wav_bytes
+
+    # 3) 원하는 파라메터로 변환 (16kHz, mono, 16-bit)
+    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
+
+    # 4) 메모리로 WAV 내보내기
+    out = io.BytesIO()
+    audio.export(out, format="wav")
+    wav_bytes = out.getvalue()
+
+    # 5) 임시 입력 파일 삭제
+    os.unlink(fin_path)
+    return wav_bytes
diff --git a/main.py b/main.py
index 8f05ef4..2e48eca 100644
--- a/main.py
+++ b/main.py
@@ -62,6 +62,7 @@
 #     # 종료 함수는 비활성화 (라이브러리 오류 방지)
 #     # dhtDevice.exit()
 
+import uvicorn
 
 # ───────────────────────────────
 # 메인 함수
@@ -69,8 +70,8 @@
 if __name__ == "__main__":
     uvicorn.run(
         app="app.service.main:app",
-        host="localhost",
-        # host="0.0.0.0",
+        # host="localhost",
+        host="0.0.0.0",
         port=8000,
     )
     # detect_motion()  # PIR 센서 테스트 시 주석 해제
diff --git a/requirements.txt b/requirements.txt
index c61a0e6..376bdd9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,65 +1,164 @@
+absl-py==2.3.0
 annotated-types==0.7.0
 anyio==4.9.0
-av==14.3.0
+asttokens==3.0.0
+astunparse==1.6.3
+async-timeout==5.0.1
+attrs==25.3.0
+audioread==3.0.1
+av==14.4.0
+backcall==0.2.0
+beautifulsoup4==4.13.4
+bleach==6.2.0
 boto3==1.37.16
 botocore==1.37.16
+cachetools==5.5.2
 certifi==2025.1.31
 cffi==1.17.1
 charset-normalizer==3.4.1
 click==8.1.8
 colorama==0.4.6
 coloredlogs==15.0.1
+contourpy==1.3.0
 ctranslate2==4.6.0
+cycler==0.12.1
+decorator==5.2.1
+defusedxml==0.7.1
 distro==1.9.0
-dotenv==0.9.9
+docopt==0.6.2
 elevenlabs==1.54.0
 exceptiongroup==1.2.2
+executing==2.2.0
 fastapi==0.115.11
 faster-whisper==1.1.1
+fastjsonschema==2.21.1
 filelock==3.18.0
+flatbuffers==25.2.10
+fonttools==4.58.1
 fsspec==2025.3.2
+gast==0.4.0
+google-auth==2.40.2
+google-auth-oauthlib==0.4.6
+google-pasta==0.2.0
+grpcio==1.71.0
 h11==0.14.0
+h5py==3.13.0
 httpcore==1.0.7
 httpx==0.28.1
 huggingface-hub==0.31.2
 humanfriendly==10.0
 idna==3.10
+importlib_metadata==8.7.0
+importlib_resources==6.5.2
+ipython==8.12.3
+jedi==0.19.2
 Jinja2==3.1.6
-jiter==0.9.0
+jiter==0.10.0
 jmespath==1.0.1
+joblib==1.5.1
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+jupyterlab_pygments==0.3.0
+kiwisolver==1.4.7
+lazy_loader==0.4
+libclang==18.1.1
+librosa==0.11.0
+llvmlite==0.43.0
+Markdown==3.8
+markdown-it-py==3.0.0
 MarkupSafe==3.0.2
+matplotlib==3.9.4
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.3
+ml_dtypes==0.5.1
 mpmath==1.3.0
+msgpack==1.1.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
 networkx==3.2.1
-numpy==2.0.2
+numba==0.60.0
+numpy==1.26.4
+oauthlib==3.2.2
 onnxruntime==1.19.2
 openai==1.68.2
+opt_einsum==3.4.0
+optree==0.16.0
 packaging==25.0
+pandocfilters==1.5.1
+parso==0.8.4
+pickleshare==0.7.5
+pillow==11.2.1
+pipreqs==0.5.0
+platformdirs==4.3.8
 playsound==1.3.0
-protobuf==6.31.0
-# PyAudio==0.2.14
+pooch==1.8.2
+prompt_toolkit==3.0.51
+protobuf==3.19.6
+pure_eval==0.2.3
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
 pycparser==2.22
 pydantic==2.10.6
 pydantic_core==2.27.2
 pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.3
+pyreadline3==3.5.4
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 python-multipart==0.0.20
 PyYAML==6.0.2
+pyzmq==26.4.0
+redis==6.2.0
+referencing==0.36.2
+regex==2024.11.6
 requests==2.32.3
-# RPi.GPIO==0.7.1
+requests-oauthlib==2.0.0
+rich==14.0.0
+rpds-py==0.25.1
+rsa==4.9.1
 s3transfer==0.11.4
+safetensors==0.5.3
+scikit-learn==1.6.1
 scipy==1.13.1
+sentence-transformers==4.1.0
 six==1.17.0
 sniffio==1.3.1
 sounddevice==0.5.1
 soundfile==0.13.1
+soupsieve==2.7
+soxr==0.5.0.post1
+SpeechRecognition==3.14.3
+stack-data==0.6.3
 starlette==0.46.1
 sympy==1.14.0
+tensorboard==2.11.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorflow-estimator==2.11.0
+tensorflow-io-gcs-filesystem==0.31.0
+termcolor==3.1.0
+threadpoolctl==3.6.0
+tinycss2==1.4.0
 tokenizers==0.21.1
 torch==2.7.0
 torchaudio==2.7.0
+torchvision==0.22.0
+tornado==6.5.1
 tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.52.4
 typing_extensions==4.12.2
 urllib3==1.26.20
 uvicorn==0.34.0
-websockets==15.0.1
\ No newline at end of file
+wcwidth==0.2.13
+webencodings==0.5.1
+websockets==15.0.1
+Werkzeug==3.1.3
+wrapt==1.17.2
+yarg==0.1.9
+zipp==3.22.0
diff --git a/requirements_server.txt b/requirements_server.txt
new file mode 100644
index 0000000..45b9215
--- /dev/null
+++ b/requirements_server.txt
@@ -0,0 +1,170 @@
+absl-py==2.3.0
+annotated-types==0.7.0
+anyio==4.9.0
+asttokens==3.0.0
+astunparse==1.6.3
+async-timeout==5.0.1
+attrs==25.3.0
+audioread==3.0.1
+av==14.4.0
+backcall==0.2.0
+beautifulsoup4==4.13.4
+bleach==6.2.0
+boto3==1.37.16
+botocore==1.37.16
+cachetools==5.5.2
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+coloredlogs==15.0.1
+contourpy==1.3.0
+ctranslate2==4.6.0
+cycler==0.12.1
+decorator==5.2.1
+defusedxml==0.7.1
+distro==1.9.0
+docopt==0.6.2
+elevenlabs==1.54.0
+exceptiongroup==1.2.2
+executing==2.2.0
+fastapi==0.115.11
+faster-whisper==1.1.1
+fastjsonschema==2.21.1
+filelock==3.18.0
+flatbuffers==25.2.10
+fonttools==4.58.1
+fsspec==2025.3.2
+gast==0.4.0
+google-auth==2.40.2
+google-auth-oauthlib==0.4.6
+google-pasta==0.2.0
+grpcio==1.71.0
+h11==0.14.0
+h5py==3.13.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.31.2
+humanfriendly==10.0
+idna==3.10
+importlib_metadata==8.7.0
+importlib_resources==6.5.2
+ipython==8.12.3
+jedi==0.19.2
+Jinja2==3.1.6
+jiter==0.10.0
+jmespath==1.0.1
+joblib==1.5.1
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+jupyterlab_pygments==0.3.0
+keras==3.10.0
+kiwisolver==1.4.7
+lazy_loader==0.4
+libclang==18.1.1
+librosa==0.11.0
+llvmlite==0.43.0
+Markdown==3.8
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.9.4
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.3
+ml_dtypes==0.5.1
+mpmath==1.3.0
+msgpack==1.1.0
+namex==0.1.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+networkx==3.2.1
+numba==0.60.0
+numpy==1.26.4
+oauthlib==3.2.2
+onnxruntime==1.19.2
+openai==1.68.2
+opt_einsum==3.4.0
+optree==0.16.0
+packaging==25.0
+pandocfilters==1.5.1
+parso==0.8.4
+pickleshare==0.7.5
+pillow==11.2.1
+pipreqs==0.5.0
+platformdirs==4.3.8
+playsound==1.3.0
+pooch==1.8.2
+prompt_toolkit==3.0.51
+protobuf==5.29.5
+pure_eval==0.2.3
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.10.6
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.3
+pyreadline3==3.5.4
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.20
+pywin32==310
+PyYAML==6.0.2
+pyzmq==26.4.0
+redis==6.2.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+rich==14.0.0
+rpds-py==0.25.1
+rsa==4.9.1
+s3transfer==0.11.4
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.13.1
+sentence-transformers==4.1.0
+six==1.17.0
+sniffio==1.3.1
+sounddevice==0.5.1
+soundfile==0.13.1
+soupsieve==2.7
+soxr==0.5.0.post1
+SpeechRecognition==3.14.3
+stack-data==0.6.3
+starlette==0.46.1
+sympy==1.14.0
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+tensorboard-plugin-wit==1.8.1
+tensorflow==2.19.0
+tensorflow-estimator==2.11.0
+tensorflow-io-gcs-filesystem==0.31.0
+tensorflow_intel==2.18.0
+termcolor==3.1.0
+tf_keras==2.19.0
+threadpoolctl==3.6.0
+tinycss2==1.4.0
+tokenizers==0.21.1
+torch==2.7.0
+torchaudio==2.7.0
+torchvision==0.22.0
+tornado==6.5.1
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.52.4
+typing_extensions==4.12.2
+urllib3==1.26.20
+uvicorn==0.34.0
+wcwidth==0.2.13
+webencodings==0.5.1
+websockets==15.0.1
+Werkzeug==3.1.3
+wrapt==1.17.2
+yarg==0.1.9
+zipp==3.22.0