diff --git a/.gitignore b/.gitignore index f792265..d80845d 100644 --- a/.gitignore +++ b/.gitignore @@ -34,9 +34,10 @@ venv.bak/ *.idea .DS_Store -*.h5 + __pycache__/ *.pyc audio/ -emotion_diary/ +#emotion_diary/ +emotion_png/ pyvenv.cfg \ No newline at end of file diff --git a/app/ML/ModelService.py b/app/ML/ModelService.py new file mode 100644 index 0000000..a89245c --- /dev/null +++ b/app/ML/ModelService.py @@ -0,0 +1,97 @@ +# import numpy as np +# from dotenv import load_dotenv +# from fastapi import Request, UploadFile, File, APIRouter +# from typing import List +# from tensorflow.keras.models import load_model +# from sentence_transformers import SentenceTransformer +# import io +# import requests +# +# from app.ML.audio_extractor_utils import get_features +# from app.ML.loss import boundary_enhanced_focal_loss +# from app.ML.plot_utils import save_plot, get_s3_png_url +# from app.ML.speech_to_text import speech_to_text +# +# import os +# +# from app.service.gpt import EmotionReportGPT +# from app.utils.convertFileExtension import convert_to_wav +# +# router = APIRouter( +# prefix="/api/fastapi", +# ) +# load_dotenv() +# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +# +# +# +# +# @router.post("/predict") +# async def predict(request: Request, files: List[UploadFile] = File(...)): +# # token = request.headers.get("Authorization").split(" ")[1] +# print(files) +# # 1) 임시 파일 저장 or 메모리 내 처리 +# wav_data_list = [] +# for file in files: +# raw = await file.read() +# ext = file.filename.split('.')[-1] # 'm4a', 'mp3' 등 +# wav_bytes = convert_to_wav(raw, ext) # BytesIO 변환 +# wav_data_list.append(wav_bytes) +# +# # 2) 오디오 특징 추출 +# all_feats = [] +# for wav_bytes in wav_data_list: +# # get_features 함수가 경로 입력이면, 아래처럼 메모리 파일 처리 필요 +# # 임시파일로 저장 후 경로 전달 or get_features 수정 필요 +# +# temp_path = f"temp_{file.filename}" +# with open(temp_path, "wb") as f: +# f.write(wav_bytes) +# feats = get_features(temp_path) +# os.remove(temp_path) +# all_feats.append(feats) +# +# all_feats = np.stack(all_feats, axis=0) +# pooled_feats = all_feats.mean(axis=0) +# audio_input = pooled_feats[np.newaxis, :, np.newaxis] +# +# # 3) STT & 텍스트 임베딩 +# texts = [] +# for wav_bytes in wav_data_list: +# temp_path = f"temp_stt.wav" +# with open(temp_path, "wb") as f: +# f.write(wav_bytes) +# text = speech_to_text(temp_path) +# os.remove(temp_path) +# texts.append(text) +# +# full_text = " . ".join(texts) +# text_vec = embedding_model.encode([full_text])[0] +# text_input = text_vec[np.newaxis, :] +# +# # 4) 예측 +# prediction = model.predict([audio_input, text_input]) +# pred_percent = (prediction[0] * 100).tolist() +# +# # 5) JSON 응답 +# result = {label: round(p, 2) for label, p in zip(emotion_labels, pred_percent)} +# top_idx = np.argmax(pred_percent) +# result['predicted_emotion'] = emotion_labels[top_idx] +# +# local_path = save_plot(pred_percent) +# s3_path = get_s3_png_url(local_path) +# reporter = EmotionReportGPT(full_text, pred_percent) +# report_text = reporter.get_report_text() +# +# print(s3_path) +# +# # send_emotion_report_to_spring(s3_path, report_text) +# +# data = { +# "imageUrl": s3_path, +# "report_text": report_text +# } +# return data +# +# +# diff --git a/app/ML/audio_extractor_utils.py b/app/ML/audio_extractor_utils.py new file mode 100644 index 0000000..dcace4c --- /dev/null +++ b/app/ML/audio_extractor_utils.py @@ -0,0 +1,69 @@ +import librosa +import librosa.display +import numpy as np + + +def noise(data): + noise_amp = 0.035 * np.random.uniform() * np.amax(data) + data = data + noise_amp * np.random.normal(size=data.shape[0]) + return data + + +def stretch(data, rate=0.8): + return librosa.effects.time_stretch(y=data, rate=rate) + + +def shift(data): + shift_range = int(np.random.uniform(low=-5, high=5) * 1000) + return np.roll(data, shift_range) + + +def pitch(data, sampling_rate, pitch_factor=0.7): + return librosa.effects.pitch_shift(y=data, sr=sampling_rate, n_steps=pitch_factor) + + +def extract_features(data, sample_rate): + # ZCR + result = np.array([]) + zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0) + result = np.hstack((result, zcr)) # stacking horizontally + + # Chroma_stft + stft = np.abs(librosa.stft(data)) + chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) + result = np.hstack((result, chroma_stft)) # stacking horizontally + + # MFCC + mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0) + result = np.hstack((result, mfcc)) # stacking horizontally + + # Root Mean Square Value + rms = np.mean(librosa.feature.rms(y=data).T, axis=0) + result = np.hstack((result, rms)) # stacking horizontally + + # MelSpectogram + mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0) + result = np.hstack((result, mel)) # stacking horizontally + + return result + + +def get_features(path): + data, sample_rate = librosa.load(path, duration=2.5, offset=0.0) + + # without augmentation + res1 = extract_features(data, sample_rate) + result = np.array(res1) + + # data with noise + noise_data = noise(data) + res2 = extract_features(noise_data, sample_rate) + result = np.concatenate((result, res2), axis=0) + + # data with stretching and pitching + new_data = stretch(data) + data_stretch_pitch = pitch(new_data, sample_rate) + res3 = extract_features(data_stretch_pitch, sample_rate) + result = np.concatenate((result, res3), axis=0) + + return result diff --git a/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5 b/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5 new file mode 100644 index 0000000..b4f22d5 Binary files /dev/null and b/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5 differ diff --git a/app/ML/loss.py b/app/ML/loss.py new file mode 100644 index 0000000..c41199b --- /dev/null +++ b/app/ML/loss.py @@ -0,0 +1,29 @@ +import tensorflow as tf + + +# 1. Boundary-Enhanced Focal Loss 구현 (소수 클래스 식별 강화) +def boundary_enhanced_focal_loss(y_true, y_pred, gamma=2.0, margin=0.3): + y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7) + + # 하드 샘플 마이닝 (낮은 확률로 예측된 샘플 식별) + correct_prob = tf.reduce_sum(y_true * y_pred, axis=-1) + hard_mask = tf.cast(tf.less(correct_prob, margin), tf.float32) + + # 클래스별 가중치 계산 (소수 클래스에 더 높은 가중치) + effective_counts = tf.reduce_sum(y_true, axis=0) + alpha = 1.0 / (effective_counts + 1e-7) + alpha = alpha / tf.reduce_sum(alpha) + + # 소수 클래스 추가 가중치 부여 (surprise, neutral) + class_boost = tf.constant([1.0, 0.5, 1.0, 1.0, 1.0, 2.5, 5.0], dtype=tf.float32) + alpha = alpha * class_boost + + # Focal Loss 계산 + cross_entropy = -y_true * tf.math.log(y_pred) + focal_weight = tf.pow(1.0 - y_pred, gamma) + + # 하드 샘플에 추가 가중치 부여 + sample_weight = 1.0 + hard_mask * 2.0 + loss = sample_weight[:, tf.newaxis] * alpha * focal_weight * cross_entropy + + return tf.reduce_sum(loss) diff --git a/app/ML/plot_utils.py b/app/ML/plot_utils.py new file mode 100644 index 0000000..b7744d0 --- /dev/null +++ b/app/ML/plot_utils.py @@ -0,0 +1,40 @@ +# 그래프 그리기 +import os +from datetime import datetime + +from matplotlib import pyplot as plt +from app.service.s3Service import upload_to_s3_png + +colors = ['#e74c3c', '#3498db', '#f1c40f', '#e67e22', '#9b59b6', '#1abc9c', '#95a5a6'] +emotion_labels = ['angry', 'sadness', 'happiness', 'fear', 'disgust', 'surprise', 'neutral'] + + +def save_plot(predictions_percent): + plt.figure(figsize=(10, 6)) + bars = plt.barh(emotion_labels, predictions_percent, color=colors, alpha=0.85) + + plt.title('Emotion Probability Distribution', fontsize=20, weight='bold', pad=15) + plt.xlabel('Probability (%)', fontsize=14) + plt.xlim(0, max(predictions_percent) + 10) + plt.grid(axis='x', linestyle='--', alpha=0.6) + + for bar, percent in zip(bars, predictions_percent): + width = bar.get_width() + plt.text(width + 0.8, bar.get_y() + bar.get_height() / 2, f'{percent:.1f}%', va='center', fontsize=13, + weight='bold', color='#333') + + plt.yticks(fontsize=14, weight='bold') + plt.tight_layout() + + date_str = datetime.now().strftime("%Y%m%d") + filename = f"{date_str}" + local_path = os.getcwd() + f"/app/emotion_png/{filename}_emotion_distribution.png" + # 이미지 파일로 저장 + plt.savefig(local_path, dpi=300, bbox_inches='tight') + plt.show() + + return local_path + + +def get_s3_png_url(local_path): + return upload_to_s3_png(local_path) diff --git a/app/ML/predict_colab.py b/app/ML/predict_colab.py new file mode 100644 index 0000000..a95d54a --- /dev/null +++ b/app/ML/predict_colab.py @@ -0,0 +1,95 @@ +import numpy as np +import matplotlib.pyplot as plt +from tensorflow.keras.models import load_model +from sentence_transformers import SentenceTransformer +import glob +import os + +from app.ML.audio_extractor_utils import get_features +from app.ML.loss import boundary_enhanced_focal_loss +from app.ML.speech_to_text import speech_to_text +import os +os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' + + +BASE_DIR_resp = "/home/team4/Desktop/capstone/AI/app/emotion_diary" +BASE_DIR_win = "C:/Users/YJG/Desktop/2025_1_capstone_2/AI/app/emotion_diary" +emotion_labels = ['angry', 'sadness', 'happiness', 'fear', 'disgust', 'surprise', 'neutral'] +model_path_resp = "/home/team4/Desktop/capstone/AI/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5" +model_path_win = "C:/Users/YJG/Desktop/2025_1_capstone_2/AI/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5" + + +def predict(): + BASE_DIR = BASE_DIR_win + model_path = model_path_win + # (가정) 미리 정의된 함수/변수 + # get_features(path): (486,) 벡터 반환 + # speech_to_text(path): STT → 문자열 반환 + # boundary_enhanced_focal_loss: 커스텀 손실 + # emotion_labels: ['angry','sadness','happiness','fear','disgust','surprise','neutral'] + # model_path, sample_path: 경로 문자열 + + # 1) WAV 파일 리스트 + # sample_wav_list = [ + # sample_path + "/jg_sadness_1.wav", + # sample_path + "/jg_sadness_2.wav", + # sample_path + "/jg_sadness_3.wav", + # sample_path + "/jg_sadness_4.wav", + # sample_path + "/jg_sadness_5.wav" + # ] + sample_wav_list = glob.glob(os.path.join(BASE_DIR, "**", "*.wav"), recursive=True) + + # 2) 오디오 특징 평균 풀링 + all_feats = np.stack([get_features(p) for p in sample_wav_list], axis=0) # (5,486) + pooled_feats = all_feats.mean(axis=0) # (486,) + + # 3) 모델 입력 형태 맞추기 + audio_input = pooled_feats[np.newaxis, :, np.newaxis] # (1,486,1) + + # 4) 전체 텍스트 STT → 하나의 문장으로 결합 + texts = [speech_to_text(p) for p in sample_wav_list] + full_text = " . ".join(texts) + + # 5) 텍스트 임베딩 + embedding_model = SentenceTransformer('jhgan/ko-sbert-multitask') + text_vec = embedding_model.encode([full_text])[0] # (768,) + text_input = text_vec[np.newaxis, :] # (1,768) + + # 6) 모델 로드 및 예측 + model = load_model(model_path, custom_objects={ + 'boundary_enhanced_focal_loss': boundary_enhanced_focal_loss + }) + prediction = model.predict([audio_input, text_input]) # (1,7) + pred_percent = prediction[0] * 100 # (7,) + + # 7) 콘솔에 출력 + for lbl, p in zip(emotion_labels, pred_percent): + print(f"{lbl}: {p:.2f}%") + top_idx = np.argmax(pred_percent) + print(f"\n최종 예측 감정: {emotion_labels[top_idx]}") + + # 8) 가로 막대그래프 시각화 + colors = ['#e74c3c', '#3498db', '#f1c40f', '#e67e22', '#9b59b6', '#1abc9c', '#95a5a6'] + + plt.figure(figsize=(10, 6)) + bars = plt.barh(emotion_labels, pred_percent, color=colors, alpha=0.85) + + plt.title('Emotion Probability Distribution', fontsize=18, weight='bold', pad=15) + plt.xlabel('Probability (%)', fontsize=14) + plt.xlim(0, pred_percent.max() + 10) + plt.grid(axis='x', linestyle='--', alpha=0.6) + + for bar, p in zip(bars, pred_percent): + plt.text(p + 1, bar.get_y() + bar.get_height() / 2, + f'{p:.1f}%', va='center', fontsize=12, weight='bold', color='#333') + + plt.yticks(fontsize=13, weight='bold') + plt.tight_layout() + + # 이미지 파일로 저장 + plt.savefig('emotion_distribution.png', dpi=300, bbox_inches='tight') + plt.show() + + +if __name__ == "__main__": + predict() diff --git a/app/ML/speech_to_text.py b/app/ML/speech_to_text.py new file mode 100644 index 0000000..8cd19c5 --- /dev/null +++ b/app/ML/speech_to_text.py @@ -0,0 +1,26 @@ +import speech_recognition as sr + +# sample_wav_path = sample_path + "/sh_sadness_2.wav" + + +# STT 변환 함수 +def speech_to_text(audio_path): + recognizer = sr.Recognizer() + + # 음성 파일 로드 + with sr.AudioFile(audio_path) as source: + audio_data = recognizer.record(source) # 음성 데이터 읽기 + + try: + # 구글 STT API 사용 (무료) + text = recognizer.recognize_google(audio_data, language="ko-KR") + return text + except sr.UnknownValueError: + return "음성을 인식할 수 없습니다." + except sr.RequestError: + return "STT 요청 실패" + +# +# # MP3에서 변환한 WAV 파일 입력 +# sample_text = speech_to_text(sample_wav_path) +# print("변환된 텍스트:", sample_text) diff --git a/app/controller/RecordController.py b/app/controller/RecordController.py index 25ca700..fef983c 100644 --- a/app/controller/RecordController.py +++ b/app/controller/RecordController.py @@ -3,16 +3,25 @@ import subprocess from typing import List +import numpy as np import requests from boto3 import client from fastapi import APIRouter, Request, UploadFile, File, Form +from sentence_transformers import SentenceTransformer +from app.ML.audio_extractor_utils import get_features +from app.ML.loss import boundary_enhanced_focal_loss +from app.ML.plot_utils import save_plot, get_s3_png_url +from app.ML.speech_to_text import speech_to_text from app.dto.ScheduleSpeakRequestDto import ScheduleSpeakRequestDto from app.dto.ScheduleTTSRequestDto import ScheduleTTSRequestDto -from app.service.elevenLabs import add_voice, text_to_speech_file_save_AWS, text_to_speech_file -from app.service.gpt import ChatgptAPI -from app.service.s3Service import upload_to_s3, download_from_s3, save_local_file +from app.service.elevenLabs import text_to_speech_file_save_AWS, text_to_speech_file +from app.service.gpt import ChatgptAPI, EmotionReportGPT +from app.service.s3Service import download_from_s3, save_local_file from app.utils import play_file +from tensorflow.keras.models import load_model + +from app.utils.convertFileExtension import convert_to_wav router = APIRouter( prefix="/api/fastapi", @@ -31,6 +40,17 @@ region_name="ap-northeast-2", ) +os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' + +# app = FastAPI() + +BASE_DIR_win = os.getcwd() + "/app/emotion_diary" +model_path_win = os.getcwd() + "/app/ML/ko-sbert_multimodal_0501_3_resnet_augment_h.h5" +emotion_labels = ['angry', 'sadness', 'happiness', 'fear', 'disgust', 'surprise', 'neutral'] + +embedding_model = SentenceTransformer('jhgan/ko-sbert-multitask') +model = load_model(model_path_win, custom_objects={'boundary_enhanced_focal_loss': boundary_enhanced_focal_loss}) + async def save_local_files(files: List[UploadFile]) -> list: """업로드된 파일을 로컬에 저장하고 파일 경로를 반환합니다.""" @@ -50,22 +70,23 @@ async def save_local_files(files: List[UploadFile]) -> list: @router.post("/voices") async def getVoice(request: Request, file: UploadFile = File(...)): token = request.headers.get("Authorization").split(" ")[1] - local_file_path = await save_local_file(file) - voice_id = add_voice(name="박석진", local_file_paths=[local_file_path]) - voice_url = upload_to_s3(local_file_path) - os.remove(local_file_path) + # local_file_path = await save_local_file(file) + # voice_id = add_voice(name=name, local_file_paths=[local_file_path]) + # voice_url = s3Service.upload_to_s3(local_file_path) + # os.remove(local_file_path) + + send_user_voice_file_to_spring(token=token, voice_url=yjg_voice_id) - send_user_voice_file_to_spring(token=token, voice_url=voice_id) #yjg_voice_id -#만약 voice_id와 요구하는 분야가 오면 맞춰서 return +# 만약 voice_id와 요구하는 분야가 오면 맞춰서 return @router.post("/schedules") async def schedule_tts(request: Request, schedules: ScheduleTTSRequestDto): # token = request.headers.get("Authorization").split(" ")[1] voice_id = yjg_voice_id - prompt = ChatgptAPI(schedules.schedule_text, "엄마") + prompt = ChatgptAPI(schedules.schedule_text, schedules.alias) - schedule_dict: {"저녁": "엄마~ 저녁 잘 챙겨 먹었어?", "운동": "오늘 운동했어? 건강 챙겨~!"} + # schedule_dict: {"저녁": "엄마~ 저녁 잘 챙겨 먹었어?", "운동": "오늘 운동했어? 건강 챙겨~!"} schedule_dict = prompt.get_schedule_json() # TTS 처리 (MP3 파일 생성 후 s3 저장) @@ -80,45 +101,73 @@ async def schedule_tts(request: Request, schedules: ScheduleTTSRequestDto): return response -# @router.post("/basic-tts") -# async def speak_schedule_tts(request: Request, basicTTSRequestDto: BasicTTSRequestDto): -# # token = request.headers.get("Authorization").split(" ")[1] -# local_file_path = download_from_s3(basicTTSRequestDto.schedule_voice_Url) -# print(f"Downloaded file path: {local_file_path}") -# -# # 블루투스 헤드셋 또는 기본 스피커로 출력 -# os.system("pactl list sinks | grep 'bluez_sink'") # 블루투스 출력 장치 확인 -# os.system("pactl set-default-sink `pactl list sinks short | grep bluez_sink | awk '{print $2}'`") # 기본 출력 변경 -# -# # 로컬 파일을 직접 재생 -# subprocess.run(["mpg321", local_file_path]) -# -# return {"message": "TTS completed and played on Bluetooth headset or speaker"} - - -# @router.post("/extra-tts") -# async def speak_schedule_tts(request: Rlocalhostquest, extraTTSRequestDto: ExtraTTSRequestDto): -# # token = request.headers.get("Authorization").split(" ")[1] -# schedule_text = extraTTSRequestDto.schedule_text -# -# #진짜 실제로 쓸 코드 -# local_file_path = text_to_speech_file(schedule_text, yjg_voice_id) -# -# # 테스트하면서 AWS에 올려놓으려고 남긴 코드 -# url = text_to_speech_file_save_AWS(schedule_text, yjg_voice_id) -# local_file_path = download_from_s3(url) -# -# # local_file_path = os.getcwd()+"/test_audio/test8.mp3" # test -# # 블루투스 헤드셋 또는 기본 스피커로 출력 -# os.system("pactl list sinks | grep 'bluez_sink'") # 블루투스 출력 장치 확인 -# os.system("pactl set-default-sink `pactl list sinks short | grep bluez_sink | awk '{print $2}'`") # 기본 출력 변경 -# -# # 로컬 파일을 직접 재생 -# subprocess.run(["/usr/bin/mpg321", local_file_path]) -# # subprocess.run(["ffplay", "-nodisp", "-autoexit", local_file_path], -# # stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # 윈도우용 -# return {"message": "TTS completed and played on Bluetooth headset or speaker"} -# +@router.post("/predict") +async def predict(request: Request, files: List[UploadFile] = File(...)): + # token = request.headers.get("Authorization").split(" ")[1] + print(files) + # 1) 임시 파일 저장 or 메모리 내 처리 + wav_data_list = [] + for file in files: + raw = await file.read() + ext = file.filename.split('.')[-1] # 'm4a', 'mp3' 등 + wav_bytes = convert_to_wav(raw, ext) # BytesIO 변환 + wav_data_list.append(wav_bytes) + + # 2) 오디오 특징 추출 + all_feats = [] + for wav_bytes in wav_data_list: + # get_features 함수가 경로 입력이면, 아래처럼 메모리 파일 처리 필요 + # 임시파일로 저장 후 경로 전달 or get_features 수정 필요 + + temp_path = f"temp_{file.filename}" + with open(temp_path, "wb") as f: + f.write(wav_bytes) + feats = get_features(temp_path) + os.remove(temp_path) + all_feats.append(feats) + + all_feats = np.stack(all_feats, axis=0) + pooled_feats = all_feats.mean(axis=0) + audio_input = pooled_feats[np.newaxis, :, np.newaxis] + + # 3) STT & 텍스트 임베딩 + texts = [] + for wav_bytes in wav_data_list: + temp_path = f"temp_stt.wav" + with open(temp_path, "wb") as f: + f.write(wav_bytes) + text = speech_to_text(temp_path) + os.remove(temp_path) + texts.append(text) + + full_text = " . ".join(texts) + text_vec = embedding_model.encode([full_text])[0] + text_input = text_vec[np.newaxis, :] + + # 4) 예측 + prediction = model.predict([audio_input, text_input]) + pred_percent = (prediction[0] * 100).tolist() + + # 5) JSON 응답 + result = {label: round(p, 2) for label, p in zip(emotion_labels, pred_percent)} + top_idx = np.argmax(pred_percent) + result['predicted_emotion'] = emotion_labels[top_idx] + + local_path = save_plot(pred_percent) + s3_path = get_s3_png_url(local_path) + reporter = EmotionReportGPT(full_text, pred_percent) + report_text = reporter.get_report_text() + + print(s3_path) + + # send_emotion_report_to_spring(s3_path, report_text) + + data = { + "imageUrl": s3_path, + "report_text": report_text + } + return data + def send_user_voice_file_to_spring(token: str, voice_url: str): headers = { @@ -146,107 +195,17 @@ def send_user_voice_id_to_spring(token: str, voice_id: str): # requests.post("https://peachmentor.com/api/spring/records/voices", headers=headers, json=data) -def send_user_speech_file_to_spring(token: str, before_audio_link: str, answerId: int): - headers = { - "Authorization": f"Bearer {token}" - } - data = { - "beforeAudioLink": before_audio_link, - "answerId": answerId - } - requests.post("http://springboot:8080/api/spring/records/speeches", headers=headers, json=data) - # requests.post("https://peachmentor.com/api/spring/records/speeches", headers=headers, json=data) - - -def receive_self_feedback(token: str) -> str: +def send_emotion_report_to_spring(image_url: str, analysis_text): headers = { - "Authorization": f"Bearer {token}" - } - response = requests.get("http://springboot:8080/api/spring/self-feedbacks/latest-feedbacks", headers=headers) - # response = requests.get("https://peachmentor.com/api/spring/self-feedbacks/latest-feedbacks", headers=headers) - - feedback_data = response.json().get('result', {}) - self_feedback = feedback_data.get('feedback') - - if self_feedback is None: - return "없음" - return self_feedback - - -def send_statistics_to_spring(token: str, gantourCount: int, silentTime: float, answerId: int): - headers = { - "Authorization": f"Bearer {token}" + # "Authorization": f"Bearer {token}", + "Content-Type": "application/json" } data = { - "gantourCount": gantourCount, - "silentTime": silentTime, - "answerId": answerId + "imageUrl": image_url, + "report_text": analysis_text } - requests.post("http://springboot:8080/api/spring/statistics", headers=headers, json=data) - # requests.post("https://peachmentor.com/api/spring/statistics", headers=headers, json=data) - -# # 질문 답변에 대한 insight 제공 api -# @router.post("/insights") -# async def getRecord(request: Request, answerId: int = Form(...), question: str = Form(...), -# file: UploadFile = File(...)): -# token = request.headers.get("Authorization").split(" ")[1] -# -# local_file_path = await s3Service.save_local_file(file) -# before_audio_link = s3Service.upload_to_s3(local_file_path) -# -# send_user_speech_file_to_spring(token=token, before_audio_link=before_audio_link, answerId=answerId) -# -# insightGpt = InsightAssistant(question) -# insight = insightGpt.get_insight() -# -# os.remove(local_file_path) -# return {"insight": insight} - - -# 피드백 후 데이터 전송 api -# @router.post("/feedbacks") -# async def getFeedback(request: Request, feedbackRequestDto: FeedbackRequestDto): -# token = request.headers.get("Authorization").split(" ")[1] # todo: 토큰 에러처리 좀 (밑에도) -# -# filtered_past_audio_links = [link for link in feedbackRequestDto.pastAudioLinks if -# link != feedbackRequestDto.beforeAudioLink] -# links = [feedbackRequestDto.beforeAudioLink, feedbackRequestDto.voiceUrl] + filtered_past_audio_links -# file_paths = download_from_s3_links(links) -# -# voice_id = add_voice(name=feedbackRequestDto.name, local_file_paths=file_paths) -# -# transcribe_token = speechToTextWithApi.get_token() -# t_id = speechToTextWithApi.get_transcribe_id(transcribe_token, beforeAudioLink=feedbackRequestDto.beforeAudioLink) -# -# time.sleep(0.5) # 첫 요청시 바로 하면 404 뜰수도 있다고 함 -# first_script, silence_time = speechToTextWithApi.start_stt(transcribe_token, t_id) -# -# before_script_gpt = FeedbackAssistantUseBeforeScript(first_script) -# before_script = before_script_gpt.get_feedback() -# -# filler_count = speechToTextWithApi.get_filler_count(before_script[0]) -# -# feedbackGpt = FeedbackAssistant(first_script, filler_count, silence_time) -# feedback = feedbackGpt.get_feedback() -# -# after_audio_link = text_to_speech_file(text=feedback[0], voice_id=voice_id) -# -# send_statistics_to_spring(token=token, gantourCount=filler_count, silentTime=silence_time, -# answerId=feedbackRequestDto.answerId) -# -# for file_path in file_paths: -# os.remove(file_path) -# -# return {"beforeScript": before_script[0], -# "afterScript": feedback[0], -# "afterAudioLink": after_audio_link, -# "feedbackText": "\n".join(feedback[1:])} - - -# @router.post("/analyses") -# def getUserSpeechHabit(request: Request, analysisRequestDto: AnalysisRequestDto): -# token = request.headers.get("Authorization").split(" ")[1] -# analysis_gpt = AnalysisAssistant(questions=analysisRequestDto.questions, beforeScripts=analysisRequestDto.beforeScripts) -# analysis = analysis_gpt.get_analysis() -# -# return {"analysisText": analysis} # 데이터를 JSON 객체로 감쌈 \ No newline at end of file + requests.post( + "http://springboot:8080/api/spring/report", + headers=headers, + json=data + ) diff --git a/app/emotion_diary/jg_sadness_1.m4a b/app/emotion_diary/jg_sadness_1.m4a new file mode 100644 index 0000000..7a0015c Binary files /dev/null and b/app/emotion_diary/jg_sadness_1.m4a differ diff --git a/app/emotion_diary/jg_sadness_2.m4a b/app/emotion_diary/jg_sadness_2.m4a new file mode 100644 index 0000000..9375565 Binary files /dev/null and b/app/emotion_diary/jg_sadness_2.m4a differ diff --git a/app/emotion_diary/jg_sadness_3.m4a b/app/emotion_diary/jg_sadness_3.m4a new file mode 100644 index 0000000..ee1a08a Binary files /dev/null and b/app/emotion_diary/jg_sadness_3.m4a differ diff --git a/app/emotion_diary/jg_sadness_4.m4a b/app/emotion_diary/jg_sadness_4.m4a new file mode 100644 index 0000000..26b0d4d Binary files /dev/null and b/app/emotion_diary/jg_sadness_4.m4a differ diff --git a/app/emotion_diary/jg_sadness_5.m4a b/app/emotion_diary/jg_sadness_5.m4a new file mode 100644 index 0000000..235860c Binary files /dev/null and b/app/emotion_diary/jg_sadness_5.m4a differ diff --git a/app/service/gpt.py b/app/service/gpt.py index 6d027c1..eb06023 100644 --- a/app/service/gpt.py +++ b/app/service/gpt.py @@ -51,6 +51,7 @@ def get_schedule_json(self): return schedule_dict + class GenerateQuestionGPT: def __init__(self, text, alias): self.text = text @@ -92,4 +93,47 @@ def get_schedule_json(self): content = response.choices[0].message.content schedule_dict = parsing_json.extract_json_from_content(content) - return schedule_dict \ No newline at end of file + return schedule_dict + + +class EmotionReportGPT: + def __init__(self, text, percent_list): + self.text = text + self.percent_list = percent_list + + def create_report_prompt(self): + system_message = f""" + 너는 지금부터 감정을 분석 하는 심리 상담사야. + + 네 역할은 텍스트와 수치를 보고, 해당 발화의 인물이 하루 동안 어떤 감정 상태를 가졌는지 체크해주는 거야. + 텍스트는 다음과 같아: {str(self.text)} + 수치는 다음과 같아 : {self.percent_list} + + 너의 목표는 두 가지야: + 1. 텍스트와 수치를 보고 발화의 인물의 하루 감정을 종합적으로 분석해줘. + 1-a) 분석을 할 때는 텍스트나 문맥에서 근거를 들어서 논리적으로 서술해줘. + 1-b) 분석 말투는 보호자에게 피보호자의 상태를 설명하는 존댓말 말투로 해줘. + 1-c) '발화자'를 지칭하는 말은 '피보호자'로 해줘 + 2. 분석 문장은 4-5 줄이어야 해. + + 결과는 꼭 큰따옴표(")만 사용해서 str로 반환해줘. 만약 여러 문장이라면 "\n"를 문장 끝에 넣어줘. + + """ + + messages = [ + {"role": "system", "content": system_message} + ] + return messages + + def get_report_text(self): + prompt = self.create_report_prompt() + response = client.chat.completions.create( + model="gpt-4-turbo", + messages=prompt, + temperature=0.5, + max_tokens=2048 + ) + + content = response.choices[0].message.content + + return content diff --git a/app/service/interaction.py b/app/service/interaction.py index 4518264..fef5577 100644 --- a/app/service/interaction.py +++ b/app/service/interaction.py @@ -2,43 +2,30 @@ import subprocess from datetime import datetime -import pyaudio -import numpy as np from faster_whisper import WhisperModel from openai import OpenAI -from elevenLabs import text_to_speech_file from elevenlabs import ElevenLabs from dotenv import load_dotenv -# 아래 두 함수는 record_respberry.py 에 구현된 그대로 사용합니다. -# emotion_record(index) → "{prefix}{index}.wav" 파일을 만들어 리턴 -# is_silent(data) → 음성 청크가 침묵인지 여부 판단 -from record_respberry import emotion_record, is_silent +from app.service.elevenLabs import text_to_speech_file +# 녹음 함수 (arecord 사용) - 수정된 record_respberry.py 참고 +from record_respberry import emotion_record # ==== 공통 설정 ==== load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") -ELEVENLABS_KEY = os.getenv("ELEVENLABS_KEY") +ELEVENLABS_KEY = os.getenv("ELEVENLABS_KEY") if not OPENAI_API_KEY or not ELEVENLABS_KEY: raise RuntimeError(".env 에 OPENAI_API_KEY/ELEVENLABS_KEY 를 설정하세요") # OpenAI / ElevenLabs 클라이언트 -gpt_client = OpenAI(api_key=OPENAI_API_KEY) -tts_client = ElevenLabs(api_key=ELEVENLABS_KEY) +gpt_client = OpenAI(api_key=OPENAI_API_KEY) +tts_client = ElevenLabs(api_key=ELEVENLABS_KEY) # Whisper 모델 (tiny, CPU, int8) whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8") -# 녹음 파라미터 (ALSA default=USBMIC 으로 잡힌 상태) -FORMAT = pyaudio.paInt16 -CHANNELS = 1 -RATE = 44100 -CHUNK = RATE * 3 # 3초 단위 버퍼 - -# 오늘 날짜 기반 녹음 파일 저장 경로 prefix -today_str = datetime.now().strftime("%Y%m%d") -WAVE_OUTPUT_PREFIX = f"/home/team4/Desktop/capstone/AI/app/emotion_diary/{today_str}_" def interaction(alias: str): """ @@ -104,8 +91,8 @@ def interaction(alias: str): print("=== interaction 종료 ===") + if __name__ == "__main__": # 스크립트를 직접 실행할 때만 동작 # alias를 원하는 이름으로 바꿔주세요 interaction("홍길동") - diff --git a/app/service/main.py b/app/service/main.py index b186d8f..18296c5 100644 --- a/app/service/main.py +++ b/app/service/main.py @@ -7,6 +7,10 @@ from contextlib import asynccontextmanager from app.controller.RecordController import router + +# from app.controller.RecordController import router + + # from app.service.subscribe import subscribe_schedule # @asynccontextmanager @@ -86,9 +90,3 @@ def custom_openapi(): app.include_router(router) -# before_script, statistics_filler_json, statistics_silence_json = startSTT( -# "https://peachmentor-bucket.s3.ap-northeast-2.amazonaws.com/record/%E1%84%82%E1%85%A9%E1%86%A8%E1%84%8B%E1%85%B3%E1%86%B7.m4a") -# self_feedback = "그래도 주어진 시간동안 말을 이어나가긴 했는데 말을 자연스럽게 연결하지 못한 것 같아" -# feedbackAss = FeedbackAssistant(before_script, statistics_filler_json, statistics_silence_json) -# feedback = feedbackAss.get_feedback() -# print(feedback) diff --git a/app/service/predict_resp.py b/app/service/predict_resp.py new file mode 100644 index 0000000..5d93ac6 --- /dev/null +++ b/app/service/predict_resp.py @@ -0,0 +1,35 @@ +import requests +import glob +import os +import mimetypes + + +def predict(): + ip = "192.168.1.243" + # FastAPI 라우터 경로에 맞춘 URL + url = f"http://{ip}:8000/api/fastapi/predict" + + # 전송할 오디오 파일 경로 (wav, m4a, mp3 등 모두 포함) + BASE_DIR = "/home/team4/Desktop/capstone/AI/app/emotion_diary" + audio_paths = glob.glob(os.path.join(BASE_DIR, "**", "*.*"), recursive=True) + + files = [] + for path in audio_paths: + filename = os.path.basename(path) + # 확장자에 맞는 MIME 타입 추출 (fallback: application/octet-stream) + content_type = mimetypes.guess_type(path)[0] or "application/octet-stream" + files.append( + ("files", (filename, open(path, "rb"), content_type)) + ) + + response = requests.post(url, files=files) + if response.status_code == 200: + print("감정 예측 결과:") + for label, score in response.json().items(): + print(f"{label}: {score}") + else: + print(f"Error: {response.status_code} - {response.text}") + + +if __name__ == "__main__": + predict() diff --git a/app/service/record_respberry.py b/app/service/record_respberry.py index 3bb9206..3e0460f 100644 --- a/app/service/record_respberry.py +++ b/app/service/record_respberry.py @@ -1,32 +1,34 @@ import os import wave from datetime import datetime - import numpy as np -import pyaudio +import sounddevice as sd +from scipy.io.wavfile import write # === 녹음 설정 === -FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 -CHUNK = 4096 # 약 0.093초 분량 (4096/44100) -SILENCE_LIMIT = 5 # 5초 연속 침묵이면 녹음 종료 +CHUNK_DURATION = 0.1 # 초 단위, 약 100ms +CHUNK = int(RATE * CHUNK_DURATION) +SILENCE_LIMIT = 5 # 5초 연속 침묵이면 녹음 종료 +THRESHOLD = 1000.0 # 침묵 판별 기준 (RMS) + BASE_DIR = "/home/team4/Desktop/capstone/AI/app/emotion_diary" + # 날짜 기반 하위 디렉터리(매일 한 번만 생성) def _ensure_dir(): os.makedirs(BASE_DIR, exist_ok=True) -def is_silent(data: bytes, threshold: float = 1000.0) -> bool: + +def is_silent(data: np.ndarray, threshold: float = THRESHOLD) -> bool: """ - 한 프레임(CHUNK) 크기의 raw PCM data를 받아 - RMS 기준으로 침묵 여부를 판단. + float32 numpy 배열을 받아 RMS 기준으로 침묵 여부를 판단 """ - audio_data = np.frombuffer(data, dtype=np.int16) - rms = np.sqrt(np.mean(audio_data.astype(np.float32) ** 2)) - # print(f"RMS={rms:.1f}") # 필요 시 디버그용 + rms = np.sqrt(np.mean(data ** 2)) return rms < threshold + def emotion_record(index: int) -> str: """ index: 녹음 파일 구분을 위한 정수 인덱스 @@ -37,49 +39,34 @@ def emotion_record(index: int) -> str: filename = f"{date_str}_{index}.wav" filepath = os.path.join(BASE_DIR, filename) - pa = pyaudio.PyAudio() - # input_device_index 를 지정하지 않으면 ALSA default (=USBMIC) 사용 - stream = pa.open( - format=FORMAT, - channels=CHANNELS, - rate=RATE, - input=True, - frames_per_buffer=CHUNK - ) - print(f"[녹음 시작] {filename}") + frames = [] silent_secs = 0.0 try: - while True: - data = stream.read(CHUNK, exception_on_overflow=False) - frames.append(data) + with sd.InputStream(samplerate=RATE, channels=CHANNELS, dtype='float32') as stream: + while True: + data, _ = stream.read(CHUNK) + audio_chunk = data[:, 0] # mono + frames.append(audio_chunk.copy()) - if is_silent(data): - silent_secs += CHUNK / RATE - else: - silent_secs = 0.0 + if is_silent(audio_chunk): + silent_secs += CHUNK_DURATION + else: + silent_secs = 0.0 - if silent_secs >= SILENCE_LIMIT: - print(f"[침묵 {SILENCE_LIMIT}초 감지 → 녹음 종료]") - break + if silent_secs >= SILENCE_LIMIT: + print(f"[침묵 {SILENCE_LIMIT}초 감지 → 녹음 종료]") + break except Exception as e: print("녹음 중 예외:", e) - finally: - stream.stop_stream() - stream.close() - pa.terminate() - - # WAV 파일로 저장 - wf = wave.open(filepath, 'wb') - wf.setnchannels(CHANNELS) - wf.setsampwidth(pa.get_sample_size(FORMAT)) - wf.setframerate(RATE) - wf.writeframes(b''.join(frames)) - wf.close() + # float32 → int16 변환 후 저장 + all_audio = np.concatenate(frames) + int_audio = np.int16(np.clip(all_audio * 32767, -32768, 32767)) + + write(filepath, RATE, int_audio) print(f"[저장 완료] {filepath}\n") return filepath - diff --git a/app/service/s3Service.py b/app/service/s3Service.py index b992d24..aa2ca35 100644 --- a/app/service/s3Service.py +++ b/app/service/s3Service.py @@ -1,6 +1,7 @@ import os import time import uuid +from datetime import datetime from typing import List import requests @@ -62,6 +63,34 @@ def upload_to_s3(local_file_path: str) -> str: print(f"Another error => {e}") +def upload_to_s3_png(local_file_path: str) -> str: + """로컬 파일을 S3에 업로드하고 S3 URL을 반환합니다.""" + try: + if not os.path.isfile(local_file_path): + print(f"Local file does not exist: {local_file_path}") + return None + + date_str = datetime.now().strftime("%Y%m%d") + filename = f"{date_str}" + + timestamp = int(time.time()) + unique_id = str(uuid.uuid4()) + s3_file_name = f"image/{filename}_{timestamp}_{unique_id}.png" + + # S3에 파일 업로드 + with open(local_file_path, "rb") as data: + s3_client.upload_fileobj(data, bucket_name, s3_file_name) + + # S3 URL 생성 + aws_file_url = f"{url_base}/{s3_file_name}" + return aws_file_url + + except ClientError as e: + print(f'Credential error => {e}') + except Exception as e: + print(f"Another error => {e}") + + # AWS S3에서 녹음 파일 다운로드 def download_from_s3(file_s3_url: str) -> str: """S3에서 파일을 다운로드하고 로컬에 저장합니다.""" diff --git a/app/utils/convertFileExtension.py b/app/utils/convertFileExtension.py index e27dd38..d58eee0 100644 --- a/app/utils/convertFileExtension.py +++ b/app/utils/convertFileExtension.py @@ -1,7 +1,10 @@ +import io import os +import tempfile from datetime import datetime from pydub import AudioSegment +from pydub.exceptions import CouldntDecodeError def merge_all_wavs_to_mp3(audio_dir="audio", silence_duration_ms=500): @@ -45,4 +48,51 @@ def convert_to_mp3(file_path): return output_path -0 +def convert_to_wav(raw_bytes: bytes, ext: str) -> bytes: + ext = ext.lower() + # 이미 WAV라면 바로 반환 + if ext == "wav": + return raw_bytes + + # 임시 입력 파일 생성 + with tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}") as fin: + fin.write(raw_bytes) + fin.flush() + fin_path = fin.name + + try: + # 1) format 인자 없이 자동 감지 시도 + audio = AudioSegment.from_file(fin_path) + except CouldntDecodeError: + try: + # 2) 자동 감지도 실패하면, 프로브 크기 늘려서 재시도 + audio = AudioSegment.from_file( + fin_path, + parameters=["-probesize", "50M", "-analyzeduration", "100M"] + ) + except CouldntDecodeError as e: + os.unlink(fin_path) + raise RuntimeError(f"FFmpeg 디코딩 실패({ext}): {e}") from e + + # WAV(PCM) 사양으로 맞춰주기 + audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2) + + # 메모리로 WAV 내보내기 + out = io.BytesIO() + audio.export(out, format="wav") + wav_bytes = out.getvalue() + + os.unlink(fin_path) + return wav_bytes + + # 3) 원하는 파라메터로 변환 (16kHz, mono, 16-bit) + audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2) + + # 4) 메모리로 WAV 내보내기 + out = io.BytesIO() + audio.export(out, format="wav") + wav_bytes = out.getvalue() + + # 5) 임시 입력 파일 삭제 + os.unlink(fin_path) + return wav_bytes diff --git a/main.py b/main.py index 8f05ef4..2e48eca 100644 --- a/main.py +++ b/main.py @@ -62,6 +62,7 @@ # # 종료 함수는 비활성화 (라이브러리 오류 방지) # # dhtDevice.exit() +import uvicorn # ─────────────────────────────── # 메인 함수 @@ -69,8 +70,8 @@ if __name__ == "__main__": uvicorn.run( app="app.service.main:app", - host="localhost", - # host="0.0.0.0", + # host="localhost", + host="0.0.0.0", port=8000, ) # detect_motion() # PIR 센서 테스트 시 주석 해제 diff --git a/requirements.txt b/requirements.txt index c61a0e6..376bdd9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,65 +1,164 @@ +absl-py==2.3.0 annotated-types==0.7.0 anyio==4.9.0 -av==14.3.0 +asttokens==3.0.0 +astunparse==1.6.3 +async-timeout==5.0.1 +attrs==25.3.0 +audioread==3.0.1 +av==14.4.0 +backcall==0.2.0 +beautifulsoup4==4.13.4 +bleach==6.2.0 boto3==1.37.16 botocore==1.37.16 +cachetools==5.5.2 certifi==2025.1.31 cffi==1.17.1 charset-normalizer==3.4.1 click==8.1.8 colorama==0.4.6 coloredlogs==15.0.1 +contourpy==1.3.0 ctranslate2==4.6.0 +cycler==0.12.1 +decorator==5.2.1 +defusedxml==0.7.1 distro==1.9.0 -dotenv==0.9.9 +docopt==0.6.2 elevenlabs==1.54.0 exceptiongroup==1.2.2 +executing==2.2.0 fastapi==0.115.11 faster-whisper==1.1.1 +fastjsonschema==2.21.1 filelock==3.18.0 +flatbuffers==25.2.10 +fonttools==4.58.1 fsspec==2025.3.2 +gast==0.4.0 +google-auth==2.40.2 +google-auth-oauthlib==0.4.6 +google-pasta==0.2.0 +grpcio==1.71.0 h11==0.14.0 +h5py==3.13.0 httpcore==1.0.7 httpx==0.28.1 huggingface-hub==0.31.2 humanfriendly==10.0 idna==3.10 +importlib_metadata==8.7.0 +importlib_resources==6.5.2 +ipython==8.12.3 +jedi==0.19.2 Jinja2==3.1.6 -jiter==0.9.0 +jiter==0.10.0 jmespath==1.0.1 +joblib==1.5.1 +jsonschema==4.24.0 +jsonschema-specifications==2025.4.1 +jupyter_client==8.6.3 +jupyter_core==5.8.1 +jupyterlab_pygments==0.3.0 +kiwisolver==1.4.7 +lazy_loader==0.4 +libclang==18.1.1 +librosa==0.11.0 +llvmlite==0.43.0 +Markdown==3.8 +markdown-it-py==3.0.0 MarkupSafe==3.0.2 +matplotlib==3.9.4 +matplotlib-inline==0.1.7 +mdurl==0.1.2 +mistune==3.1.3 +ml_dtypes==0.5.1 mpmath==1.3.0 +msgpack==1.1.0 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 networkx==3.2.1 -numpy==2.0.2 +numba==0.60.0 +numpy==1.26.4 +oauthlib==3.2.2 onnxruntime==1.19.2 openai==1.68.2 +opt_einsum==3.4.0 +optree==0.16.0 packaging==25.0 +pandocfilters==1.5.1 +parso==0.8.4 +pickleshare==0.7.5 +pillow==11.2.1 +pipreqs==0.5.0 +platformdirs==4.3.8 playsound==1.3.0 -protobuf==6.31.0 -# PyAudio==0.2.14 +pooch==1.8.2 +prompt_toolkit==3.0.51 +protobuf==3.19.6 +pure_eval==0.2.3 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 pycparser==2.22 pydantic==2.10.6 pydantic_core==2.27.2 pydub==0.25.1 +Pygments==2.19.1 +pyparsing==3.2.3 +pyreadline3==3.5.4 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 python-multipart==0.0.20 PyYAML==6.0.2 +pyzmq==26.4.0 +redis==6.2.0 +referencing==0.36.2 +regex==2024.11.6 requests==2.32.3 -# RPi.GPIO==0.7.1 +requests-oauthlib==2.0.0 +rich==14.0.0 +rpds-py==0.25.1 +rsa==4.9.1 s3transfer==0.11.4 +safetensors==0.5.3 +scikit-learn==1.6.1 scipy==1.13.1 +sentence-transformers==4.1.0 six==1.17.0 sniffio==1.3.1 sounddevice==0.5.1 soundfile==0.13.1 +soupsieve==2.7 +soxr==0.5.0.post1 +SpeechRecognition==3.14.3 +stack-data==0.6.3 starlette==0.46.1 sympy==1.14.0 +tensorboard==2.11.0 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.1 +tensorflow-estimator==2.11.0 +tensorflow-io-gcs-filesystem==0.31.0 +termcolor==3.1.0 +threadpoolctl==3.6.0 +tinycss2==1.4.0 tokenizers==0.21.1 torch==2.7.0 torchaudio==2.7.0 +torchvision==0.22.0 +tornado==6.5.1 tqdm==4.67.1 +traitlets==5.14.3 +transformers==4.52.4 typing_extensions==4.12.2 urllib3==1.26.20 uvicorn==0.34.0 -websockets==15.0.1 \ No newline at end of file +wcwidth==0.2.13 +webencodings==0.5.1 +websockets==15.0.1 +Werkzeug==3.1.3 +wrapt==1.17.2 +yarg==0.1.9 +zipp==3.22.0 diff --git a/requirements_server.txt b/requirements_server.txt new file mode 100644 index 0000000..45b9215 --- /dev/null +++ b/requirements_server.txt @@ -0,0 +1,170 @@ +absl-py==2.3.0 +annotated-types==0.7.0 +anyio==4.9.0 +asttokens==3.0.0 +astunparse==1.6.3 +async-timeout==5.0.1 +attrs==25.3.0 +audioread==3.0.1 +av==14.4.0 +backcall==0.2.0 +beautifulsoup4==4.13.4 +bleach==6.2.0 +boto3==1.37.16 +botocore==1.37.16 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +colorama==0.4.6 +coloredlogs==15.0.1 +contourpy==1.3.0 +ctranslate2==4.6.0 +cycler==0.12.1 +decorator==5.2.1 +defusedxml==0.7.1 +distro==1.9.0 +docopt==0.6.2 +elevenlabs==1.54.0 +exceptiongroup==1.2.2 +executing==2.2.0 +fastapi==0.115.11 +faster-whisper==1.1.1 +fastjsonschema==2.21.1 +filelock==3.18.0 +flatbuffers==25.2.10 +fonttools==4.58.1 +fsspec==2025.3.2 +gast==0.4.0 +google-auth==2.40.2 +google-auth-oauthlib==0.4.6 +google-pasta==0.2.0 +grpcio==1.71.0 +h11==0.14.0 +h5py==3.13.0 +httpcore==1.0.7 +httpx==0.28.1 +huggingface-hub==0.31.2 +humanfriendly==10.0 +idna==3.10 +importlib_metadata==8.7.0 +importlib_resources==6.5.2 +ipython==8.12.3 +jedi==0.19.2 +Jinja2==3.1.6 +jiter==0.10.0 +jmespath==1.0.1 +joblib==1.5.1 +jsonschema==4.24.0 +jsonschema-specifications==2025.4.1 +jupyter_client==8.6.3 +jupyter_core==5.8.1 +jupyterlab_pygments==0.3.0 +keras==3.10.0 +kiwisolver==1.4.7 +lazy_loader==0.4 +libclang==18.1.1 +librosa==0.11.0 +llvmlite==0.43.0 +Markdown==3.8 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +matplotlib==3.9.4 +matplotlib-inline==0.1.7 +mdurl==0.1.2 +mistune==3.1.3 +ml_dtypes==0.5.1 +mpmath==1.3.0 +msgpack==1.1.0 +namex==0.1.0 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +networkx==3.2.1 +numba==0.60.0 +numpy==1.26.4 +oauthlib==3.2.2 +onnxruntime==1.19.2 +openai==1.68.2 +opt_einsum==3.4.0 +optree==0.16.0 +packaging==25.0 +pandocfilters==1.5.1 +parso==0.8.4 +pickleshare==0.7.5 +pillow==11.2.1 +pipreqs==0.5.0 +platformdirs==4.3.8 +playsound==1.3.0 +pooch==1.8.2 +prompt_toolkit==3.0.51 +protobuf==5.29.5 +pure_eval==0.2.3 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pycparser==2.22 +pydantic==2.10.6 +pydantic_core==2.27.2 +pydub==0.25.1 +Pygments==2.19.1 +pyparsing==3.2.3 +pyreadline3==3.5.4 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-multipart==0.0.20 +pywin32==310 +PyYAML==6.0.2 +pyzmq==26.4.0 +redis==6.2.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +requests-oauthlib==2.0.0 +rich==14.0.0 +rpds-py==0.25.1 +rsa==4.9.1 +s3transfer==0.11.4 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.13.1 +sentence-transformers==4.1.0 +six==1.17.0 +sniffio==1.3.1 +sounddevice==0.5.1 +soundfile==0.13.1 +soupsieve==2.7 +soxr==0.5.0.post1 +SpeechRecognition==3.14.3 +stack-data==0.6.3 +starlette==0.46.1 +sympy==1.14.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboard-plugin-wit==1.8.1 +tensorflow==2.19.0 +tensorflow-estimator==2.11.0 +tensorflow-io-gcs-filesystem==0.31.0 +tensorflow_intel==2.18.0 +termcolor==3.1.0 +tf_keras==2.19.0 +threadpoolctl==3.6.0 +tinycss2==1.4.0 +tokenizers==0.21.1 +torch==2.7.0 +torchaudio==2.7.0 +torchvision==0.22.0 +tornado==6.5.1 +tqdm==4.67.1 +traitlets==5.14.3 +transformers==4.52.4 +typing_extensions==4.12.2 +urllib3==1.26.20 +uvicorn==0.34.0 +wcwidth==0.2.13 +webencodings==0.5.1 +websockets==15.0.1 +Werkzeug==3.1.3 +wrapt==1.17.2 +yarg==0.1.9 +zipp==3.22.0