-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathinference.py
More file actions
110 lines (75 loc) · 2.74 KB
/
inference.py
File metadata and controls
110 lines (75 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import torch
from PIL import Image
from model import AudioClassifier
from torchvision import transforms
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
def audio_to_img(path):
# Load the audio file
audio, sr = librosa.load(path)
# Compute the spectrogram
spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
# Convert the spectrogram to dB scale
spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
plt.figure(figsize=(10, 4))
librosa.display.specshow(spectrogram_db, sr=sr)
plt.savefig(f"inference_tmp/tmp.png", bbox_inches='tight', pad_inches=0)
image = Image.open(f"inference_tmp/tmp.png")
# Resize the image
resized_image = image.resize((50, 50))
# Save the resized image
resized_image.save(f"inference_tmp/output/tmp.png")
plt.close()
def convert(path):
audio_to_img(path)
resized_image = Image.open("inference_tmp/output/tmp.png")
resized_image = resized_image.convert("RGB")
# Apply the ToTensor transformation
transform = transforms.ToTensor()
image_tensor = transform(resized_image)
return image_tensor
def inference(model_path, audio_path, key_map, step):
print(f"using model from {model_path}")
confidence_threshold = 0.7
reverse_map = {v: k for k, v in key_map.items()}
model = AudioClassifier(len(key_map))
model.load_state_dict(torch.load(model_path))
model.eval()
files = os.listdir(audio_path)
files.sort()
res = {}
for i,file in enumerate(files):
file_path = os.path.join(audio_path, file)
if os.path.isfile(file_path):
input_tensor = convert(file_path)
# Add batch dimension to input_tensor
input_tensor = input_tensor.unsqueeze(0)
output = model(input_tensor)
probs = torch.nn.functional.softmax(output, dim=1)
predicted_idx = torch.argmax(output, 1).item()
probability = probs[0][predicted_idx].item()
print(reverse_map[predicted_idx], probability)
if probability >= confidence_threshold:
res[str(i*step)] = reverse_map[predicted_idx]
print(res)
return res
def create_dict():
with open("index.txt", "r") as file:
text = file.read()
tags = text.split("\n")
res = {}
for i,name in enumerate(tags):
res[name] = i
return res
def main():
step = 3
print("running from main...")
dic = create_dict()
model_path = "models/acc-76-s.pth"
audio_path = "assets/chunks"
inference(model_path,audio_path,dic, step)
if __name__ == "__main__":
main()