ctrl-F-my-audio/inference.py at main · dappon4/ctrl-F-my-audio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import torch
from PIL import Image
from model import AudioClassifier
from torchvision import transforms
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np


def audio_to_img(path):

    # Load the audio file
    audio, sr = librosa.load(path)

    # Compute the spectrogram
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)

    # Convert the spectrogram to dB scale
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spectrogram_db, sr=sr)
    plt.savefig(f"inference_tmp/tmp.png", bbox_inches='tight', pad_inches=0)

    image = Image.open(f"inference_tmp/tmp.png")

    # Resize the image
    resized_image = image.resize((50, 50))

    # Save the resized image
    resized_image.save(f"inference_tmp/output/tmp.png")
    plt.close()

def convert(path):

    audio_to_img(path)

    resized_image = Image.open("inference_tmp/output/tmp.png")

    resized_image = resized_image.convert("RGB")

    # Apply the ToTensor transformation
    transform = transforms.ToTensor()
    image_tensor = transform(resized_image)

    return image_tensor

def inference(model_path, audio_path, key_map, step):

    print(f"using model from {model_path}")

    confidence_threshold = 0.7

    reverse_map = {v: k for k, v in key_map.items()}

    model = AudioClassifier(len(key_map))
    model.load_state_dict(torch.load(model_path))
    model.eval()

    files = os.listdir(audio_path)
    files.sort()
    res = {}

    for i,file in enumerate(files):
        file_path = os.path.join(audio_path, file)

        if os.path.isfile(file_path):
            input_tensor = convert(file_path)

            # Add batch dimension to input_tensor
            input_tensor = input_tensor.unsqueeze(0)


            output = model(input_tensor)
            probs = torch.nn.functional.softmax(output, dim=1)
            predicted_idx = torch.argmax(output, 1).item()

            probability = probs[0][predicted_idx].item()

            print(reverse_map[predicted_idx], probability)

            if probability >= confidence_threshold:
                res[str(i*step)] = reverse_map[predicted_idx]

    print(res)
    return res


def create_dict():
    with open("index.txt", "r") as file:
            text = file.read()
            tags = text.split("\n")
    res = {}
    for i,name in enumerate(tags):
        res[name] = i

    return res

def main():
    step = 3
    print("running from main...")
    dic = create_dict()
    model_path = "models/acc-76-s.pth"
    audio_path = "assets/chunks"

    inference(model_path,audio_path,dic, step)
if __name__ == "__main__":
    main()