diff --git a/.gitignore b/.gitignore index a9a3190..26c1375 100644 --- a/.gitignore +++ b/.gitignore @@ -185,6 +185,8 @@ pyrightconfig.json pyvenv.cfg pip-selfcheck.json + + ### VisualStudioCode ### .vscode/* !.vscode/settings.json @@ -224,5 +226,11 @@ buowset.ipynb settings.json # Block all configs besides the example config -whoot_model_training/configs -!whoot_model_training/configs/config.yml \ No newline at end of file +script/model_training/configs +!script/model_training/configs/config.yml + +whoot_model_training + +!/scripts #Explictly allow root scripts + +data \ No newline at end of file diff --git a/assess_birdnet/aggregate_audio_analysis.py b/assess_birdnet/aggregate_audio_analysis.py deleted file mode 100644 index e515ab7..0000000 --- a/assess_birdnet/aggregate_audio_analysis.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Aggregate Birdnet Output Files. - -Script for combining the batch out output files that Birdnet (acoustic bird -species classification model) produces into one master csv including the file -name of the file that the information came from. Needed if the .wav file -needed to be split into smaller segments for processing. - -Example: - $ python aggregate_audio_analysis.py /path/to/input/dir - -""" - -import os -import csv -import argparse - - -def main(root_dir): - """Aggregate birdnet results. - - Main function that creates a CSV and populates it with the detection - information from Birdnet text file outputs - - Args: - root_dir (str): The path to directory containing Birdnet results - - """ - # Scanning the root directory containing sub-direcctories - for root, _, _ in os.walk(root_dir): - # Get a list of all .txt files in each current directory - txt_files = [f for f in os.listdir(root) if f.endswith('.txt')] - - if txt_files: - # Sorting.txt files based on file names - int sorting time - sorted_txt_files = sorted(txt_files) - subdirectory_name = os.path.basename(root) - - # Create the CSV file for the current folder - output_csv_file = os.path.join(root, - f"{subdirectory_name}_master.csv") - csv_headers = [ - "Selection", - "View", - "Channel", - "Begin Time (s)", - "End Time (s)", - "Low Freq (Hz)", - "High Freq (Hz)", "Species Code", - "Common Name", - "Confidence", - "File Name", - "Extra 1", - "Extra 2" - ] - - with open( - output_csv_file, mode='w', newline='', encoding='utf8' - ) as csvfile: - csv_writer = csv.writer(csvfile) - csv_writer.writerow(csv_headers) - - for txt_file in sorted_txt_files: - with open( - os.path.join(root, txt_file), 'r', encoding='utf8' - ) as txtfile: - lines = txtfile.readlines()[1:] - for line in lines: - data = line.strip().split('\t') - file_name = os.path.splitext(txt_file)[0] - data.append(file_name) - csv_writer.writerow(data) - - print(f"File '{output_csv_file}' created successfully.") - - -if __name__ == '__main__': - # Create an argument parser - parser = argparse.ArgumentParser( - description='Input Directory Path' - ) - - # Create and parse arguements - parser.add_argument('root_dir', - type=str, - help='Directory path to files') - # Parse the command-line arguments - args = parser.parse_args() - - # Call the main function - main(args.root_dir) diff --git a/assess_birdnet/assess_performance.py b/assess_birdnet/assess_performance.py deleted file mode 100644 index 3b2ef2e..0000000 --- a/assess_birdnet/assess_performance.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Assess Birdnet Metrics. - -This script compares human and birdnet labels and outputs the -confusion matrix, accuracy, precision, recall, and F1 score -assuming the human labels are 100% accurate. - -Example: - - $ python assess_performance.py /path/to/human_labeled.csv \ - /path/to/birdnet_labeled.csv - -""" - -import argparse -import pandas as pd -from sklearn.metrics import confusion_matrix, accuracy_score, precision_score -from sklearn.metrics import recall_score, f1_score - - -def main(human_labeled, birdnet_labeled): - """Evaluate Birdnet Metrics. - - Main script that prints metrics comparing human/birdnet labeled - acoustic data assuming the human labels are ground truth. - - Args: - human_labeled (str): The path to the human labeled csv. - birdnet_labeled (str): The path to the adjusted birdnet output. - - """ - scored_data = pd.read_csv(human_labeled) - ml_output = pd.read_csv(birdnet_labeled) - - y_true = scored_data['Label'].map({'yes': 1, 'no': 0}).values - y_pred = ml_output['Label'].map({'yes': 1, 'no': 0}).values - - confusion_m = confusion_matrix(y_true, y_pred) - accuracy = accuracy_score(y_true, y_pred) - precision = precision_score(y_true, y_pred) - recall = recall_score(y_true, y_pred) - f1_result = f1_score(y_true, y_pred) - - print("Confusion Matrix:") - print(confusion_m) - print(f"Accuracy: {accuracy:.4f}") - print(f"Precision: {precision:.4f}") - print(f"Recall: {recall:.4f}") - print(f"F1 Score: {f1_result:.4f}") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Input Directory Path' - ) - parser.add_argument('human_labeled', - type=str, - help='Path to human labeled adjusted output.') - parser.add_argument('birdnet_labeled', - type=str, - help='Path to birdnet labeled adjusted output.') - args = parser.parse_args() - main(args.human_labeled, args.birdnet_labeled) diff --git a/assess_birdnet/normalize_birdnet_output.py b/assess_birdnet/normalize_birdnet_output.py deleted file mode 100644 index 9253222..0000000 --- a/assess_birdnet/normalize_birdnet_output.py +++ /dev/null @@ -1,95 +0,0 @@ -"""Standardize Birdnet Output. - -This script takes a master csv containing the aggregated birdnet output -text files and creates an expanded csv containing a "no" label -for each chunk of time that birdnet did not detect a vocalization. The -3-second chunks where birdnet made a detection will be marked with a "yes". - -Example: - - $ python normalize_birdnet_output.py \ - /path/to/aggregated_birdnet_output.csv /path/to/birdnet_labeled.csv - -""" - -import argparse -import pandas as pd -import numpy as np - - -def main(aggr_birdnet, birdnet_labeled): - """Organize and expand birdnet labels. - - Main function to take birdnet labels and create a - dataframe that has time chunks for the whole audio file - duration and labels the detection periods with "yes". - - Args: - aggr_birdnet (str): Path to aggregated Birdnet analysis file. - birdnet_labeled (str): Path to desired output csv. - - """ - ml_output = pd.read_csv(aggr_birdnet) - - filtered_data = ml_output[ml_output['Common Name'] == 'burowl'] - - # Optional if Birdnet analysis is continuous and not aggregated. - filtered_data = filtered_data.apply(adjust_time, axis=1) - - # Total duration of the sound file(s) that Birdnet analyzed. - total_duration = 10800 - all_intervals = pd.DataFrame({ - 'Begin Time (s)': np.arange(0, total_duration, 3), - 'End Time (s)': np.arange(3, total_duration + 3, 3), - }) - all_intervals['Label'] = 'no' - - for _, row in filtered_data.iterrows(): - start = row['Begin Time (s)'] - end = row['End Time (s)'] - mask = ( - all_intervals['Begin Time (s)'] >= start - ) & (all_intervals['End Time (s)'] <= end) - all_intervals.loc[mask, 'Label'] = 'yes' - - all_intervals.to_csv(birdnet_labeled, index=False) - - -def adjust_time(row): - """Create continuous timestamps. - - Function to standardize the timestamps for the aggregated - birdnet input file. This is because this script was designed - assuming that the analysis needed to be aggregated from split - wav files from the same larger audio recording. This function - ensures that if you split up your sound file in smaller bits - to be analyzed by birdnet and aggregate their output, the time - chunks will represent the entire sound file in order and not - as separate audio files. - - Args: - row (pandas.Series): The current row in the dataframe. - - Returns: - pandas.Series: The time adjusted row in the dataframe. - - """ - chunk_number = int(row['File Name'].split('output_')[1].split('.')[0]) - offset = chunk_number * 60 - row['Begin Time (s)'] += offset - row['End Time (s)'] += offset - return row - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Input Directory Path' - ) - parser.add_argument('aggr_birdnet', - type=str, - help='File path to aggregated birdnet raw output') - parser.add_argument('birdnet_labeled', - type=str, - help='Result csv with adjusted birdnet results') - args = parser.parse_args() - main(args.aggr_birdnet, args.birdnet_labeled) diff --git a/assess_birdnet/normalize_scored_output.py b/assess_birdnet/normalize_scored_output.py deleted file mode 100644 index ffeb2df..0000000 --- a/assess_birdnet/normalize_scored_output.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Standardize human labeled data. - -This script takes the human labeled data and will output a -dataframe containing time chunks for your specific sound file -of interest and labels for each chunk depending on what -detections were marked by the human labelers. - -Example: - - $ python normalize_scored_output.py \\ - /path/to/human_labeled.csv /path/to/output_dataframe.csv - -""" - -import argparse -import pandas as pd - - -def main(labels, adjusted_labels): - """Standardize human labeled data. - - Main function to create a dataframe with the whole - duration of the audio file of interest represented in - time chunks labeled 'no' or 'yes' if the human labels - marked a vocalization in that specific time chunk. - - Args: - labels (str): The path to the human labeled data. - adjusted_labels (str): The resulting csv dataframe with - labels for each 3 second chunk based on the human labels. - - """ - scored_data = pd.read_csv(labels) - - # Need to insert wav file of interest. - file_of_interest = '20170421_180000.wav' - filtered_data = scored_data[scored_data['IN FILE'] == file_of_interest] - - # Time length of audio file of interest. - audio_file_duration = 10800 - - total_chunks = audio_file_duration // 3 - chunks_data = { - 'Chunk Start': [i*3 for i in range(total_chunks)], - 'Chunk End': [(i+1)*3 for i in range(total_chunks)], - 'Label': ['no'] * total_chunks - } - chunks_df = pd.DataFrame(chunks_data) - filtered_data.apply(lambda row: mark_intervals(row, chunks_df), - axis=1) - - chunks_df.to_csv(adjusted_labels, index=False) - print(f"File {adjusted_labels} created successfully.") - - -def mark_intervals(row, chunks_df): - """Label positive detections. - - Function to relabel the row in the dataframe - to yes if the human labels marked a vocalization - at that point. - - Args: - row (pandas.Series): The current row in the human labeled - data that matches the audio file of interest. - chunks_df (pandas.Dataframe): The new unlabeled dataframe. - - """ - start_time = float(row['OFFSET']) - end_time = start_time + float(row['DURATION']) - start_chunk = int(start_time // 3) - end_chunk = int(end_time // 3) - - if row['TOP1MATCH'] != 'null': - chunks_df.loc[start_chunk:end_chunk, 'Label'] = 'yes' - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Input Directory Path' - ) - parser.add_argument('labels', - type=str, - help='File path to human labeled raw output') - parser.add_argument('adjusted_labels', - type=str, - help='Result csv with adjusted human labels') - args = parser.parse_args() - main(args.labels, args.adjusted_labels) diff --git a/whoot_model_training/configs/config.yml b/configs/model_training/config.yml similarity index 100% rename from whoot_model_training/configs/config.yml rename to configs/model_training/config.yml diff --git a/make_model/buowset/README.md b/make_model/buowset/README.md deleted file mode 100644 index b723b20..0000000 --- a/make_model/buowset/README.md +++ /dev/null @@ -1,49 +0,0 @@ -These scripts are intended to load and format and parse -the buowset dataset to easily train models from differing -embeddings sources. - -Using Birdnet Embeddings: -Birdnet needs each sound clip to be a minimum of 3s long, -so many of the segments need to be padded in order to reach -this minimum. - -1) Running zero_pad_detections.py on the entire dataset will -copy it to a new folder, containing all the same segments in -buowset but padded with silence either added to the end of the -sample, or randomly dispersed silence at the beginning and end. -You can decide how long the minimum length of each segment should -be, so working with a different embedding source that requires -a different minimum length than Birdnet is still possible, simply -change the length parameter. - -2) With your newly padded dataset, run birdnet.embeddings on the -entire folder. - -3) Then pass along the directory containing your birdnet embeddings, -your metadata.csv for buowset, and a path for your merged data as -a .pkl and run embed_to_df_birdnet.py. - -4) Pass this .pkl result to make_svm.py, along with an optional path -to a model.pkl file if you'd like to save the model that is -produced. - -You're done! - -Optionally, combine steps 3 and 4 into one step by running -make_birdnet_svm.py and passing the birdnet embeddings, metadata, -and optional model save file. -If you are using the same result from step 3 repeatedly and merely -changing model parameters, properly go through steps 3 and 4 so you -just need to repeat step 4. - -Defaults: make_svm.py and make_birdnet_svm.py rely on global parameters -defined in make_svm.py. -TRAINING_FOLDS: Change these numbers to be the folds you wish to go -in your training set. Default is 0-3. -TESTING_FOLDS: Change this number(s) to be the fold(s) you wish to go -in your testing set. Default is 4. -CLASS_0: This SVM pipeline creates binary classifiers. Change the numbers -of this variable to reflect which of the 6 classes in buowset you'd like to -be 0, all unlisted classes will be 1. Default here is that the 'no_buow' class -is the 0, or int 5. This default is also the only balanced distribution for -making a binary SVM with buowset as is. diff --git a/make_model/buowset/embed_to_df_birdnet.py b/make_model/buowset/embed_to_df_birdnet.py deleted file mode 100644 index 29f400e..0000000 --- a/make_model/buowset/embed_to_df_birdnet.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Creating a standardized dataframe with embeddings. - -As there are multiple types and formats that can produce embeddings, -to train an SVM we need a consistent format for the data to be injested. -This function can be called from main to save the dataframe to disk, -and can be called elsewhere to return these dataframes without saving to disk. -It specifically handles embeddings from running birdnet.embeddings on -all audio files. It joins the fold and label info from the metadata -file with the embeddings list and wav name of origin. - -Usage: - $ python3 embeddings_to_df.py -embeds /path/to/embeddings/ - -meta /path/to/metadata.csv -path /path/to/output.pkl -""" -import glob -import os -import ntpath -import argparse -import pandas as pd - - -def obtain_birdnet_embeddings(embeds): - """Merge embeddings with fold and label info. - - Args: - embeds (str): Path to directory where embeddings files are. - - Returns: - dict: A dictionary with the filename as the key, and a list - of floats as the values (embeddings). - """ - embed_dict = {} - text_files = glob.glob(os.path.join(embeds, "*.txt")) - for embed in text_files: - filename = ntpath.basename(embed) - filename = filename.replace(".birdnet.embeddings.txt", ".wav") - dfb = pd.read_csv(embed, - delimiter="[,\t]", - engine='python', - header=None) - dfb_stripped = dfb.drop(dfb.columns[:2], axis=1) - flattened = dfb_stripped.values.flatten() - if len(flattened) > 1024: - print(f"filename {filename} has extra lines. Trunicating") - flattened = flattened[:1024] - embed_dict[filename] = flattened - - return embed_dict - - -def merge_dfs(metadata, embed_dict): - """Merge metadata with the embeddings dictionary. - - Args: - metadata (pd.DataFrame): Filename with fold and label info. - embed_dict (dict): Filename key and embeddings as float list value. - - Returns: - pd.DataFrame: Merged filename, embeddings, and fold and label info. - """ - embed_df = pd.DataFrame.from_dict(embed_dict, orient='index') - embed_df.index.name = 'segment' - df_merged = metadata.merge(embed_df, on='segment') - df_merged = df_merged.drop(columns=['segment_duration_s']) - - return df_merged - - -def main(meta, embeds, path): - """Main script to create and save out embedding df. - - Args: - meta (str): Path to metadata file. - embeds (str): Path to the embeddings folder/file info. - path (str): Path to output embeddings dataframe.csv. - """ - metadata = pd.read_csv(meta, index_col=0) - embed_dict = obtain_birdnet_embeddings(embeds) - df_merged = merge_dfs(metadata, embed_dict) - - df_merged.to_pickle(path) - - print(f"Created dataframe file: {path}") - - -if __name__ == "__main__": - PARSER = argparse.ArgumentParser( - description='Input Directory Path' - ) - PARSER.add_argument('-meta', type=str, - help='Path to metadata with fold and label info.') - PARSER.add_argument('-embeds', type=str, - help='Path to directory containing embedding info.') - PARSER.add_argument('-path', type=str, - help='Path to output dataframe as .pkl.') - ARGS = PARSER.parse_args() - main(ARGS.meta, ARGS.embeds, ARGS.path) diff --git a/make_model/buowset/make_birdnet_svm.py b/make_model/buowset/make_birdnet_svm.py deleted file mode 100644 index afeb472..0000000 --- a/make_model/buowset/make_birdnet_svm.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Create embedding dataframe as well as SVM - -Create the embedding and metadata combined dataframe for Birdnet -style embeddings output and create a binary SVM. - -Usage: - python3 make_birdnet_svm.py -meta /path/to/metadata.csv - -embed_path /path/to/birdnet/embeddings/ - [OPTIONAL] -model /path/to/saved/model.pkl -""" -import argparse -import pandas as pd - -from embed_to_df_birdnet import obtain_birdnet_embeddings, merge_dfs -from make_svm import get_binary_classes, make_svm, save_out_model - - -def main(meta, embed_path, model): - """Create svm from raw bridnet embeddings and metadata. - - Args: - embed_path (str): Path to birdnet embeddings files. - meta (str): Path to metadata containing fold and labels. - model (str): Path to desired save location of result model.pkl. - """ - metadata = pd.read_csv(meta, index_col=0) - embed_dict = obtain_birdnet_embeddings(embed_path) - df_merged = merge_dfs(metadata, embed_dict) - dataset = get_binary_classes(df_merged) - if model is None: - make_svm(dataset) - else: - svm = make_svm(dataset) - save_out_model(svm, model) - - -if __name__ == "__main__": - PARSER = argparse.ArgumentParser( - description='Input Directory Path' - ) - PARSER.add_argument('-meta', type=str, - help='Path to metadata with fold and label info.') - PARSER.add_argument('-embed_path', type=str, - help='Path to directory containing embedding info.') - PARSER.add_argument('-model', type=str, default=None, - help='Path to output dataframe as .pkl.') - ARGS = PARSER.parse_args() - main(ARGS.meta, ARGS.embed_path, ARGS.model) diff --git a/make_model/buowset/make_svm.py b/make_model/buowset/make_svm.py deleted file mode 100644 index ed3024a..0000000 --- a/make_model/buowset/make_svm.py +++ /dev/null @@ -1,142 +0,0 @@ -"""Make SVM with buowset data - -This script can be run to create an SVM from buowset data. - -If you already have an embeddings dataframe merged with the -fold and label metadata: - -Usage: python3 make_svm.py -embed_df /path/to/premade/df.pkl - -If you would like to save our your resulting model file, add - -model_file /path/to/save/model.pkl - -""" -import argparse -import pickle -import pandas as pd -from sklearn.svm import SVC -from sklearn.metrics import classification_report - - -# folds to use for training -TRAINING_FOLDS = [0, 1, 2, 3] -# fold to use for testing -TESTING_FOLDS = [4] -# no buow is 5th class, to be marked as 0 for a binary svm, nums not listed -# will be marked as 1 -CLASS_0 = [5] - - -def get_binary_classes(merged_df): - """Convert class labels to binary labels. - - Args: - merged_df (pd.DataFrame): Dataframe with embeddings, labels, folds. - - Returns: - pd.DataFrame: Same input with new row with binary label added. - """ - merged_df['binary_label'] = (~merged_df['label'].isin(CLASS_0)).astype(int) - - return merged_df - - -def make_x_and_y(embed_df): - """Create train and test split based on existing folds - - Default, this will create an 80% train 20% test split with - the 5th fold data as the test data, with the buow segments - as 1 and the no_buow segments as 0. - - Args: - embed_df (pd.DataFrame): Filename, embeddings as floats, - and the label and fold for that file. - - Returns: - pd.DateFrame: Training embeddings. - pd.DateFrame: Training labels. - pd.DateFrame: Testing embeddings. - pd.DateFrame: Testing labels. - """ - train_df = embed_df[embed_df['fold'].isin(TRAINING_FOLDS)] - test_df = embed_df[embed_df['fold'].isin(TESTING_FOLDS)] - - embedding_cols = embed_df.select_dtypes(include='float64').columns.tolist() - - x_train = train_df[embedding_cols].values - y_train = train_df['binary_label'].values - x_test = test_df[embedding_cols].values - y_test = test_df['binary_label'].values - - return x_train, y_train, x_test, y_test - - -def make_svm(embeddings_df): - """Obtain embeddings, train test split, and create an SVM. - - Args: - embeddings_df (str): The path to your embeddings folds/files. - - Returns: - sklearn.svm.SVC: Support vector machine model. - """ - x_train, y_train, x_test, y_test = make_x_and_y(embeddings_df) - print("beginning model training") - svm = SVC(class_weight='balanced', probability=True) - svm.fit(x_train, y_train) - - y_pred_default = svm.predict(x_test) - - print("Classification report with default threshold:") - print(classification_report(y_test, y_pred_default)) - - return svm - - -def save_out_model(svm, model_file): - """Saves model as a pkl. - - If you'd like to save the model to use, you can optionally - provide a -model_file arg string. If you are just wanting - to see how the model metrics are or testing code, you - may not want to save the model each time. - - Args: - svm (sklearn.svm.SVC): The support vector machine. - model_file (str): Path to where the model will be saved .pkl. - """ - with open(model_file, 'wb') as file: - pickle.dump(svm, file) - - print(f"Saved model path: {model_file}") - - -def main(embed_df, model_file): - """Main script to run - - Args: - embed_df (str): Merged dataframe with filename, embeddings, label, - and fold number. - model_file (str): Path to desired model output file, must be a .pkl. - """ - dataset = pd.read_pickle(embed_df) - - dataset = get_binary_classes(dataset) - - if model_file is None: - make_svm(dataset) - else: - svm = make_svm(dataset) - save_out_model(svm, model_file) - - -if __name__ == "__main__": - PARSER = argparse.ArgumentParser( - description='Input Directory Path' - ) - PARSER.add_argument('-embed_df', type=str, - help='Path to your premade embeddings dataframe.') - PARSER.add_argument('-model_file', type=str, default=None, - help='File name and location of saved model.pkl.') - ARGS = PARSER.parse_args() - main(ARGS.embed_df, ARGS.model_file) diff --git a/make_model/buowset/permutation_test.py b/make_model/buowset/permutation_test.py deleted file mode 100644 index 6c1cc14..0000000 --- a/make_model/buowset/permutation_test.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Permutation testing for Birdnet embedding buowset. - -Usage: - python3 permutation_test.py -meta /path/to/fold/metadata.csv - -embeds /path/to/dir/with/birdnet/embeddings/ -""" -import argparse -import pandas as pd -from sklearn.svm import SVC -from sklearn.metrics import accuracy_score -import numpy as np -from tqdm import tqdm -from embed_to_df_birdnet import obtain_birdnet_embeddings, merge_dfs -from make_svm import get_binary_classes, make_x_and_y - - -def permutation_test(meta, embeds): - """Conduct permutation test. - - Conducts a 100 iteration permutation test for whatever - current data split is defined in make_x_and_y. - - Args: - meta (str): Path to metadata with fold info. - embeds (str): Path to birdnet embeddings. - """ - data = pd.read_csv(meta, index_col=0) - embed_dict = obtain_birdnet_embeddings(embeds) - df_merged = merge_dfs(data, embed_dict) - dataset = get_binary_classes(df_merged) - permutated_accuracies = [] - permutation_iters = 100 - x_train, y_train, x_test, y_test = make_x_and_y(dataset) - for _ in tqdm(range(permutation_iters), desc='Test Progress'): - np.random.shuffle(y_train) - np.random.shuffle(y_test) - svm = SVC(class_weight='balanced', probability=True) - svm.fit(x_train, y_train) - y_pred_default = svm.predict(x_test) - permutated_accuracies.append(accuracy_score(y_pred_default, - y_test)*100) - print(f"Average permutated accuracy is: {np.mean(permutated_accuracies)}") - - -def main(meta, embeds): - """Execute main script. - - Runs the permutation test function. - - Args: - meta (str): Path to metadata with fold info. - embeds (str): Path to birdnet embeddings. - """ - permutation_test(meta, embeds) - - -if __name__ == "__main__": - PARSER = argparse.ArgumentParser( - description='Input Directory Path' - ) - PARSER.add_argument('-meta', type=str, - help='Path to fold metadata') - PARSER.add_argument('-embeds', type=str, - help='Path to directory with embeddings files') - ARGS = PARSER.parse_args() - main(ARGS.meta, ARGS.embeds) diff --git a/make_model/buowset/zero_pad_detections.py b/make_model/buowset/zero_pad_detections.py deleted file mode 100644 index e362edf..0000000 --- a/make_model/buowset/zero_pad_detections.py +++ /dev/null @@ -1,95 +0,0 @@ -"""Zero padding human labeled detections. - -In order to obtain birdnet embeddings, each sample -must be 3s long at least. The human labeled detections are -often shorter than 3s. This script adds silence to make -samples shorter than 3s, reach 3s. Birdnet needs the files -to be hardcoded paths in order to later obtain the embeddings -so we must save out these new files, and we also duplicate the -original so we're essentially creating a copy of the dataset -but with no segments less than 3s. - -Usage: python3 zero_pad_detections.py -path /path/to/dir/wavs/ - -output /path/to/new/dataset/ -length 3000 -randomize - -Omitting -length and -randomize will default to 3000ms and NON -random padding (padding added to end of sample) -""" -import argparse -import os -import random -from pydub import AudioSegment - - -def pad_segments(path, output, length, randomize): - """Pad segments with silence to reach desired duration. - - For segments shorter than min duration, we add silence to the - end or randomly at the beginning and end to reach the desired length. - - Args: - path (str): Path to all of the audio segments. - output (str): Path to desired output for all segments - now lengthened. - length (int): Desired minimum duration of padded segments in ms. - Default 3000ms. - randomize (bool): Flag for if location of padded silence is randomized - within the length of the segment. - """ - for file in os.listdir(path): - filepath = os.path.join(path, file) - audio = AudioSegment.from_wav(filepath) - if len(audio) < length: - if randomize: - max_begin_silence = length - len(audio) - begin_silence = random.uniform(0.0, max_begin_silence) - end_silence = length - (len(audio) + begin_silence) - begin_silence = AudioSegment.silent(duration=begin_silence) - end_silence = AudioSegment.silent(duration=end_silence) - padded = begin_silence + audio + end_silence - else: - silence = AudioSegment.silent(duration=length-len(audio)+1) - padded = audio + silence # Adding silence after the audio - full_path = os.path.join(output, file) - padded.export(full_path, format='wav') - else: - full_path = os.path.join(output, file) - audio.export(full_path, format='wav') - - -def main(path, output, length, randomize): - """Execute main function. - - Runs pad segments. - - Args: - path (str): Path to all of the audio segments. - output (str): Path to desired output for all segments - now lengthened to desired duration. - length (int): Minimum duration of the resulting audio - segments, in milliseconds. - randomize (bool): Flag for if the location of the padded - silence is randomized within the length - of the segment. - """ - pad_segments(path, output, length, randomize) - - -if __name__ == "__main__": - PARSER = argparse.ArgumentParser( - description='Input Directory Path' - ) - PARSER.add_argument('-path', type=str, - help='Path to dataset audio clips') - PARSER.add_argument('-output', type=str, - help='Path to desired output directory for all clips.') - PARSER.add_argument('-length', type=int, default=3000, - help='Minimum length(ms) of the clips, default 3000.') - PARSER.add_argument('-randomize', action='store_true', - help='Randomize location of the audio amidst silence.') - ARGS = PARSER.parse_args() - main(ARGS.path, ARGS.output, ARGS.length, ARGS.randomize) - -# TODO: There are some buow vocalizations longer than 3s. -# Currently, the make_svm just trunicates the birdnet embeddings longer than -# one 3-second feature detection, but we should handle that here. diff --git a/make_model/label_embeddings.py b/make_model/label_embeddings.py deleted file mode 100644 index 92f07f9..0000000 --- a/make_model/label_embeddings.py +++ /dev/null @@ -1,104 +0,0 @@ -"""Merge embeddings with labeled data. - -In order to make use of the birdnet embeddings for each sound -file to train an svm (or any model), we want to ensure each -labeled chunk contains the embedding information as well. -You will need to have run all your audio through embeddings.py -to obtain the embeddings for each of the sound files. This script -then takes both the embeddings and the labeled output csvs from -running parse_2017_data.py to create 1 csv that contains the human -ground truth label as well as columns for each of the 1024 features -per 3 second chunk. - -Example: - - $ python label_embeddings.py /path/to/output/ \ - /path/to/birdnet_embeddings/ /path/to/desired/outputs/ - -""" - -import argparse -import os -import pandas as pd - - -def main(human_labels, embeddings, output): - """Merge dataframes. - - Args: - human_labels (str): The path to the human labeled csv. - embeddings (str): The path to the Birdnet embeddings files. - output (str): The path to desired output directory. - - """ - for filename in os.listdir(human_labels): - file_path = os.path.join(human_labels, filename) - label_df = pd.read_csv(file_path) - stripped_filename = filename.strip("_chunks.csv") - - for birdnet in os.listdir(embeddings): - stripped_birdnet = birdnet.strip(".birdnet.embeddings.txt") - - if stripped_birdnet == stripped_filename: - birdnet_path = os.path.join(embeddings, birdnet) - dfb = pd.read_csv(birdnet_path, - delimiter="[,\t]", - engine='python', - header=None) - dfb_stripped = dfb.drop(dfb.columns[:2], axis=1) - dfb_stripped.columns = [ - f"feature_{i}" for i in range( - 1, len(dfb_stripped.columns) + 1)] - df_stripped = compare_dfs(label_df, dfb_stripped) - combined_df = pd.concat([df_stripped, dfb_stripped], axis=1) - output_filename = stripped_filename + "_labeled_embeddings.csv" - output_path = os.path.join(output, output_filename) - combined_df.to_csv(output_path, index=False) - print(f"Labeled embeddings created for: {output_path}") - - else: - continue - - -def compare_dfs(label_df, dfb): - """Ensure same number of rows. - - The human labeled data may contain one more row than the - Birdnet embeddings because it will ignore the end of a file - if there is not enough time for a full 3-second final chunk. - This will remove that last human labeled line as there will - be no embedding for it. It will throw an error if it's off - by more than one row because that should never be the case. - - Args: - df (pandas.Dataframe): The human labeled dataframe. - dfb (pandas.Dataframe): The embeddings dataframe. - - Returns: - proper_df (pandas.Dataframe: The labeled dataframe with - correct number of rows. - - """ - if abs(len(label_df) - len(dfb)) > 1: - raise ValueError("Dfs have a difference greater than 1 row") - - if len(label_df) > len(dfb): - df_stripped = label_df.iloc[:-1] - else: - df_stripped = label_df - - return df_stripped - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Input and output paths' - ) - parser.add_argument('labels', type=str, - help='Directory path to human labeled csvs.') - parser.add_argument('embeddings', type=str, - help='Directory path to birdnet embeddings.') - parser.add_argument('output', type=str, - help='Directory path to desired output csvs.') - args = parser.parse_args() - main(args.labels, args.embeddings, args.output) diff --git a/make_model/make_svm.py b/make_model/make_svm.py deleted file mode 100644 index ef1db5b..0000000 --- a/make_model/make_svm.py +++ /dev/null @@ -1,75 +0,0 @@ -"""Create Support Vector Machine Model. - -This script takes the labeled embeddings file, randomly divides into -a train and test set, trains a linear 2-class SVM, outputs the metrics, -and saves the model to a file to be used later. - -Example: - - $ python make_svm.py /path/to/labeled_embeddings.csv \ - /path/to/desired/model/output.sav - -""" - -import argparse -import os -import pickle -import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn.svm import SVC -from sklearn.metrics import classification_report - - -def main(labeled_embeddings, saved_model): - """Create and save SVM. - - Create the SVM, output the metrics, and save the model. - - Args: - labeled_embeddings (str): The path to folder with labeled embeddings. - saved_model (str): The path to the model. - - """ - all_x = [] - all_y = [] - for embeddings_file in os.listdir(labeled_embeddings): - embeddings_path = os.path.join(labeled_embeddings, embeddings_file) - le_df = pd.read_csv(embeddings_path) - embed = le_df.drop(['Chunk Start', 'Chunk End', 'Label'], axis=1) - label = le_df['Label'] - all_x.append(embed) - all_y.append(label) - combined_x = pd.concat(all_x, ignore_index=True) - combined_y = pd.concat(all_y, ignore_index=True) - - print(f"Detection types in entire set: \n{combined_y.value_counts()}") - - train_x, test_x, train_y, test_y = train_test_split(combined_x, - combined_y, - test_size=0.2, - random_state=42) - - svm = SVC(class_weight='balanced', probability=True) - svm.fit(train_x, train_y) - - y_pred_default = svm.predict(test_x) - - with open(saved_model, 'wb') as file: - pickle.dump(svm, file) - - print("Classification report with default threshold:") - print(classification_report(test_y, y_pred_default)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Input CSV and model output' - ) - parser.add_argument('labeled_embeddings', - type=str, - help='Directory path to labels with embeddings.') - parser.add_argument('saved_model', - type=str, - help='Path to the saved model output.') - args = parser.parse_args() - main(args.labeled_embeddings, args.saved_model) diff --git a/make_model/parse_2017_data.py b/make_model/parse_2017_data.py deleted file mode 100644 index 9a27da7..0000000 --- a/make_model/parse_2017_data.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Standardize the human labels. - -Each sound file of interest has a corresponding csv created that contains -labels for each 3 second chunk in the file, where 'yes' is marked on -that time chunk if the human label contains a vocalization in that time -frame. - -Example: - - $ python parse_2017_data.py /path/to/human_labels.csv \ - /path/to/directory/of/wavs/ /path/to/directory/output/ - -""" - -import argparse -import os -import pandas as pd -import librosa - - -def main(labels, wavs, output): - """Create human labeled dataframes. - - Main script to create csvs of human labeled data for each - wav file of interest. - - Args: - labels (str): The path to human labeled csv. - wavs (str): The path to all audio files. - output (str): The path to directory where each csv will - output (1 for each wav). - - """ - os.makedirs(output, exist_ok=True) - - scored_data = pd.read_csv(labels) - - for audio_file in os.listdir(wavs): - if audio_file.endswith('.wav'): - audio_path = os.path.join(wavs, audio_file) - - try: - time_series, sample_rate = librosa.load(audio_path, sr=None) - audio_duration = librosa.get_duration(y=time_series, - sr=sample_rate) - except Exception as err: - print(f"Error processing {audio_file}: {err}") - continue - - total_chunks = int(audio_duration // 3) + 1 - chunks_data = { - 'Chunk Start': [i * 3 for i in range(total_chunks)], - 'Chunk End': [(i + 1) * 3 for i in range(total_chunks)], - 'Label': ['no'] * total_chunks - } - chunks_df = pd.DataFrame(chunks_data) - - filtered_data = scored_data[scored_data['IN FILE'] == audio_file] - for _, row in filtered_data.iterrows(): - if row['TOP1MATCH'] != 'null': - start_time = float(row['OFFSET']) - end_time = start_time + float(row['DURATION']) - - for i in range(len(chunks_df)): - chunk_start = chunks_df.loc[i, 'Chunk Start'] - chunk_end = chunks_df.loc[i, 'Chunk End'] - if (start_time < chunk_end and end_time > chunk_start): - chunks_df.loc[i, 'Label'] = 'yes' - - output_file = os.path.join( - output, f'{os.path.splitext(audio_file)[0]}_chunks.csv' - ) - chunks_df.to_csv(output_file, index=False) - print(f"Processed {audio_file} -> {output_file}") - - print("Processing complete!") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Input Directory Path' - ) - parser.add_argument('labels', type=str, - help='Path to human labeled csv') - parser.add_argument('wavs', type=str, - help='Path to all wav files that have been labeled') - parser.add_argument('output', type=str, - help='Path to desired directory for output csvs') - args = parser.parse_args() - main(args.labels, args.wavs, args.output) diff --git a/pyproject.toml b/pyproject.toml index 8b1b906..cdb7bf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,10 @@ dependencies = [ "tqdm>=4.67.1", ] +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + [project.optional-dependencies] cpu = [ "torch>=2.7.0", @@ -40,7 +44,13 @@ notebooks = [ cu128 = "https://download.pytorch.org/whl/cu128" [tool.setuptools] -packages = ["make_model", "assess_birdnet", "whoot_model_training"] +packages = ["whoot"] [tool.uv.sources] pyha-analyzer = { git = "https://github.com/UCSD-E4E/pyha-analyzer-2.0.git", branch = "support_whoot" } +whoot = { workspace = true } + +[dependency-groups] +dev = [ + "whoot", +] diff --git a/test.py b/test.py new file mode 100644 index 0000000..4de7737 --- /dev/null +++ b/test.py @@ -0,0 +1,3 @@ +from whoot import whoot_model_training + +print(whoot_model_training) \ No newline at end of file diff --git a/whoot/__init__.py b/whoot/__init__.py index c0ae2e0..a247e5d 100644 --- a/whoot/__init__.py +++ b/whoot/__init__.py @@ -1 +1,3 @@ +from . import whoot_model_training __version__ = "0.0.2.dev0" +__all__ = ["whoot_model_training"] \ No newline at end of file diff --git a/whoot_model_training/whoot_model_training/__init__.py b/whoot/whoot_model_training/__init__.py similarity index 100% rename from whoot_model_training/whoot_model_training/__init__.py rename to whoot/whoot_model_training/__init__.py diff --git a/whoot_model_training/whoot_model_training/data_extractor/__init__.py b/whoot/whoot_model_training/data_extractor/__init__.py similarity index 100% rename from whoot_model_training/whoot_model_training/data_extractor/__init__.py rename to whoot/whoot_model_training/data_extractor/__init__.py diff --git a/whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py b/whoot/whoot_model_training/data_extractor/buowset_extractor.py similarity index 100% rename from whoot_model_training/whoot_model_training/data_extractor/buowset_extractor.py rename to whoot/whoot_model_training/data_extractor/buowset_extractor.py diff --git a/whoot_model_training/whoot_model_training/dataset.py b/whoot/whoot_model_training/dataset.py similarity index 100% rename from whoot_model_training/whoot_model_training/dataset.py rename to whoot/whoot_model_training/dataset.py diff --git a/whoot_model_training/whoot_model_training/logger.py b/whoot/whoot_model_training/logger.py similarity index 100% rename from whoot_model_training/whoot_model_training/logger.py rename to whoot/whoot_model_training/logger.py diff --git a/whoot_model_training/whoot_model_training/metrics.py b/whoot/whoot_model_training/metrics.py similarity index 100% rename from whoot_model_training/whoot_model_training/metrics.py rename to whoot/whoot_model_training/metrics.py diff --git a/whoot_model_training/whoot_model_training/models/__init__.py b/whoot/whoot_model_training/models/__init__.py similarity index 100% rename from whoot_model_training/whoot_model_training/models/__init__.py rename to whoot/whoot_model_training/models/__init__.py diff --git a/whoot_model_training/whoot_model_training/models/model.py b/whoot/whoot_model_training/models/model.py similarity index 100% rename from whoot_model_training/whoot_model_training/models/model.py rename to whoot/whoot_model_training/models/model.py diff --git a/whoot_model_training/whoot_model_training/models/timm_model.py b/whoot/whoot_model_training/models/timm_model.py similarity index 100% rename from whoot_model_training/whoot_model_training/models/timm_model.py rename to whoot/whoot_model_training/models/timm_model.py diff --git a/whoot_model_training/whoot_model_training/preprocessors/__init__.py b/whoot/whoot_model_training/preprocessors/__init__.py similarity index 100% rename from whoot_model_training/whoot_model_training/preprocessors/__init__.py rename to whoot/whoot_model_training/preprocessors/__init__.py diff --git a/whoot_model_training/whoot_model_training/preprocessors/base_preprocessor.py b/whoot/whoot_model_training/preprocessors/base_preprocessor.py similarity index 100% rename from whoot_model_training/whoot_model_training/preprocessors/base_preprocessor.py rename to whoot/whoot_model_training/preprocessors/base_preprocessor.py diff --git a/whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py b/whoot/whoot_model_training/preprocessors/spectrogram_preprocessors.py similarity index 100% rename from whoot_model_training/whoot_model_training/preprocessors/spectrogram_preprocessors.py rename to whoot/whoot_model_training/preprocessors/spectrogram_preprocessors.py diff --git a/whoot_model_training/whoot_model_training/trainer.py b/whoot/whoot_model_training/trainer.py similarity index 100% rename from whoot_model_training/whoot_model_training/trainer.py rename to whoot/whoot_model_training/trainer.py diff --git a/whoot_model_training/README.md b/whoot_model_training/README.md deleted file mode 100644 index 639d457..0000000 --- a/whoot_model_training/README.md +++ /dev/null @@ -1,57 +0,0 @@ -Toolkit for training Machine Learning Classification Models over audio dataset - -Key inspiration is https://github.com/UCSD-E4E/pyha-analyzer-2.0/tree/main. This repo differs in that it uses a traditional training pipeline rather than the Hugging Face Trainer. Hugging face trainer abstracts the training code, which should be explicit for this toolkit. - - -# Install - -To set up environment for model training: - -1) run steps 1 - 3 of the installation instructions in `whoot/README.md` -2) For step 4, specifically run `pip install -e .[model-training, cpu]` for cpu training, `pip install -e .[model-training, cu128]` for training on Nvidia GPUs - -Note that you should check what is supported by CUDA on your machine. See developers if you need a different CUDA version - -# Running - -0) Add your Comet-ML API to your local environment. See https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/ -1) Create a copy of the config found in `configs/config.yml` and fill it out for your dataset. See the [config](#config) section -2) Edit train.py to set up training for your dataset. If you are using a new dataset which an extractor does not exist for, contact code authors. -3) run `python train.py path/to/your/config/file.yml` - -# Config - -## Default Config Properties -The properties of `config.yml` are as follows: -### Data paths -`metadata_csv`: the path to the metadata file for your dataset. -`data_path`: Path to the highest level parent folder containing audio. Audio can be in a different path than the metadata! -`hf_cache_path`: cache for hugging face. This path will be automatically made as you run the script, this would be the location of where the new file should go - -### Required Variables -`COMET_PROJECT_NAME`: "whoot", this is the project on comet-ml training will run on. -`CUDA_VISIBLE_DEVICES`: "0" or "0,1", this controls how many GPUs the training uses. -`SUBPROJECT_NAME`: Some description to help filter which training this is used for, can be the task being done (multi_label_classification) or something else (fun_training_test) -`DATASET_NAME`: Name of the dataset being trained on, will be embedded on comet_ml to make searching easier - -## Project Specific config information -### Buowset -The filenames in metadata_csv are the audio files found in `data_path`. - -`SUBPROJECT_NAME` is either "binary" or "mutlilabelClass" -`DATASET_NAME` is buowset0 - -# Repo Philosophy - -The most challenging issue with machine learning is the dataset. This training repo intends to make it easy to modularize parts of the training pipeline, and integrate them together, ideally regardless of the dataset. - -The pipeline works in 5 parts: -- Extractors: Extractors take in raw data and reformats it into `AudioDatasets`, apache-arrow data structures implemented via HuggingFace with common columns between any dataset. Every label is one_hot_encoded and treated as mutlilabel regardless of the problem. Audio filepaths as casted into [Audio columns](https://huggingface.co/docs/datasets/v3.6.0/en/package_reference/main_classes#datasets.Audio). Extractors are *unique for each dataset* but *uniform in the AudioDataset*. - -- Preprocessors: Online preprocessors take rows in `AudioDatasets` and output `ModelInputs`, formatted data specific to a given model. Preprocessors read AudioDatasets and translate it so the Model can read it - -- Models: Models have defined `ModelInput` and `ModelOutput` formats. All ModelInputs and ModelOutputs have common data that they are required to have such that the `PyhaTrainer` can understand how to feed information to the Model, and how to read information from the model. All models implement their own loss functions and return a loss given labels. - -- Augmentations: TODO - -- PyhaTrainer: With few exceptions unrelated to bioacoustic classifications, all PyTorch training code is the same. The HuggingFace Trainer and the extension PyhaTrainer handle most training scripts you will ever write. Why not use it and focus on model design, dataset preprocessing and cleaning. As long as the trainer knows how to feed data into a model (`AudioDatasets` and `Preprocessors`) and how to read it (`ModelOutputs`), then it will have no issues. \ No newline at end of file diff --git a/whoot_model_training/train.py b/whoot_model_training/train.py deleted file mode 100644 index aa31161..0000000 --- a/whoot_model_training/train.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Trains a Mutliclass Model with Pytorch and Huggingface - -This script can be used to run experiments with different -models and datasets to create any model for bioacoustic classification - -It is intended this script to be heavily modified with each experiment -(say one wants to use a different dataset, one should copy this and change the -extractor!) - -Usage: - $ python train.py /path/to/config.yml - -config.yml should contain frequently changed hyperparameters -""" -import os -import argparse -import yaml - -from whoot_model_training.trainer import WhootTrainer, WhootTrainingArguments -from whoot_model_training.data_extractor import buowset_extractor -from whoot_model_training.models import TimmModel, TimmInputs, TimmModelConfig -from whoot_model_training import CometMLLoggerSupplement - -from whoot_model_training.preprocessors import ( - MelModelInputPreprocessor -) - -# Uncomment for use with data augmentation -# from pyha_analyzer.preprocessors import MixItUp, ComposeAudioLabel -# from audiomentations import ( -# Compose, AddColorNoise, -# AddBackgroundNoise, PolarityInversion, Gain -# ) - - -def parse_config(config_path: str) -> dict: - """wrapper to parse config - - Args: - config_path (str): path to config file for training! - - returns: - (dict): hyperparameters parameters - """ - config = {} - with open(config_path, "r", encoding="UTF-8") as f: - config = yaml.safe_load(f) - return config - - -def train(config): - """Highest level logic for training - - Does the following: - - Formats the dataset into an AudioDataset - - Prepares preprocessing for each audio clip - - Builds the model - - Configures and runs the trainer - - Runs evaluation - - Args: - config (dict): the config used for training. Defined in yaml file - """ - - # Extract the dataset - ds = buowset_extractor( - metadata_csv=config["metadata_csv"], - parent_path=config["data_path"], - output_path=config["hf_cache_path"], - ) - - # Create the model - run_name = "efficientnet_b1_testing_confusion_matrix_no_data_aug" - model_config = TimmModelConfig( - timm_model="efficientnet_b1", - num_classes=ds.get_num_classes()) - model = TimmModel(model_config) - - # Preprocessors - - # Uncomment if doing work with data augmentation - # # Augmentations - # wav_augs = ComposeAudioLabel([ - # # AddBackgroundNoise( #We don't have background noise yet... - # # sounds_path="data_birdset/background_noise", - # # min_snr_db=10, - # # max_snr_db=30, - # # noise_transform=PolarityInversion(), - # # p=0.8 - # # ), - # Gain( - # min_gain_db = -12, - # max_gain_db = 12, - # p = 0.8 - # ), - # MixItUp( - # dataset_ref=ds["train"], - # min_snr_db=10, - # max_snr_db=30, - # noise_transform=PolarityInversion(), - # p=0.8 - # ) - # ]) - - # Online preprocessors prepare data for training - train_preprocessor = MelModelInputPreprocessor( - TimmInputs, duration=3 - ) - - preprocessor = MelModelInputPreprocessor( - TimmInputs, duration=3 - ) - - ds["train"].set_transform(train_preprocessor) - ds["valid"].set_transform(preprocessor) - ds["test"].set_transform(preprocessor) - - # Run training - training_args = WhootTrainingArguments( - run_name=run_name, - subproject_name=config["SUBPROJECT_NAME"], - dataset_name=config["DATASET_NAME"], - ) - - # COMMON OPTIONAL ARGS - training_args.num_train_epochs = 2 - training_args.eval_steps = 100 - training_args.per_device_train_batch_size = 32 - training_args.per_device_eval_batch_size = 32 - training_args.dataloader_num_workers = 36 - training_args.run_name = run_name - - trainer = WhootTrainer( - model=model, - dataset=ds, - training_args=training_args, - logger=CometMLLoggerSupplement( - augmentations=None, - name=training_args.run_name - ), - ) - - trainer.train() - model.save_pretrained("model_checkpoints/test") - - -def init_env(config: dict): - """Sets up local environment for COMET-ML training logging - - Args: config (dict): at a minimum this has the project name - and CUDA devices that are allowed to be used. - """ - print(config) - os.environ["COMET_PROJECT_NAME"] = config["COMET_PROJECT_NAME"] - os.environ["CUDA_VISIBLE_DEVICES"] = config["CUDA_VISIBLE_DEVICES"] - check_for_comet = config["COMET_WORKSPACE"] is not None - assert check_for_comet, "Make sure to add a COMET_WORKSPACE to config" - os.environ["COMET_WORKSPACE"] = config["COMET_WORKSPACE"] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Input config path") - parser.add_argument("config", type=str, help="Path to config.yml") - args = parser.parse_args() - _config = parse_config(args.config) - - init_env(_config) - train(_config) diff --git a/whoot_model_training/train_binary.py b/whoot_model_training/train_binary.py deleted file mode 100644 index a86c867..0000000 --- a/whoot_model_training/train_binary.py +++ /dev/null @@ -1,162 +0,0 @@ -"""Trains a Mutliclass Model with Pytorch and Huggingface - -This script can be used to run experiments with different -models and datasets to create any model for bioacoustic classification - -It is intended this script to be heavily modified with each experiment -(say one wants to use a different dataset, one should copy this and change the -extractor!) - -Usage: - $ python train.py /path/to/config.yml - -config.yml should contain frequently changed hyperparameters -""" -import os -import argparse -import yaml - -from whoot_model_training.trainer import WhootTrainer, WhootTrainingArguments -from whoot_model_training.data_extractor import buowset_binary_extractor -from whoot_model_training.models import TimmModel, TimmInputs -from whoot_model_training import CometMLLoggerSupplement - -from whoot_model_training.preprocessors import ( - MelModelInputPreprocessor -) - -# Uncomment for use with data augmentation -# from pyha_analyzer.preprocessors import MixItUp, ComposeAudioLabel -# from audiomentations import ( -# Compose, AddColorNoise, -# AddBackgroundNoise, PolarityInversion, Gain -# ) - - -def parse_config(config_path: str) -> dict: - """wrapper to parse config - - Args: - config_path (str): path to config file for training! - - returns: - (dict): hyperparameters parameters - """ - config = {} - with open(config_path, "r", encoding="UTF-8") as f: - config = yaml.safe_load(f) - return config - - -def train(config): - """Highest level logic for training - - Does the following: - - Formats the dataset into an AudioDataset - - Prepares preprocessing for each audio clip - - Builds the model - - Configures and runs the trainer - - Runs evaluation - - Args: - config (dict): the config used for training. Defined in yaml file - """ - - # Extract the dataset - ds = buowset_binary_extractor( - metadata_csv=config["metadata_csv"], - parent_path=config["data_path"], - output_path=config["hf_cache_path"], - ) - - # Create the model - run_name = "efficientnet_b1_testing_confusion_matrix_no_data_aug" - model = TimmModel(timm_model="efficientnet_b1", - num_classes=ds.get_num_classes()) - - # Preprocessors - - # Uncomment if doing work with data augmentation - # # Augmentations - # wav_augs = ComposeAudioLabel([ - # # AddBackgroundNoise( #We don't have background noise yet... - # # sounds_path="data_birdset/background_noise", - # # min_snr_db=10, - # # max_snr_db=30, - # # noise_transform=PolarityInversion(), - # # p=0.8 - # # ), - # Gain( - # min_gain_db = -12, - # max_gain_db = 12, - # p = 0.8 - # ), - # MixItUp( - # dataset_ref=ds["train"], - # min_snr_db=10, - # max_snr_db=30, - # noise_transform=PolarityInversion(), - # p=0.8 - # ) - # ]) - - # Offline preprocessors prepare data for training - train_preprocessor = MelModelInputPreprocessor( - TimmInputs, duration=3 - ) - - preprocessor = MelModelInputPreprocessor( - TimmInputs, duration=3 - ) - - ds["train"].set_transform(train_preprocessor) - ds["valid"].set_transform(preprocessor) - ds["test"].set_transform(preprocessor) - - # Run training - training_args = WhootTrainingArguments( - run_name=run_name, - subproject_name=config["SUBPROJECT_NAME"], - dataset_name=config["DATASET_NAME"], - ) - - # COMMON OPTIONAL ARGS - training_args.num_train_epochs = 2 - training_args.eval_steps = 20 - training_args.per_device_train_batch_size = 32 - training_args.per_device_eval_batch_size = 32 - training_args.dataloader_num_workers = 36 - training_args.run_name = run_name - - trainer = WhootTrainer( - model=model, - dataset=ds, - training_args=training_args, - logger=CometMLLoggerSupplement( - augmentations=None, - name=training_args.run_name - ), - ) - - trainer.train() - - -def init_env(config: dict): - """Sets up local environment for COMET-ML training logging - - Args: config (dict): at a minimum this has the project name - and CUDA devices that are allowed to be used. - """ - print(config) - os.environ["COMET_PROJECT_NAME"] = config["COMET_PROJECT_NAME"] - os.environ["CUDA_VISIBLE_DEVICES"] = config["CUDA_VISIBLE_DEVICES"] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Input config path") - parser.add_argument("config", type=str, help="Path to config.yml") - args = parser.parse_args() - _config = parse_config(args.config) - - init_env(_config) - train(_config)