conservationtechlab · kgarwoodsdzwa · Jun 30, 2025 · Jun 30, 2025 · Jul 1, 2025 · Jul 3, 2025
diff --git a/assess_birdnet/README.md b/assess_birdnet/README.md
@@ -0,0 +1,36 @@
+Tools to assess BirdNET performance on Buowset
+
+Segments must be a minimum of 3s in order to be assessed by Birdnet, buowsetv1.0
+needs padding but buowsetv1.1 are 3s without artificial padding. And then run 
+BirdNET analyze over the entire dataset with the desired confidence
+thresholding and with burrowing owl as the only class in the species list.
+
+To run Birdnet over your dataset, follow the instructions in this repo:
+https://birdnet-team.github.io/BirdNET-Analyzer/usage/cli.html#birdnet-analyzer-analyze
+
+We created our own class list with only our species of interest and ran
+birdnet_analyzer.analyze over the entire dataset, beginning with default
+confidence and sensitivity values. You can adjust these values and rerun 
+to obtain a comparison of performance across different confidence thresholds
+and sensitivity. Birdnet will give you a text file result for each audio file
+in your dataset, we had these text files saved to the same directory as the
+audio.
+
+Running aggregate_birdnet_buowset.py with the path to the BirdNET results and
+a .pkl file to send the result to will create a dataframe with the name of the
+wav file and a 0 for no buow and a 1 for yes buow detected by BirdNET. 
+
+Then running buowset_assess_birdnet.py with the aforementioned .pkl, the
+metadata file for buowset, and some optional paramters, you can compare the
+performance of BirdNET against the ground truth labels of buowset. By adding
+the optional arguments, you go from comparing BirdNET as a burrowing owl/
+no burrowing owl detector to assessing the BirdNET performance on a class by
+class basis. If you select to assess for the 'Coocoo' class for example, 
+it will aggregate all coocoo instances based on ground truth, and obtain
+an equal amount of randomly selected no_buow samples, and generate a confusion
+matrix comparing if BirdNET marked the instances of that class as burrowing owl. 
+
+Because BirdNET is a binary classifier for burrowing owl in this data, a class
+by class comparison only tells us if it disproportionately misses certain calls
+more than others when looking for burrowing owls in general, ie it gives us a peak
+into the likely call distribution of their training data.
diff --git a/assess_birdnet/aggregate_birdnet_buowset.py b/assess_birdnet/aggregate_birdnet_buowset.py
@@ -0,0 +1,75 @@
+"""Create a dataframe out of birdnet results.
+
+When you run BirdNET analyze on wav files, it outputs a
+result txt file per each wav. We need to aggregate all
+of the results into 1 dataframe and saved out so we can
+reference it later when we analyze the birdnet results
+for buowset.
+
+Usage:
+    python3 aggregate_birdnet_buowset.py /path/to/birdnet/
+    analyzer/folder/ /path/to/output.pkl
+"""
+import argparse
+import glob
+import os
+import ntpath
+import pandas as pd
+
+
+def parse_birdnet_analysis(birdnet):
+    """Create dataframe from individual birdnet result files.
+
+    Args:
+        birdnet (str): Path to the birdnet results.
+
+    Returns:
+        pandas.DataFrame: Birdnet results as a single dataframe.
+    """
+    bn_dict = {}
+    burowl_count = 0
+    result_files = glob.glob(os.path.join(birdnet, "*.txt"))
+    for txt_file in result_files:
+        filename = ntpath.basename(txt_file)
+        filename = filename.replace("BirdNET.selection.table.txt", "wav")
+        with open(txt_file, 'r') as file:
+            header = file.readline().strip().split('\t')
+            data = pd.read_csv(file, header=None, names=header, delimiter='\t')
+        if any(data['Species Code'].str.lower() == 'burowl'):
+            bn_dict[filename] = 1
+            burowl_count += 1
+            print(f"New burowl count is {burowl_count}")
+        else:
+            bn_dict[filename] = 0
+    print("finished dict")
+    birdnet_df = pd.DataFrame.from_dict(bn_dict,
+                                        orient='index',
+                                        columns=['bn_label'])
+    birdnet_df.index.name = 'segment'
+    return birdnet_df
+
+
+def main(birdnet, output):
+    """Save out birdnet results to a dataframe.
+
+    Args:
+        birdnet (str): Path to the birdnet results files.
+        output (str): Filename for the output pkl.
+    """
+    birdnet_df = parse_birdnet_analysis(birdnet)
+    birdnet_df.to_pickle(output)
+    print(birdnet_df)
+
+
+if __name__ == '__main__':
+    PARSER = argparse.ArgumentParser(
+        description='Input CSV and model output'
+        )
+    PARSER.add_argument('birdnet_analysis',
+                        type=str,
+                        help='Path to Birdnet analysis folder.')
+    PARSER.add_argument('output',
+                        type=str,
+                        help='Path to desired output for result.')
+    ARGS = PARSER.parse_args()
+    main(ARGS.birdnet_analysis, ARGS.output)
diff --git a/assess_birdnet/buowset_assess_birdnet.py b/assess_birdnet/buowset_assess_birdnet.py
@@ -0,0 +1,188 @@
+"""Assess Birdnet performance on Buowset.
+
+This allows for assessing how well Birdnet performs on
+buowset for burrowing owl/no burrowing owl, and for
+the individual call types within our labeled data.
+
+Usage:
+    python3 buowset_assess_birdnet.py /path/to/birdnet/output/
+    /path/to/buowset/metadata.csv
+"""
+import argparse
+import random
+import pandas as pd
+from sklearn.metrics import confusion_matrix, accuracy_score
+from sklearn.metrics import recall_score, f1_score, precision_score
+from comet_ml import Experiment
+
+
+def organize_birdnet_output(birdnet_results):
+    """Open the birdnet results file.
+
+    Args:
+        birdnet_results (string): Path to the .pkl aggregated results.
+
+    Returns:
+        pd.DataFrame
+    """
+    birdnet_df = pd.read_pickle(birdnet_results)
+    return birdnet_df
+
+
+def merge_metadata(metadata, birdnet_df):
+    """Combine metadata and birdnet results by segment.
+
+    Args:
+        metadata (string): Path to metadata file.
+        birdnet_df (pd.DataFrame): The birdnet results file.
+    Returns:
+        pd.DataFrame
+    """
+    meta = pd.read_csv(metadata, index_col=0)
+    df_merged = meta.merge(birdnet_df, on='segment')
+    df_merged = df_merged.drop(columns=['segment_duration_s', 'fold'])
+
+    return df_merged
+
+
+def map_binary_labels(merged_data):
+    """Obtain the two dataframes for the predicted and true labels.
+
+    Args:
+    """
+    y_true = merged_data['label'].map(
+        {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 0}
+    ).values
+    y_pred = merged_data['bn_label'].values
+
+    return y_true, y_pred
+
+
+def map_class_labels(merged_data, assess_class):
+    """Create binary class vs no buow assessment for 1 class.
+
+    Args:
+        merged_data (pd.DataFrame): The birdnet label and ground truth
+            merged on segment name.
+        assess_class (int): The number associated with the specific
+            vocalization type to be assessed.
+    """
+    class_only = merged_data[merged_data['label'] == assess_class]
+    num_rows = len(class_only) - 1
+    no_buow_only = merged_data[merged_data['label'] == 5]
+    num_no_buow_rows = len(no_buow_only) - 1
+    available_numbers = list(range(0, num_rows))
+    available_indexes = list(range(0, num_no_buow_rows))
+    index_no_buow = []
+    index = 0
+    while available_numbers:
+        selected_number = random.choice(available_indexes)
+        index_no_buow.append(selected_number)
+        available_indexes.remove(selected_number)
+        available_numbers.remove(index)
+        index += 1
+    no_buow_subset = no_buow_only.iloc[index_no_buow]
+    merged = pd.concat([no_buow_subset, class_only], ignore_index=True)
+    y_true = merged['label'].map({assess_class: 1, 5: 0}).values
+    y_pred = merged['bn_label'].values
+
+    return y_true, y_pred
+
+
+def assess_birdnet(y_true, y_pred, experiment=None):
+    """Assess Birdnet against ground truth labels.
+
+    Args:
+        y_true (pd.DataFrame):
+        y_pred (pd.DataFrame):
+        experiment:
+    """
+    confusion_m = confusion_matrix(y_true, y_pred)
+    accuracy = accuracy_score(y_true, y_pred)
+    precision = precision_score(y_true, y_pred)
+    recall = recall_score(y_true, y_pred)
+    f1_result = f1_score(y_true, y_pred)
+
+    print("Confusion Matrix:")
+    print(confusion_m)
+    print(f"Accuracy: {accuracy:.4f}")
+    print(f"Precision: {precision:.4f}")
+    print(f"Recall: {recall:.4f}")
+    print(f"F1 Score: {f1_result:.4f}")
+
+    if experiment:
+        experiment.log_metric("accuracy", accuracy)
+        experiment.log_metric("precision", precision)
+        experiment.log_metric("recall", recall)
+        experiment.log_metric("f1_score", f1_result)
+        experiment.log_confusion_matrix(matrix=confusion_m.tolist(),
+                                        labels=["No Detection", "Detection"])
+
+
+def create_comet_exp():
+    """Create the comet experiment settings.
+
+    Returns:
+        ()
+    """
+    project = input("Enter the comet project name: ")
+    work_space = input("Enter the comet workspace: ")
+    experiment_name = input("Enter the name of this experiment: ")
+    experiment = Experiment(
+        project_name=project,
+        workspace=work_space
+    )
+    experiment.set_name(experiment_name)
+    experiment.add_tags(["burrowl", "birdnet", "binary-classification"])
+
+    return experiment
+
+
+def main(birdnet_results, metadata, single_class, assess_class):
+    """Assess birdnet.
+
+    Args:
+        birdnet_results (string): Path to the .pkl of the aggregated
+            birdnet results.
+        metadata (string): Path to the metadata.csv
+        single_class (bool): Default false for buow/no buow, true for
+            assessing an individual class.
+        assess_class (int): The class number to be assessed if single_class
+            called true.
+    """
+    print("Starting")
+    experiment = create_comet_exp()
+    print("Aggregating BirdNET results.")
+    birdnet_df = organize_birdnet_output(birdnet_results)
+    print(f"Aggregated {len(birdnet_df)} BirdNET results.")
+    print(f"Matching ground truth labels to BirdNET results.")
+    merged_data = merge_metadata(metadata, birdnet_df)
+    print("Comparing BirdNET labels to ground truth.")
+    if single_class is False:
+        print("Doing binary buow/no_buow assessment")
+        y_true, y_pred = map_binary_labels(merged_data)
+    else:
+        print(f"Assessing performance of Birdnet on: {assess_class}")
+        y_true, y_pred = map_class_labels(merged_data, assess_class)
+    assess_birdnet(y_true, y_pred, experiment=experiment)
+
+
+if __name__ == '__main__':
+    PARSER = argparse.ArgumentParser(
+        description='Input Directory Path'
+        )
+    PARSER.add_argument('birdnet_results',
+                        type=str,
+                        help='Path to Birdnet results for padded buowset.')
+    PARSER.add_argument('metadata',
+                        type=str,
+                        help='Path to buowset metadata file.')
+    PARSER.add_argument('-single_class', action='store_true',
+                        help='Call for individual class assessment.')
+    PARSER.add_argument('-assess_class', default=None, type=int,
+                        help='Which class would you like to assess?')
+    ARGS = PARSER.parse_args()
+    main(ARGS.birdnet_results,
+         ARGS.metadata,
+         ARGS.single_class,
+         ARGS.assess_class)
diff --git a/assess_birdnet/tools/README.md b/assess_birdnet/tools/README.md
@@ -0,0 +1,9 @@
+Older tools to assess birdnet against the human labeled burrowing owl data from 2017-2018.
+
+These tools assess birdnet by splitting all audio into 3s chunks, irregardless of where
+a labeled detection occurred, and adds the human labels onto the 3s chunks after this
+chunking occurs, if the detection window has ANY overlap with a 3s segment. It then
+compares this on an individual wav file basis to the birdnet results for the same data.
+
+We have since moved onto assessing BirdNET on Buowset, the dataset created out of our
+human labeled burrowing owl data. 
diff --git a/assess_birdnet/aggregate_audio_analysis.py → ...birdnet/tools/aggregate_split_bn_audio.py b/assess_birdnet/aggregate_audio_analysis.py → ...birdnet/tools/aggregate_split_bn_audio.py
diff --git a/assess_birdnet/assess_performance.py → assess_birdnet/tools/assess_performance.py b/assess_birdnet/assess_performance.py → assess_birdnet/tools/assess_performance.py
diff --git a/assess_birdnet/normalize_birdnet_output.py → ...birdnet/tools/normalize_birdnet_output.py b/assess_birdnet/normalize_birdnet_output.py → ...birdnet/tools/normalize_birdnet_output.py
diff --git a/assess_birdnet/normalize_scored_output.py → ..._birdnet/tools/normalize_scored_output.py b/assess_birdnet/normalize_scored_output.py → ..._birdnet/tools/normalize_scored_output.py
@@ -71,8 +71,18 @@ def mark_intervals(row, chunks_df):
     start_chunk = int(start_time // 3)
     end_chunk = int(end_time // 3)
 
-    if row['TOP1MATCH'] != 'null':
-        chunks_df.loc[start_chunk:end_chunk, 'Label'] = 'yes'
+    row_lower['MANUAL ID*'] = row['MANUAL ID*'].str.lower()
+
+    if row_lower['MANUAL ID*'] == 'cluck':
+        chunks_df.loc[start_chunk:end_chunk, 'Label'] = '0'
+    elif row_lower['MANUAL ID*'] == 'coocoo':
+        chunks_df.loc[start_chunk:end_chunk, 'Label'] = '1'
+    elif row_lower['MANUAL ID*'] == 'twitter':
+        chunks_df.loc[start_chunk:end_chunk, 'Label'] = '2'
+    elif row_lower['MANUAL ID*'] == 'alarm':
+        chunks_df.loc[start_chunk:end_chunk, 'Label'] = '3'
+    elif row_lower['MANUAL ID*'] == 'chick begging':
+        chunks_df.loc[start_chunk:end_chunk, 'Label'] = '4'
 
 
 if __name__ == '__main__':