conservationtechlab · Sean1572 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 15, 2025
diff --git a/.gitignore b/.gitignore
@@ -229,12 +229,13 @@ settings.json
 # Block all configs besides the example config
 whoot_model_training/configs
 !whoot_model_training/configs/config.yml
-
-# Block demos
+*.csv
+*.ipynb
+*.json
 demos/
 *.ipynb
 
 # Block predictions
 predictions/*
 *.pkl
-*.arrow
+*.arrow
diff --git a/data_downloader/downloader_demo.ipynb b/data_downloader/downloader_demo.ipynb
@@ -0,0 +1,224 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3842a3a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import requests\n",
+    "from xc import XenoCantoDownloader\n",
+    "from dotenv import load_dotenv\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "\n",
+    "# Load environment variables from the .env file\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa77bd28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xcd = XenoCantoDownloader(api_key=os.environ[\"XC_API_KEY\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ee1304b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = xcd.build_query()\n",
+    "res = xcd.get_page(query)\n",
+    "res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fed0106",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = xcd(query=\"box:32.485,-117.582,33.482,-115.228\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a43fcef6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "with open(\"xc_meta.json\", mode=\"w\") as f:\n",
+    "    json.dump(data, f, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0942a027",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "req = requests.get(data[0][\"recordings\"][0][\"file\"])\n",
+    "req"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d62a7a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[0][\"recordings\"][0][\"file\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dcc63419",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "req.content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30af4509",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import shutil\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "from multiprocessing.pool import ThreadPool\n",
+    "\n",
+    "# https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests\n",
+    "def download_file(url, local_filename, dry_run=False):\n",
+    "    if os.path.exists(local_filename):\n",
+    "        return local_filename\n",
+    "\n",
+    "    try:\n",
+    "        with requests.get(url, stream=True) as r:\n",
+    "            with open(local_filename, 'wb') as f:\n",
+    "                if not dry_run:\n",
+    "                    shutil.copyfileobj(r.raw, f)\n",
+    "                else:\n",
+    "                    print(local_filename)\n",
+    "\n",
+    "        return local_filename\n",
+    "    except IOError as e:\n",
+    "        print(e, flush=True)\n",
+    "        return None\n",
+    "\n",
+    "def download_files(xcd, data, parent_folder=\"data/xeno-canto\", workers = 4):\n",
+    "    def prep_download(args):\n",
+    "        url = args[0]\n",
+    "        file_path = args[1]\n",
+    "        return download_file(url, file_path)\n",
+    "\n",
+    "    os.makedirs(parent_folder, exist_ok=True)\n",
+    "\n",
+    "    if \"recordings\" in data[0]:\n",
+    "        data = xcd.concat_recording_data(data) \n",
+    "    download_data = [\n",
+    "        (recording[\"file\"], Path(parent_folder) / Path(recording[\"file-name\"]))\n",
+    "        for recording in data\n",
+    "    ]\n",
+    "    pool = ThreadPool(workers)\n",
+    "    results = pool.imap_unordered(prep_download, download_data) \n",
+    "    pool.close()\n",
+    "    return results\n",
+    "\n",
+    "download_files(xcd, data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea02004c",
+   "metadata": {},
+   "source": [
+    "# Study"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5bf99a36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "recordings = xcd.concat_recording_data(data)\n",
+    "df = pd.DataFrame(recordings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4e26ec8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.histplot(df[\"en\"].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9d7dc37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.ylabel(\"Number of Species\")\n",
+    "plt.xlabel(\"Number of Indivuals Per Species\")\n",
+    "plt.title(\"Do We Have a Few-shot Learning Problem for XC in Southern California?\")\n",
+    "df[\"en\"].value_counts().hist()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c04ef6f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "whoot",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data_downloader/xc.py b/data_downloader/xc.py
@@ -0,0 +1,123 @@
+"""Xeno-Canto Data Metadata Downloader and Search Module."""
+import os
+import urllib.parse
+import json
+import requests
+
+
+class XenoCantoDownloader():
+    """Handler for Xeno-Canto API.
+
+    Note: Requires an API key from env var "XC_API_KEY".
+    Third version of the Xeno-Canto API is used here.
+    """
+    def __init__(self, api_key=None):
+        """Creates the Xeno-Canto Downloader.
+
+        Args:
+            api_key (str): API key for Xeno-Canto API.
+                If None, looks for env var "XC_API_KEY"
+        """
+        self.endpoint_url = "https://xeno-canto.org/api/3/recordings"
+        self.api_key = os.environ["XC_API_KEY"] if api_key is None else api_key
+        assert self.api_key is not None, \
+            "API KEY MISSING: Put API key in Environment Var!"
+
+    def __call__(self,
+                 query=None,
+                 loc=None,
+                 ):
+        r"""Download XC data.
+
+        Initally, this was intended to be used to build queries
+        So more args were planned (hence loc). In practice, it was easier
+        to build queries by hand ¯\_(ツ)_/¯
+
+        You can pull the query you want from the url on the website if you
+        are manually searching for thigns there. Its the same syntax.
+
+        Also is useful for debugging issues there
+
+        Args:
+            query (str/None): Search query string see XC Search Tags
+            loc (str/None): Location string for search query
+        """
+        if query is None:
+            query = self.build_query(
+                loc=loc,
+            )
+
+        page_datas = []
+        page_data = self.get_page(query, page=1)
+        page_datas.append(page_data)
+
+        # Get rest of data!
+        for i in range(2, page_data["numPages"] + 1):
+            page_data = self.get_page(query, page=i)
+            page_datas.append(page_data)
+
+        return page_datas
+
+    def concat_recording_data(self, page_datas):
+        """Concatinate recording data from multiple pages.
+
+        Args:
+            page_datas (list): list of page data dicts
+        """
+        new_page_data = []
+        for page_data in page_datas:
+            new_page_data = new_page_data + page_data["recordings"]
+        return new_page_data
+
+    def build_query(
+        self,
+        loc="San Diego, California, United States of America",
+        # box=None,
+    ):
+        """Builds a query string for Xeno-Canto API.
+
+        See https://xeno-canto.org/help/search
+        Args:
+            loc (str): Location string for search query
+        """
+        search_tags = ""
+        if loc is not None:
+            search_tags += f"loc:\"{loc}\"+"
+        # Remove trailing +
+        return search_tags[:-1]
+
+    def get_page(self, query, page=1):
+        """Get a page of results from Xeno-Canto API.
+
+        Args:
+            query (str): Search query string see XC Search Tags
+            page (int): Page number to retrieve
+        """
+        res = requests.get(
+            self.endpoint_url + "?" + urllib.parse.urlencode({
+                "query": query,
+                "key": self.api_key,
+                "page": page
+            }),
+            timeout=100
+        )
+        if res.status_code == 200:
+            return json.loads(res.text)
+
+        return {}
+    # def download_files(self, data):
+    #     if type(data) == dict:
+    #        data = self.concat_recording_data(self, data)
+    #     for recording in data:
+    #         requests
+
+
+if __name__ == "__main__":
+    # parser = argparse.ArgumentParser(
+    #     description='Input Directory Path'
+    #     )
+    # parser.add_argument('meta', type=str,
+    #                     help='Path to metadata csv')
+    # args = parser.parse_args()
+    xcd = XenoCantoDownloader()
+    print(xcd())