Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,13 @@ settings.json
# Block all configs besides the example config
whoot_model_training/configs
!whoot_model_training/configs/config.yml

# Block demos
*.csv
*.ipynb
*.json
demos/
*.ipynb

# Block predictions
predictions/*
*.pkl
*.arrow
*.arrow
224 changes: 224 additions & 0 deletions data_downloader/downloader_demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "3842a3a9",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import requests\n",
"from xc import XenoCantoDownloader\n",
"from dotenv import load_dotenv\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"# Load environment variables from the .env file\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa77bd28",
"metadata": {},
"outputs": [],
"source": [
"xcd = XenoCantoDownloader(api_key=os.environ[\"XC_API_KEY\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ee1304b",
"metadata": {},
"outputs": [],
"source": [
"query = xcd.build_query()\n",
"res = xcd.get_page(query)\n",
"res"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fed0106",
"metadata": {},
"outputs": [],
"source": [
"data = xcd(query=\"box:32.485,-117.582,33.482,-115.228\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a43fcef6",
"metadata": {},
"outputs": [],
"source": [
"\n",
"with open(\"xc_meta.json\", mode=\"w\") as f:\n",
" json.dump(data, f, indent=4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0942a027",
"metadata": {},
"outputs": [],
"source": [
"req = requests.get(data[0][\"recordings\"][0][\"file\"])\n",
"req"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d62a7a6",
"metadata": {},
"outputs": [],
"source": [
"data[0][\"recordings\"][0][\"file\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dcc63419",
"metadata": {},
"outputs": [],
"source": [
"req.content"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30af4509",
"metadata": {},
"outputs": [],
"source": [
"import shutil\n",
"import os\n",
"from pathlib import Path\n",
"from multiprocessing.pool import ThreadPool\n",
"\n",
"# https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests\n",
"def download_file(url, local_filename, dry_run=False):\n",
" if os.path.exists(local_filename):\n",
" return local_filename\n",
"\n",
" try:\n",
" with requests.get(url, stream=True) as r:\n",
" with open(local_filename, 'wb') as f:\n",
" if not dry_run:\n",
" shutil.copyfileobj(r.raw, f)\n",
" else:\n",
" print(local_filename)\n",
"\n",
" return local_filename\n",
" except IOError as e:\n",
" print(e, flush=True)\n",
" return None\n",
"\n",
"def download_files(xcd, data, parent_folder=\"data/xeno-canto\", workers = 4):\n",
" def prep_download(args):\n",
" url = args[0]\n",
" file_path = args[1]\n",
" return download_file(url, file_path)\n",
"\n",
" os.makedirs(parent_folder, exist_ok=True)\n",
"\n",
" if \"recordings\" in data[0]:\n",
" data = xcd.concat_recording_data(data) \n",
" download_data = [\n",
" (recording[\"file\"], Path(parent_folder) / Path(recording[\"file-name\"]))\n",
" for recording in data\n",
" ]\n",
" pool = ThreadPool(workers)\n",
" results = pool.imap_unordered(prep_download, download_data) \n",
" pool.close()\n",
" return results\n",
"\n",
"download_files(xcd, data)"
]
},
{
"cell_type": "markdown",
"id": "ea02004c",
"metadata": {},
"source": [
"# Study"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5bf99a36",
"metadata": {},
"outputs": [],
"source": [
"recordings = xcd.concat_recording_data(data)\n",
"df = pd.DataFrame(recordings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4e26ec8",
"metadata": {},
"outputs": [],
"source": [
"sns.histplot(df[\"en\"].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9d7dc37",
"metadata": {},
"outputs": [],
"source": [
"plt.ylabel(\"Number of Species\")\n",
"plt.xlabel(\"Number of Indivuals Per Species\")\n",
"plt.title(\"Do We Have a Few-shot Learning Problem for XC in Southern California?\")\n",
"df[\"en\"].value_counts().hist()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c04ef6f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "whoot",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
123 changes: 123 additions & 0 deletions data_downloader/xc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Xeno-Canto Data Metadata Downloader and Search Module."""
import os
import urllib.parse
import json
import requests


class XenoCantoDownloader():
"""Handler for Xeno-Canto API.

Note: Requires an API key from env var "XC_API_KEY".
Third version of the Xeno-Canto API is used here.
"""
def __init__(self, api_key=None):
"""Creates the Xeno-Canto Downloader.

Args:
api_key (str): API key for Xeno-Canto API.
If None, looks for env var "XC_API_KEY"
"""
self.endpoint_url = "https://xeno-canto.org/api/3/recordings"
self.api_key = os.environ["XC_API_KEY"] if api_key is None else api_key
assert self.api_key is not None, \
"API KEY MISSING: Put API key in Environment Var!"

def __call__(self,
query=None,
loc=None,
):
r"""Download XC data.

Initally, this was intended to be used to build queries
So more args were planned (hence loc). In practice, it was easier
to build queries by hand ¯\_(ツ)_/¯

You can pull the query you want from the url on the website if you
are manually searching for thigns there. Its the same syntax.

Also is useful for debugging issues there

Args:
query (str/None): Search query string see XC Search Tags
loc (str/None): Location string for search query
"""
if query is None:
query = self.build_query(
loc=loc,
)

page_datas = []
page_data = self.get_page(query, page=1)
page_datas.append(page_data)

# Get rest of data!
for i in range(2, page_data["numPages"] + 1):
page_data = self.get_page(query, page=i)
page_datas.append(page_data)

return page_datas

def concat_recording_data(self, page_datas):
"""Concatinate recording data from multiple pages.

Args:
page_datas (list): list of page data dicts
"""
new_page_data = []
for page_data in page_datas:
new_page_data = new_page_data + page_data["recordings"]
return new_page_data

def build_query(
self,
loc="San Diego, California, United States of America",
# box=None,
):
"""Builds a query string for Xeno-Canto API.

See https://xeno-canto.org/help/search
Args:
loc (str): Location string for search query
"""
search_tags = ""
if loc is not None:
search_tags += f"loc:\"{loc}\"+"
# Remove trailing +
return search_tags[:-1]

def get_page(self, query, page=1):
"""Get a page of results from Xeno-Canto API.

Args:
query (str): Search query string see XC Search Tags
page (int): Page number to retrieve
"""
res = requests.get(
self.endpoint_url + "?" + urllib.parse.urlencode({
"query": query,
"key": self.api_key,
"page": page
}),
timeout=100
)
if res.status_code == 200:
return json.loads(res.text)

return {}
# def download_files(self, data):
# if type(data) == dict:
# data = self.concat_recording_data(self, data)
# for recording in data:
# requests


if __name__ == "__main__":
# parser = argparse.ArgumentParser(
# description='Input Directory Path'
# )
# parser.add_argument('meta', type=str,
# help='Path to metadata csv')
# args = parser.parse_args()
xcd = XenoCantoDownloader()
print(xcd())
Loading