Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ to your desired specification.
[cucumber_disease_classification](https://github.com/Project-AgML/AgML/blob/main/docs/datasets/cucumber_disease_classification.md) | Image Classification | 7689 |
[iNatAg](https://github.com/Project-AgML/AgML/blob/main/docs/datasets/iNatAg.md) | Image Classification | 4720903 |
[iNatAg-mini](https://github.com/Project-AgML/AgML/blob/main/docs/datasets/iNatAg-mini.md) | Image Classification | 560844 |

[soybean_insect_classification](https://github.com/Project-AgML/AgML/blob/main/docs/datasets/soybean_insect_classification.md) | Image Classification | 6410 |

## iNatAg and iNatAg-mini

Expand All @@ -218,6 +218,7 @@ loader = agml.data.AgMLDataLoader.from_parent("iNatAg", filters={"family_name":
loader = agml.data.AgMLDataLoader.from_parent("iNatAg", filters={"common_name": "..."})
```


## Usage Information

### Using Public Agricultural Data
Expand Down Expand Up @@ -248,5 +249,4 @@ a bug or feature that you would like to see implemented, please don't hesitate t
See the [contributing guidelines](/CONTRIBUTING.md) for more information.

## Funding
This project is partly funded by the [National AI Institute for Food Systems](https://aifs.ucdavis.edu).

This project is partly funded by the [National AI Institute for Food Systems](https://aifs.ucdavis.edu).
40 changes: 40 additions & 0 deletions agml/_assets/public_datasources.json
Original file line number Diff line number Diff line change
Expand Up @@ -1057,6 +1057,11 @@
"continent": "worldwide",
"country": "worldwide"
},
"sensor_modality": "rgb",
"real_synthetic": "real",
"platform": "handheld/ground",
"input_data_format": "jpg",
"annotation_format": "image",
"n_images": "42",
"docs_url": "https://data.nal.usda.gov/dataset/data-multi-species-fruit-flower-detection-using-refined-semantic-segmentation-network",
"external_image_sources": [],
Expand Down Expand Up @@ -2462,5 +2467,40 @@
},
"parent_dataset": "",
"extra_metadata": {}
},
"soybean_insect_classification": {
"ml_task": "image_classification",
"ag_task": "pest_classification",
"location": {
"continent": "south_america",
"country": "brazil"
},
"sensor_modality": "rgb",
"real_synthetic": "real",
"platform": "uav",
"input_data_format": "jpg",
"annotation_format": "directory_names",
"n_images": "6410",
"docs_url": "https://data.mendeley.com/datasets/bycbh73438/1",
"classes": {
"0": "Caterpillar",
"1": "Diabrotica_speciosa",
"2": "Healthy"
},
"external_image_sources": [],
"parent_dataset": "",
"extra_metadata": {},
"stats": {
"mean": [
0.4129256308078766,
0.515846848487854,
0.3347107470035553
],
"std": [
0.22007715702056885,
0.22993944585323334,
0.20302338898181915
]
}
}
}
Binary file modified agml/_assets/shape_info.pickle
Binary file not shown.
4 changes: 4 additions & 0 deletions agml/_assets/source_citations.json
Original file line number Diff line number Diff line change
Expand Up @@ -262,5 +262,9 @@
"cucumber_disease_classification": {
"license": "CC BY 4.0",
"citation": "Sultana, Nusrat; Shorif, Sumaita Binte ; Akter, Morium ; Uddin, Mohammad Shorif (2022), “Cucumber Disease Recognition Dataset”, Mendeley Data, V1, doi: 10.17632/y6d3z6f8z9.1"
},
"soybean_insect_classification": {
"license": "CC BY 4.0",
"citation": "Mignoni, Maria Eloisa (2021), “Images of Soybean Leaves”, Mendeley Data, V1, doi: 10.17632/bycbh73438.1"
}
}
3 changes: 3 additions & 0 deletions agml/_internal/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -1585,6 +1585,9 @@ def sunflower_disease_classification(self, dataset_name):
def cucumber_disease_classification(self, dataset_name):
pass

def soybean_insect_classification(self, dataset_name):
pass

if __name__ == '__main__':
# Initialize program arguments.
ap = argparse.ArgumentParser()
Expand Down
28 changes: 11 additions & 17 deletions agml/data/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
from typing import Iterable

import yaml
from rich.console import Console
from rich.table import Table


import agml.utils.logging as logging
from agml.framework import AgMLSerializable
Expand Down Expand Up @@ -280,12 +283,12 @@ def summary(self):
as `print(loader.info.summary())`, just `loader.info.summary()`.
"""

def _bold(msg):
return "\033[1m" + msg + "\033[0m"
console = Console()

def _bold_yaml(msg): # noqa
return "<|>" + msg + "<|>"
table = Table(title="Dataset Summary")

table.add_column("Attribute", justify="right", style="bold cyan")
table.add_column("Value", justify="left", style="bold white")
_SWITCH_NAMES = {
"ml_task": "Machine Learning Task",
"ag_task": "Agricultural Task",
Expand All @@ -294,7 +297,6 @@ def _bold_yaml(msg): # noqa
"docs_url": "Documentation",
}

formatted_metadata = {}
for key, value in self._metadata.items():
name = key.replace("_", " ").title()
if key in _SWITCH_NAMES.keys():
Expand All @@ -303,18 +305,10 @@ def _bold_yaml(msg): # noqa
value = {int(k): v for k, v in value.items()}
if name == "Number of Images":
value = int(value)
formatted_metadata[_bold_yaml(name)] = value

stream = io.StringIO()
yaml.dump(formatted_metadata, stream, sort_keys=False)
content = stream.getvalue()
content = re.sub("<\\|>(.*?)<\\|>", _bold(r"\1"), content)
header = "=" * 20 + " DATASET SUMMARY " + "=" * 20
print(header)
print(_bold("Name") + f": {self._name}")
print(content, end="")
print("=" * 57)
sys.stdout.flush()

table.add_row(name, f"{value}")

console.print(table)

def citation_summary(self):
"""Prints out a summary of the citation information of the dataset.
Expand Down
27 changes: 25 additions & 2 deletions agml/data/public.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import functools

from rich.console import Console
import numpy as np

from agml.backend.config import data_save_path
Expand Down Expand Up @@ -122,8 +123,30 @@ def _matches(self, meta, filters):
return False
return True

def print_result(self):
return "[%s]" % ", ".join(self._current_filtered_source)
def __repr__(self):
"""Prints a formatted table of the filtered datasets using rich."""

console = Console()
if not self._current_filtered_source:
console.print("[bold yellow]No datasets found matching the criteria.[/]")
return

table = Table(title="Filtered Datasets")
table.add_column("Dataset Name", style="cyan")
table.add_column("ML Task")
table.add_column("Location")
table.add_column("# Images")

for source_name in self._current_filtered_source:
meta = self._sources[source_name]
table.add_row(
source_name,
meta.get("ml_task", "N/A"), # Handle missing 'ml_task'
f"{meta['location']['continent']}, {meta['location']['country']}" if "location" in meta else "N/A",
str(meta.get("n_images", "N/A")), # prints N/A if number of images are not there.
)

console.print(table)

def result(self):
"""Returns the filtered datasets as DatasetMetadata objects.
Expand Down
135 changes: 89 additions & 46 deletions agml/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
import shutil
import sys

from rich.console import Console
from rich.panel import Panel
from rich.text import Text


@functools.lru_cache(maxsize=None)
def load_public_sources() -> dict:
Expand Down Expand Up @@ -98,67 +102,106 @@ def maybe_you_meant(name, msg, source=None) -> str:
def copyright_print(name, location=None):
"""Prints out license/copyright info after a dataset download."""
content = load_citation_sources()[name]
license = content["license"] # noqa
license_info = content["license"] # noqa
citation = content["citation"]

def _bold(msg): # noqa
return "\033[1m" + msg + "\033[0m"
# Construct the title text
title_text = Text(f"Dataset: {name}", style="bold cyan")

if location is None:
first_msg = "Citation information for " + _bold(name) + ".\n"
if location is not None:
location_text = Text.assemble(("You have just downloaded ", "bold cyan"), (name, "green"))
else:
first_msg = "You have just downloaded " + _bold(name) + ".\n"
location_text = Text.assemble(("Citation information for", "bold cyan"), (name, "green"))

_LICENSE_TO_URL = {
'CC BY-SA 4.0': 'https://creativecommons.org/licenses/by-sa/4.0/',
'CC BY-SA 3.0': 'https://creativecommons.org/licenses/by-sa/3.0/',
'CC BY-NC 3.0': 'https://creativecommons.org/licenses/by-nc/3.0/',
'CC BY-NC SA 3.0': 'https://creativecommons.org/licenses/by-nc/3.0/',
'MIT': 'https://opensource.org/licenses/MIT',
'GPL-3.0': 'https://opensource.org/licenses/GPL-3.0',
'US Public Domain': 'https://www.usa.gov/government-works',
'CC0: Public Domain': 'https://creativecommons.org/publicdomain/zero/1.0/',
'Apache 2.0': 'https://www.apache.org/licenses/LICENSE-2.0',
"CC BY-SA 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
"CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/",
"CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/",
"CC BY-NC SA 3.0": "https://creativecommons.org/licenses/by-nc/3.0/",
"CC BY-NC-SA 4.0": "https://creativecommons.org/licenses/by-nc/4.0/",
"MIT": "https://opensource.org/licenses/MIT",
"GPL-3.0": "https://opensource.org/licenses/GPL-3.0",
"US Public Domain": "https://www.usa.gov/government-works",
"CC0: Public Domain": "https://creativecommons.org/publicdomain/zero/1.0/",
"Apache 2.0": "https://www.apache.org/licenses/LICENSE-2.0",
'CC BY-NC 4.0': 'https://creativecommons.org/licenses/by-nc/4.0/',
'CC BY-NC-SA 4.0': 'https://creativecommons.org/licenses/by-nc-sa/4.0/',
'CC BY 4.0': 'https://creativecommons.org/licenses/by/4.0/deed.en'}
if license == '':
license_msg = "This dataset has " \
+ _bold("no license") + ".\n"
else:
license_msg = "This dataset is licensed under the " + _bold(license) + " license.\n"
license_msg += "To learn more, visit: " + _LICENSE_TO_URL[license] + "\n"
'CC BY 4.0': 'https://creativecommons.org/licenses/by/4.0/deed.en'
}

# Create license message
if not license_info: # Handle empty license
license_msg = Text("License: None specified", style="yellow")
else:
license_msg = Text.assemble(("This dataset is licensed under ", "bold cyan"), (f"{license_info}", "green"))
if license_info in _LICENSE_TO_URL:
license_more_info = Text(" To learn more about this license, visit ", style="bold cyan")
license_url = Text(f"{_LICENSE_TO_URL[license_info]}", justify="center", style="white")
else:
license_more_info = ""
license_url = ""

# Create citation message
if citation == "":
citation_msg = "This dataset has no associated citation."
citation_msg = Text("This dataset has no associated citation.", style="yellow")
citation_url = ""
else:
citation_msg = "When using this dataset, please cite the following:\n\n"
citation_msg += citation
citation_msg = (Text("When using this dataset, please cite the following: \n", style="bold cyan"),)
citation_url = Text(citation, justify="center", style="white")

# Dataset documentation message
docs = load_public_sources()[name]["docs_url"]
docs_msg = "\nYou can find additional information about " "this dataset at:\n" + docs

columns = shutil.get_terminal_size((80, 24)).columns
max_print_length = max(
min(
columns,
max([len(i) for i in [*citation_msg.split("\n"), *license_msg.split("\n")]]),
docs_msg = Text("You can find additional information about this dataset at: ", style="bold cyan")
docs_url = Text(docs, justify="center", style="white")

combined_message = Text.assemble(
title_text,
"\n\n",
location_text,
"\n\n",
license_msg,
" ",
license_more_info,
license_url,
"\n\n",
citation_msg,
citation_url,
"\n\n",
docs_msg,
docs_url,
)
console = Console()

# Create and print the rich Panel
panel = Panel(
combined_message,
title="Copyright, Citation, and Documenation Information",
subtitle=title_text,
border_style="bright_yellow",
highlight=True,
expand=False, # Prevent unnecessary whitespace
)
console.print(panel)

# Instructions on how to reprint (using rich Text)
instructions = Text.assemble(
("\nThis message will ",),
(
"not ",
"bold",
),
columns,
("be automatically shown again. To view this message again, in an AgMLDataLoader run "),
("`loader.info.citation_summary()` "),
(" Otherwise, you can use `agml.data.source(<dataset_name>).citation_summary()`.",),
)
print("\n" + "=" * max_print_length)
print(first_msg)
print(license_msg)
print(citation_msg)
print(docs_msg)
print(
"\nThis message will " + _bold("not") + " be automatically shown\n"
"again. To view this message again, in an AgMLDataLoader\n"
+ "run `loader.info.citation_summary()`. Otherwise, you\n"
+ "can use `agml.data.source(<name>).citation_summary().`"
warning_panel = Panel(
instructions,
title="Note",
border_style="yellow",
expand=False,
highlight=True,
)
console.print(warning_panel)

if location is not None:
print(f"\nYou can find your dataset at {location}.")
print("=" * max_print_length)
sys.stdout.flush()
console.print(f"\n [bold] You can find your dataset at {location}.")
Loading
Loading