Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions brainio-test/MANIFEST.in

This file was deleted.

Empty file.
24 changes: 0 additions & 24 deletions brainio-test/brainio_test/entrypoint.py

This file was deleted.

13 changes: 0 additions & 13 deletions brainio-test/brainio_test/lookup.csv

This file was deleted.

10 changes: 0 additions & 10 deletions brainio-test/brainio_test/lookup2.csv

This file was deleted.

18 changes: 0 additions & 18 deletions brainio-test/setup.py

This file was deleted.

8 changes: 6 additions & 2 deletions brainio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from .fetch import get_assembly, get_stimulus_set
from .lookup import get_catalog, list_stimulus_sets, list_assemblies, list_catalogs
# Import functions that depend on external dependencies (boto3) only when available
try:
from .fetch import get_assembly, get_stimulus_set
except ImportError:
# External dependencies not available - functions will raise NotImplementedError anyway
pass

Binary file added brainio/__pycache__/__init__.cpython-311.pyc
Binary file not shown.
Binary file added brainio/__pycache__/assemblies.cpython-311.pyc
Binary file not shown.
Binary file added brainio/__pycache__/fetch.cpython-311.pyc
Binary file not shown.
49 changes: 0 additions & 49 deletions brainio/catalogs.py

This file was deleted.

53 changes: 21 additions & 32 deletions brainio/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import brainio.assemblies as assemblies
import brainio.stimuli as stimuli
from brainio.lookup import lookup_assembly, lookup_stimulus_set, sha1_hash
from brainio.lookup import sha1_hash
from brainio.stimuli import StimulusSetLoader

BRAINIO_HOME = 'BRAINIO_HOME'
Expand Down Expand Up @@ -157,42 +157,31 @@ def resolve_stimulus_set_class(class_name):


def get_assembly(identifier):
assembly_lookup = lookup_assembly(identifier)
file_path = fetch_file(location_type=assembly_lookup['location_type'],
location=assembly_lookup['location'], sha1=assembly_lookup['sha1'])
stimulus_set = get_stimulus_set(assembly_lookup['stimulus_set_identifier'])
cls = resolve_assembly_class(assembly_lookup['class'])
loader = cls.get_loader_class()(
cls=cls,
file_path=file_path,
stimulus_set_identifier=assembly_lookup['stimulus_set_identifier'],
stimulus_set=stimulus_set,
"""
DEPRECATED: The catalog-based assembly loading has been removed.

Use direct loading methods instead:
- For S3: Use brainscore_vision.data_helpers.s3.load_assembly_from_s3()
- For files: Use DataAssembly.from_files()
"""
raise NotImplementedError(
"get_assembly() has been deprecated. The catalog system has been removed. "
"Use direct loading methods like load_assembly_from_s3() or DataAssembly.from_files() instead."
)
assembly = loader.load()
assembly.attrs['identifier'] = identifier
return assembly


def get_stimulus_set(identifier):
csv_lookup, zip_lookup = lookup_stimulus_set(identifier)
csv_path = fetch_file(location_type=csv_lookup['location_type'], location=csv_lookup['location'],
sha1=csv_lookup['sha1'])
zip_path = fetch_file(location_type=zip_lookup['location_type'], location=zip_lookup['location'],
sha1=zip_lookup['sha1'])
stimuli_directory = unzip(zip_path)
loader = StimulusSetLoader(
csv_path=csv_path,
stimuli_directory=stimuli_directory,
cls=resolve_stimulus_set_class(csv_lookup['class'])
"""
DEPRECATED: The catalog-based stimulus set loading has been removed.

Use direct loading methods instead:
- For S3: Use brainscore_vision.data_helpers.s3.load_stimulus_set_from_s3()
- For files: Use StimulusSet.from_files()
"""
raise NotImplementedError(
"get_stimulus_set() has been deprecated. The catalog system has been removed. "
"Use direct loading methods like load_stimulus_set_from_s3() or StimulusSet.from_files() instead."
)
stimulus_set = loader.load()
stimulus_set.identifier = identifier
# ensure perfect overlap
stimuli_paths = [Path(stimuli_directory) / local_path for local_path in os.listdir(stimuli_directory)
if not local_path.endswith('.zip') and not local_path.endswith('.csv')]
assert set(stimulus_set.stimulus_paths.values()) == set(stimuli_paths), \
"Inconsistency: unzipped stimuli paths do not match csv paths"
return stimulus_set


def fullname(obj):
Expand Down
170 changes: 2 additions & 168 deletions brainio/lookup.py
Original file line number Diff line number Diff line change
@@ -1,177 +1,11 @@
import hashlib
import logging

import entrypoints
import numpy as np
import pandas as pd

from brainio.catalogs import Catalog, SOURCE_CATALOG

ENTRYPOINT = "brainio_lookups"
TYPE_ASSEMBLY = 'assembly'
TYPE_STIMULUS_SET = 'stimulus_set'
_catalogs = {}

_logger = logging.getLogger(__name__)


def list_catalogs():
return sorted(list(entrypoints.get_group_named(ENTRYPOINT).keys()))


def _load_catalog(identifier, entry_point):
catalog = entry_point.load()()
assert isinstance(catalog, Catalog)
assert catalog.identifier == identifier
return catalog


def _load_installed_catalogs():
installed_catalogs = entrypoints.get_group_named(ENTRYPOINT)
_logger.debug(f"Loading catalog from entrypoints")
print(f"Loading catalog from entrypoints")
for k, v in installed_catalogs.items():
catalog = _load_catalog(k, v)
_catalogs[k] = catalog
return _catalogs


def get_catalog(identifier):
catalogs = get_catalogs()
return catalogs[identifier]


def get_catalogs():
if not _catalogs:
_load_installed_catalogs()
return _catalogs


def combined_catalog():
source_catalogs = get_catalogs()
target_catalogs = {}
for identifier, source_catalog in source_catalogs.items():
target_catalog = source_catalog.copy()
target_catalog[SOURCE_CATALOG] = identifier
target_catalogs[identifier] = target_catalog
concat_catalogs = pd.concat(target_catalogs.values(), ignore_index=True)
return concat_catalogs


def list_stimulus_sets():
combined = combined_catalog()
stimuli_rows = combined[combined['lookup_type'] == TYPE_STIMULUS_SET]
return sorted(list(set(stimuli_rows['identifier'])))


def list_assemblies():
combined = combined_catalog()
assembly_rows = combined[combined['lookup_type'] == TYPE_ASSEMBLY]
return sorted(list(set(assembly_rows['identifier'])))


def lookup_stimulus_set(identifier):
combined = combined_catalog()
lookup = combined[(combined['identifier'] == identifier) & (combined['lookup_type'] == TYPE_STIMULUS_SET)]
if len(lookup) == 0:
raise StimulusSetLookupError(f"Stimulus set {identifier} not found")
csv_lookup = _lookup_stimulus_set_filtered(lookup, filter_func=_is_csv_lookup, label="CSV")
zip_lookup = _lookup_stimulus_set_filtered(lookup, filter_func=_is_zip_lookup, label="ZIP")
return csv_lookup, zip_lookup


def _lookup_stimulus_set_filtered(lookup, filter_func, label):
cols = [n for n in lookup.columns if n != SOURCE_CATALOG]
# filter for csv vs. zip
# if there are any groups of rows where every field except source is the same,
# we only want one from each group
filtered_rows = lookup[lookup.apply(filter_func, axis=1)].drop_duplicates(subset=cols)
identifier = lookup.iloc[0]['identifier']
if len(filtered_rows) == 0:
raise StimulusSetLookupError(f"{label} for stimulus set {identifier} not found")
if len(filtered_rows) > 1: # there were multiple rows but not all identical
raise RuntimeError(
f"Internal data inconsistency: Found more than 2 lookup rows for stimulus_set {label} for identifier {identifier}")
assert len(filtered_rows) == 1
return filtered_rows.squeeze()


def lookup_assembly(identifier):
combined = combined_catalog()
lookup = combined[(combined['identifier'] == identifier) & (combined['lookup_type'] == TYPE_ASSEMBLY)]
if len(lookup) == 0:
raise AssemblyLookupError(f"Data assembly {identifier} not found")
cols = [n for n in lookup.columns if n != SOURCE_CATALOG]
# if there are any groups of rows where every field except source is the same,
# we only want one from each group
de_dupe = lookup.drop_duplicates(subset=cols)
if len(de_dupe) > 1: # there were multiple rows but not all identical
raise RuntimeError(f"Internal data inconsistency: Found multiple lookup rows for identifier {identifier}")
assert len(de_dupe) == 1
return de_dupe.squeeze()


class StimulusSetLookupError(KeyError):
pass


class AssemblyLookupError(KeyError):
pass


def append(catalog_identifier, object_identifier, cls, lookup_type,
bucket_name, sha1, s3_key, stimulus_set_identifier=None):
catalogs = get_catalogs()
catalog = catalogs[catalog_identifier]
catalog_path = catalog.attrs['source_path']
_logger.debug(f"Adding {lookup_type} {object_identifier} to catalog {catalog_identifier}")
object_lookup = {
'identifier': object_identifier,
'lookup_type': lookup_type,
'class': cls,
'location_type': "S3",
'location': f"https://{bucket_name}.s3.amazonaws.com/{s3_key}",
'sha1': sha1,
'stimulus_set_identifier': stimulus_set_identifier,
}
# check duplicates
assert object_lookup['lookup_type'] in [TYPE_ASSEMBLY, TYPE_STIMULUS_SET]
duplicates = catalog[(catalog['identifier'] == object_lookup['identifier']) &
(catalog['lookup_type'] == object_lookup['lookup_type'])]
if len(duplicates) > 0:
if object_lookup['lookup_type'] == TYPE_ASSEMBLY:
raise ValueError(f"Trying to add duplicate identifier {object_lookup['identifier']}, "
f"existing \n{duplicates.to_string()}")
elif object_lookup['lookup_type'] == TYPE_STIMULUS_SET:
if len(duplicates) == 1 and duplicates.squeeze()['identifier'] == object_lookup['identifier'] and (
(_is_csv_lookup(duplicates.squeeze()) and _is_zip_lookup(object_lookup)) or
(_is_zip_lookup(duplicates.squeeze()) and _is_csv_lookup(object_lookup))):
pass # all good, we're just adding the second part of a stimulus set
else:
raise ValueError(
f"Trying to add duplicate identifier {object_lookup['identifier']}, existing {duplicates}")
# append and save
add_lookup = pd.DataFrame({key: [value] for key, value in object_lookup.items()})
catalog = pd.concat((catalog, add_lookup))
catalog.attrs['source_path'] = catalog_path # explicitly set since concat does not always preserve
catalog.to_csv(catalog_path, index=False)
_catalogs[catalog_identifier] = catalog
return catalog


def _is_csv_lookup(data_row):
return data_row['lookup_type'] == TYPE_STIMULUS_SET \
and data_row['location'].endswith('.csv') \
and data_row['class'] not in [None, np.nan]


def _is_zip_lookup(data_row):
return data_row['lookup_type'] == TYPE_STIMULUS_SET \
and data_row['location'].endswith('.zip') \
and data_row['class'] in [None, np.nan]


def sha1_hash(path, buffer_size=64 * 2 ** 10):
"""Calculate SHA1 hash of a file."""
_logger.debug(f'BEGIN sha1_hash on {path}')
sha1 = hashlib.sha1()
with open(path, "rb") as f:
Expand All @@ -180,4 +14,4 @@ def sha1_hash(path, buffer_size=64 * 2 ** 10):
sha1.update(buffer)
buffer = f.read(buffer_size)
_logger.debug(f'END sha1_hash on {path}')
return sha1.hexdigest()
return sha1.hexdigest()
Loading