diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c340f25..8007bd3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,6 @@ jobs: ubuntu-24.04, macos-13, windows-2022, - windows-2019, ] python-version: ["3.11"] runs-on: ${{ matrix.os }} diff --git a/.gitignore b/.gitignore index 3eeb70f..cf9aeb9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,14 @@ -#mac files +# claude +CLAUDE.md +.claude/ +claude_output/ +claude_logs/ + +# mac files **/.DS_Store # Dataset directory -data/ +./data/ # logs **/logs/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0961d95..e45b42f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -exclude: ^docs/|devcontainer.json|.*/snapshots/ +exclude: ^docs/|devcontainer.json|.*/snapshots/|mkdocs.yml default_stages: [commit] default_language_version: @@ -62,6 +62,7 @@ repos: rev: v1.15.0 hooks: - id: mypy + additional_dependencies: [types-requests] - repo: https://github.com/markdownlint/markdownlint rev: v0.12.0 diff --git a/docs/brentlab_yeastresources_collection.md b/docs/brentlab_yeastresources_collection.md new file mode 100644 index 0000000..4f49a02 --- /dev/null +++ b/docs/brentlab_yeastresources_collection.md @@ -0,0 +1,378 @@ +# BrentLab Yeast Resources Collection + +This document describes the BrentLab yeast resources collection on HuggingFace as an example implementation of the [datacard specifications](huggingface_datacard.md). This collection demonstrates best practices for organizing transcription factor binding and perturbation datasets for *Saccharomyces cerevisiae*. + +## Collection Overview + +The BrentLab yeast resources collection contains 11 datasets related to yeast transcription factor binding and gene expression regulation: + +1. **barkai_compendium** - ChEC-seq binding data across multiple GEO series +2. **callingcards** - Calling Cards transposon-based binding data +3. **hackett_2020** - TF overexpression with nutrient limitation +4. **harbison_2004** - ChIP-chip binding across 14 environmental conditions +5. **hu_2007_reimand_2010** - TF knockout expression data +6. **hughes_2006** - TF perturbation screen (overexpression and knockout) +7. **kemmeren_2014** - TF deletion expression profiling +8. **mahendrawada_2025** - ChEC-seq and nascent RNA-seq data +9. **rossi_2021** - ChIP-exo binding data +10. **yeast_comparative_analysis** - Cross-dataset comparative analyses +11. **yeast_genome_resources** - Reference genomic features + +## Standardized Media Names + +The collection uses standardized media names to facilitate cross-dataset queries. When specifying media in datacards, use these canonical names: + +### Rich Media + +- **YPD** (Yeast extract Peptone Dextrose) + - Carbon source: 2% D-glucose + - Nitrogen sources: 1% yeast extract, 2% peptone + - Standard rich medium for yeast growth + +- **yeast_extract_peptone** + - Base medium without specified carbon source + - Used with galactose (YPGal) or raffinose (YPRaff) + +### Minimal/Defined Media + +- **minimal** or **minimal_glucose** + - Minimal defined medium with glucose as carbon source + - Nitrogen source varies by experiment + +- **synthetic_complete** or **synthetic_complete_dextrose** + - Defined medium with complete amino acid supplementation + - Carbon source: typically 2% D-glucose + - Nitrogen source: yeast nitrogen base + amino acid dropout mix + +- **synthetic_complete_minus_X** + - Synthetic complete medium lacking specific nutrient(s) + - Examples: `synthetic_complete_minus_thiamine`, `synthetic_complete_minus_phosphate` + - Used for nutrient deprivation experiments + +- **selective_medium** + - Defined medium for plasmid selection + - Specific composition varies by selection markers + +## Standardized Strain Backgrounds + +The collection primarily uses these strain backgrounds: + +- **BY4741** - MATa his3Δ1 leu2Δ0 met15Δ0 ura3Δ0 + - Used in: hu_2007_reimand_2010, kemmeren_2014 + +- **W303** - Common alternative strain background + - Used in: harbison_2004 (derivative Z1256) + +- **S288C** - Reference genome strain + - Used in: Various datasets + +Strain background can be specified as a string or detailed object: + +```yaml +# Simple string +experimental_conditions: + strain_background: BY4741 + +# Detailed specification +experimental_conditions: + strain_background: + genotype: BY4741 + mating_type: MATa + markers: + - his3Δ1 + - leu2Δ0 + - met15Δ0 + - ura3Δ0 + source: Open_Biosystems + description: Knockout strains for nonessential transcription factors +``` + +## Standard Experimental Conditions + +### Growth Temperature + +Standard growth temperature across the collection is **30°C** unless otherwise noted. + +Exceptions: +- **rossi_2021**: 25°C baseline with 37°C heat shock for some samples +- **hu_2007_reimand_2010**: Heat shock at 39°C for heat shock response TFs +- **callingcards**: the experiments are performed at room temperature (~22-25°C) + +### Growth Phase + +Common growth phase specifications: + +These labels are taken from the original publications. In some cases the OD600 +is noted + +- **early_log_phase** +- **mid_log_phase** +- **late_log_phase** +- **stationary_phase** - eg barkai_compendium, which are allowed to grow overnight. The + cells are harvested at a very high density (OD600 4.0). + +Example: +```yaml +experimental_conditions: + growth_phase_at_harvest: + stage: mid_log_phase + od600: 0.6 + od600_tolerance: 0.1 +``` + +### Cultivation Methods + +Standard cultivation methods used: + +- **liquid_culture** - Standard batch culture in flasks +- **batch** - Batch culture +- **plate** - Growth on agar plates +- **chemostat** - Continuous culture (hackett_2020) + +## Concentration Specifications + +**Always use `concentration_percent`** for all concentration specifications. +Convert other units to percentage: + +- **mg/ml to percent**: divide by 10 (e.g., 5 mg/ml = 0.5%) +- **g/L to percent**: divide by 10 (e.g., 6.71 g/L = 0.671%) +- **Molar to percent**: convert using molecular weight + - Example: 100 nM rapamycin = 9.142e-6% + +### Examples from the Collection + +```yaml +# Yeast nitrogen base: 6.71 g/L = 0.671% +- compound: yeast_nitrogen_base + concentration_percent: 0.671 + +# Alpha factor: 5 mg/ml = 0.5% +- compound: alpha_factor_pheromone + concentration_percent: 0.5 + +# Rapamycin: 100 nM = 9.142e-6% +chemical_treatment: + compound: rapamycin + concentration_percent: 9.142e-6 +``` + +## Field Naming Conventions + +The collection follows these field naming conventions: + +### Gene/Feature Identifiers + +- **regulator_locus_tag**: Systematic ID of regulatory factor (e.g., "YJR060W") +- **regulator_symbol**: Common name of regulatory factor (e.g., "CBF1") +- **target_locus_tag**: Systematic ID of target gene +- **target_symbol**: Common name of target gene + +All locus tags and symbols join to **yeast_genome_resources** dataset. + +### Quantitative Measurements Examples + +Common measurement field names: + +- **effect**, **log2fc**, **log2_ratio** - Log fold change measurements +- **pvalue**, **pval**, **p_value** - Statistical significance +- **padj**, **adj_p_value** - FDR-adjusted p-values +- **binding_score**, **peak_score** - Binding strength metrics +- **enrichment** - Enrichment ratios + +### Experimental Metadata Examples + +- **sample_id** - Unique sample identifier (integer) +- **db_id** - Legacy database identifier (deprecated, do not use) +- **batch** - Experimental batch identifier +- **replicate** - Biological replicate number +- **time** - Timepoint in timecourse experiments + +## Dataset Type Usage Examples + +### genomic_features + +**yeast_genome_resources** provides reference annotations: +- Gene coordinates and strand information +- Systematic IDs (locus_tag) and common names (symbol) +- Feature types (gene, ncRNA_gene, tRNA_gene, etc.) + +Used for joining regulator/target identifiers across all other datasets. + +### annotated_features + +Most common dataset type in the collection. Examples: + +- **hackett_2020**: TF overexpression with timecourse measurements +- **harbison_2004**: ChIP-chip binding with condition field definitions +- **kemmeren_2014**: TF deletion expression data +- **mahendrawada_2025**: ChEC-seq binding scores + +Typical structure: regulator × target × measurements, with optional condition fields. + +### genome_map + +Position-level data, typically partitioned by sample or accession: + +- **barkai_compendium**: ChEC-seq pileup data partitioned by Series/Accession +- **rossi_2021**: ChIP-exo 5' tag coverage partitioned by sample +- **callingcards**: Transposon insertion density partitioned by batch + +### metadata + +Separate metadata configs or embedded metadata via `metadata_fields`: + +**Separate config example** (barkai_compendium): +```yaml +- config_name: GSE178430_metadata + dataset_type: metadata + applies_to: ["genomic_coverage"] +``` + +**Embedded metadata example** (harbison_2004): +```yaml +- config_name: harbison_2004 + dataset_type: annotated_features + metadata_fields: ["regulator_locus_tag", "regulator_symbol", "condition"] +``` + +### comparative + +**yeast_comparative_analysis** provides cross-dataset analysis results: + +- **dto config**: Direct Target Overlap analysis comparing binding and perturbation experiments +- Uses `source_sample` role for composite identifiers +- Format: `"repo_id;config_name;sample_id"` (semicolon-separated) +- Contains 8 quantitative measures: rank thresholds, set sizes, FDR, p-values +- Partitioned by binding_repo_dataset and perturbation_repo_dataset + +**Composite Sample Identifiers**: +Comparative datasets use composite identifiers to reference samples from other datasets: +- `binding_id`: Points to a binding experiment (e.g., `BrentLab/callingcards;annotated_features;1`) +- `perturbation_id`: Points to a perturbation experiment (e.g., `BrentLab/hackett_2020;hackett_2020;200`) + +**Typical structure**: source_sample_1 x source_sample_2 x ... x measurements + +**Use case**: Answer questions like "Which binding experiments show significant overlap with perturbation effects?" + +## Categorical Condition Definitions + +Many datasets define categorical experimental conditions using the `definitions` field. + +### harbison_2004 Environmental Conditions + +14 conditions with detailed specifications: +- **YPD** (rich media baseline) +- **SM** (amino acid starvation) +- **RAPA** (rapamycin treatment) +- **H2O2Hi**, **H2O2Lo** (oxidative stress) +- **HEAT** (heat shock) +- **GAL**, **RAFF** (alternative carbon sources) +- And 6 more... + +Each condition definition includes media composition, temperature, growth phase, and treatments. + +### hackett_2020 Nutrient Limitations + +```yaml +restriction: + definitions: + P: # Phosphate limitation + media: + phosphate_source: + - compound: potassium_phosphate_monobasic + concentration_percent: 0.002 + N: # Nitrogen limitation + media: + nitrogen_source: + - compound: ammonium_sulfate + concentration_percent: 0.004 + M: # Undefined limitation + description: "Not defined in the paper" +``` + +### hu_2007_reimand_2010 Treatment Conditions + +```yaml +heat_shock: + definitions: + true: + temperature_celsius: 39 + duration_minutes: 15 + false: + description: Standard growth conditions at 30°C +``` + +## Partitioning Strategies + +Large genome_map datasets use partitioning: + +**barkai_compendium** - Two-level partitioning: +```yaml +partitioning: + partition_by: ["Series", "Accession"] + path_template: "genome_map/*/*/part-0.parquet" +``` + +**callingcards** - Batch partitioning: +```yaml +partitioning: + enabled: true + partition_by: ["batch"] + path_template: "genome_map/batch={batch}/*.parquet" +``` + +## Collection-Wide Best Practices + +### 1. Omit unspecified fields with a comment + +`tfbpapi` will handle adding "unspecified" to fields which are not common across +datasets. + +```yaml +# CORRECT +experimental_conditions: + temperature_celsius: 30 + # cultivation_method is note noted in the paper and is omitted + +# INCORRECT +experimental_conditions: + temperature_celsius: unspecified +``` + +### 2. Document Source Publications + +If the original paper used something like g/L, then convert that to +`concentration_percent` and add a comment with the original value and units. + +```yaml +carbon_source: + - compound: D-glucose + # Saldanha et al 2004: 10 g/L + concentration_percent: 1 +``` + +### 3. Use Standard Field Roles + +Apply semantic roles consistently: +- `regulator_identifier` - for regulator fields +- `target_identifier` - for target fields +- `quantitative_measure` - for measurements +- `experimental_condition` - for condition fields +- `genomic_coordinate` - for positional data + +### 4. Provide sample_id + +All annotated_features datasets should include `sample_id` to uniquely identify experimental samples. This enables cross-dataset joining and metadata management. + +### 5. Specify metadata_fields or applies_to + +For datasets with metadata, either: +- Use `metadata_fields` to extract from the data itself, OR +- Create separate metadata config with `applies_to` field + +### 6. Use Consistent Gene Identifiers + +All regulator/target identifiers must be joinable to **yeast_genome_resources**: +- Use current systematic IDs (ORF names) +- Include both locus_tag and symbol fields +- Mark with appropriate roles diff --git a/docs/datacard.md b/docs/datacard.md new file mode 100644 index 0000000..cfab1f1 --- /dev/null +++ b/docs/datacard.md @@ -0,0 +1,6 @@ +# DataCard + +::: tfbpapi.datacard.DataCard + options: + show_root_heading: true + show_source: true diff --git a/docs/errors.md b/docs/errors.md new file mode 100644 index 0000000..6ba92ff --- /dev/null +++ b/docs/errors.md @@ -0,0 +1,28 @@ +# Custom Exceptions + +## HfDataFetchError + +::: tfbpapi.errors.HfDataFetchError + options: + show_root_heading: true + show_source: true + +Raised when HuggingFace API requests fail during data fetching operations. + +## DataCardError + +::: tfbpapi.errors.DataCardError + options: + show_root_heading: true + show_source: true + +Base exception for DataCard operations. + +## DataCardValidationError + +::: tfbpapi.errors.DataCardValidationError + options: + show_root_heading: true + show_source: true + +Raised when dataset card validation fails during parsing or loading. \ No newline at end of file diff --git a/docs/fetchers.md b/docs/fetchers.md new file mode 100644 index 0000000..2901a79 --- /dev/null +++ b/docs/fetchers.md @@ -0,0 +1,16 @@ +# Data Fetchers + +::: tfbpapi.fetchers.HfDataCardFetcher + options: + show_root_heading: true + show_source: true + +::: tfbpapi.fetchers.HfRepoStructureFetcher + options: + show_root_heading: true + show_source: true + +::: tfbpapi.fetchers.HfSizeInfoFetcher + options: + show_root_heading: true + show_source: true diff --git a/docs/hf_cache_manager.md b/docs/hf_cache_manager.md new file mode 100644 index 0000000..752b712 --- /dev/null +++ b/docs/hf_cache_manager.md @@ -0,0 +1,6 @@ +# HfCacheManager + +::: tfbpapi.hf_cache_manager.HfCacheManager + options: + show_root_heading: true + show_source: true diff --git a/docs/huggingface_datacard.md b/docs/huggingface_datacard.md new file mode 100644 index 0000000..d56c771 --- /dev/null +++ b/docs/huggingface_datacard.md @@ -0,0 +1,496 @@ +# HuggingFace Dataset Card Format + +This document describes the expected YAML metadata format for HuggingFace dataset +repositories used with the tfbpapi package. The metadata is defined in the repository's +README.md file, at the top in a yaml block, and provides structured information about +the dataset configuration and contents. + +This documentation is intended for developers preparing or augmenting a huggingface +dataset repository to be compatible with tfbpapi. Before reading, please review the +[BrentLab/hackett_2020](https://huggingface.co/datasets/BrentLab/hackett_2020/blob/main/README.md) +datacard as an example of a complete implementation of a simple repository. After +reviewing Hackett 2020 and this documentation, it might be helpful to review a more +complex example such as: + +- [BrentLab/barkai_compendium](https://huggingface.co/datasets/BrentLab/barkai_compendium): + This contains a `genome_map` partitioned dataset with separate metadata applied via + the `applies_to` field. +- [Brentlab/rossi_2021](https://huggingface.co/datasets/BrentLab/rossi_2021): + This contains multiple `annotated_features` datasets with embedded metadata +- [Brentlab/yeast_genomic_features](https://huggingface.co/datasets/BrentLab/yeast_genomic_features): + This contains a simple `genomic_features` dataset used as a reference for other + datasets in the collection. + +## Dataset Types + +The `dataset_type` field is a property of each config (hierarchically under +`config_name`). `tfbpapi` recognizes the following dataset types: + +### 1. `genomic_features` +Static information about genomic features (genes, promoters, etc.) +- **Use case**: Gene annotations, regulatory classifications, static feature data +- **Structure**: One row per genomic feature +- **Required fields**: Usually includes gene identifiers, coordinates, classifications + +### 2. `annotated_features` +Quantitative data associated with genomic features. A field `sample_id` should exist +to identify single experiments in a single set of conditions. +- **Use case**: Expression data, binding scores, differential expression results +- **Structure**: Each sample will have one row per genomic feature measured. The + role `quantitative_measure` should be used to identify measurement columns. +- **Common fields**: `regulator_*`, `target_*` fields with the roles + `regulator_identifier` and `target_identifier` respectively. Fields with the role + `quantitative_measure` for measurements. + +### 3. `genome_map` +Position-level data across genomic coordinates +- **Use case**: Signal tracks, coverage data, genome-wide binding profiles +- **Structure**: Position-value pairs, often large datasets +- **Required fields**: `chr` (chromosome), `pos` (position), signal values + +### 4. `metadata` +Experimental metadata and sample descriptions +- **Use case**: Sample information, experimental conditions, protocol details. Note + that this can also include per-sample QC metrics. For cross-sample QC or analysis, + see [comparative](#5-comparative) below. +- **Structure**: One row per sample +- **Common fields**: Sample identifiers, experimental conditions, publication info +- **Special field**: `applies_to` - Optional list of config names this metadata applies to + +### 5. `comparative` + +Quality control metrics, validation results, and cross-dataset analysis outputs. + +**Use cases**: +- Cross-dataset quality assessments and validation metrics +- Analysis results relating samples across datasets or repositories +- Comparative analyses (e.g., binding vs expression correlation) + +**Structure**: One row represents an observation on 2 or more samples. Note that the + name of the column containing the sample references isn't specified. However, the + role and format of the sample references are strictly defined. See + [Defining Sample References](#defining-sample-references) below. + +#### Defining Sample References + +The name of the field which contains the sample reference is user-defined. However, +the contents of that field, and its role, must be as follows: + +- **`source_sample`**: Fields containing composite sample identifiers. This must be in + the format `"repo_id;config_name;sample_id"`. + +``` +"repo_id;config_name;sample_id" +``` + +Examples: +- `"BrentLab/harbison_2004;harbison_2004;CBF1_YPD"` +- `"BrentLab/kemmeren_2014;kemmeren_2014;sample_42"` + +## Experimental Conditions + +Experimental conditions can be specified in three ways: +1. **Top-level** `experimental_conditions`: Apply to all configs in the repository. + Use when experimental parameters are common across all datasets. This will occur + at the same level as `configs` +2. **Config-level** `experimental_conditions`: Apply to a specific config + ([dataset](#dataset)). Use when certain datasets have experimental parameters that + are not shared by all other datasets in the [repository](#huggingface-repo), but + are common to all [samples](#sample) within that dataset. +3. **Field-level** with `role: experimental_condition` ([feature-roles](#feature-roles)): For + per-sample or per-measurement variation in experimental conditions stored as + data columns. This is specified in the + `dataset_info.features` ([feature-definitions](#feature-definitions)) + section of a config. `experimental_condition` fields which are categorical can are + specifically defined in [categorical fields with value definitions](#categorical-fields-with-value-definitions). + +The priority of experimental conditions is: + +field-level > config-level > top-level + +**Example of all three methods:** +```yaml +# Top-level experimental conditions (apply to all [datasets](#dataset) in the repo) +experimental_conditions: + temperature_celsius: 30 +configs: +- config_name: overexpression_data + description: TF overexpression perturbation data + dataset_type: annotated_features + # The overexpression_data [dataset](#dataset) has an additional experimental + # condition that is specific to this dataset + experimental_conditions: + strain_background: "BY4741" + data_files: + - split: train + path: overexpression.parquet + dataset_info: + features: + - name: time + dtype: float + description: Time point in minutes + role: experimental_condition + - name: mechanism + dtype: string + description: Induction mechanism (GEV or ZEV) + role: experimental_condition + definitions: + GEV: + perturbation_method: + type: inducible_overexpression + system: GEV + inducer: beta-estradiol + description: "Galactose-inducible estrogen receptor-VP16 fusion system" + ZEV: + perturbation_method: + type: inducible_overexpression + system: ZEV + inducer: beta-estradiol + description: >- + "Z3 (synthetic zinc finger)-estrogen receptor-VP16 fusion system" + - name: log2_ratio + dtype: float + description: Log2 fold change + role: quantitative_measure +``` + +## Feature Definitions + +Each config must include detailed feature definitions in `dataset_info.features`: +```yaml +dataset_info: + features: + - name: field_name # Column name in the data + dtype: string # Data type (string, int64, float64, etc.) + description: "Detailed description of what this field contains" + role: "target_identifier" # Optional: semantic role of the feature +``` + +### Categorical Fields with Value Definitions + +For fields with `role: experimental_condition` that contain categorical values, you can +provide structured definitions for each value using the `definitions` field. This allows +machine-parsable specification of what each condition value means experimentally: +```yaml +- name: condition + dtype: + class_label: + names: ["standard", "heat_shock"] + role: experimental_condition + description: Growth condition of the sample + definitions: + standard: + media: + name: synthetic_complete + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_nitrogen_base + # lastname et al 2025 used 6.71 g/L + concentration_percent: 0.671 + specifications: + - without_amino_acids + - without_ammonium_sulfate + - compound: ammonium_sulfate + # lastname et al 2025 used 5 g/L + concentration_percent: 0.5 + - compound: amino_acid_dropout_mix + # lastname et al 2025 used 2 g/L + concentration_percent: 0.2 + heat_shock: + temperature_celsius: 37 + duration_minutes: 10 +``` + +Each key in `definitions` must correspond to a possible value in the field. +The structure under each value provides experimental parameters specific to that +condition using the same nested format as `experimental_conditions` at config or +top level. + +### Naming Conventions + +**Gene/Feature Identifiers:** +- `(regulator/target)_locus_tag`: Systematic gene identifiers (e.g., "YJR060W"). Must + be able to join to a genomic_features dataset. If none is specific, + then the BrentLab/yeast_genomic_features is used +- `(regulator/target)_symbol`: Standard gene symbols (e.g., "CBF1"). Must be able to + join to a genomic_features dataset. If none is specific, + then the BrentLab/yeast_genomic_features is used + +**Genomic Coordinates:** +Unless otherwise noted, assume that coordinates are 0-based, half-open intervals + +- `chr`: Chromosome identifier +- `start`, `end`: Genomic coordinates +- `pos`: Single position +- `strand`: Strand information (+ or -) + +## Feature Roles + +The optional `role` field provides semantic meaning to features, especially useful +for annotated_features datasets. The following roles are recognized by tfbpapi. +**NOTE** `experimental_condition` is a reserved role with additional behavior +as described above. + +## Partitioned Datasets + +For large datasets (eg most genome_map datasets), use partitioning: + +```yaml +dataset_info: + partitioning: + enabled: true + partition_by: ["accession"] # Partition column(s) + path_template: "data/accession={accession}/*.parquet" +``` + +This allows efficient querying of subsets without loading the entire dataset. + +## Metadata + +### Metadata Relationships with `applies_to` + +For metadata configs, you can explicitly specify which other configs the metadata +applies to using the `applies_to` field. This provides more control than automatic +type-based matching. + +```yaml +configs: +# Data configs +- config_name: genome_map_data + dataset_type: genome_map + # ... rest of config + +- config_name: binding_scores + dataset_type: annotated_features + # ... rest of config + +- config_name: expression_data + dataset_type: annotated_features + # ... rest of config + +# Metadata config that applies to multiple data configs +- config_name: repo_metadata + dataset_type: metadata + applies_to: ["genome_map_data", "binding_scores", "expression_data"] + # ... rest of config +``` + +### Embedded Metadata with `metadata_fields` + +When no explicit metadata config exists, you can extract metadata directly from the +dataset's own files using the `metadata_fields` field. This specifies which fields +should be treated as metadata. + +### Single File Embedded Metadata + +For single parquet files, the system extracts distinct values using `SELECT DISTINCT`: + +```yaml +- config_name: binding_data + dataset_type: annotated_features + metadata_fields: ["regulator_symbol", "experimental_condition"] + data_files: + - split: train + path: binding_measurements.parquet + dataset_info: + features: + - name: regulator_symbol + dtype: string + description: Transcription factor name + - name: experimental_condition + dtype: string + description: Experimental treatment + - name: binding_score + dtype: float64 + description: Quantitative measurement +``` + +### Partitioned Dataset Embedded Metadata + +For partitioned datasets, partition values are extracted from directory structure: + +```yaml +- config_name: genome_map_data + dataset_type: genome_map + metadata_fields: ["run_accession", "regulator_symbol"] + data_files: + - split: train + path: genome_map/accession=*/regulator=*/*.parquet + dataset_info: + features: + - name: chr + dtype: string + description: Chromosome + - name: pos + dtype: int32 + description: Position + - name: signal + dtype: float32 + description: Signal intensity + partitioning: + enabled: true + partition_by: ["run_accession", "regulator_symbol"] +``` + +## Data File Organization + +### Single Files +```yaml +data_files: +- split: train + path: single_file.parquet +``` + +### Multiple Files/Partitioned Data +```yaml +data_files: +- split: train + path: data_directory/*/*.parquet # Glob patterns supported +``` + +## Complete Example Structure + +```yaml +license: mit +language: [en] +tags: [biology, genomics, transcription-factors] +pretty_name: "Example Genomics Dataset" +size_categories: [100K 5:\n", + " print(f\" ... and {len(repo_sizes) - 5} more repositories\")\n", + "\n", + "# Calculate total revisions\n", + "total_revisions = sum(len(repo.revisions) for repo in cache_info.repos)\n", + "print(f\"\\nTotal revisions across all repos: {total_revisions}\")\n", + "\n", + "# Show age distribution\n", + "from datetime import datetime\n", + "now = datetime.now().timestamp()\n", + "old_revisions = 0\n", + "for repo in cache_info.repos:\n", + " for rev in repo.revisions:\n", + " age_days = (now - rev.last_modified) / (24 * 3600)\n", + " if age_days > 30:\n", + " old_revisions += 1\n", + "\n", + "print(f\"Revisions older than 30 days: {old_revisions}\")\n", + "print(f\"Recent revisions (≤30 days): {total_revisions - old_revisions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Querying Loaded Metadata\n", + "\n", + "Once metadata is loaded into DuckDB, we can query it using SQL." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Internal Cache Management Methods\n", + "\n", + "HfCacheManager provides several internal methods that work behind the scenes. Let's explore what these methods do and how they integrate with the caching strategy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Working with Specific Metadata Configurations\n", + "\n", + "You can also retrieve metadata for specific configurations rather than all at once." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HfCacheManager Internal Methods:\n", + "===================================\n", + "\n", + "1. _get_metadata_for_config(config)\n", + " → Implements the 3-case strategy for a specific configuration\n", + " → Returns detailed result with strategy used and success status\n", + "\n", + "2. _check_metadata_exists_in_duckdb(table_name)\n", + " → Case 1: Checks if metadata table already exists in DuckDB\n", + " → Fast check using information_schema.tables\n", + "\n", + "3. _load_metadata_from_cache(config, table_name)\n", + " → Case 2: Attempts to load from local HuggingFace cache\n", + " → Uses try_to_load_from_cache() to find cached files\n", + "\n", + "4. _download_and_load_metadata(config, table_name)\n", + " → Case 3: Downloads from HuggingFace Hub if not cached\n", + " → Uses snapshot_download() for efficient file retrieval\n", + "\n", + "5. _create_duckdb_table_from_files(file_paths, table_name)\n", + " → Creates DuckDB views from parquet files\n", + " → Handles both single files and multiple files efficiently\n", + "\n", + "6. _extract_embedded_metadata_field(data_table, field, metadata_table)\n", + " → Extracts metadata fields from data tables\n", + " → Creates separate queryable metadata views\n", + "\n", + "These methods work together to provide:\n", + "• Transparent caching that 'just works'\n", + "• Minimal network usage through intelligent fallbacks\n", + "• Fast metadata access via DuckDB views\n", + "• Automatic handling of different file structures\n" + ] + } + ], + "source": [ + "# Demonstrate understanding of internal cache methods\n", + "print(\"HfCacheManager Internal Methods:\")\n", + "print(\"=\" * 35)\n", + "\n", + "print(\"\\n1. _get_metadata_for_config(config)\")\n", + "print(\" → Implements the 3-case strategy for a specific configuration\")\n", + "print(\" → Returns detailed result with strategy used and success status\")\n", + "\n", + "print(\"\\n2. _check_metadata_exists_in_duckdb(table_name)\")\n", + "print(\" → Case 1: Checks if metadata table already exists in DuckDB\")\n", + "print(\" → Fast check using information_schema.tables\")\n", + "\n", + "print(\"\\n3. _load_metadata_from_cache(config, table_name)\")\n", + "print(\" → Case 2: Attempts to load from local HuggingFace cache\")\n", + "print(\" → Uses try_to_load_from_cache() to find cached files\")\n", + "\n", + "print(\"\\n4. _download_and_load_metadata(config, table_name)\")\n", + "print(\" → Case 3: Downloads from HuggingFace Hub if not cached\")\n", + "print(\" → Uses snapshot_download() for efficient file retrieval\")\n", + "\n", + "print(\"\\n5. _create_duckdb_table_from_files(file_paths, table_name)\")\n", + "print(\" → Creates DuckDB views from parquet files\")\n", + "print(\" → Handles both single files and multiple files efficiently\")\n", + "\n", + "print(\"\\n6. _extract_embedded_metadata_field(data_table, field, metadata_table)\")\n", + "print(\" → Extracts metadata fields from data tables\")\n", + "print(\" → Creates separate queryable metadata views\")\n", + "\n", + "print(\"\\nThese methods work together to provide:\")\n", + "print(\"• Transparent caching that 'just works'\")\n", + "print(\"• Minimal network usage through intelligent fallbacks\")\n", + "print(\"• Fast metadata access via DuckDB views\")\n", + "print(\"• Automatic handling of different file structures\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Extracting Embedded Metadata\n", + "\n", + "Some datasets have metadata embedded within their data files. The HfCacheManager can extract this embedded metadata into separate, queryable tables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Embedded Metadata Extraction\n", + "\n", + "One unique feature of HfCacheManager is the ability to extract embedded metadata fields from data tables into separate, queryable metadata tables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Demonstrate embedded metadata extraction concept\n", + "print(\"Embedded Metadata Extraction:\")\n", + "print(\"=\" * 35)\n", + "\n", + "print(\"\\nScenario: You have a data table with embedded metadata fields\")\n", + "print(\"Example: genomics data with 'experimental_condition' field\")\n", + "\n", + "# Create sample data to demonstrate the concept\n", + "conn.execute(\"\"\"\n", + " CREATE TABLE sample_genomics_data AS \n", + " SELECT \n", + " 'gene_' || (row_number() OVER()) as gene_id,\n", + " random() * 1000 as expression_value,\n", + " CASE \n", + " WHEN (row_number() OVER()) % 4 = 0 THEN 'control'\n", + " WHEN (row_number() OVER()) % 4 = 1 THEN 'treatment_A'\n", + " WHEN (row_number() OVER()) % 4 = 2 THEN 'treatment_B'\n", + " ELSE 'stress_condition'\n", + " END as experimental_condition,\n", + " CASE \n", + " WHEN (row_number() OVER()) % 3 = 0 THEN 'timepoint_0h'\n", + " WHEN (row_number() OVER()) % 3 = 1 THEN 'timepoint_6h'\n", + " ELSE 'timepoint_24h'\n", + " END as timepoint\n", + " FROM range(100)\n", + "\"\"\")\n", + "\n", + "print(\"✓ Created sample genomics data with embedded metadata fields\")\n", + "\n", + "# Show the data structure\n", + "sample_data = conn.execute(\n", + " \"SELECT * FROM sample_genomics_data LIMIT 5\"\n", + ").fetchall()\n", + "\n", + "print(f\"\\nSample data structure:\")\n", + "print(\"gene_id | expression_value | experimental_condition | timepoint\")\n", + "print(\"-\" * 65)\n", + "for row in sample_data:\n", + " print(f\"{row[0]:8} | {row[1]:15.1f} | {row[2]:20} | {row[3]}\")\n", + "\n", + "print(f\"\\nEmbedded metadata fields identified:\")\n", + "print(\"• experimental_condition: Contains treatment/control information\")\n", + "print(\"• timepoint: Contains temporal sampling information\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use HfCacheManager to extract embedded metadata\n", + "print(\"Using HfCacheManager for Metadata Extraction:\")\n", + "print(\"=\" * 50)\n", + "\n", + "# Extract experimental_condition metadata\n", + "success1 = cache_manager._extract_embedded_metadata_field(\n", + " 'sample_genomics_data', \n", + " 'experimental_condition', \n", + " 'metadata_experimental_conditions'\n", + ")\n", + "\n", + "# Extract timepoint metadata \n", + "success2 = cache_manager._extract_embedded_metadata_field(\n", + " 'sample_genomics_data',\n", + " 'timepoint', \n", + " 'metadata_timepoints'\n", + ")\n", + "\n", + "print(f\"Experimental condition extraction: {'✓ Success' if success1 else '✗ Failed'}\")\n", + "print(f\"Timepoint extraction: {'✓ Success' if success2 else '✗ Failed'}\")\n", + "\n", + "# Show extracted metadata tables\n", + "if success1:\n", + " print(f\"\\nExtracted experimental conditions:\")\n", + " conditions = conn.execute(\n", + " \"SELECT value, count FROM metadata_experimental_conditions ORDER BY count DESC\"\n", + " ).fetchall()\n", + " \n", + " for condition, count in conditions:\n", + " print(f\" • {condition}: {count} samples\")\n", + "\n", + "if success2:\n", + " print(f\"\\nExtracted timepoints:\")\n", + " timepoints = conn.execute(\n", + " \"SELECT value, count FROM metadata_timepoints ORDER BY count DESC\"\n", + " ).fetchall()\n", + " \n", + " for timepoint, count in timepoints:\n", + " print(f\" • {timepoint}: {count} samples\")\n", + "\n", + "print(f\"\\nBenefits of extraction:\")\n", + "print(\"• Separate queryable metadata tables\")\n", + "print(\"• Fast metadata-based filtering and analysis\") \n", + "print(\"• Clear separation of data and metadata concerns\")\n", + "print(\"• Reusable metadata across different analyses\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current HuggingFace Cache Status:\n", + "===================================\n", + "Total size: 5.5G\n", + "Number of repositories: 11\n", + "\n", + "Repository breakdown:\n", + " • BrentLab/yeast_comparative_analysis: 166.1K (1 revisions)\n", + " • BrentLab/yeast_genome_resources: 114.5K (7 revisions)\n", + " • BrentLab/barkai_compendium: 3.6G (1 revisions)\n", + " • BrentLab/kemmeren_2014: 646.2M (3 revisions)\n", + " • BrentLab/hu_2007_reimand_2010: 42.7M (1 revisions)\n", + " ... and 6 more repositories\n", + "\n", + "Target repository (BrentLab/mahendrawada_2025) cache info:\n", + " Size: 94.3M\n", + " Revisions: 4\n", + " Latest revision: af5ac9dc\n", + " Last modified: 1763578870.280984\n" + ] + } + ], + "source": [ + "from huggingface_hub import scan_cache_dir\n", + "\n", + "# Get current cache information \n", + "cache_info = scan_cache_dir()\n", + "\n", + "print(\"Current HuggingFace Cache Status:\")\n", + "print(\"=\" * 35)\n", + "print(f\"Total size: {cache_info.size_on_disk_str}\")\n", + "print(f\"Number of repositories: {len(cache_info.repos)}\")\n", + "\n", + "print(\"\\nRepository breakdown:\")\n", + "for repo in list(cache_info.repos)[:5]: # Show first 5 repos\n", + " print(f\" • {repo.repo_id}: {repo.size_on_disk_str} ({len(repo.revisions)} revisions)\")\n", + "\n", + "if len(cache_info.repos) > 5:\n", + " print(f\" ... and {len(cache_info.repos) - 5} more repositories\")\n", + "\n", + "# Show target repository if it exists in cache\n", + "target_repo = None\n", + "for repo in cache_info.repos:\n", + " if repo.repo_id == cache_manager.repo_id:\n", + " target_repo = repo\n", + " break\n", + "\n", + "if target_repo:\n", + " print(f\"\\nTarget repository ({cache_manager.repo_id}) cache info:\")\n", + " print(f\" Size: {target_repo.size_on_disk_str}\")\n", + " print(f\" Revisions: {len(target_repo.revisions)}\")\n", + " if target_repo.revisions:\n", + " latest_rev = max(target_repo.revisions, key=lambda r: r.last_modified)\n", + " print(f\" Latest revision: {latest_rev.commit_hash[:8]}\")\n", + " print(f\" Last modified: {latest_rev.last_modified}\")\n", + "else:\n", + " print(f\"\\nTarget repository ({cache_manager.repo_id}) not found in cache.\")\n", + " print(\"It may need to be downloaded first.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cache Cleanup by Age" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleaning cache by age (30+ days old):\n", + "========================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found 41 old revisions. Will free 4.7G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Cleanup strategy created:\n", + "Expected space freed: 4.7G\n", + "Items to delete: 46\n", + "\n", + "Breakdown of items to delete:\n", + " • Blob files: 27\n", + " • Reference files: 0\n", + " • Repository directories: 7\n", + " • Snapshot directories: 12\n", + "\n", + "Sample blob files to delete:\n", + " • /home/chase/.cache/huggingface/hub/datasets--BrentLab--harbison_2004/blobs/b5fbd9e98fd8ddadeeb5631e3b6f5055e917c98d\n", + " • /home/chase/.cache/huggingface/hub/datasets--BrentLab--hackett_2020/blobs/a85bd6b418d9644d9adaa1269c27f97469a4aaee51af63cf1aa041f62cd8ba2c\n", + " • /home/chase/.cache/huggingface/hub/datasets--BrentLab--hackett_2020/blobs/c3e72ccb1b8deba4bbfd18abe6081de7ec3914d9\n", + " ... and 24 more blob files\n" + ] + } + ], + "source": [ + "# Clean cache entries older than 30 days (dry run)\n", + "print(\"Cleaning cache by age (30+ days old):\")\n", + "print(\"=\" * 40)\n", + "\n", + "age_cleanup = cache_manager.clean_cache_by_age(\n", + " max_age_days=30,\n", + " dry_run=True # Set to False to actually execute\n", + ")\n", + "\n", + "print(f\"\\nCleanup strategy created:\")\n", + "print(f\"Expected space freed: {age_cleanup.expected_freed_size_str}\")\n", + "\n", + "# Count total items to delete across all categories\n", + "total_items = len(age_cleanup.blobs) + len(age_cleanup.refs) + len(age_cleanup.repos) + len(age_cleanup.snapshots)\n", + "print(f\"Items to delete: {total_items}\")\n", + "\n", + "# Show breakdown of what would be deleted\n", + "if total_items > 0:\n", + " print(f\"\\nBreakdown of items to delete:\")\n", + " print(f\" • Blob files: {len(age_cleanup.blobs)}\")\n", + " print(f\" • Reference files: {len(age_cleanup.refs)}\")\n", + " print(f\" • Repository directories: {len(age_cleanup.repos)}\")\n", + " print(f\" • Snapshot directories: {len(age_cleanup.snapshots)}\")\n", + " \n", + " # Show some example items\n", + " if age_cleanup.blobs:\n", + " print(f\"\\nSample blob files to delete:\")\n", + " for item in list(age_cleanup.blobs)[:3]:\n", + " print(f\" • {item}\")\n", + " if len(age_cleanup.blobs) > 3:\n", + " print(f\" ... and {len(age_cleanup.blobs) - 3} more blob files\")\n", + "else:\n", + " print(\"No old files found for cleanup.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cache Cleanup by Size" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleaning cache to target size: 1GB\n", + "========================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 17 revisions for deletion. Will free 3.8G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Size-based cleanup strategy:\n", + "Expected space freed: 3.8G\n", + "Items to delete: 85\n", + "\n", + "Comparing cleanup strategies for 1GB:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 17 revisions for deletion. Will free 3.8G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " • oldest_first : 3.8G (85 items)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 4 revisions for deletion. Will free 4.0G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " • largest_first : 4.0G (8 items)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 17 revisions for deletion. Will free 3.8G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " • least_used : 3.8G (85 items)\n" + ] + } + ], + "source": [ + "# Clean cache to target size (dry run)\n", + "target_size = \"1GB\"\n", + "print(f\"Cleaning cache to target size: {target_size}\")\n", + "print(\"=\" * 40)\n", + "\n", + "size_cleanup = cache_manager.clean_cache_by_size(\n", + " target_size=target_size,\n", + " strategy=\"oldest_first\", # Can be: oldest_first, largest_first, least_used\n", + " dry_run=True\n", + ")\n", + "\n", + "print(f\"\\nSize-based cleanup strategy:\")\n", + "print(f\"Expected space freed: {size_cleanup.expected_freed_size_str}\")\n", + "\n", + "# Count total items to delete across all categories\n", + "total_items = len(size_cleanup.blobs) + len(size_cleanup.refs) + len(size_cleanup.repos) + len(size_cleanup.snapshots)\n", + "print(f\"Items to delete: {total_items}\")\n", + "\n", + "# Compare different strategies\n", + "strategies = [\"oldest_first\", \"largest_first\", \"least_used\"]\n", + "print(f\"\\nComparing cleanup strategies for {target_size}:\")\n", + "\n", + "for strategy in strategies:\n", + " try:\n", + " strategy_result = cache_manager.clean_cache_by_size(\n", + " target_size=target_size,\n", + " strategy=strategy,\n", + " dry_run=True\n", + " )\n", + " strategy_total = (len(strategy_result.blobs) + len(strategy_result.refs) + \n", + " len(strategy_result.repos) + len(strategy_result.snapshots))\n", + " print(f\" • {strategy:15}: {strategy_result.expected_freed_size_str:>8} \"\n", + " f\"({strategy_total} items)\")\n", + " except Exception as e:\n", + " print(f\" • {strategy:15}: Error - {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleaning Unused Revisions" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleaning unused revisions (keep latest 2 per repo):\n", + "==================================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found 31 unused revisions. Will free 642.9M\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Revision cleanup strategy:\n", + "Expected space freed: 642.9M\n", + "Items to delete: 118\n", + "\n", + "Breakdown of cleanup:\n", + " • Blob files: 87\n", + " • Reference files: 0\n", + " • Repository directories: 0\n", + " • Snapshot directories: 31\n", + "\n", + "Per-repository revision analysis:\n", + "\n", + " • BrentLab/yeast_comparative_analysis:\n", + " Total revisions: 1\n", + " Would keep: 1\n", + " Would delete: 0\n", + " Keep: ac03d065 (modified: 1767824941.5531375)\n", + "\n", + " • BrentLab/yeast_genome_resources:\n", + " Total revisions: 7\n", + " Would keep: 2\n", + " Would delete: 5\n", + " Keep: 42beb284 (modified: 1758155946.5549896)\n", + " Keep: 15fdb72f (modified: 1755819093.2306638)\n", + " Delete: 7441b9a8 (modified: 1755816785.6988702)\n", + "\n", + " • BrentLab/barkai_compendium:\n", + " Total revisions: 1\n", + " Would keep: 1\n", + " Would delete: 0\n", + " Keep: a987ef37 (modified: 1756926783.3167186)\n" + ] + } + ], + "source": [ + "# Clean unused revisions, keeping only the latest 2 per repository\n", + "print(\"Cleaning unused revisions (keep latest 2 per repo):\")\n", + "print(\"=\" * 50)\n", + "\n", + "revision_cleanup = cache_manager.clean_unused_revisions(\n", + " keep_latest=2,\n", + " dry_run=True\n", + ")\n", + "\n", + "print(f\"\\nRevision cleanup strategy:\")\n", + "print(f\"Expected space freed: {revision_cleanup.expected_freed_size_str}\")\n", + "\n", + "# Count total items to delete across all categories\n", + "total_items = len(revision_cleanup.blobs) + len(revision_cleanup.refs) + len(revision_cleanup.repos) + len(revision_cleanup.snapshots)\n", + "print(f\"Items to delete: {total_items}\")\n", + "\n", + "# Show breakdown\n", + "if total_items > 0:\n", + " print(f\"\\nBreakdown of cleanup:\")\n", + " print(f\" • Blob files: {len(revision_cleanup.blobs)}\")\n", + " print(f\" • Reference files: {len(revision_cleanup.refs)}\") \n", + " print(f\" • Repository directories: {len(revision_cleanup.repos)}\")\n", + " print(f\" • Snapshot directories: {len(revision_cleanup.snapshots)}\")\n", + "\n", + "# Show repository-specific breakdown\n", + "cache_info = scan_cache_dir()\n", + "if cache_info.repos:\n", + " print(\"\\nPer-repository revision analysis:\")\n", + " for repo in list(cache_info.repos)[:3]:\n", + " print(f\"\\n • {repo.repo_id}:\")\n", + " print(f\" Total revisions: {len(repo.revisions)}\")\n", + " print(f\" Would keep: {min(2, len(repo.revisions))}\")\n", + " print(f\" Would delete: {max(0, len(repo.revisions) - 2)}\")\n", + " \n", + " # Show revision details\n", + " sorted_revisions = sorted(repo.revisions, key=lambda r: r.last_modified, reverse=True)\n", + " for i, rev in enumerate(sorted_revisions[:2]):\n", + " print(f\" Keep: {rev.commit_hash[:8]} (modified: {rev.last_modified})\")\n", + " \n", + " for rev in sorted_revisions[2:3]: # Show one that would be deleted\n", + " print(f\" Delete: {rev.commit_hash[:8]} (modified: {rev.last_modified})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Automated Cache Management" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Starting automated cache cleanup...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Automated cache cleanup (comprehensive):\n", + "========================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found 41 old revisions. Will free 4.7G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n", + "INFO:__main__:Found 31 unused revisions. Will free 642.9M\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n", + "INFO:__main__:Selected 9 revisions for deletion. Will free 2.8M\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n", + "INFO:__main__:Automated cleanup complete. Total freed: 5.0GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Automated cleanup executed 3 strategies:\n", + " 1. Strategy freed: 4.7G\n", + " 2. Strategy freed: 642.9M\n", + " 3. Strategy freed: 2.8M\n", + "\n", + "Total space that would be freed: 5.0GB\n", + "Cache size after cleanup: 129.8MB\n" + ] + } + ], + "source": [ + "# Automated cache cleanup with multiple strategies\n", + "print(\"Automated cache cleanup (comprehensive):\")\n", + "print(\"=\" * 40)\n", + "\n", + "auto_cleanup = cache_manager.auto_clean_cache(\n", + " max_age_days=30, # Remove anything older than 30 days\n", + " max_total_size=\"5GB\", # Target maximum cache size\n", + " keep_latest_per_repo=2, # Keep 2 latest revisions per repo\n", + " dry_run=True # Dry run for safety\n", + ")\n", + "\n", + "print(f\"\\nAutomated cleanup executed {len(auto_cleanup)} strategies:\")\n", + "\n", + "total_freed = 0\n", + "for i, strategy in enumerate(auto_cleanup, 1):\n", + " print(f\" {i}. Strategy freed: {strategy.expected_freed_size_str}\")\n", + " total_freed += strategy.expected_freed_size\n", + "\n", + "print(f\"\\nTotal space that would be freed: {cache_manager._format_bytes(total_freed)}\")\n", + "\n", + "# Calculate final cache size\n", + "current_cache = scan_cache_dir()\n", + "final_size = current_cache.size_on_disk - total_freed\n", + "print(f\"Cache size after cleanup: {cache_manager._format_bytes(max(0, final_size))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Best Practices and Performance Tips\n", + "\n", + "Here are some best practices for using HfCacheManager effectively:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance Best Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performance Demonstration: Cache Management Benefits\n", + "=======================================================\n", + "\n", + "Demonstrating cache cleanup performance...\n", + "\n", + "1. Cache scanning performance:\n", + " Time to scan cache: 0.096 seconds\n", + " Repositories found: 11\n", + " Total cache size: 5.5G\n", + "\n", + "2. Cleanup strategy creation performance:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Found 41 old revisions. Will free 4.7G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Age cleanup strategy: 0.094 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:Selected 17 revisions for deletion. Will free 3.8G\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n", + "INFO:__main__:Found 31 unused revisions. Will free 642.9M\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Size cleanup strategy: 0.093 seconds\n", + " Revision cleanup strategy: 0.100 seconds\n", + "\n", + "Performance insights:\n", + "• Cache scanning is fast: 0.096s for 11 repos\n", + "• Cleanup strategy creation is efficient\n", + "• Dry runs allow safe preview of cleanup operations\n", + "• Multiple strategies can be compared quickly\n" + ] + } + ], + "source": [ + "import time\n", + "\n", + "print(\"Performance Demonstration: Cache Management Benefits\")\n", + "print(\"=\" * 55)\n", + "\n", + "print(\"\\nDemonstrating cache cleanup performance...\")\n", + "\n", + "# Show performance of cache scanning and cleanup strategy creation\n", + "print(\"\\n1. Cache scanning performance:\")\n", + "start_time = time.time()\n", + "cache_info = scan_cache_dir()\n", + "scan_time = time.time() - start_time\n", + "print(f\" Time to scan cache: {scan_time:.3f} seconds\")\n", + "print(f\" Repositories found: {len(cache_info.repos)}\")\n", + "print(f\" Total cache size: {cache_info.size_on_disk_str}\")\n", + "\n", + "# Show performance of cleanup strategy creation\n", + "print(\"\\n2. Cleanup strategy creation performance:\")\n", + "\n", + "start_time = time.time()\n", + "age_strategy = cache_manager.clean_cache_by_age(max_age_days=30, dry_run=True)\n", + "age_time = time.time() - start_time\n", + "print(f\" Age cleanup strategy: {age_time:.3f} seconds\")\n", + "\n", + "start_time = time.time()\n", + "size_strategy = cache_manager.clean_cache_by_size(target_size=\"1GB\", dry_run=True)\n", + "size_time = time.time() - start_time\n", + "print(f\" Size cleanup strategy: {size_time:.3f} seconds\")\n", + "\n", + "start_time = time.time()\n", + "revision_strategy = cache_manager.clean_unused_revisions(keep_latest=2, dry_run=True)\n", + "revision_time = time.time() - start_time\n", + "print(f\" Revision cleanup strategy: {revision_time:.3f} seconds\")\n", + "\n", + "print(f\"\\nPerformance insights:\")\n", + "print(f\"• Cache scanning is fast: {scan_time:.3f}s for {len(cache_info.repos)} repos\")\n", + "print(f\"• Cleanup strategy creation is efficient\")\n", + "print(f\"• Dry runs allow safe preview of cleanup operations\")\n", + "print(f\"• Multiple strategies can be compared quickly\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Memory and Storage Optimization" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Memory and Storage Optimization Tips:\n", + "========================================\n", + "\n", + "1. DuckDB Views vs Tables:\n", + " • HfCacheManager creates VIEWS by default (not tables)\n", + " • Views reference original parquet files without duplication\n", + " • This saves storage space while enabling fast SQL queries\n", + "\n", + "2. Metadata-First Workflow:\n", + " • Load metadata first to understand data structure\n", + " • Use metadata to filter and select specific data subsets\n", + " • Avoid loading entire datasets when only portions are needed\n", + "\n", + "3. Cache Management Strategy:\n", + " • Run automated cleanup regularly\n", + " • Keep cache size reasonable for your system\n", + " • Prioritize keeping recent and frequently-used datasets\n" + ] + } + ], + "source": [ + "print(\"Memory and Storage Optimization Tips:\")\n", + "print(\"=\" * 40)\n", + "\n", + "print(\"\\n1. DuckDB Views vs Tables:\")\n", + "print(\" • HfCacheManager creates VIEWS by default (not tables)\")\n", + "print(\" • Views reference original parquet files without duplication\")\n", + "print(\" • This saves storage space while enabling fast SQL queries\")\n", + "\n", + "print(\"\\n2. Metadata-First Workflow:\")\n", + "print(\" • Load metadata first to understand data structure\")\n", + "print(\" • Use metadata to filter and select specific data subsets\")\n", + "print(\" • Avoid loading entire datasets when only portions are needed\")\n", + "\n", + "print(\"\\n3. Cache Management Strategy:\")\n", + "print(\" • Run automated cleanup regularly\")\n", + "print(\" • Keep cache size reasonable for your system\")\n", + "print(\" • Prioritize keeping recent and frequently-used datasets\")\n", + "\n", + "# Demonstrate DuckDB view benefits\n", + "tables_info = conn.execute(\n", + " \"SELECT table_name, table_type FROM information_schema.tables WHERE table_name LIKE 'metadata_%'\"\n", + ").fetchall()\n", + "\n", + "if tables_info:\n", + " print(f\"\\nCurrent DuckDB objects ({len(tables_info)} total):\")\n", + " for table_name, table_type in tables_info:\n", + " print(f\" • {table_name}: {table_type}\")\n", + " \n", + " view_count = sum(1 for _, table_type in tables_info if table_type == 'VIEW')\n", + " print(f\"\\n {view_count} views created (space-efficient!)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Integration with Other Components\n", + "\n", + "The HfCacheManager works seamlessly with other components in the tfbpapi ecosystem." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HfCacheManager Integration Workflow:\n", + "========================================\n", + "\n", + "1. Cache Management Setup:\n", + " from tfbpapi.HfCacheManager import HfCacheManager\n", + " cache_mgr = HfCacheManager(repo_id, duckdb_conn)\n", + " # Inherits all DataCard functionality + cache management\n", + "\n", + "2. Proactive Cache Cleanup:\n", + " # Clean before large operations\n", + " cache_mgr.auto_clean_cache(max_total_size='5GB', dry_run=False)\n", + " # Or use specific strategies\n", + " cache_mgr.clean_cache_by_age(max_age_days=30)\n", + "\n", + "3. Data Loading with Cache Awareness:\n", + " # The 3-case strategy works automatically with HfQueryAPI\n", + " from tfbpapi import HfQueryAPI\n", + " query_api = HfQueryAPI(repo_id, duckdb_conn)\n", + " # Metadata loading uses cache manager's strategy\n", + " data_df = query_api.get_pandas('config_name')\n", + "\n", + "4. Embedded Metadata Extraction:\n", + " # Extract metadata fields after data loading\n", + " cache_mgr._extract_embedded_metadata_field(\n", + " 'data_table_name', 'metadata_field', 'metadata_table_name')\n", + "\n", + "5. Regular Cache Maintenance:\n", + " # Schedule regular cleanup\n", + " cache_mgr.clean_unused_revisions(keep_latest=2)\n", + " cache_mgr.clean_cache_by_size('10GB', strategy='oldest_first')\n", + "\n", + "Current Session State:\n", + "Repository: BrentLab/mahendrawada_2025\n", + "DuckDB tables: 0\n", + "HF cache size: 5.5G\n", + "Cache repositories: 11\n" + ] + } + ], + "source": [ + "print(\"HfCacheManager Integration Workflow:\")\n", + "print(\"=\" * 40)\n", + "\n", + "print(\"\\n1. Cache Management Setup:\")\n", + "print(\" from tfbpapi.HfCacheManager import HfCacheManager\")\n", + "print(\" cache_mgr = HfCacheManager(repo_id, duckdb_conn)\")\n", + "print(\" # Inherits all DataCard functionality + cache management\")\n", + "\n", + "print(\"\\n2. Proactive Cache Cleanup:\")\n", + "print(\" # Clean before large operations\")\n", + "print(\" cache_mgr.auto_clean_cache(max_total_size='5GB', dry_run=False)\")\n", + "print(\" # Or use specific strategies\")\n", + "print(\" cache_mgr.clean_cache_by_age(max_age_days=30)\")\n", + "\n", + "print(\"\\n3. Data Loading with Cache Awareness:\")\n", + "print(\" # The 3-case strategy works automatically with HfQueryAPI\")\n", + "print(\" from tfbpapi import HfQueryAPI\")\n", + "print(\" query_api = HfQueryAPI(repo_id, duckdb_conn)\")\n", + "print(\" # Metadata loading uses cache manager's strategy\")\n", + "print(\" data_df = query_api.get_pandas('config_name')\")\n", + "\n", + "print(\"\\n4. Embedded Metadata Extraction:\")\n", + "print(\" # Extract metadata fields after data loading\")\n", + "print(\" cache_mgr._extract_embedded_metadata_field(\")\n", + "print(\" 'data_table_name', 'metadata_field', 'metadata_table_name')\")\n", + "\n", + "print(\"\\n5. Regular Cache Maintenance:\")\n", + "print(\" # Schedule regular cleanup\")\n", + "print(\" cache_mgr.clean_unused_revisions(keep_latest=2)\")\n", + "print(\" cache_mgr.clean_cache_by_size('10GB', strategy='oldest_first')\")\n", + "\n", + "# Show current state\n", + "print(f\"\\nCurrent Session State:\")\n", + "print(f\"Repository: {cache_manager.repo_id}\")\n", + "print(f\"DuckDB tables: {len(conn.execute('SELECT table_name FROM information_schema.tables').fetchall())}\")\n", + "\n", + "cache_info = scan_cache_dir()\n", + "print(f\"HF cache size: {cache_info.size_on_disk_str}\")\n", + "print(f\"Cache repositories: {len(cache_info.repos)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Troubleshooting and Error Handling\n", + "\n", + "The HfCacheManager includes comprehensive error handling and diagnostic capabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cache Management Troubleshooting:\n", + "===================================\n", + "\n", + "1. Import and Setup Issues:\n", + " • Ensure correct import: from tfbpapi.HfCacheManager import HfCacheManager\n", + " • Verify DuckDB connection: conn = duckdb.connect(':memory:')\n", + " • Check repository access permissions\n", + "\n", + "2. Cache Space and Performance Issues:\n", + " Current cache size: 5.5G\n", + " • Use auto_clean_cache() for automated management\n", + " • Monitor cache growth with scan_cache_dir()\n", + " • Set appropriate size limits for your system\n", + "\n", + "3. Cache Cleanup Issues:\n", + " • Use dry_run=True first to preview changes\n", + " • Check disk permissions for cache directory\n", + " • Verify no active processes are using cached files\n", + "\n", + "4. DuckDB Integration Issues:\n", + " • Ensure DuckDB connection is active\n", + " • Check memory limits for in-memory databases\n", + " • Verify table names don't conflict\n", + "\n", + "Cache Health Check:\n", + "✓ DuckDB connection: DuckDB OK\n", + "✓ Cache access: 11 repositories found\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:__main__:No old revisions found to delete\n", + "INFO:__main__:Found 0 old revisions. Will free 0.0\n", + "INFO:__main__:Dry run completed. Use dry_run=False to execute deletion\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Cache cleanup methods: Working\n", + "\n", + "Current Status:\n", + "Repository: BrentLab/mahendrawada_2025\n", + "Logger configured: True\n", + "Cache management ready: ✓\n" + ] + } + ], + "source": [ + "print(\"Cache Management Troubleshooting:\")\n", + "print(\"=\" * 35)\n", + "\n", + "print(\"\\n1. Import and Setup Issues:\")\n", + "print(\" • Ensure correct import: from tfbpapi.HfCacheManager import HfCacheManager\")\n", + "print(\" • Verify DuckDB connection: conn = duckdb.connect(':memory:')\")\n", + "print(\" • Check repository access permissions\")\n", + "\n", + "print(\"\\n2. Cache Space and Performance Issues:\")\n", + "try:\n", + " cache_info = scan_cache_dir()\n", + " print(f\" Current cache size: {cache_info.size_on_disk_str}\")\n", + " print(\" • Use auto_clean_cache() for automated management\")\n", + " print(\" • Monitor cache growth with scan_cache_dir()\")\n", + " print(\" • Set appropriate size limits for your system\")\n", + " \n", + " # Show if cache is getting large\n", + " total_gb = cache_info.size_on_disk / (1024**3)\n", + " if total_gb > 10:\n", + " print(f\" ⚠️ Large cache detected ({total_gb:.1f}GB) - consider cleanup\")\n", + " \n", + "except Exception as e:\n", + " print(f\" Cache scan error: {e}\")\n", + "\n", + "print(\"\\n3. Cache Cleanup Issues:\")\n", + "print(\" • Use dry_run=True first to preview changes\")\n", + "print(\" • Check disk permissions for cache directory\")\n", + "print(\" • Verify no active processes are using cached files\")\n", + "\n", + "print(\"\\n4. DuckDB Integration Issues:\")\n", + "print(\" • Ensure DuckDB connection is active\")\n", + "print(\" • Check memory limits for in-memory databases\")\n", + "print(\" • Verify table names don't conflict\")\n", + "\n", + "# Perform health checks\n", + "print(f\"\\nCache Health Check:\")\n", + "\n", + "# Test DuckDB\n", + "try:\n", + " test_result = conn.execute(\"SELECT 'DuckDB OK' as status\").fetchone()\n", + " print(f\"✓ DuckDB connection: {test_result[0]}\")\n", + "except Exception as e:\n", + " print(f\"✗ DuckDB connection: {e}\")\n", + "\n", + "# Test cache access\n", + "try:\n", + " cache_info = scan_cache_dir()\n", + " print(f\"✓ Cache access: {len(cache_info.repos)} repositories found\")\n", + "except Exception as e:\n", + " print(f\"✗ Cache access: {e}\")\n", + "\n", + "# Test cache manager methods\n", + "try:\n", + " test_cleanup = cache_manager.clean_cache_by_age(max_age_days=999, dry_run=True)\n", + " print(f\"✓ Cache cleanup methods: Working\")\n", + "except Exception as e:\n", + " print(f\"✗ Cache cleanup methods: {e}\")\n", + "\n", + "print(f\"\\nCurrent Status:\")\n", + "print(f\"Repository: {cache_manager.repo_id}\")\n", + "print(f\"Logger configured: {cache_manager.logger is not None}\")\n", + "print(f\"Cache management ready: ✓\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tfbpapi-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/datacard_tutorial.ipynb b/docs/tutorials/datacard_tutorial.ipynb new file mode 100644 index 0000000..1556a1c --- /dev/null +++ b/docs/tutorials/datacard_tutorial.ipynb @@ -0,0 +1,606 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DataCard Tutorial: Exploring HuggingFace Dataset Metadata\n", + "\n", + "The `DataCard` class provides an interface for exploring HuggingFace dataset metadata without loading the actual genomic data. This is particularly useful for:\n", + "\n", + "- Understanding dataset structure and available configurations\n", + "- Exploring experimental conditions at all hierarchy levels\n", + "- Discovering metadata relationships\n", + "- Planning data analysis workflows and metadata table creation\n", + "\n", + "In this tutorial, we'll explore the **BrentLab/harbison_2004** dataset, which contains ChIP-chip data for transcription factor binding across 14 environmental conditions in yeast." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Instantiating a DataCard Object" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Repository: BrentLab/harbison_2004\n" + ] + } + ], + "source": [ + "from tfbpapi.datacard import DataCard\n", + "\n", + "card = DataCard('BrentLab/harbison_2004')\n", + "\n", + "print(f\"Repository: {card.repo_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Repository Overview\n", + "\n", + "Let's start by getting a high-level overview of the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Repository Information:\n", + "========================================\n", + "repo_id : BrentLab/harbison_2004\n", + "pretty_name : Harbison, 2004 ChIP-chip\n", + "license : mit\n", + "tags : ['genomics', 'yeast', 'transcription', 'binding']\n", + "language : ['en']\n", + "size_categories : ['1M\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsiusdataset_id
01YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
12YAL051WOAF1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
23YBL005WPDR3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
34YBL008WHIR1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
45YBL021CHAP3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
\n", + "" + ], + "text/plain": [ + " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", + "0 1 YSC0017 MATA1 YPD glucose \n", + "1 2 YAL051W OAF1 YPD glucose \n", + "2 3 YBL005W PDR3 YPD glucose \n", + "3 4 YBL008W HIR1 YPD glucose \n", + "4 5 YBL021C HAP3 YPD glucose \n", + "\n", + " temperature_celsius dataset_id \n", + "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "3 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "4 30.0 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Query all datasets for samples grown on glucose\n", + "glucose_samples = vdb.query(filters={\"carbon_source\": \"glucose\"})\n", + "\n", + "print(f\"Found {len(glucose_samples)} samples with glucose\")\n", + "print(f\"\\nColumns: {list(glucose_samples.columns)}\")\n", + "print(f\"\\nFirst few rows:\")\n", + "glucose_samples.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query Specific Datasets\n", + "\n", + "Limit your query to specific datasets using the `datasets` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 310 samples from harbison_2004\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" + }, + { + "name": "dataset_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "0c9ebf95-0bf1-46d7-83ee-57b87c5def44", + "rows": [ + [ + "0", + "1", + "YSC0017", + "MATA1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "1", + "2", + "YAL051W", + "OAF1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "2", + "3", + "YBL005W", + "PDR3", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "3", + "4", + "YBL008W", + "HIR1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "4", + "5", + "YBL021C", + "HAP3", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ] + ], + "shape": { + "columns": 7, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsiusdataset_id
01YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
12YAL051WOAF1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
23YBL005WPDR3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
34YBL008WHIR1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
45YBL021CHAP3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", + "0 1 YSC0017 MATA1 YPD glucose \n", + "1 2 YAL051W OAF1 YPD glucose \n", + "2 3 YBL005W PDR3 YPD glucose \n", + "3 4 YBL008W HIR1 YPD glucose \n", + "4 5 YBL021C HAP3 YPD glucose \n", + "\n", + " temperature_celsius dataset_id \n", + "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "3 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "4 30.0 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Query only harbison_2004\n", + "harbison_glucose = vdb.query(\n", + " filters={\"carbon_source\": \"glucose\"},\n", + " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")]\n", + ")\n", + "\n", + "print(f\"Found {len(harbison_glucose)} samples from harbison_2004\")\n", + "harbison_glucose.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Select Specific Fields\n", + "\n", + "Return only the fields you need with the `fields` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Columns: ['sample_id', 'carbon_source', 'temperature_celsius', 'dataset_id']\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" + }, + { + "name": "dataset_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "38c6fa44-08cf-4751-9476-7bca3cb1c41c", + "rows": [ + [ + "0", + "1", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "1", + "2", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "2", + "3", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "3", + "4", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "4", + "5", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ] + ], + "shape": { + "columns": 4, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idcarbon_sourcetemperature_celsiusdataset_id
01glucose30.0BrentLab/harbison_2004/harbison_2004
12glucose30.0BrentLab/harbison_2004/harbison_2004
23glucose30.0BrentLab/harbison_2004/harbison_2004
34glucose30.0BrentLab/harbison_2004/harbison_2004
45glucose30.0BrentLab/harbison_2004/harbison_2004
\n", + "
" + ], + "text/plain": [ + " sample_id carbon_source temperature_celsius \\\n", + "0 1 glucose 30.0 \n", + "1 2 glucose 30.0 \n", + "2 3 glucose 30.0 \n", + "3 4 glucose 30.0 \n", + "4 5 glucose 30.0 \n", + "\n", + " dataset_id \n", + "0 BrentLab/harbison_2004/harbison_2004 \n", + "1 BrentLab/harbison_2004/harbison_2004 \n", + "2 BrentLab/harbison_2004/harbison_2004 \n", + "3 BrentLab/harbison_2004/harbison_2004 \n", + "4 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get just sample_id, carbon_source, and temperature\n", + "minimal_data = vdb.query(\n", + " filters={\"carbon_source\": \"glucose\"},\n", + " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\"]\n", + ")\n", + "\n", + "print(f\"Columns: {list(minimal_data.columns)}\")\n", + "minimal_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Advanced Queries\n", + "\n", + "VirtualDB supports more sophisticated query patterns." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multiple Filter Conditions" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 1791 samples with glucose at 30C\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" + }, + { + "name": "dataset_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "357f69ed-ad79-4401-8458-5a1cc48f14c5", + "rows": [ + [ + "0", + "1", + "YSC0017", + "MATA1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "1", + "2", + "YAL051W", + "OAF1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "2", + "3", + "YBL005W", + "PDR3", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "3", + "4", + "YBL008W", + "HIR1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "4", + "5", + "YBL021C", + "HAP3", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ] + ], + "shape": { + "columns": 7, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsiusdataset_id
01YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
12YAL051WOAF1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
23YBL005WPDR3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
34YBL008WHIR1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
45YBL021CHAP3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", + "0 1 YSC0017 MATA1 YPD glucose \n", + "1 2 YAL051W OAF1 YPD glucose \n", + "2 3 YBL005W PDR3 YPD glucose \n", + "3 4 YBL008W HIR1 YPD glucose \n", + "4 5 YBL021C HAP3 YPD glucose \n", + "\n", + " temperature_celsius dataset_id \n", + "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "3 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "4 30.0 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Samples with glucose at 30C\n", + "glucose_30c = vdb.query(\n", + " filters={\n", + " \"carbon_source\": \"glucose\",\n", + " \"temperature_celsius\": 30\n", + " }\n", + ")\n", + "\n", + "print(f\"Found {len(glucose_30c)} samples with glucose at 30C\")\n", + "glucose_30c.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Numeric Range Queries" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 1833 samples at >= 30C\n", + "Found 1833 samples between 28-32C\n" + ] + } + ], + "source": [ + "# Samples at temperature >= 30C\n", + "warm_samples = vdb.query(\n", + " filters={\"temperature_celsius\": (\">=\", 30)}\n", + ")\n", + "\n", + "print(f\"Found {len(warm_samples)} samples at >= 30C\")\n", + "\n", + "# Samples between 28C and 32C\n", + "moderate_temp = vdb.query(\n", + " filters={\"temperature_celsius\": (\"between\", 28, 32)}\n", + ")\n", + "\n", + "print(f\"Found {len(moderate_temp)} samples between 28-32C\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Factor Alias Expansion\n", + "\n", + "When you query for a normalized value, VirtualDB automatically expands to all original aliases." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", + "0 68 YDR277C MTH1 GAL galactose \n", + "1 112 YGL035C MIG1 GAL galactose \n", + "2 197 YKL038W RGT1 GAL galactose \n", + "3 335 YPL248C GAL4 GAL galactose \n", + "\n", + " temperature_celsius dataset_id \n", + "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "3 30.0 BrentLab/harbison_2004/harbison_2004 \n" + ] + } + ], + "source": [ + "# Query for \"galactose\" matches \"D-galactose\", \"gal\", and \"galactose\"\n", + "galactose_samples = vdb.query(filters={\"carbon_source\": \"galactose\"})\n", + "\n", + "print(galactose_samples)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Complete Data Retrieval\n", + "\n", + "By default, `query()` returns sample-level metadata (one row per sample). \n", + "Set `complete=True` to get all measurements (many rows per sample)." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Complete data: 1930060 rows\n", + "Columns: ['sample_id', 'db_id', 'target_locus_tag', 'target_symbol', 'effect', 'pvalue', 'regulator_locus_tag', 'regulator_symbol', 'condition', 'carbon_source', 'temperature_celsius', 'dataset_id']\n", + "\n", + "First few measurements:\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "db_id", + "rawType": "float64", + "type": "float" + }, + { + "name": "target_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "target_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "effect", + "rawType": "float64", + "type": "float" + }, + { + "name": "pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" + }, + { + "name": "dataset_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "0b10b74e-6f1a-42af-8654-7811d039bfac", + "rows": [ + [ + "0", + "1", + "0.0", + "YAL001C", + "TFC3", + "1.697754", + "0.068704735", + "YSC0017", + "MATA1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "1", + "1", + "0.0", + "YAL002W", + "VPS8", + null, + null, + "YSC0017", + "MATA1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "2", + "1", + "0.0", + "YAL003W", + "EFB1", + null, + null, + "YSC0017", + "MATA1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "3", + "1", + "0.0", + "YAL004W", + "YAL004W", + "0.74534215", + "0.83592938", + "YSC0017", + "MATA1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "4", + "1", + "0.0", + "YAL005C", + "SSA1", + null, + null, + "YSC0017", + "MATA1", + "YPD", + "glucose", + "30.0", + "BrentLab/harbison_2004/harbison_2004" + ] + ], + "shape": { + "columns": 12, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_iddb_idtarget_locus_tagtarget_symboleffectpvalueregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsiusdataset_id
010.0YAL001CTFC31.6977540.068705YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
110.0YAL002WVPS8NaNNaNYSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
210.0YAL003WEFB1NaNNaNYSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
310.0YAL004WYAL004W0.7453420.835929YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
410.0YAL005CSSA1NaNNaNYSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
\n", + "
" + ], + "text/plain": [ + " sample_id db_id target_locus_tag target_symbol effect pvalue \\\n", + "0 1 0.0 YAL001C TFC3 1.697754 0.068705 \n", + "1 1 0.0 YAL002W VPS8 NaN NaN \n", + "2 1 0.0 YAL003W EFB1 NaN NaN \n", + "3 1 0.0 YAL004W YAL004W 0.745342 0.835929 \n", + "4 1 0.0 YAL005C SSA1 NaN NaN \n", + "\n", + " regulator_locus_tag regulator_symbol condition carbon_source \\\n", + "0 YSC0017 MATA1 YPD glucose \n", + "1 YSC0017 MATA1 YPD glucose \n", + "2 YSC0017 MATA1 YPD glucose \n", + "3 YSC0017 MATA1 YPD glucose \n", + "4 YSC0017 MATA1 YPD glucose \n", + "\n", + " temperature_celsius dataset_id \n", + "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "3 30.0 BrentLab/harbison_2004/harbison_2004 \n", + "4 30.0 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get complete data with measurements\n", + "complete_data = vdb.query(\n", + " filters={\"carbon_source\": \"glucose\"},\n", + " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", + " complete=True\n", + ")\n", + "\n", + "print(f\"Complete data: {len(complete_data)} rows\")\n", + "print(f\"Columns: {list(complete_data.columns)}\")\n", + "print(\"\\nFirst few measurements:\")\n", + "complete_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Binding data: 1930060 measurements\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "target_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "effect", + "rawType": "float64", + "type": "float" + }, + { + "name": "pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "dataset_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "0b7cb890-7e9c-44d2-9ef5-59374bcf3a8a", + "rows": [ + [ + "0", + "2", + "OAF1", + "TFC3", + "1.5895642", + "0.088986168", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "1", + "2", + "OAF1", + "VPS8", + "1.1413208", + "0.32480496", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "2", + "2", + "OAF1", + "EFB1", + "0.72911994", + "0.87882413", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "3", + "2", + "OAF1", + "YAL004W", + "1.1679044", + "0.28225283", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "4", + "2", + "OAF1", + "SSA1", + "0.72911994", + "0.87882413", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "5", + "2", + "OAF1", + "ERP2", + "1.0508274", + "0.43070675", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "6", + "2", + "OAF1", + "FUN14", + "1.3478761", + "0.15551056", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "7", + "2", + "OAF1", + "SPO7", + "0.93967306", + "0.57823415", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "8", + "2", + "OAF1", + "MDM10", + "0.93967306", + "0.57823415", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "9", + "2", + "OAF1", + "SWC3", + "0.86566703", + "0.6711192", + "BrentLab/harbison_2004/harbison_2004" + ] + ], + "shape": { + "columns": 6, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_symboltarget_symboleffectpvaluedataset_id
02OAF1TFC31.5895640.088986BrentLab/harbison_2004/harbison_2004
12OAF1VPS81.1413210.324805BrentLab/harbison_2004/harbison_2004
22OAF1EFB10.7291200.878824BrentLab/harbison_2004/harbison_2004
32OAF1YAL004W1.1679040.282253BrentLab/harbison_2004/harbison_2004
42OAF1SSA10.7291200.878824BrentLab/harbison_2004/harbison_2004
52OAF1ERP21.0508270.430707BrentLab/harbison_2004/harbison_2004
62OAF1FUN141.3478760.155511BrentLab/harbison_2004/harbison_2004
72OAF1SPO70.9396730.578234BrentLab/harbison_2004/harbison_2004
82OAF1MDM100.9396730.578234BrentLab/harbison_2004/harbison_2004
92OAF1SWC30.8656670.671119BrentLab/harbison_2004/harbison_2004
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_symbol target_symbol effect pvalue \\\n", + "0 2 OAF1 TFC3 1.589564 0.088986 \n", + "1 2 OAF1 VPS8 1.141321 0.324805 \n", + "2 2 OAF1 EFB1 0.729120 0.878824 \n", + "3 2 OAF1 YAL004W 1.167904 0.282253 \n", + "4 2 OAF1 SSA1 0.729120 0.878824 \n", + "5 2 OAF1 ERP2 1.050827 0.430707 \n", + "6 2 OAF1 FUN14 1.347876 0.155511 \n", + "7 2 OAF1 SPO7 0.939673 0.578234 \n", + "8 2 OAF1 MDM10 0.939673 0.578234 \n", + "9 2 OAF1 SWC3 0.865667 0.671119 \n", + "\n", + " dataset_id \n", + "0 BrentLab/harbison_2004/harbison_2004 \n", + "1 BrentLab/harbison_2004/harbison_2004 \n", + "2 BrentLab/harbison_2004/harbison_2004 \n", + "3 BrentLab/harbison_2004/harbison_2004 \n", + "4 BrentLab/harbison_2004/harbison_2004 \n", + "5 BrentLab/harbison_2004/harbison_2004 \n", + "6 BrentLab/harbison_2004/harbison_2004 \n", + "7 BrentLab/harbison_2004/harbison_2004 \n", + "8 BrentLab/harbison_2004/harbison_2004 \n", + "9 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can combine complete=True with field selection\n", + "# Get just the binding data columns\n", + "binding_data = vdb.query(\n", + " filters={\"carbon_source\": \"glucose\"},\n", + " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", + " fields=[\"sample_id\", \"regulator_symbol\", \"target_symbol\", \"effect\", \"pvalue\"],\n", + " complete=True\n", + ")\n", + "\n", + "print(f\"Binding data: {len(binding_data)} measurements\")\n", + "binding_data.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example analysis\n", + "\n", + "The following is an example of using VirtualDB to extract and summarize data across\n", + "datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sample counts by dataset and carbon source:\n", + " dataset_id carbon_source num_samples\n", + "BrentLab/harbison_2004/harbison_2004 galactose 4\n", + "BrentLab/harbison_2004/harbison_2004 glucose 310\n", + "BrentLab/harbison_2004/harbison_2004 raffinose 1\n", + "BrentLab/harbison_2004/harbison_2004 unspecified 37\n", + "BrentLab/kemmeren_2014/kemmeren_2014 glucose 1487\n" + ] + } + ], + "source": [ + "# Compare number of samples by carbon source across datasets\n", + "\n", + "# Get all samples\n", + "all_samples = vdb.query()\n", + "\n", + "# Count by dataset and carbon source\n", + "summary = all_samples.groupby(['dataset_id', 'carbon_source']).size()\n", + "summary = summary.reset_index(name='num_samples')\n", + "\n", + "print(\"Sample counts by dataset and carbon source:\")\n", + "print(summary.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Glucose samples by temperature:\n", + " 30.0C: 1791 samples\n" + ] + } + ], + "source": [ + "# Compare glucose experiments at different temperatures\n", + "\n", + "glucose_by_temp = vdb.query(\n", + " filters={\"carbon_source\": \"glucose\"},\n", + " fields=[\"sample_id\", \"temperature_celsius\", \"environmental_condition\"]\n", + ")\n", + "\n", + "# Count samples by temperature\n", + "temp_counts = glucose_by_temp['temperature_celsius'].value_counts().sort_index()\n", + "\n", + "print(\"Glucose samples by temperature:\")\n", + "for temp, count in temp_counts.items():\n", + " print(f\" {temp}C: {count} samples\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 18678 FHL1 binding measurements in glucose\n", + "Significant targets: 379\n", + "\n", + "Top 10 targets by effect size:\n", + "target_symbol effect pvalue\n", + " RPS5 24.145013 9.739702e-09\n", + " RPL11A 20.585725 1.232356e-08\n", + " PRE2 20.585725 1.232356e-08\n", + " SRF1 20.342898 1.226799e-08\n", + " SLX8 20.057080 1.513076e-08\n", + " RPL23B 20.057080 1.513076e-08\n", + " RPL40A 19.262139 1.761808e-08\n", + " MLP2 19.262139 1.761808e-08\n", + " RPS6A 18.704379 1.544172e-08\n", + " RPL22A 17.926705 1.560357e-08\n" + ] + } + ], + "source": [ + "# Get binding data for a specific regulator across datasets\n", + "\n", + "# Query for FHL1 binding in glucose conditions\n", + "fhl1_binding = vdb.query(\n", + " filters={\n", + " \"carbon_source\": \"glucose\",\n", + " \"regulator_symbol\": \"FHL1\"\n", + " },\n", + " fields=[\"sample_id\", \"regulator_symbol\", \"target_symbol\", \"effect\", \"pvalue\"],\n", + " complete=True\n", + ")\n", + "\n", + "print(f\"Found {len(fhl1_binding)} FHL1 binding measurements in glucose\")\n", + "\n", + "# Find significant targets (p < 0.001)\n", + "significant = fhl1_binding[fhl1_binding['pvalue'] < 0.001]\n", + "print(f\"Significant targets: {len(significant)}\")\n", + "\n", + "# Top 10 by effect size\n", + "top_targets = significant.nlargest(10, 'effect')[['target_symbol', 'effect', 'pvalue']]\n", + "print(\"\\nTop 10 targets by effect size:\")\n", + "print(top_targets.to_string(index=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Querying Comparative Datasets\n", + "\n", + "Comparative datasets like DTO (Direct Target Overlap) contain analysis results that relate samples across multiple datasets. These datasets can be queried directly to find significant cross-dataset relationships." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 65536.00it/s]\n", + "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 57325.34it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 32 FHL1 binding measurements\n", + "\n", + "Columns: ['sample_id', 'regulator_symbol', 'condition', 'dto_fdr', 'perturbation_id', 'dataset_id']\n", + "\n", + "Rows with DTO data: 4\n", + "\n", + "First few results:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "dto_fdr", + "rawType": "float64", + "type": "float" + }, + { + "name": "perturbation_id", + "rawType": "object", + "type": "string" + }, + { + "name": "dataset_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "a0eb6112-b457-4642-add7-4bcd5068e495", + "rows": [ + [ + "0", + "345", + "FHL1", + "H2O2Hi", + "0.4549087454017032", + "BrentLab/Hackett_2020;hackett_2020;1666", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "1", + "345", + "FHL1", + "H2O2Hi", + null, + "BrentLab/Hackett_2020;hackett_2020;1665", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "2", + "345", + "FHL1", + "H2O2Hi", + null, + "BrentLab/Hackett_2020;hackett_2020;1667", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "3", + "345", + "FHL1", + "H2O2Hi", + null, + "BrentLab/Hackett_2020;hackett_2020;1669", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "4", + "345", + "FHL1", + "H2O2Hi", + null, + "BrentLab/Hackett_2020;hackett_2020;1663", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "5", + "345", + "FHL1", + "H2O2Hi", + null, + "BrentLab/Hackett_2020;hackett_2020;1664", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "6", + "345", + "FHL1", + "H2O2Hi", + null, + "BrentLab/Hackett_2020;hackett_2020;1670", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "7", + "345", + "FHL1", + "H2O2Hi", + null, + "BrentLab/Hackett_2020;hackett_2020;1668", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "8", + "346", + "FHL1", + "RAPA", + null, + "BrentLab/Hackett_2020;hackett_2020;1667", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "9", + "346", + "FHL1", + "RAPA", + null, + "BrentLab/Hackett_2020;hackett_2020;1663", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "10", + "346", + "FHL1", + "RAPA", + null, + "BrentLab/Hackett_2020;hackett_2020;1670", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "11", + "346", + "FHL1", + "RAPA", + null, + "BrentLab/Hackett_2020;hackett_2020;1668", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "12", + "346", + "FHL1", + "RAPA", + "0.0", + "BrentLab/Hackett_2020;hackett_2020;1666", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "13", + "346", + "FHL1", + "RAPA", + null, + "BrentLab/Hackett_2020;hackett_2020;1669", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "14", + "346", + "FHL1", + "RAPA", + null, + "BrentLab/Hackett_2020;hackett_2020;1664", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "15", + "346", + "FHL1", + "RAPA", + null, + "BrentLab/Hackett_2020;hackett_2020;1665", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "16", + "347", + "FHL1", + "SM", + null, + "BrentLab/Hackett_2020;hackett_2020;1667", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "17", + "347", + "FHL1", + "SM", + "0.0221957781456953", + "BrentLab/Hackett_2020;hackett_2020;1666", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "18", + "347", + "FHL1", + "SM", + null, + "BrentLab/Hackett_2020;hackett_2020;1669", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "19", + "347", + "FHL1", + "SM", + null, + "BrentLab/Hackett_2020;hackett_2020;1664", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "20", + "347", + "FHL1", + "SM", + null, + "BrentLab/Hackett_2020;hackett_2020;1663", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "21", + "347", + "FHL1", + "SM", + null, + "BrentLab/Hackett_2020;hackett_2020;1670", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "22", + "347", + "FHL1", + "SM", + null, + "BrentLab/Hackett_2020;hackett_2020;1668", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "23", + "347", + "FHL1", + "SM", + null, + "BrentLab/Hackett_2020;hackett_2020;1665", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "24", + "348", + "FHL1", + "YPD", + null, + "BrentLab/Hackett_2020;hackett_2020;1664", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "25", + "348", + "FHL1", + "YPD", + "0.089578429724277", + "BrentLab/Hackett_2020;hackett_2020;1666", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "26", + "348", + "FHL1", + "YPD", + null, + "BrentLab/Hackett_2020;hackett_2020;1663", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "27", + "348", + "FHL1", + "YPD", + null, + "BrentLab/Hackett_2020;hackett_2020;1667", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "28", + "348", + "FHL1", + "YPD", + null, + "BrentLab/Hackett_2020;hackett_2020;1669", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "29", + "348", + "FHL1", + "YPD", + null, + "BrentLab/Hackett_2020;hackett_2020;1665", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "30", + "348", + "FHL1", + "YPD", + null, + "BrentLab/Hackett_2020;hackett_2020;1670", + "BrentLab/harbison_2004/harbison_2004" + ], + [ + "31", + "348", + "FHL1", + "YPD", + null, + "BrentLab/Hackett_2020;hackett_2020;1668", + "BrentLab/harbison_2004/harbison_2004" + ] + ], + "shape": { + "columns": 6, + "rows": 32 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_symbolconditiondto_fdrperturbation_iddataset_id
0345FHL1H2O2Hi0.454909BrentLab/Hackett_2020;hackett_2020;1666BrentLab/harbison_2004/harbison_2004
1345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1665BrentLab/harbison_2004/harbison_2004
2345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1667BrentLab/harbison_2004/harbison_2004
3345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1669BrentLab/harbison_2004/harbison_2004
4345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1663BrentLab/harbison_2004/harbison_2004
5345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1664BrentLab/harbison_2004/harbison_2004
6345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1670BrentLab/harbison_2004/harbison_2004
7345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1668BrentLab/harbison_2004/harbison_2004
8346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1667BrentLab/harbison_2004/harbison_2004
9346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1663BrentLab/harbison_2004/harbison_2004
10346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1670BrentLab/harbison_2004/harbison_2004
11346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1668BrentLab/harbison_2004/harbison_2004
12346FHL1RAPA0.000000BrentLab/Hackett_2020;hackett_2020;1666BrentLab/harbison_2004/harbison_2004
13346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1669BrentLab/harbison_2004/harbison_2004
14346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1664BrentLab/harbison_2004/harbison_2004
15346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1665BrentLab/harbison_2004/harbison_2004
16347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1667BrentLab/harbison_2004/harbison_2004
17347FHL1SM0.022196BrentLab/Hackett_2020;hackett_2020;1666BrentLab/harbison_2004/harbison_2004
18347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1669BrentLab/harbison_2004/harbison_2004
19347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1664BrentLab/harbison_2004/harbison_2004
20347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1663BrentLab/harbison_2004/harbison_2004
21347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1670BrentLab/harbison_2004/harbison_2004
22347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1668BrentLab/harbison_2004/harbison_2004
23347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1665BrentLab/harbison_2004/harbison_2004
24348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1664BrentLab/harbison_2004/harbison_2004
25348FHL1YPD0.089578BrentLab/Hackett_2020;hackett_2020;1666BrentLab/harbison_2004/harbison_2004
26348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1663BrentLab/harbison_2004/harbison_2004
27348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1667BrentLab/harbison_2004/harbison_2004
28348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1669BrentLab/harbison_2004/harbison_2004
29348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1665BrentLab/harbison_2004/harbison_2004
30348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1670BrentLab/harbison_2004/harbison_2004
31348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1668BrentLab/harbison_2004/harbison_2004
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_symbol condition dto_fdr \\\n", + "0 345 FHL1 H2O2Hi 0.454909 \n", + "1 345 FHL1 H2O2Hi NaN \n", + "2 345 FHL1 H2O2Hi NaN \n", + "3 345 FHL1 H2O2Hi NaN \n", + "4 345 FHL1 H2O2Hi NaN \n", + "5 345 FHL1 H2O2Hi NaN \n", + "6 345 FHL1 H2O2Hi NaN \n", + "7 345 FHL1 H2O2Hi NaN \n", + "8 346 FHL1 RAPA NaN \n", + "9 346 FHL1 RAPA NaN \n", + "10 346 FHL1 RAPA NaN \n", + "11 346 FHL1 RAPA NaN \n", + "12 346 FHL1 RAPA 0.000000 \n", + "13 346 FHL1 RAPA NaN \n", + "14 346 FHL1 RAPA NaN \n", + "15 346 FHL1 RAPA NaN \n", + "16 347 FHL1 SM NaN \n", + "17 347 FHL1 SM 0.022196 \n", + "18 347 FHL1 SM NaN \n", + "19 347 FHL1 SM NaN \n", + "20 347 FHL1 SM NaN \n", + "21 347 FHL1 SM NaN \n", + "22 347 FHL1 SM NaN \n", + "23 347 FHL1 SM NaN \n", + "24 348 FHL1 YPD NaN \n", + "25 348 FHL1 YPD 0.089578 \n", + "26 348 FHL1 YPD NaN \n", + "27 348 FHL1 YPD NaN \n", + "28 348 FHL1 YPD NaN \n", + "29 348 FHL1 YPD NaN \n", + "30 348 FHL1 YPD NaN \n", + "31 348 FHL1 YPD NaN \n", + "\n", + " perturbation_id \\\n", + "0 BrentLab/Hackett_2020;hackett_2020;1666 \n", + "1 BrentLab/Hackett_2020;hackett_2020;1665 \n", + "2 BrentLab/Hackett_2020;hackett_2020;1667 \n", + "3 BrentLab/Hackett_2020;hackett_2020;1669 \n", + "4 BrentLab/Hackett_2020;hackett_2020;1663 \n", + "5 BrentLab/Hackett_2020;hackett_2020;1664 \n", + "6 BrentLab/Hackett_2020;hackett_2020;1670 \n", + "7 BrentLab/Hackett_2020;hackett_2020;1668 \n", + "8 BrentLab/Hackett_2020;hackett_2020;1667 \n", + "9 BrentLab/Hackett_2020;hackett_2020;1663 \n", + "10 BrentLab/Hackett_2020;hackett_2020;1670 \n", + "11 BrentLab/Hackett_2020;hackett_2020;1668 \n", + "12 BrentLab/Hackett_2020;hackett_2020;1666 \n", + "13 BrentLab/Hackett_2020;hackett_2020;1669 \n", + "14 BrentLab/Hackett_2020;hackett_2020;1664 \n", + "15 BrentLab/Hackett_2020;hackett_2020;1665 \n", + "16 BrentLab/Hackett_2020;hackett_2020;1667 \n", + "17 BrentLab/Hackett_2020;hackett_2020;1666 \n", + "18 BrentLab/Hackett_2020;hackett_2020;1669 \n", + "19 BrentLab/Hackett_2020;hackett_2020;1664 \n", + "20 BrentLab/Hackett_2020;hackett_2020;1663 \n", + "21 BrentLab/Hackett_2020;hackett_2020;1670 \n", + "22 BrentLab/Hackett_2020;hackett_2020;1668 \n", + "23 BrentLab/Hackett_2020;hackett_2020;1665 \n", + "24 BrentLab/Hackett_2020;hackett_2020;1664 \n", + "25 BrentLab/Hackett_2020;hackett_2020;1666 \n", + "26 BrentLab/Hackett_2020;hackett_2020;1663 \n", + "27 BrentLab/Hackett_2020;hackett_2020;1667 \n", + "28 BrentLab/Hackett_2020;hackett_2020;1669 \n", + "29 BrentLab/Hackett_2020;hackett_2020;1665 \n", + "30 BrentLab/Hackett_2020;hackett_2020;1670 \n", + "31 BrentLab/Hackett_2020;hackett_2020;1668 \n", + "\n", + " dataset_id \n", + "0 BrentLab/harbison_2004/harbison_2004 \n", + "1 BrentLab/harbison_2004/harbison_2004 \n", + "2 BrentLab/harbison_2004/harbison_2004 \n", + "3 BrentLab/harbison_2004/harbison_2004 \n", + "4 BrentLab/harbison_2004/harbison_2004 \n", + "5 BrentLab/harbison_2004/harbison_2004 \n", + "6 BrentLab/harbison_2004/harbison_2004 \n", + "7 BrentLab/harbison_2004/harbison_2004 \n", + "8 BrentLab/harbison_2004/harbison_2004 \n", + "9 BrentLab/harbison_2004/harbison_2004 \n", + "10 BrentLab/harbison_2004/harbison_2004 \n", + "11 BrentLab/harbison_2004/harbison_2004 \n", + "12 BrentLab/harbison_2004/harbison_2004 \n", + "13 BrentLab/harbison_2004/harbison_2004 \n", + "14 BrentLab/harbison_2004/harbison_2004 \n", + "15 BrentLab/harbison_2004/harbison_2004 \n", + "16 BrentLab/harbison_2004/harbison_2004 \n", + "17 BrentLab/harbison_2004/harbison_2004 \n", + "18 BrentLab/harbison_2004/harbison_2004 \n", + "19 BrentLab/harbison_2004/harbison_2004 \n", + "20 BrentLab/harbison_2004/harbison_2004 \n", + "21 BrentLab/harbison_2004/harbison_2004 \n", + "22 BrentLab/harbison_2004/harbison_2004 \n", + "23 BrentLab/harbison_2004/harbison_2004 \n", + "24 BrentLab/harbison_2004/harbison_2004 \n", + "25 BrentLab/harbison_2004/harbison_2004 \n", + "26 BrentLab/harbison_2004/harbison_2004 \n", + "27 BrentLab/harbison_2004/harbison_2004 \n", + "28 BrentLab/harbison_2004/harbison_2004 \n", + "29 BrentLab/harbison_2004/harbison_2004 \n", + "30 BrentLab/harbison_2004/harbison_2004 \n", + "31 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Query harbison_2004 binding data enriched with DTO metrics\n", + "# This demonstrates field-based joins: requesting dto_fdr field\n", + "# while querying the primary binding dataset\n", + "\n", + "binding_with_dto = vdb.query(\n", + " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", + " filters={\"regulator_symbol\": \"FHL1\"},\n", + " fields=[\"sample_id\", \"regulator_symbol\", \"condition\", \"dto_fdr\", \"binding_id\", \"perturbation_id\"],\n", + ")\n", + "\n", + "print(f\"Found {len(binding_with_dto)} FHL1 binding measurements\")\n", + "print(f\"\\nColumns: {list(binding_with_dto.columns)}\")\n", + "print(f\"\\nRows with DTO data: {binding_with_dto['dto_fdr'].notna().sum()}\")\n", + "print(f\"\\nFirst few results:\")\n", + "binding_with_dto" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 122760.12it/s]\n", + "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 35951.18it/s]\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id", + "rawType": "object", + "type": "string" + }, + { + "name": "dto_empirical_pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "dataset_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "f666fc22-ce67-46fc-80bb-c44baafdf799", + "rows": [ + [ + "0", + "347", + "FHL1", + "BrentLab/Hackett_2020;hackett_2020;1666", + "0.297", + "BrentLab/harbison_2004/harbison_2004" + ] + ], + "shape": { + "columns": 5, + "rows": 1 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_symbolperturbation_iddto_empirical_pvaluedataset_id
0347FHL1BrentLab/Hackett_2020;hackett_2020;16660.297BrentLab/harbison_2004/harbison_2004
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_symbol perturbation_id \\\n", + "0 347 FHL1 BrentLab/Hackett_2020;hackett_2020;1666 \n", + "\n", + " dto_empirical_pvalue dataset_id \n", + "0 0.297 BrentLab/harbison_2004/harbison_2004 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can also filter on comparative dataset fields\n", + "# This returns only binding measurements with significant DTO results\n", + "\n", + "significant_dtos = vdb.query(\n", + " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", + " filters={\n", + " \"regulator_symbol\": \"FHL1\",\n", + " # the threshold is high here b/c FHL1 didn't have significant results in harbison\n", + " \"dto_empirical_pvalue\": (\"<\", 0.5)\n", + " },\n", + " fields=[\"sample_id\", \"regulator_symbol\", \"target_symbol\", \"perturbation_id\", \"dto_empirical_pvalue\"],\n", + ")\n", + "\n", + "significant_dtos" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tfbpapi-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/virtual_database_concepts.md b/docs/virtual_database_concepts.md new file mode 100644 index 0000000..55fe6c9 --- /dev/null +++ b/docs/virtual_database_concepts.md @@ -0,0 +1,525 @@ +# Virtual Database + +VirtualDB provides a unified query interface across heterogeneous datasets with +different experimental condition structures and terminologies. Each dataset +defines experimental conditions in its own way, with properties stored at +different hierarchy levels (repository, dataset, or field) and using different +naming conventions. VirtualDB uses an external YAML configuration to map these +varying structures to a common schema, normalize factor level names (e.g., +"D-glucose", "dextrose", "glu" all become "glucose"), and enable cross-dataset +queries with standardized field names and values. + +## Configuration Structure + +This is a basic example of a VirtualDB configuration YAML file: + +```yaml +repositories: + # Each repository defines a "table" in the virtual database + BrentLab/harbison_2004: + # REQUIRED: Specify which field is the sample identifier. At this level, it means + # that all datasets have a field `sample_id` that uniquely identifies samples. + sample_id: + field: sample_id + # Repository-wide properties (apply to all datasets in this repository) + nitrogen_source: + path: media.nitrogen_source.name + + dataset: + # Each dataset gets its own view with standardized fields + harbison_2004: + # Dataset-specific properties (constant for all samples) + phosphate_source: + path: media.phosphate_source.compound + + # Field-level properties (vary per sample) + carbon_source: + field: condition + path: media.carbon_source.compound + dtype: string # Optional: specify data type + + # Field without path (column alias with normalization) + environmental_condition: + field: condition + + # if there is a `comparative_analysis` dataset that you want to link to + # a given dataset, you can declare it at the dataset level + # For more information on this section, see the section + # 'Comparative Datasets in VirtualDB' + comparative_analyses: + # specify the comparative analysis repo + - repo: BrentLab/yeast_comparative_analysis + # and dataset + dataset: dto + # and the field in the comparative analysis that links back tot this + # dataset. Note that this field should have role `source_sample`, and it + # should therefore be formated as `repo_id;config_name;sample_id` where the + # sample_id is derived from the field in this dataset that is specified + # for this dataset in the `sample_id` field above. + via_field: perturbation_id + + BrentLab/kemmeren_2014: + dataset: + kemmeren_2014: + # REQUIRED: If `sample_id` isn't defined at the repo level, then it must be + # defined at the dataset level for each dataset in the repo + sample_id: + field: sample_id + # Same logical fields, different physical paths + carbon_source: + path: media.carbon_source.compound + dtype: string + temperature_celsius: + path: temperature_celsius + dtype: numeric # Enables numeric filtering with comparison operators + +# ===== Normalization Rules ===== +# Map varying terminologies to standardized values +factor_aliases: + carbon_source: + glucose: [D-glucose, glu, dextrose] + galactose: [D-galactose, gal] + +# Handle missing values with defaults +missing_value_labels: + carbon_source: "unspecified" + +# ===== Documentation ===== +description: + carbon_source: The carbon source provided to the cells during growth +``` + +### Property Hierarchy + +Properties are extracted at three hierarchy levels: + +1. **Repository-wide**: Common to all datasets in a repository + - Paths relative to repository-level `experimental_conditions` + - Example: `path: media.nitrogen_source.name` + +2. **Dataset-specific**: Specific to one dataset configuration + - Paths relative to config-level `experimental_conditions` + - Example: `path: media.phosphate_source.compound` + +3. **Field-level**: Vary per sample, defined in field definitions + - `field` specifies which field to extract from + - `path` relative to field definitions (not `experimental_conditions`) + - Example: `field: condition, path: media.carbon_source.compound` + +**Special case**: Field without path creates a column alias +- `field: condition` (no path) to renames `condition` column, enables normalization + +### Path Resolution + +Paths use dot notation to navigate nested structures: + +**Repository/Dataset-level** (automatically prepends `experimental_conditions.`): +- `path: temperature_celsius` to `experimental_conditions.temperature_celsius` +- `path: media.carbon_source.compound` to + `experimental_conditions.media.carbon_source.compound` + +**Field-level** (paths relative to field definitions): +- `field: condition, path: media.carbon_source.compound` to looks in field +`condition`'s definitions to navigates to `media.carbon_source.compound` + +### Data Type Specifications + +Field mappings support an optional `dtype` parameter to ensure proper type handling +during metadata extraction and query filtering. + +**Supported dtypes**: +- `string` - Text data (default if not specified) +- `numeric` - Numeric values (integers or floating-point numbers) +- `bool` - Boolean values (true/false) + +**When to use dtype**: + +1. **Numeric filtering**: Required for fields used with comparison operators + (`<`, `>`, `<=`, `>=`, `between`) +2. **Type consistency**: When source data might be extracted with incorrect type +3. **Performance**: Helps with query optimization and prevents type mismatches + +**Type conversion process**: + +Type conversion happens during metadata extraction: +1. Extract value from source using path +2. Convert to specified dtype if provided +3. Store in metadata DataFrame with correct type + +**Example - The problem**: +```python +# Without dtype: temperature extracted as string "30" +# Comparison fails or produces incorrect results +df = vdb.query(filters={"temperature_celsius": (">", 25)}) +# String comparison: "30" > 25 evaluates incorrectly +``` + +**Example - The solution**: +```yaml +temperature_celsius: + path: temperature_celsius + dtype: numeric # Ensures numeric type for proper comparison +``` + +```python +# With dtype: temperature extracted as numeric 30.0 +# Comparison works correctly +df = vdb.query(filters={"temperature_celsius": (">", 25)}) +# Numeric comparison: 30.0 > 25 is True (correct!) +``` + +**Usage examples**: +```yaml +repositories: + BrentLab/example: + dataset: + example_dataset: + # String field for categorical data + strain_background: + path: strain_background + dtype: string + + # Numeric field for quantitative filtering + temperature_celsius: + path: temperature_celsius + dtype: numeric + + # Numeric field for concentration measurements + drug_concentration_um: + path: drug_treatment.concentration_um + dtype: numeric + + # Boolean field + is_heat_shock: + path: is_heat_shock + dtype: bool +``` + +## VirtualDB Structure + +VirtualDB maintains a collection of dataset-specific metadata tables, one per +configured dataset. Each table has the same structure (standardized schema) but +contains data specific to that dataset. + +Unless directed, these tables are not stored on desk and instead generated via +query against the source parquet files. Think of them as a typical database view. + +### Internal Structure + +```python +{ + # Primary datasets with sample_id + ("BrentLab/harbison_2004", "harbison_2004"): DataFrame( + # Columns: sample_id, carbon_source, temperature_celsius, nitrogen_source, ... + # Values: Normalized according to factor_aliases + # Example rows: + # sample_id carbon_source temperature_celsius nitrogen_source + # harbison_001 glucose 30 yeast nitrogen base + # harbison_002 galactose 30 yeast nitrogen base + ), + + ("BrentLab/kemmeren_2014", "kemmeren_2014"): DataFrame( + # Columns: sample_id, carbon_source, temperature_celsius, ... + # Note: Different physical source paths, same logical schema + # Example rows: + # sample_id carbon_source temperature_celsius + # kemmeren_001 glucose 30 + # kemmeren_002 raffinose 30 + ), + + # Comparative datasets with parsed composite identifiers + ("BrentLab/yeast_comparative_analysis", "dto"): DataFrame( + # Original composite ID columns preserved + # Columns: binding_id, perturbation_id, dto_fdr, dto_empirical_pvalue, ... + # Example rows: + # binding_id perturbation_id dto_fdr + # BrentLab/harbison_2004;harbison_2004;harbison_001 BrentLab/kemmeren_2014;kemmeren_2014;sample_42 0.001 + # BrentLab/harbison_2004;harbison_2004;harbison_002 BrentLab/kemmeren_2014;kemmeren_2014;sample_43 0.045 + # + # When materialized with foreign keys, additional parsed columns are created: + # Columns: binding_id, binding_repo_id, binding_config_name, binding_sample_id, + # perturbation_id, perturbation_repo_id, perturbation_config_name, perturbation_sample_id, + # dto_fdr, dto_empirical_pvalue, ... + # Example rows: + # binding_repo_id binding_config_name binding_sample_id dto_fdr + # BrentLab/harbison_2004 harbison_2004 harbison_001 0.001 + # BrentLab/harbison_2004 harbison_2004 harbison_002 0.045 + ) +} +``` + +### View Materialization + +Tables can be cached for faster subsequent queries via materialization: + +```python +# Cache all views for faster subsequent queries +vdb.materialize_views() + +# Cache specific datasets +vdb.materialize([("BrentLab/harbison_2004", "harbison_2004")]) + +# Invalidate cache (e.g., after data updates) +vdb.invalidate_cache() +vdb.invalidate_cache([("BrentLab/harbison_2004", "harbison_2004")]) +``` + +Materialized views are stored locally and reused for queries. + +## VirtualDB Interface + +### Schema Discovery + +**List all queryable fields**: +```python +from tfbpapi.virtual_db import VirtualDB + +vdb = VirtualDB("config.yaml") + +# All fields defined in any dataset +fields = vdb.get_fields() +# ["carbon_source", "temperature_celsius", "nitrogen_source", "phosphate_source", ...] + +# Fields present in ALL datasets (common fields) +common = vdb.get_common_fields() +# ["carbon_source", "temperature_celsius"] + +# Fields for specific dataset +dataset_fields = vdb.get_fields("BrentLab/harbison_2004", "harbison_2004") +# ["carbon_source", "temperature_celsius", "nitrogen_source", "phosphate_source"] +``` + +**Discover valid values for fields**: +```python +# Unique values across all datasets (normalized) +values = vdb.get_unique_values("carbon_source") +# ["glucose", "galactose", "raffinose", "unspecified"] + +# Values broken down by dataset +values_by_dataset = vdb.get_unique_values("carbon_source", by_dataset=True) +# { +# "BrentLab/harbison_2004": ["glucose", "galactose"], +# "BrentLab/kemmeren_2014": ["glucose", "raffinose"] +# } +``` + +### Querying Data + +The `query()` method is the primary interface for retrieving data from VirtualDB. + +**Basic usage** (sample-level, all fields): +```python +# Query across all configured datasets +# Returns one row per sample with all configured fields +df = vdb.query(filters={"carbon_source": "glucose"}) +# DataFrame: sample_id, carbon_source, temperature_celsius, nitrogen_source, ... +``` + +**Query specific datasets**: +```python +# Limit query to specific datasets +df = vdb.query( + filters={"carbon_source": "glucose", "temperature_celsius": 30}, + datasets=[("BrentLab/harbison_2004", "harbison_2004")] +) +``` + +**Select specific fields**: +```python +# Return only specified fields +df = vdb.query( + filters={"carbon_source": "glucose"}, + fields=["sample_id", "carbon_source", "temperature_celsius"] +) +# DataFrame: sample_id, carbon_source, temperature_celsius +``` + +**Complete data** (measurement-level): +```python +# Set complete=True to get all measurements, not just sample-level +# Returns many rows per sample (one per target/feature/coordinate) +df = vdb.query( + filters={"carbon_source": "glucose"}, + complete=True +) +# DataFrame: sample_id, target, value, carbon_source, temperature_celsius, ... +# For annotated_features: target-level data for all matching samples +# For genome_map: coordinate-level data for all matching samples + +# Can combine with field selection +df = vdb.query( + filters={"carbon_source": "glucose"}, + fields=["sample_id", "target", "effect"], + complete=True +) +# DataFrame: sample_id, target, effect +``` + +### Factor Alias Expansion + +When querying with aliased values, VirtualDB automatically expands to all +original values specified in the configuration: + +```python +# User queries for normalized value +df = vdb.query(filters={"carbon_source": "galactose"}) + +# Internally expands to all aliases +# WHERE carbon_source IN ('D-galactose', 'gal', 'galactose') +``` + +### Numeric Field Filtering + +Numeric fields support exact matching and range queries: + +```python +# Exact match +df = vdb.query(filters={"temperature_celsius": 30}) + +# Range queries +df = vdb.query(filters={"temperature_celsius": (">=", 28)}) +# inclusive of the boundaries, ie [28, 32] +df = vdb.query(filters={"temperature_celsius": ("between", 28, 32)}) + +# Missing value labels. This analogous to how factor_aliases work. In this case, it +# will return where the temprature_celsius is missing/None/Null/NaN/etc and/or the +# value matches the specified label, in this case "room". If the missing value label +# is a character value and the field is a numeric field, then only missing values will +# be matched. +df = vdb.query(filters={"temperature_celsius": "room"}) +# Matches samples where temperature is None/missing +``` + +## Comparative Datasets in VirtualDB + +Comparative datasets differ from other dataset types in that they represent +relationships between samples across datasets rather than individual samples. +Each row relates 2+ samples from other datasets. + +### Structure + +Comparative datasets use `source_sample` fields instead of a single `sample_id`: +- Multiple fields with `role: source_sample` +- Each contains composite identifier: `"repo_id;config_name;sample_id"` +- Example: `binding_id = "BrentLab/harbison_2004;harbison_2004;42"` + +### Querying Comparative Data + +Comparative datasets can be queried in two ways: **direct queries** for analysis +results, and **field-based queries** to enrich primary dataset queries with +comparative metrics. + +#### Direct Queries + +Query the comparative dataset directly to find analysis results: + +```python +# Find significant DTO results across all experiments +dto_results = vdb.query( + datasets=[("BrentLab/yeast_comparative_analysis", "dto")], + filters={"dto_fdr": ("<", 0.05)}, + complete=True +) +# Returns: binding_id, perturbation_id, dto_fdr, dto_empirical_pvalue, +# binding_rank_threshold, perturbation_rank_threshold, ... + +# Filter by source dataset +dto_for_harbison = vdb.query( + datasets=[("BrentLab/yeast_comparative_analysis", "dto")], + filters={"binding_id": ("contains", "harbison_2004")}, + complete=True +) + +# Combine filters on both metrics and source samples +high_quality_dto = vdb.query( + datasets=[("BrentLab/yeast_comparative_analysis", "dto")], + filters={ + "dto_fdr": ("<", 0.01), + "binding_id": ("contains", "callingcards") + }, + complete=True +) +``` + +#### Field-based Queries + +```python +# Query binding data, automatically include DTO metrics +binding_with_dto = vdb.query( + datasets=[("BrentLab/callingcards", "annotated_features")], + filters={"regulator_locus_tag": "YJR060W"}, + fields=["sample_id", "target_locus_tag", "binding_score", "dto_fdr"], + complete=True +) +# Returns binding data WITH dto_fdr joined automatically via composite ID + +# Query perturbation data, include derived significance field +perturbation_with_significance = vdb.query( + datasets=[("BrentLab/hackett_2020", "hackett_2020")], + filters={"regulator_locus_tag": "YJR060W"}, + fields=["sample_id", "target_locus_tag", "log2fc", "is_significant"], + complete=True +) +# Returns perturbation data WITH is_significant (computed from dto_fdr < 0.05) +``` + +### Configuration + +Comparative datasets work differently - +**primary datasets declare which comparative datasets reference them**: + +```yaml +repositories: + # Primary dataset (e.g., binding data) + BrentLab/callingcards: + dataset: + annotated_features: + # REQUIRED: Specify which field is the sample identifier + sample_id: + field: sample_id + + # OPTIONAL: Declare comparative analyses that include this dataset + comparative_analyses: + - repo: BrentLab/yeast_comparative_analysis + dataset: dto + via_field: binding_id + # VirtualDB knows composite format: "BrentLab/callingcards;annotated_features;" + + # Regular fields + regulator_locus_tag: + field: regulator_locus_tag + # ... other fields + + # Another primary dataset (e.g., perturbation data) + BrentLab/hu_2007_reimand_2010: + dataset: + data: + sample_id: + field: sample_id + + comparative_analyses: + - repo: BrentLab/yeast_comparative_analysis + dataset: dto + via_field: perturbation_id + + # Regular fields + # ... other fields + + # Comparative dataset - OPTIONAL field mappings for renaming/aliasing + BrentLab/yeast_comparative_analysis: + dataset: + dto: + # Optional: Rename fields for clarity or add derived columns + fdr: + field: dto_fdr # Rename dto_fdr to fdr + + empirical_pvalue: + field: dto_empirical_pvalue # Rename for consistency + + is_significant: + # Derived field: computed from dto_fdr + expression: "dto_fdr < 0.05" +``` + +## See Also +- [DataCard Documentation](huggingface_datacard.md) diff --git a/docs/virtual_db.md b/docs/virtual_db.md new file mode 100644 index 0000000..8fe590e --- /dev/null +++ b/docs/virtual_db.md @@ -0,0 +1,21 @@ +# VirtualDB + +::: tfbpapi.virtual_db.VirtualDB + options: + show_root_heading: true + show_source: true + +## Helper Functions + +::: tfbpapi.virtual_db.get_nested_value + options: + show_root_heading: true + +::: tfbpapi.virtual_db.normalize_value + options: + show_root_heading: true + +## Usage + +For comprehensive usage documentation including comparative datasets, see +[Virtual Database Concepts](virtual_database_concepts.md). diff --git a/mkdocs.yml b/mkdocs.yml index a28f581..0635060 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,45 +1,141 @@ site_name: tfbpapi -site_description: "A collection of objects and functions to work with calling cards sequencing tools" +site_description: "Python API for querying and analyzing genomic datasets from HuggingFace Hub" site_author: "ben mueller , chase mateusiak , michael brent " -site_url: "https://brentlab.github.io/tfbpapi/" +site_url: "https://brentlab.github.io/tfbpapi" repo_url: "https://github.com/brentlab/tfbpapi" -repo_name: "tfbpapi" -edit_uri: "edit/master/docs/" +repo_name: "brentlab/tfbpapi" +edit_uri: "edit/main/docs/" watch: ['tfbpapi', 'docs'] theme: name: material + palette: + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/brightness-7 + name: Switch to dark mode + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: indigo + accent: indigo + toggle: + icon: material/brightness-4 + name: Switch to light mode + features: + - navigation.tabs + - navigation.sections + - navigation.expand + - navigation.path + - navigation.top + - search.highlight + - search.share + - search.suggest + - content.code.copy + - content.code.select + - content.code.annotate + - content.action.edit + - content.action.view + icon: + repo: fontawesome/brands/github + edit: material/pencil + view: material/eye plugins: -- search -- autorefs -- section-index -- mkdocs-jupyter: + - search: + separator: '[\s\-,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' + - autorefs + - section-index + - mkdocs-jupyter: remove_tag_config: - remove_input_tags: - - hide - remove_output_tags: - - hide -- mkdocstrings: - handlers: - python: - paths: [tfbpapi] # search packages in the src folder - merge_init_into_class: True - options: - docstring_style: 'sphinx' + remove_input_tags: + - hide + remove_output_tags: + - hide + execute: false + allow_errors: false + - mkdocstrings: + handlers: + python: + paths: [.] + inventories: + - https://docs.python.org/3/objects.inv + - https://numpy.org/doc/stable/objects.inv + - https://pandas.pydata.org/docs/objects.inv + options: + docstring_style: sphinx + show_source: true + show_root_heading: true + show_root_toc_entry: true + show_symbol_type_heading: true + show_symbol_type_toc: true + signature_crossrefs: true markdown_extensions: + - abbr + - admonition + - attr_list + - def_list + - footnotes + - md_in_html - smarty + - tables - toc: - permalink: True + permalink: true + title: On this page - sane_lists - pymdownx.arithmatex: generic: true + - pymdownx.betterem: + smart_enable: all + - pymdownx.caret + - pymdownx.details + - pymdownx.emoji: + emoji_generator: !!python/name:material.extensions.emoji.to_svg + emoji_index: !!python/name:material.extensions.emoji.twemoji + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.keys + - pymdownx.magiclink: + normalize_issue_symbols: true + repo_url_shorthand: true + user: brentlab + repo: tfbpapi + - pymdownx.mark + - pymdownx.smartsymbols + - pymdownx.snippets: + auto_append: + - includes/mkdocs.md - pymdownx.superfences: custom_fences: - name: mermaid class: mermaid - format: "!!python/name:pymdownx.superfences.fence_code_format" + format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.tabbed: + alternate_style: true + combine_header_slug: true + slugify: !!python/object/apply:pymdownx.slugs.slugify + kwds: + case: lower + - pymdownx.tasklist: + custom_checkbox: true + - pymdownx.tilde + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/brentlab/tfbpapi + name: GitHub Repository + version: + provider: mike + default: latest extra_javascript: - javascripts/mathjax.js @@ -47,36 +143,29 @@ extra_javascript: - https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js - js/init-mermaid.js +extra_css: + - stylesheets/extra.css + nav: -- Home: index.md -- Tutorials: - - Database Interface: tutorials/database_interface.ipynb - - LassoCV: tutorials/lassoCV.ipynb - - Interactor Modeling Workflow: tutorials/interactor_modeling_workflow.ipynb -- API: - - Models: - - Overview: ml_models/index.md - - SigmoidModel: ml_models/SigmoidModel.md - - Lasso Modeling: ml_models/lasso_modeling.md - - Database Interface: - - Records Only Classes: - - interface/BindingManualQCAPI.md - - interface/DataSourceAPI.md - - interface/DtoAPI.md - - interface/ExpressionManualQCAPI.md - - interface/FileFormatAPI.md - - interface/GenomicFeatureAPI.md - - interface/RegulatorAPI.md - - Records and Files Classes: - - BindingAPI: interface/BindingAPI.md - - BindingConcatenatedAPI: interface/BindingConcatenatedAPI.md - - CallingCardsBackgroundAPI: interface/CallingCardsBackgroundAPI.md - - ExpressionAPI: interface/ExpressionAPI.md - - PromoterSetAPI: interface/PromoterSetAPI.md - - PromoterSetSigAPI: interface/PromoterSetSigAPI.md - - Developer Classes: - - interface/AbstractAPI.md - - interface/AbstractRecordsAndFilesAPI.md - - interface/AbstractRecordsOnlyAPI.md - - interface/Cache.md - - interface/ParamsDict.md + - Home: index.md + - Tutorials: + - "Getting Started": + - "DataCard: Exploring Datasets": tutorials/datacard_tutorial.ipynb + - "Cache Management": tutorials/cache_manager_tutorial.ipynb + - "Querying Data": + - "VirtualDB: Unified Cross-Dataset Queries": tutorials/virtual_db_tutorial.ipynb + - Concepts: + - "Virtual Database Design": virtual_database_concepts.md + - API Reference: + - Core: + - VirtualDB: virtual_db.md + - DataCard: datacard.md + - HfCacheManager: hf_cache_manager.md + - Models and Configuration: + - Pydantic Models: models.md + - Fetchers: fetchers.md + - Error Handling: + - Custom Exceptions: errors.md + - HuggingFace Configuration: + - HuggingFace Dataset Card Format: huggingface_datacard.md + - BrentLab Collection: brentlab_yeastresources_collection.md diff --git a/pyproject.toml b/pyproject.toml index 27e077a..e3710f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,21 +9,25 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.11" requests = "^2.32.3" -aiohttp = "^3.11.18" -cachetools = "^5.5.2" -scikit-learn = "^1.6.1" -requests-toolbelt = "^1.0.0" -responses = "^0.25.7" -aioresponses = "^0.7.8" -numpy = "^2.2.5" -dotenv = "^0.9.9" pandas = "^2.3.1" +huggingface-hub = "^0.34.4" +duckdb = "^1.3.2" +pydantic = "^2.11.9" [tool.poetry.group.dev.dependencies] pytest = "^8.3.5" -pytest-snapshot = "^0.9.0" pytest-asyncio = "^0.26.0" +types-requests = "^2.32.4.20250809" +mkdocs = "^1.6.1" +mkdocs-material = "^9.6.19" +mkdocs-autorefs = "^1.4.3" +mkdocs-section-index = "^0.3.10" +mkdocs-jupyter = "^0.25.1" +mkdocstrings = {extras = ["python"], version = "^0.30.0"} +matplotlib = "^3.10.6" +seaborn = "^0.13.2" +types-pyyaml = "^6.0.12.20250915" [tool.pytest.ini_options] diff --git a/tfbpapi/AbstractAPI.py b/tfbpapi/AbstractAPI.py deleted file mode 100644 index 19c4eb6..0000000 --- a/tfbpapi/AbstractAPI.py +++ /dev/null @@ -1,230 +0,0 @@ -import logging -import os -from abc import ABC, abstractmethod -from collections.abc import Coroutine -from typing import Any - -import pandas as pd -import requests # type: ignore - -from tfbpapi.Cache import Cache -from tfbpapi.ParamsDict import ParamsDict - - -class AbstractAPI(ABC): - """ - Abstract base class for creating API clients that require token authentication. - - This class provides a template for connecting to a cache for caching API responses, - validating parameters against a list of valid keys, and provides an interface for - CRUD operations. - - """ - - def __init__( - self, - url: str = "", - token: str = "", - **kwargs, - ): - """ - Initialize the API client. - - :param url: The API endpoint URL. Defaults to the `BASE_URL` - environment variable. - :param token: The authentication token. Defaults to the `TOKEN` - environment variable. - :param valid_param_keys: A list of valid parameter keys for the API. - :param params: A ParamsDict object containing parameters for the API request. - :param cache: a Cache object for caching API responses. - :param kwargs: Additional keyword arguments that may be passed on to the - ParamsDict and Cache constructors. - - """ - self.logger = logging.getLogger(self.__class__.__name__) - self._token = token or os.getenv("TOKEN", "") - self.url = url or os.getenv("BASE_URL", "") - self.params = ParamsDict( - params=kwargs.pop("params", {}), - valid_keys=kwargs.pop("valid_keys", []), - ) - self.cache = Cache( - maxsize=kwargs.pop("maxsize", 100), ttl=kwargs.pop("ttl", 300) - ) - - @property - def header(self) -> dict[str, str]: - """The HTTP authorization header.""" - return { - "Authorization": f"token {self.token}", - "Content-Type": "application/json", - } - - @property - def url(self) -> str: - """The URL for the API.""" - return self._url # type: ignore - - @url.setter - def url(self, value: str) -> None: - if not value: - self._url = None - elif hasattr(self, "token") and self.token: - # validate the URL with the new token - self._is_valid_url(value) - self._url = value - else: - self.logger.warning("No token provided: URL un-validated") - self._url = value - - @property - def token(self) -> str: - """The authentication token for the API.""" - return self._token - - @token.setter - def token(self, value: str) -> None: - self._token = value - # validate the URL with the new token - if hasattr(self, "url") and self.url: - self.logger.info("Validating URL with new token") - self._is_valid_url(self.url) - - @property - def cache(self) -> Cache: - """The cache object for caching API responses.""" - return self._cache - - @cache.setter - def cache(self, value: Cache) -> None: - self._cache = value - - @property - def params(self) -> ParamsDict: - """The ParamsDict object containing parameters for the API request.""" - return self._params - - @params.setter - def params(self, value: ParamsDict) -> None: - self._params = value - - def push_params(self, params: dict[str, Any]) -> None: - """Adds or updates parameters in the ParamsDict.""" - try: - self.params.update(params) - except KeyError as e: - self.logger.error(f"Error updating parameters: {e}") - - def pop_params(self, keys: list[str] | None = None) -> None: - """Removes parameters from the ParamsDict.""" - if keys is None: - self.params.clear() - return - if keys is not None and not isinstance(keys, list): - keys = [keys] - for key in keys: - del self.params[key] - - @abstractmethod - def create(self, data: dict[str, Any], **kwargs) -> Any: - """Placeholder for the create method.""" - raise NotImplementedError( - f"`create()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def read(self, **kwargs) -> Any: - """Placeholder for the read method.""" - raise NotImplementedError( - f"`read()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def update(self, df: pd.DataFrame, **kwargs) -> Any: - """Placeholder for the update method.""" - raise NotImplementedError( - f"`update()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def delete(self, id: str, **kwargs) -> Any: - """Placeholder for the delete method.""" - raise NotImplementedError( - f"`delete()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - """Placeholder for the submit method.""" - raise NotImplementedError( - f"`submit()` is not implemented for {self.__class__.__name__}" - ) - - @abstractmethod - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Coroutine[Any, Any, Any]: - """Placeholder for the retrieve method.""" - raise NotImplementedError( - f"`retrieve()` is not implemented for {self.__class__.__name__}" - ) - - def _is_valid_url(self, url: str) -> None: - """ - Confirms that the URL is valid and the header authorization is appropriate. - - :param url: The URL to validate. - :type url: str - :raises ValueError: If the URL is invalid or the token is not set. - - """ - try: - # note that with allow_redirect=True the result can be a 300 status code - # which is not an error, and then another request to the redirected URL - response = requests.head(url, headers=self.header, allow_redirects=True) - if response.status_code != 200: - raise ValueError("Invalid URL or token provided. Check both.") - except requests.RequestException as e: - raise AttributeError(f"Error validating URL: {e}") from e - except AttributeError as e: - self.logger.error(f"Error validating URL: {e}") - - def _cache_get(self, key: str, default: Any = None) -> Any: - """ - Get a value from the cache if configured. - - :param key: The key to retrieve from the cache. - :type key: str - :param default: The default value to return if the key is not found. - :type default: any, optional - :return: The value from the cache or the default value. - :rtype: any - - """ - return self.cache.get(key, default=default) - - def _cache_set(self, key: str, value: Any) -> None: - """ - Set a value in the cache if configured. - - :param key: The key to set in the cache. - :type key: str - :param value: The value to set in the cache. - :type value: any - - """ - self.cache.set(key, value) - - def _cache_list(self) -> list[str]: - """List keys in the cache if configured.""" - return self.cache.list() - - def _cache_delete(self, key: str) -> None: - """ - Delete a key from the cache if configured. - - :param key: The key to delete from the cache. - :type key: str - - """ - self.cache.delete(key) diff --git a/tfbpapi/AbstractRecordsAndFilesAPI.py b/tfbpapi/AbstractRecordsAndFilesAPI.py deleted file mode 100644 index 87f99ad..0000000 --- a/tfbpapi/AbstractRecordsAndFilesAPI.py +++ /dev/null @@ -1,314 +0,0 @@ -import csv -import gzip -import os -import tarfile -import tempfile -from collections.abc import Callable -from io import BytesIO -from typing import Any - -import aiohttp -import pandas as pd - -from tfbpapi.AbstractAPI import AbstractAPI - - -class AbstractRecordsAndFilesAPI(AbstractAPI): - """ - Abstract class to interact with both the records and the data stored in the `file` - field. - - The return for this class must be records, against the `/export` - endpoint when `retrieve_files` is False. When `retrieve_files` is True, the cache - should be checked first. If the file doesn't exist there, it should be retrieved - from the database against the `/record_table_and_files` endpoint. The file should - be a tarball with the metadata.csv and the file associated with the record, - where the file is named according to the `id` field in metadata.csv. Data files - should be `.csv.gz`. - - """ - - def __init__(self, **kwargs): - """ - Initialize the AbstractRecordsAndFilesAPI object. This will serve as an - interface to an endpoint that can serve both records and files, and cache the - file/retrieve from the cache if it exists. - - :param kwargs: parameters to pass to AbstractAPI. - - """ - self.export_url_suffix = kwargs.pop("export_url_suffix", "export") - self.export_files_url_suffix = kwargs.pop( - "export_files_url_suffix", "record_table_and_files" - ) - super().__init__(**kwargs) - - @property - def export_url_suffix(self) -> str: - """The URL suffix for exporting records.""" - return self._export_url_suffix - - @export_url_suffix.setter - def export_url_suffix(self, value: str) -> None: - self._export_url_suffix = value - - @property - def export_files_url_suffix(self) -> str: - """The URL suffix for exporting files.""" - return self._export_files_url_suffix - - @export_files_url_suffix.setter - def export_files_url_suffix(self, value: str) -> None: - self._export_files_url_suffix = value - - def _detect_delimiter(self, file_path: str, sample_size: int = 1024) -> str: - """ - Detect the delimiter of a CSV file. - - :param file_path: The path to the CSV file. - :type file_path: str - :param sample_size: The number of bytes to read from the file to detect the - delimiter. Defaults to 1024. - :type sample_size: int - :return: The delimiter of the CSV file. - :rtype: str - :raises FileNotFoundError: If the file does not exist. - :raises gzip.BadGzipFile: If the file is not a valid gzip file. - :raises _csv.Error: If the CSV sniffer cannot determine the delimiter. - - """ - try: - # by default, open() uses newline=False, which opens the file - # in universal newline mode and translates all new line characters - # to '\n' - file = ( - gzip.open(file_path, "rt") - if file_path.endswith(".gz") - else open(file_path) - ) - except FileNotFoundError as exc: - raise FileNotFoundError(f"File {file_path} not found.") from exc - - sample = file.read(sample_size) - - # In order to avoid errors in the csv sniffer, attempt to find the - # last newline character in the string - last_newline_index = sample.rfind("\n") - # if a newline character is found, trim the sample to the last newline - if last_newline_index != -1: - # Trim to the last complete line - sample = sample[:last_newline_index] - - sniffer = csv.Sniffer() - dialect = sniffer.sniff(sample) - delimiter = dialect.delimiter - - file.close() - - return delimiter - - async def read( - self, - callback: Callable[ - [pd.DataFrame, dict[str, Any] | None, Any], Any - ] = lambda metadata, data, cache, **kwargs: ( - {"metadata": metadata, "data": data} - ), - retrieve_files: bool = False, - **kwargs, - ) -> Any: - """ - Retrieve data from the endpoint according to the `retrieve_files` parameter. If - `retrieve_files` is False, the records will be returned as a dataframe. If - `retrieve_files` is True, the files associated with the records will be - retrieved either from the local cache or from the database. Note that a user can - select which effect_colname and pvalue_colname is used for a genomicfile (see - database documentation for more details). If one or both of those are present in - the params, and retrieve_file is true, then that column name is added to the - cache_key. Eg if record 1 is being retrieved from mcisaac data with - effect_colname "log2_raio", then the cache_key for that data will be - "1_log2_ratio". The default effect colname, which is set by the database, will - be stored with only the record id as the cache_key. - - :param callback: The function to call with the metadata. Signature must - include `metadata`, `data`, and `cache`. - :type callback: Callable[[pd.DataFrame, dict[str, Any] | None, Any], Any] - :param retrieve_files: Boolean. Whether to retrieve the files associated with - the records. Defaults to False. - :type retrieve_files: bool - :param kwargs: The following kwargs are used by the read() function. Any - others are passed onto the callback function - - timeout: The timeout for the GET request. Defaults to 120. - - :return: The result of the callback function. - :rtype: Any - - :raises ValueError: If the callback function does not have the correct - signature. - :raises aiohttp.ClientError: If there is an error in the GET request. - :raises pd.errors.ParserError: If there is an error reading the request - - """ - if not callable(callback) or {"metadata", "data", "cache"} - set( - callback.__code__.co_varnames - ): - raise ValueError( - "The callback must be a callable function with `metadata`, `data`, ", - "and `cache` as parameters.", - ) - - export_url = f"{self.url.rstrip('/')}/{self.export_url_suffix}" - self.logger.debug("read() export_url: %s", export_url) - - timeout = aiohttp.ClientTimeout(kwargs.pop("timeout", 120)) - async with aiohttp.ClientSession(timeout=timeout) as session: - try: - async with session.get( - export_url, headers=self.header, params=self.params - ) as response: - response.raise_for_status() - content = await response.content.read() - with gzip.GzipFile(fileobj=BytesIO(content)) as f: - records_df = pd.read_csv(f) - - if not retrieve_files: - return callback(records_df, None, self.cache, **kwargs) - else: - data_list = await self._retrieve_files(session, records_df) - return callback( - records_df, - data_list, - self.cache, - **kwargs, - ) - - except aiohttp.ClientError as e: - self.logger.error(f"Error in GET request: {e}") - raise - except pd.errors.ParserError as e: - self.logger.error(f"Error reading request content: {e}") - raise - - async def _retrieve_files( - self, session: aiohttp.ClientSession, records_df: pd.DataFrame - ) -> dict[str, pd.DataFrame]: - """ - Retrieve files associated with the records either from the local cache or from - the database. - - :param session: The aiohttp ClientSession. - :type session: aiohttp.ClientSession - :param records_df: The DataFrame containing the records. - :type records_df: pd.DataFrame - :return: A dictionary where the keys are record IDs and the values are - DataFrames of the associated files. - :rtype: dict[str, pd.DataFrame] - - """ - data_list = {} - for record_id in records_df["id"]: - data_list[str(record_id)] = await self._retrieve_file(session, record_id) - return data_list - - async def _retrieve_file( - self, session: aiohttp.ClientSession, record_id: int - ) -> pd.DataFrame: - """ - Retrieve a file associated with a record either from the local cache or from the - database. - - :param session: The aiohttp ClientSession. - :type session: aiohttp.ClientSession - :param record_id: The ID of the record. - :type record_id: int - :return: A DataFrame containing the file's data. - :rtype: pd.DataFrame - :raises FileNotFoundError: If the file is not found in the tar archive. - :raises ValueError: If the delimiter is not supported. - - """ - export_files_url = f"{self.url.rstrip('/')}/{self.export_files_url_suffix}" - self.logger.debug("_retrieve_file() export_url: %s", export_files_url) - - # set key for local cache - cache_key = str(record_id) - if "effect_colname" in self.params: - cache_key += f"_{self.params['effect_colname']}" - if "pvalue_colname" in self.params: - cache_key += f"_{self.params['pvalue_colname']}" - cached_data = self._cache_get(cache_key) - if cached_data is not None: - self.logger.info(f"cache_key {cache_key} retrieved from cache.") - return pd.read_json(BytesIO(cached_data.encode())) - else: - self.logger.debug(f"cache_key {cache_key} not found in cache.") - - try: - header = self.header.copy() - header["Content-Type"] = "application/gzip" - retrieve_files_params = self.params.copy() - retrieve_files_params.update({"id": record_id}) - async with session.get( - export_files_url, - headers=header, - params=retrieve_files_params, - timeout=120, - ) as response: - response.raise_for_status() - tar_data = await response.read() - - # Create a temporary file for the tarball - tar_file = tempfile.NamedTemporaryFile(delete=False, suffix=".tar.gz") - try: - tar_file.write(tar_data) - tar_file.flush() - tar_file.seek(0) - - # Create a temporary directory for extraction - with tempfile.TemporaryDirectory() as extract_dir: - # Open the tar file and log its contents - with tarfile.open(fileobj=tar_file, mode="r:gz") as tar: - tar_members = tar.getmembers() - self.logger.debug( - f"Tar file contains: " - f"{[member.name for member in tar_members]}", - ) - - # Find the specific file to extract - csv_filename = f"{record_id}.csv.gz" - member = next( - (m for m in tar_members if m.name == csv_filename), None - ) - if member is None: - raise FileNotFoundError( - f"{csv_filename} not found in tar archive" - ) - - # Extract only the specific member - tar.extract(member, path=extract_dir) - - # Read the extracted CSV file - csv_path = os.path.join(extract_dir, csv_filename) - self.logger.debug(f"Extracted file: {csv_path}") - - delimiter = self._detect_delimiter(csv_path) - - # raise an error if the delimiter is not a "," or a "\t" - if delimiter not in [",", "\t"]: - raise ValueError( - f"Delimiter {delimiter} is not supported. " - "Supported delimiters are ',' and '\\t'." - ) - - df = pd.read_csv(csv_path, delimiter=delimiter) - - # Store the data in the cache - self.logger.debug(f"Storing {cache_key} in cache.") - self._cache_set(cache_key, df.to_json()) - finally: - os.unlink(tar_file.name) - - return df - except Exception as e: - self.logger.error(f"Error retrieving file for cache_key {cache_key}: {e}") - raise diff --git a/tfbpapi/AbstractRecordsOnlyAPI.py b/tfbpapi/AbstractRecordsOnlyAPI.py deleted file mode 100644 index 1751ec7..0000000 --- a/tfbpapi/AbstractRecordsOnlyAPI.py +++ /dev/null @@ -1,82 +0,0 @@ -import gzip -import logging -from collections.abc import Callable -from io import BytesIO -from typing import Any - -import aiohttp -import pandas as pd - -from tfbpapi.AbstractAPI import AbstractAPI - - -class AbstractRecordsOnlyAPI(AbstractAPI): - """Abstract class for CRUD operations on records-only (no file storage) - endpoints.""" - - def __init__(self, **kwargs): - """ - Initialize the RecordsOnlyAPI object. - - :param kwargs: Additional parameters to pass to AbstractAPI. - - """ - self.logger = logging.getLogger(__name__) - super().__init__(**kwargs) - - async def read( - self, - callback: Callable[ - [pd.DataFrame, dict[str, Any] | None, Any], Any - ] = lambda metadata, data, cache, **kwargs: { - "metadata": metadata, - "data": data, - }, - export_url_suffix="export", - **kwargs, - ) -> Any: - """ - Retrieve data from the endpoint. The data will be returned as a dataframe. The - callback function must take metadata, data, and cache as parameters. - - :param callback: The function to call with the data. Signature must - include `metadata`, `data`, and `cache` as parameters. - :param export_url_suffix: The URL suffix for the export endpoint. This will - return a response object with a csv file. - :param kwargs: This can be used to pass "params" to the request to use in place - of `self.params`. If those are passed, they will be popped off and then - the remaining kwargs will be passed to the callback function - - """ - if not callable(callback) or {"metadata", "data", "cache"} - set( - callback.__code__.co_varnames - ): - raise ValueError( - "The callback must be a callable function with `metadata`,", - "`data`, and `cache` as parameters.", - ) - - export_url = f"{self.url.rstrip('/')}/{export_url_suffix}" - self.logger.debug("read() export_url: %s", export_url) - - async with aiohttp.ClientSession() as session: - try: - # note that the url and the export suffix are joined such that - # the url is stripped of any trailing slashes and the export suffix is - # added without a leading slash - async with session.get( - export_url, - headers=self.header, - params=kwargs.pop("params", self.params), - ) as response: - response.raise_for_status() - content = await response.content.read() - with gzip.GzipFile(fileobj=BytesIO(content)) as f: - records_df = pd.read_csv(f) - return callback(records_df, None, self.cache, **kwargs) - except aiohttp.ClientError as e: - self.logger.error(f"Error in GET request: {e}") - raise - except pd.errors.ParserError as e: - self.logger.error(f"Error reading request content: {e}") - raise diff --git a/tfbpapi/BindingAPI.py b/tfbpapi/BindingAPI.py deleted file mode 100644 index b766b37..0000000 --- a/tfbpapi/BindingAPI.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class BindingAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the BindingAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the BindingAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "regulator", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "replicate", - "source", - "source_name", - "source_orig_id", - "strain", - "condition", - "lab", - "assay", - "workflow", - "data_usable", - ], - ) - - url = kwargs.pop("url", os.getenv("BINDING_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The BindingAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The BindingAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The BindingAPI does not support retrieve.") diff --git a/tfbpapi/BindingConcatenatedAPI.py b/tfbpapi/BindingConcatenatedAPI.py deleted file mode 100644 index 1ad6aff..0000000 --- a/tfbpapi/BindingConcatenatedAPI.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class BindingConcatenatedAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the BindingConcatenatedAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the BindingConcatenatedAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "regulator", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "replicate", - "source", - "strain", - "condition", - "lab", - "assay", - "workflow", - "data_usable", - ], - ) - - url = kwargs.pop("url", os.getenv("BINDINGCONCATENATED_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingConcatenatedAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The BindingConcatenatedAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The BindingConcatenatedAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingConcatenatedAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError( - "The BindingConcatenatedAPI does not support retrieve." - ) diff --git a/tfbpapi/BindingManualQCAPI.py b/tfbpapi/BindingManualQCAPI.py deleted file mode 100644 index df4169b..0000000 --- a/tfbpapi/BindingManualQCAPI.py +++ /dev/null @@ -1,106 +0,0 @@ -import os -from typing import Any - -import pandas as pd -import requests # type: ignore - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class BindingManualQCAPI(AbstractRecordsOnlyAPI): - """A class to interact with the BindingManualQCAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the BindingManualQCAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "binding", - "best_datatype", - "data_usable", - "passing_replicate", - "rank_recall", - "regulator", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "source", - ], - ) - - url = kwargs.pop("url", os.getenv("BINDINGMANUALQC_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`BINDINGMANUALQC_URL` must be set", - ) - - self.bulk_update_url_suffix = kwargs.pop( - "bulk_update_url_suffix", "bulk-update" - ) - - super().__init__(url=url, valid_param_keys=valid_param_keys, **kwargs) - - @property - def bulk_update_url_suffix(self) -> str: - """The URL suffix for updating multiple records in the same request.""" - return self._bulk_update_url_suffix - - @bulk_update_url_suffix.setter - def bulk_update_url_suffix(self, value: str) -> None: - self._bulk_update_url_suffix = value - - def update(self, df: pd.DataFrame, **kwargs: Any) -> requests.Response: - """ - Update the records in the database. - - :param df: The DataFrame containing the records to update. - :type df: pd.DataFrame - :param kwargs: Additional fields to include in the payload. - :type kwargs: Any - :return: The response from the POST request. - :rtype: requests.Response - :raises requests.RequestException: If the request fails. - - """ - bulk_update_url = ( - f"{self.url.rstrip('/')}/{self.bulk_update_url_suffix.rstrip('/')}/" - ) - - self.logger.debug("bulk_update_url: %s", bulk_update_url) - - # Include additional fields in the payload if provided - payload = {"data": df.to_dict(orient="records")} - payload.update(kwargs) - - try: - response = requests.post( - bulk_update_url, - headers=self.header, - json=payload, - ) - response.raise_for_status() - return response - except requests.RequestException as e: - self.logger.error(f"Error in POST request: {e}") - raise - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingManualQCAPI does not support create.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The BindingManualQCAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The BindingManualQCAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The BindingManualQCAPI does not support retrieve.") diff --git a/tfbpapi/Cache.py b/tfbpapi/Cache.py deleted file mode 100644 index 366604d..0000000 --- a/tfbpapi/Cache.py +++ /dev/null @@ -1,29 +0,0 @@ -import logging -from typing import Any - -from cachetools import TTLCache # type: ignore - - -class Cache: - """A caching class that uses cachetools for TTL caching with an LRU eviction - policy.""" - - def __init__(self, maxsize: int = 100, ttl: int = 300): - self.ttl_cache = TTLCache(maxsize=maxsize, ttl=ttl) - self.logger = logging.getLogger(__name__) - - def get(self, key: str, default: Any = None) -> Any: - """Get a value from the cache.""" - return self.ttl_cache.get(key, default) - - def set(self, key: str, value: Any) -> None: - """Set a value in the cache.""" - self.ttl_cache[key] = value - - def list(self) -> list[str]: - """List all keys in the cache.""" - return list(self.ttl_cache.keys()) - - def delete(self, key: str) -> None: - """Delete a key from the cache.""" - self.ttl_cache.pop(key, None) diff --git a/tfbpapi/CallingCardsBackgroundAPI.py b/tfbpapi/CallingCardsBackgroundAPI.py deleted file mode 100644 index f5b7668..0000000 --- a/tfbpapi/CallingCardsBackgroundAPI.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class CallingCardsBackgroundAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the CallingCardsBackgroundAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the CallingCardsBackgroundAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - ["id", "name"], - ) - - url = kwargs.pop("url", os.getenv("CALLINGCARDSBACKGROUND_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support create." - ) - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support update." - ) - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support delete." - ) - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support submit." - ) - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError( - "The CallingCardsBackgroundAPI does not support retrieve." - ) diff --git a/tfbpapi/DataSourceAPI.py b/tfbpapi/DataSourceAPI.py deleted file mode 100644 index 0d00785..0000000 --- a/tfbpapi/DataSourceAPI.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class DataSourceAPI(AbstractRecordsOnlyAPI): - """A class to interact with the DataSourceAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the DataSourceAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - ["id", "fileformat_id", "fileformat", "lab", "assay", "workflow"], - ) - - url = kwargs.pop("url", os.getenv("DATASOURCE_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`DATASOURCE_URL` must be set", - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The DataSourceAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The DataSourceAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The DataSourceAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The DataSourceAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The DataSourceAPI does not support retrieve.") diff --git a/tfbpapi/DtoAPI.py b/tfbpapi/DtoAPI.py deleted file mode 100644 index bc8d404..0000000 --- a/tfbpapi/DtoAPI.py +++ /dev/null @@ -1,295 +0,0 @@ -import asyncio -import json -import os -import time -from typing import Any - -import aiohttp -import pandas as pd -import requests # type: ignore - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class DtoAPI(AbstractRecordsOnlyAPI): - """ - A class to interact with the DTO API. - - Retrieves dto data from the database. - - """ - - def __init__(self, **kwargs) -> None: - """ - Initialize the DTO object. This will serve as an interface to the DTO endpoint - of both the database and the application cache. - - :param url: The URL of the DTO API - :param kwargs: Additional parameters to pass to AbstractAPI. - - """ - - self.bulk_update_url_suffix = kwargs.pop( - "bulk_update_url_suffix", "bulk-update" - ) - - super().__init__( - url=kwargs.pop("url", os.getenv("DTO_URL", "")), - **kwargs, - ) - - async def read(self, *args, **kwargs) -> Any: - """ - Override the read() method to use a custom callback that parses metadata. - - :param callback: The function to call with the metadata. Defaults to parsing - metadata. - :type callback: Callable[[pd.DataFrame, dict[str, Any] | None, Any], Any] - :return: The result of the callback function. - :rtype: Any - - """ - - # Define the default callback - def dto_callback(metadata, data, cache, **kwargs): - return {"metadata": self.parse_metadata(metadata), "data": data} - - # Explicitly set the callback argument to dto_callback - kwargs["callback"] = dto_callback - - # Call the superclass method with updated kwargs - return await super().read(*args, **kwargs) - - async def submit( - self, - post_dict: dict[str, Any], - **kwargs, - ) -> Any: - """ - Submit a DTO task to the DTO API. - - :param post_dict: The dictionary to submit to the DTO API. The typing needs to - be adjusted -- it can take a list of dictionaries to submit a batch. - :return: The group_task_id of the submitted task. - - """ - # make a post request with the post_dict to dto_url - dto_url = f"{self.url.rstrip('/')}/submit/" - self.logger.debug("dto_url: %s", dto_url) - - async with aiohttp.ClientSession() as session: - async with session.post( - dto_url, headers=self.header, json=post_dict - ) as response: - try: - response.raise_for_status() - except aiohttp.ClientResponseError as e: - self.logger.error( - "Failed to submit DTO task: Status %s, Reason %s", - e.status, - e.message, - ) - raise - result = await response.json() - try: - return result["group_task_id"] - except KeyError: - self.logger.error( - "Expected 'group_task_id' in response: %s", json.dumps(result) - ) - raise - - async def retrieve( - self, - group_task_id: str, - timeout: int = 300, - polling_interval: int = 2, - **kwargs, - ) -> dict[str, pd.DataFrame]: - """ - Periodically check the task status and retrieve the result when the task - completes. - - :param group_task_id: The task ID to retrieve results for. - :param timeout: The maximum time to wait for the task to complete (in seconds). - :param polling_interval: The time to wait between status checks (in seconds). - :return: Records from the DTO API of the successfully completed task. - - """ - # Start time for timeout check - start_time = time.time() - - # Task status URL - status_url = f"{self.url.rstrip('/')}/status/" - - while True: - async with aiohttp.ClientSession() as session: - # Send a GET request to check the task status - async with session.get( - status_url, - headers=self.header, - params={"group_task_id": group_task_id}, - ) as response: - response.raise_for_status() # Raise an error for bad status codes - status_response = await response.json() - - # Check if the task is complete - if status_response.get("status") == "SUCCESS": - - if error_tasks := status_response.get("error_tasks"): - self.logger.error( - f"Tasks {group_task_id} failed: {error_tasks}" - ) - if success_tasks := status_response.get("success_pks"): - params = {"id": ",".join(str(pk) for pk in success_tasks)} - return await self.read(params=params) - elif status_response.get("status") == "FAILURE": - raise Exception( - f"Task {group_task_id} failed: {status_response}" - ) - - # Check if we have reached the timeout - elapsed_time = time.time() - start_time - if elapsed_time > timeout: - raise TimeoutError( - f"Task {group_task_id} did not " - "complete within {timeout} seconds." - ) - - # Wait for the specified polling interval before checking again - await asyncio.sleep(polling_interval) - - def create(self, data: dict[str, Any], **kwargs) -> requests.Response: - raise NotImplementedError("The DTO does not support create.") - - def update(self, df: pd.DataFrame, **kwargs: Any) -> requests.Response: - """ - Update the records in the database. - - :param df: The DataFrame containing the records to update. - :type df: pd.DataFrame - :param kwargs: Additional fields to include in the payload. - :type kwargs: Any - :return: The response from the POST request. - :rtype: requests.Response - :raises requests.RequestException: If the request fails. - - """ - bulk_update_url = ( - f"{self.url.rstrip('/')}/{self.bulk_update_url_suffix.rstrip('/')}/" - ) - - self.logger.debug("bulk_update_url: %s", bulk_update_url) - - # Include additional fields in the payload if provided - payload = {"data": df.to_dict(orient="records")} - payload.update(kwargs) - - try: - response = requests.post( - bulk_update_url, - headers=self.header, - json=payload, - ) - response.raise_for_status() - return response - except requests.RequestException as e: - self.logger.error(f"Error in POST request: {e}") - raise - - def delete(self, id: str, **kwargs) -> Any: - """ - Delete a DTO record from the database. - - :param id: The ID of the DTO record to delete. - :return: A dictionary with a status message indicating success or failure. - - """ - # Include the Authorization header with the token - headers = kwargs.get("headers", {}) - headers["Authorization"] = f"Token {self.token}" - - # Make the DELETE request with the updated headers - response = requests.delete(f"{self.url}/{id}/", headers=headers, **kwargs) - - if response.status_code == 204: - return {"status": "success", "message": "DTO deleted successfully."} - - # Raise an error if the response indicates failure - response.raise_for_status() - - def parse_metadata(self, metadata: pd.DataFrame) -> pd.DataFrame: - """ - Parse the metadata from the DTO API. - - :param metadata: The metadata DataFrame to parse. - :return: The parsed metadata DataFrame. - :raises KeyError: If the metadata DataFrame is missing required columns. - - """ - if metadata.empty: - self.logger.warning("Metadata is empty") - return metadata - - output_columns = [ - "id", - "promotersetsig", - "expression", - "regulator_symbol", - "binding_source", - "expression_source", - "passing_fdr", - "passing_pvalue", - ] - - # required columns are "result" and output_columns - missing_req_columns = [ - col for col in ["result"] + output_columns if col not in metadata.columns - ] - if missing_req_columns: - raise KeyError( - "Metadata is missing required columns: " - "{', '.join(missing_req_columns)}" - ) - - dto_results_list = [] - - # Check and rename keys, logging a warning if a key is missing - keys_to_rename = { - "rank1": "binding_rank_threshold", - "rank2": "perturbation_rank_threshold", - "set1_len": "binding_set_size", - "set2_len": "perturbation_set_size", - } - - for _, row in metadata.iterrows(): - dto_results = json.loads(row.result.replace("'", '"')) - - for old_key, new_key in keys_to_rename.items(): - if old_key in dto_results: - dto_results[new_key] = dto_results.pop(old_key) - else: - self.logger.warning( - f"Key '{old_key}' missing in row with id '{row.id}'." - ) - - dto_results["id"] = row.id - dto_results["promotersetsig"] = row.promotersetsig - dto_results["expression"] = row.expression - dto_results["regulator_symbol"] = row.regulator_symbol - dto_results["binding_source"] = row.binding_source - dto_results["expression_source"] = row.expression_source - dto_results["passing_fdr"] = row.passing_fdr - dto_results["passing_pvalue"] = row.passing_pvalue - - dto_results_list.append(dto_results) - - # Create DataFrame - result_df = pd.DataFrame(dto_results_list) - - # Reorder columns: output_columns first, followed by others - reordered_columns = output_columns + [ - col for col in result_df.columns if col not in output_columns - ] - - return result_df.loc[:, reordered_columns] diff --git a/tfbpapi/ExpressionAPI.py b/tfbpapi/ExpressionAPI.py deleted file mode 100644 index c61e1f7..0000000 --- a/tfbpapi/ExpressionAPI.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class ExpressionAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the ExpressionAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the ExpressionAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "regulator", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "control", - "mechanism", - "restriction", - "time", - "strain", - "source", - "source_name", - "source_time", - "lab", - "assay", - "workflow", - "effect_colname", - "pvalue_colname", - "preferred_replicate", - ], - ) - - url = kwargs.pop("url", os.getenv("EXPRESSION_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The ExpressionAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The ExpressionAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The ExpressionAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The ExpressionAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The ExpressionAPI does not support retrieve.") diff --git a/tfbpapi/ExpressionManualQCAPI.py b/tfbpapi/ExpressionManualQCAPI.py deleted file mode 100644 index 80023e6..0000000 --- a/tfbpapi/ExpressionManualQCAPI.py +++ /dev/null @@ -1,103 +0,0 @@ -import os -from typing import Any - -import pandas as pd -import requests # type: ignore - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class ExpressionManualQCAPI(AbstractRecordsOnlyAPI): - """A class to interact with the ExpressionManualQCAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the ExpressionManualQCAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "expression", - "strain_verified", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "replicate", - "control", - "mechanism", - "restriction", - "time", - "source", - "lab", - "assay", - "workflow", - ], - ) - - url = kwargs.pop("url", os.getenv("EXPRESSIONMANUALQC_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`EXPRESSIONMANUALQC_URL` must be set", - ) - - self.bulk_update_url_suffix = kwargs.pop( - "bulk_update_url_suffix", "bulk-update" - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The ExpressionManualQCAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs: Any) -> requests.Response: - """ - Update the records in the database. - - :param df: The DataFrame containing the records to update. - :type df: pd.DataFrame - :param kwargs: Additional fields to include in the payload. - :type kwargs: Any - :return: The response from the POST request. - :rtype: requests.Response - :raises requests.RequestException: If the request fails. - - """ - bulk_update_url = ( - f"{self.url.rstrip('/')}/{self.bulk_update_url_suffix.rstrip('/')}/" - ) - - self.logger.debug("bulk_update_url: %s", bulk_update_url) - - # Include additional fields in the payload if provided - payload = {"data": df.to_dict(orient="records")} - payload.update(kwargs) - - try: - response = requests.post( - bulk_update_url, - headers=self.header, - json=payload, - ) - response.raise_for_status() - return response - except requests.RequestException as e: - self.logger.error(f"Error in POST request: {e}") - raise - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The ExpressionManualQCAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The ExpressionManualQCAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError( - "The ExpressionManualQCAPI does not support retrieve." - ) diff --git a/tfbpapi/FileFormatAPI.py b/tfbpapi/FileFormatAPI.py deleted file mode 100644 index bccdcc1..0000000 --- a/tfbpapi/FileFormatAPI.py +++ /dev/null @@ -1,57 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class FileFormatAPI(AbstractRecordsOnlyAPI): - """A class to interact with the FileFormatAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the FileFormatAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "fileformat", - "fields", - "separator", - "feature_identifier_col", - "effect_col", - "default_effect_threshold", - "pval_col", - "default_pvalue_threshold", - ], - ) - - url = kwargs.pop("url", os.getenv("FILEFORMAT_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`FILEFORMAT_URL` must be set", - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The FileFormatAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The FileFormatAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The FileFormatAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The FileFormatAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The FileFormatAPI does not support retrieve.") diff --git a/tfbpapi/GenomicFeatureAPI.py b/tfbpapi/GenomicFeatureAPI.py deleted file mode 100644 index 499cb6c..0000000 --- a/tfbpapi/GenomicFeatureAPI.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class GenomicFeatureAPI(AbstractRecordsOnlyAPI): - """A class to interact with the GenomicFeatureAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the GenomicFeatureAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "chr", - "start", - "end", - "strand", - "type", - "locus_tag", - "symbol", - "source", - "alias", - "note", - ], - ) - - url = kwargs.pop("url", os.getenv("GENOMICFEATURE_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`GENOMICFEATURE_URL` must be set", - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The GenomicFeatureAPI does not support retrieve.") diff --git a/tfbpapi/ParamsDict.py b/tfbpapi/ParamsDict.py deleted file mode 100644 index 19f7470..0000000 --- a/tfbpapi/ParamsDict.py +++ /dev/null @@ -1,156 +0,0 @@ -from typing import Any, Union - - -class ParamsDict(dict): - """ - A dictionary subclass that ensures all keys are strings and supports multiple key- - value assignments at once, with validation against a list of valid keys. - - This class is designed to be used for passing parameters to HTTP requests and - extends the base dictionary class, ensuring that insertion order is preserved. - - """ - - def __init__(self, params: dict[str, Any] = {}, valid_keys: list[str] = []) -> None: - """ - Initialize the ParamsDict with optional initial parameters and valid keys. - - :param params: A dictionary of initial parameters. All keys must be strings. - :type params: dict, optional - :param valid_keys: A list of valid keys for validation. - :type valid_keys: list of str, optional - :raises ValueError: If `params` is not a dictionary or if any of the keys - are not strings. - - """ - params = params or {} - valid_keys = valid_keys or [] - if not isinstance(params, dict): - raise ValueError("params must be a dictionary") - if len(params) > 0 and not all(isinstance(k, str) for k in params.keys()): - raise ValueError("params must be a dictionary with string keys") - super().__init__(params) - self._valid_keys = valid_keys - - def __setitem__(self, key: str | list[str], value: Any | list[Any]) -> None: - """ - Set a parameter value or multiple parameter values. - - :param key: The parameter key or a list of parameter keys. - :type key: str or list of str - :param value: The parameter value or a list of parameter values. - :type value: any or list of any - :raises ValueError: If the length of `key` and `value` lists do not match. - :raises KeyError: If `key` is not a string or a list of strings. - - """ - if isinstance(key, str): - self._validate_key(key) - super().__setitem__(key, value) - elif isinstance(key, list) and isinstance(value, list): - if len(key) != len(value): - raise ValueError("Length of keys and values must match") - for k, v in zip(key, value): - if not isinstance(k, str): - raise KeyError("All keys must be strings") - self._validate_key(k) - super().__setitem__(k, v) - else: - raise KeyError("Key must be a string or list of strings") - - def __getitem__(self, key: str | list[str]) -> Union[Any, "ParamsDict"]: - """ - Get a parameter value or a new ParamsDict with specified keys. - - :param key: The parameter key or a list of parameter keys. - :type key: str or list of str - :return: The parameter value or a new ParamsDict with the specified keys. - :rtype: any or ParamsDict - :raises KeyError: If `key` is not a string or a list of strings. - - """ - if isinstance(key, str): - return super().__getitem__(key) - elif isinstance(key, list): - return ParamsDict({k: dict.__getitem__(self, k) for k in key if k in self}) - else: - raise KeyError("Key must be a string or list of strings") - - def __delitem__(self, key: str) -> None: - """ - Delete a parameter by key. - - :param key: The parameter key. - :type key: str - :raises KeyError: If `key` is not a string. - - """ - if isinstance(key, str): - super().__delitem__(key) - else: - raise KeyError("Key must be a string") - - def __repr__(self) -> str: - """ - Return a string representation of the ParamsDict. - - :return: A string representation of the ParamsDict. - :rtype: str - - """ - return f"ParamsDict({super().__repr__()})" - - def __str__(self) -> str: - """ - Return a human-readable string representation of the ParamsDict. - - :return: A human-readable string representation of the ParamsDict. - :rtype: str - - """ - return ", ".join(f"{k}: {v}" for k, v in self.items()) - - def update(self, *args, **kwargs) -> None: - """Update the ParamsDict with the key/value pairs from other, overwriting - existing keys.""" - if args: - other = args[0] - if isinstance(other, dict): - [self._validate_key(k) for k in other.keys()] - for key, value in other.items(): - self.__setitem__(key, value) - else: - [self._validate_key(k) for k, _ in other] - for key, value in other: - self.__setitem__(key, value) - [self._validate_key(k) for k in kwargs.keys()] - for key, value in kwargs.items(): - self.__setitem__(key, value) - - def as_dict(self) -> dict: - """ - Convert the ParamsDict to a standard dictionary. - - :return: A standard dictionary with the same items as the ParamsDict. - :rtype: dict - - """ - return dict(self) - - def _validate_key(self, key: str) -> bool: - """Validate that the key is in the list of valid keys.""" - if self._valid_keys and key not in self._valid_keys: - raise KeyError(f"Invalid parameter key provided: {key}") - return True - - @property - def valid_keys(self) -> list[str]: - """Get the list of valid keys.""" - return self._valid_keys - - @valid_keys.setter - def valid_keys(self, keys: list[str]) -> None: - """Set the list of valid keys.""" - if not all(isinstance(k, str) for k in keys): - raise ValueError("valid_keys must be a list of strings") - self._valid_keys = keys diff --git a/tfbpapi/PromoterSetAPI.py b/tfbpapi/PromoterSetAPI.py deleted file mode 100644 index f747497..0000000 --- a/tfbpapi/PromoterSetAPI.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class PromoterSetAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the PromoterSetAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the PromoterSetAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - ["id", "name"], - ) - - url = kwargs.pop("url", os.getenv("PROMOTERSET_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The PromoterSetAPI does not support retrieve.") diff --git a/tfbpapi/PromoterSetSigAPI.py b/tfbpapi/PromoterSetSigAPI.py deleted file mode 100644 index e75609e..0000000 --- a/tfbpapi/PromoterSetSigAPI.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class PromoterSetSigAPI(AbstractRecordsAndFilesAPI): - """Class to interact with the PromoterSetSigAPI endpoint.""" - - def __init__(self, **kwargs) -> None: - """ - Initialize the PromoterSetSigAPI object. - - :param kwargs: parameters to pass through AbstractRecordsAndFilesAPI to - AbstractAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "single_binding", - "composite_binding", - "promoter", - "promoter_name", - "background", - "background_name", - "regulator_locus_tag", - "regulator_symbol", - "batch", - "replicate", - "source", - "source_name", - "lab", - "assay", - "workflow", - "data_usable", - "aggregated", - "condition", - "deduplicate", - "preferred_replicate", - ], - ) - - url = kwargs.pop("url", os.getenv("PROMOTERSETSIG_URL", None)) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The PromoterSetSigAPI does not support retrieve.") diff --git a/tfbpapi/RankResponseAPI.py b/tfbpapi/RankResponseAPI.py deleted file mode 100644 index 6ed3330..0000000 --- a/tfbpapi/RankResponseAPI.py +++ /dev/null @@ -1,286 +0,0 @@ -import asyncio -import json -import os -import tarfile -import tempfile -import time -from typing import Any - -import aiohttp -import pandas as pd -from requests import Response, delete, post # type: ignore -from requests_toolbelt import MultipartEncoder - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - - -class RankResponseAPI(AbstractRecordsAndFilesAPI): - """ - A class to interact with the Rank Response API. - - Retrieves rank response data from the database. - - """ - - def __init__(self, **kwargs) -> None: - """ - Initialize the RankResponseAPI object. This will serve as an interface to the - RankResponse endpoint of both the database and the application cache. - - :param url: The URL of the Rank Response API - :param kwargs: Additional parameters to pass to AbstractAPI. - - """ - super().__init__( - url=kwargs.pop("url", os.getenv("RANKRESPONSE_URL", "")), - **kwargs, - ) - - async def submit( - self, - post_dict: dict[str, Any], - **kwargs, - ) -> Any: - # make a post request with the post_dict to rankresponse_url - rankresponse_url = f"{self.url.rstrip('/')}/submit/" - self.logger.debug("rankresponse_url: %s", rankresponse_url) - - async with aiohttp.ClientSession() as session: - async with session.post( - rankresponse_url, headers=self.header, json=post_dict - ) as response: - response.raise_for_status() - result = await response.json() - try: - return result["group_task_id"] - except KeyError: - self.logger.error( - "Expected 'group_task_id' in response: %s", json.dumps(result) - ) - raise - - async def retrieve( - self, - group_task_id: str, - timeout: int = 300, - polling_interval: int = 2, - **kwargs, - ) -> dict[str, pd.DataFrame]: - """ - Periodically check the task status and retrieve the result when the task - completes. - - :param group_task_id: The task ID to retrieve results for. - :param timeout: The maximum time to wait for the task to complete (in seconds). - :param polling_interval: The time to wait between status checks (in seconds). - :return: Extracted files from the result tarball. - - """ - # Start time for timeout check - start_time = time.time() - - # Task status URL - status_url = f"{self.url.rstrip('/')}/status/" - - while True: - async with aiohttp.ClientSession() as session: - # Send a GET request to check the task status - async with session.get( - status_url, - headers=self.header, - params={"group_task_id": group_task_id}, - ) as response: - response.raise_for_status() # Raise an error for bad status codes - status_response = await response.json() - - # Check if the task is complete - if status_response.get("status") == "SUCCESS": - # Fetch and return the tarball - return await self._download_result(group_task_id) - elif status_response.get("status") == "FAILURE": - raise Exception( - f"Task {group_task_id} failed: {status_response}" - ) - - # Check if we have reached the timeout - elapsed_time = time.time() - start_time - if elapsed_time > timeout: - raise TimeoutError( - f"Task {group_task_id} did not " - "complete within {timeout} seconds." - ) - - # Wait for the specified polling interval before checking again - await asyncio.sleep(polling_interval) - - async def _download_result(self, group_task_id: str) -> Any: - """ - Download the result tarball after the task is successful. - - :param group_task_id: The group_task_id to download the results for. - :return: Extracted metadata and data from the tarball. - - """ - download_url = f"{self.url.rstrip('/')}/retrieve_task/" - - async with aiohttp.ClientSession() as session: - async with session.get( - download_url, - headers=self.header, - params={"group_task_id": group_task_id}, - ) as response: - response.raise_for_status() # Ensure request was successful - tar_data = await response.read() - - # Save tarball to a temporary file or return raw tar content - with tempfile.NamedTemporaryFile( - delete=False, suffix=".tar.gz" - ) as temp_file: - temp_file.write(tar_data) - temp_file.flush() - temp_file.seek(0) - - # Extract and return the content of the tarball - return self._extract_files(temp_file.name) - - def _extract_files(self, tar_path: str) -> dict[str, pd.DataFrame]: - """ - Extract metadata and associated files from a tarball. - - :param tar_path: The path to the tarball file. - :return: A tuple of metadata DataFrame and a dictionary of DataFrames for each - file. - - """ - with tarfile.open(tar_path, mode="r:gz") as tar: - tar_members = tar.getmembers() - - # Extract metadata.json - metadata_member = next( - (m for m in tar_members if m.name == "metadata.json"), None - ) - if metadata_member is None: - raise FileNotFoundError("metadata.json not found in tar archive") - - extracted_file = tar.extractfile(metadata_member) - if extracted_file is None: - raise FileNotFoundError("Failed to extract metadata.json") - - with extracted_file as f: - metadata_dict = json.load(f) - - metadata_df = pd.DataFrame(metadata_dict.values()) - metadata_df["id"] = metadata_dict.keys() - - # Extract CSV files - data = {} - for rr_id in metadata_df["id"]: - csv_filename = f"{rr_id}.csv.gz" - member = next((m for m in tar_members if m.name == csv_filename), None) - if member is None: - raise FileNotFoundError(f"{csv_filename} not found in tar archive") - - extracted_file = tar.extractfile(member) - if extracted_file is None: - raise FileNotFoundError(f"Failed to extract {csv_filename}") - - with extracted_file as f: - data[rr_id] = pd.read_csv(f, compression="gzip") - return {"metadata": metadata_df, "data": data} - - def create(self, data: dict[str, Any], **kwargs) -> Response: - """ - Create a new RankResponse record by uploading a gzipped CSV file. - - :param data: This should be the fields in the RankREsponse model, eg - "promotersetsig_id", "expression_id" and "parameters". - :param kwargs: Additional parameters to pass to the post. This must include a - DataFrame to upload as a CSV file with the keyword `df`, eg `df=my_df`. - - :return: The result of the post request. - - :raises ValueError: If a DataFrame is not provided in the keyword arguments. - :raises TypeError: If the DataFrame provided is not a pandas DataFrame. - - """ - # ensure that the url ends in a slash - rankresponse_url = f"{self.url.rstrip('/')}/" - df = kwargs.pop("df", None) - - if df is None: - raise ValueError( - "A DataFrame must be provided to create " - "a RankResponse via keyword `df`" - ) - if not isinstance(df, pd.DataFrame): - raise TypeError( - f"Expected a DataFrame for keyword `df`, got {type(df).__name__}" - ) - - # Create a temporary gzipped CSV file from the DataFrame - with tempfile.NamedTemporaryFile(suffix=".csv.gz") as temp_file: - df.to_csv(temp_file.name, compression="gzip", index=False) - - # Prepare the file and metadata for upload - with open(temp_file.name, "rb") as file: - multipart_data = MultipartEncoder( - fields={**data, "file": (temp_file.name, file, "application/gzip")} - ) - headers = {**self.header, "Content-Type": multipart_data.content_type} - - # Send the POST request with custom encoded multipart data - response = post(rankresponse_url, headers=headers, data=multipart_data) - - response.raise_for_status() - return response - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The RankResponseAPI does not support update.") - - def delete(self, id: str = "", **kwargs) -> Any: - """ - Delete one or more records from the database. - - :param id: The ID of the record to delete. However, you can also pass in - `ids` as a list of IDs to delete multiple records. This is why `id` is optional. - If neither `id` nor `ids` is provided, a ValueError is raised. - - :return: A dictionary with a status message indicating success or failure. - - :raises ValueError: If neither `id` nor `ids` is provided. - - """ - # Include the Authorization header with the token - headers = kwargs.get("headers", {}) - headers["Authorization"] = f"Token {self.token}" - - ids = kwargs.pop("ids", str(id)) - - # Determine if it's a single ID or multiple - if isinstance(ids, str) and str != "": - # Single ID deletion for backward compatibility - response = delete(f"{self.url}/{ids}/", headers=headers, **kwargs) - elif isinstance(ids, list) and ids: - # Bulk delete with a list of IDs - response = delete( - f"{self.url}/delete/", - headers=headers, - json={"ids": ids}, # Send the list of IDs in the request body - **kwargs, - ) - else: - raise ValueError( - "No ID(s) provided for deletion. Either pass a single ID with " - "`id` or a list of IDs with `ids = [1,2, ...]" - ) - - if response.status_code in [200, 204]: - return { - "status": "success", - "message": "RankResponse(s) deleted successfully.", - } - - # Raise an error if the response indicates failure - response.raise_for_status() diff --git a/tfbpapi/RegulatorAPI.py b/tfbpapi/RegulatorAPI.py deleted file mode 100644 index 675c002..0000000 --- a/tfbpapi/RegulatorAPI.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -from typing import Any - -import pandas as pd - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class RegulatorAPI(AbstractRecordsOnlyAPI): - """A class to interact with the RegulatorAPI endpoint.""" - - def __init__(self, **kwargs): - """ - Initialize the RegulatorAPI object. - - :param kwargs: parameters to pass to AbstractAPI via AbstractRecordsOnlyAPI. - - """ - valid_param_keys = kwargs.pop( - "valid_param_keys", - [ - "id", - "regulator_locus_tag", - "regulator_symbol", - "under_development", - ], - ) - - url = kwargs.pop("url", os.getenv("REGULATOR_URL", None)) - if not url: - raise AttributeError( - "url must be provided or the environmental variable ", - "`REGULATOR_URL` must be set", - ) - - super().__init__(url=url, valid_keys=valid_param_keys, **kwargs) - - def create(self, data: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The RegulatorAPI does not support create.") - - def update(self, df: pd.DataFrame, **kwargs) -> Any: - raise NotImplementedError("The RegulatorAPI does not support update.") - - def delete(self, id: str, **kwargs) -> Any: - raise NotImplementedError("The RegulatorAPI does not support delete.") - - def submit(self, post_dict: dict[str, Any], **kwargs) -> Any: - raise NotImplementedError("The RegulatorAPI does not support submit.") - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - raise NotImplementedError("The RegulatorAPI does not support retrieve.") diff --git a/tfbpapi/UnivariateModelsAPI.py b/tfbpapi/UnivariateModelsAPI.py deleted file mode 100644 index d3bc632..0000000 --- a/tfbpapi/UnivariateModelsAPI.py +++ /dev/null @@ -1,202 +0,0 @@ -import asyncio -import json -import os -import time -from typing import Any - -import aiohttp -import pandas as pd -import requests # type: ignore - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class UnivariateModelsAPI(AbstractRecordsOnlyAPI): - """ - A class to interact with the UnivariateModels API. - - Retrieves univariatemodels data from the database. - - """ - - def __init__(self, **kwargs) -> None: - """ - Initialize the UnivariateModels object. This will serve as an interface to the - UnivariateModels endpoint of both the database and the application cache. - - :param url: The URL of the UnivariateModels API - :param kwargs: Additional parameters to pass to AbstractAPI. - - """ - - self.bulk_update_url_suffix = kwargs.pop( - "bulk_update_url_suffix", "bulk-update" - ) - - super().__init__( - url=kwargs.pop("url", os.getenv("UNIVARIATEMODELS_URL", "")), - **kwargs, - ) - - async def submit( - self, - post_dict: dict[str, Any], - **kwargs, - ) -> Any: - """ - Submit a UnivariateModels task to the UnivariateModels API. - - :param post_dict: The dictionary to submit to the UnivariateModels API. The - typing needs to be adjusted -- it can take a list of dictionaries to submit - a batch. - :return: The group_task_id of the submitted task. - - """ - # make a post request with the post_dict to univariatemodels_url - univariatemodels_url = f"{self.url.rstrip('/')}/submit/" - self.logger.debug("univariatemodels_url: %s", univariatemodels_url) - - async with aiohttp.ClientSession() as session: - async with session.post( - univariatemodels_url, headers=self.header, json=post_dict - ) as response: - try: - response.raise_for_status() - except aiohttp.ClientResponseError as e: - self.logger.error( - "Failed to submit UnivariateModels task: Status %s, Reason %s", - e.status, - e.message, - ) - raise - result = await response.json() - try: - return result["group_task_id"] - except KeyError: - self.logger.error( - "Expected 'group_task_id' in response: %s", json.dumps(result) - ) - raise - - async def retrieve( - self, - group_task_id: str, - timeout: int = 300, - polling_interval: int = 2, - **kwargs, - ) -> dict[str, pd.DataFrame]: - """ - Periodically check the task status and retrieve the result when the task - completes. - - :param group_task_id: The task ID to retrieve results for. - :param timeout: The maximum time to wait for the task to complete (in seconds). - :param polling_interval: The time to wait between status checks (in seconds). - :return: Records from the UnivariateModels API of the successfully completed - task. - - """ - # Start time for timeout check - start_time = time.time() - - # Task status URL - status_url = f"{self.url.rstrip('/')}/status/" - - while True: - async with aiohttp.ClientSession() as session: - # Send a GET request to check the task status - async with session.get( - status_url, - headers=self.header, - params={"group_task_id": group_task_id}, - ) as response: - response.raise_for_status() # Raise an error for bad status codes - status_response = await response.json() - - # Check if the task is complete - if status_response.get("status") == "SUCCESS": - - if error_tasks := status_response.get("error_tasks"): - self.logger.error( - f"Tasks {group_task_id} failed: {error_tasks}" - ) - if success_tasks := status_response.get("success_pks"): - params = {"id": ",".join(str(pk) for pk in success_tasks)} - return await self.read(params=params) - elif status_response.get("status") == "FAILURE": - raise Exception( - f"Task {group_task_id} failed: {status_response}" - ) - - # Check if we have reached the timeout - elapsed_time = time.time() - start_time - if elapsed_time > timeout: - raise TimeoutError( - f"Task {group_task_id} did not " - "complete within {timeout} seconds." - ) - - # Wait for the specified polling interval before checking again - await asyncio.sleep(polling_interval) - - def create(self, data: dict[str, Any], **kwargs) -> requests.Response: - raise NotImplementedError("The UnivariateModels does not support create.") - - def update(self, df: pd.DataFrame, **kwargs: Any) -> requests.Response: - """ - Update the records in the database. - - :param df: The DataFrame containing the records to update. - :type df: pd.DataFrame - :param kwargs: Additional fields to include in the payload. - :type kwargs: Any - :return: The response from the POST request. - :rtype: requests.Response - :raises requests.RequestException: If the request fails. - - """ - bulk_update_url = ( - f"{self.url.rstrip('/')}/{self.bulk_update_url_suffix.rstrip('/')}/" - ) - - self.logger.debug("bulk_update_url: %s", bulk_update_url) - - # Include additional fields in the payload if provided - payload = {"data": df.to_dict(orient="records")} - payload.update(kwargs) - - try: - response = requests.post( - bulk_update_url, - headers=self.header, - json=payload, - ) - response.raise_for_status() - return response - except requests.RequestException as e: - self.logger.error(f"Error in POST request: {e}") - raise - - def delete(self, id: str, **kwargs) -> Any: - """ - Delete a UnivariateModels record from the database. - - :param id: The ID of the UnivariateModels record to delete. - :return: A dictionary with a status message indicating success or failure. - - """ - # Include the Authorization header with the token - headers = kwargs.get("headers", {}) - headers["Authorization"] = f"Token {self.token}" - - # Make the DELETE request with the updated headers - response = requests.delete(f"{self.url}/{id}/", headers=headers, **kwargs) - - if response.status_code == 204: - return { - "status": "success", - "message": "UnivariateModels deleted successfully.", - } - - # Raise an error if the response indicates failure - response.raise_for_status() diff --git a/tfbpapi/__init__.py b/tfbpapi/__init__.py index 8c0f3be..f9db664 100644 --- a/tfbpapi/__init__.py +++ b/tfbpapi/__init__.py @@ -1,39 +1,33 @@ -from .BindingAPI import BindingAPI -from .BindingConcatenatedAPI import BindingConcatenatedAPI -from .BindingManualQCAPI import BindingManualQCAPI -from .CallingCardsBackgroundAPI import CallingCardsBackgroundAPI -from .DataSourceAPI import DataSourceAPI -from .DtoAPI import DtoAPI -from .ExpressionAPI import ExpressionAPI -from .ExpressionManualQCAPI import ExpressionManualQCAPI -from .FileFormatAPI import FileFormatAPI -from .GenomicFeatureAPI import GenomicFeatureAPI -from .metric_arrays import metric_arrays -from .PromoterSetAPI import PromoterSetAPI -from .PromoterSetSigAPI import PromoterSetSigAPI -from .rank_transforms import shifted_negative_log_ranks, stable_rank, transform -from .RankResponseAPI import RankResponseAPI -from .RegulatorAPI import RegulatorAPI -from .UnivariateModelsAPI import UnivariateModelsAPI +from .datacard import DataCard +from .fetchers import HfDataCardFetcher, HfRepoStructureFetcher, HfSizeInfoFetcher +from .hf_cache_manager import HfCacheManager +from .models import ( + DatasetCard, + DatasetConfig, + DatasetType, + ExtractedMetadata, + FeatureInfo, + MetadataConfig, + MetadataRelationship, + PropertyMapping, + RepositoryConfig, +) +from .virtual_db import VirtualDB __all__ = [ - "BindingAPI", - "BindingConcatenatedAPI", - "BindingManualQCAPI", - "CallingCardsBackgroundAPI", - "DataSourceAPI", - "DtoAPI", - "ExpressionAPI", - "ExpressionManualQCAPI", - "FileFormatAPI", - "GenomicFeatureAPI", - "metric_arrays", - "transform", - "PromoterSetAPI", - "PromoterSetSigAPI", - "RankResponseAPI", - "RegulatorAPI", - "stable_rank", - "shifted_negative_log_ranks", - "UnivariateModelsAPI", + "DataCard", + "HfCacheManager", + "HfDataCardFetcher", + "HfRepoStructureFetcher", + "HfSizeInfoFetcher", + "MetadataConfig", + "PropertyMapping", + "RepositoryConfig", + "VirtualDB", + "DatasetCard", + "DatasetConfig", + "DatasetType", + "ExtractedMetadata", + "FeatureInfo", + "MetadataRelationship", ] diff --git a/tfbpapi/constants.py b/tfbpapi/constants.py new file mode 100644 index 0000000..749678f --- /dev/null +++ b/tfbpapi/constants.py @@ -0,0 +1,11 @@ +import os +from pathlib import Path + +from huggingface_hub.constants import HF_HUB_CACHE + +CACHE_DIR = Path(os.getenv("HF_CACHE_DIR", HF_HUB_CACHE)) + + +def get_hf_token() -> str | None: + """Get HuggingFace token from environment variable.""" + return os.getenv("HF_TOKEN") diff --git a/tfbpapi/datacard.py b/tfbpapi/datacard.py new file mode 100644 index 0000000..b8798fc --- /dev/null +++ b/tfbpapi/datacard.py @@ -0,0 +1,492 @@ +""" +DataCard class for parsing and exploring HuggingFace dataset metadata. + +This module provides the DataCard class for parsing HuggingFace dataset cards +into structured Python objects that can be easily explored. The focus is on +enabling users to drill down into the YAML structure to understand: + +- Dataset configurations and their types +- Feature definitions and roles +- Experimental conditions at all hierarchy levels (top/config/field) +- Field-level condition definitions +- Metadata relationships + +Users can then use this information to plan metadata table structures and +data loading strategies. + +""" + +import logging +from typing import Any + +from pydantic import ValidationError + +from tfbpapi.errors import DataCardError, DataCardValidationError, HfDataFetchError +from tfbpapi.fetchers import ( + HfDataCardFetcher, + HfRepoStructureFetcher, + HfSizeInfoFetcher, +) +from tfbpapi.models import ( + DatasetCard, + DatasetConfig, + ExtractedMetadata, + FeatureInfo, + MetadataRelationship, +) + + +class DataCard: + """ + Parser and explorer for HuggingFace dataset metadata. + + The parsed structure uses Pydantic models with `extra="allow"` to accept + arbitrary fields (like experimental_conditions) without requiring code + changes. + + Key capabilities: + - Parse dataset card YAML into structured objects + - Navigate experimental conditions at 3 levels (top/config/field) + - Explore field definitions and roles + - Extract metadata schema for table design + - Discover metadata relationships + + Example: + >>> card = DataCard("BrentLab/harbison_2004") + >>> # Use context manager for config exploration + >>> with card.config("harbison_2004") as cfg: + ... # Get all experimental conditions + ... conds = cfg.experimental_conditions() + ... # Get condition fields with definitions + ... fields = cfg.condition_fields() + ... # Drill down into specific field + ... for name, info in fields.items(): + ... for value, definition in info['definitions'].items(): + ... print(f"{name}={value}: {definition}") + + Example (legacy API still supported): + >>> card = DataCard("BrentLab/harbison_2004") + >>> conditions = card.get_experimental_conditions("harbison_2004") + >>> defs = card.get_field_definitions("harbison_2004", "condition") + + """ + + def __init__(self, repo_id: str, token: str | None = None): + """ + Initialize DataCard for a repository. + + :param repo_id: HuggingFace repository identifier (e.g., "user/dataset") + :param token: Optional HuggingFace token for authentication + + """ + self.repo_id = repo_id + self.token = token + self.logger = logging.getLogger(self.__class__.__name__) + + # Initialize fetchers + self._card_fetcher = HfDataCardFetcher(token=token) + self._structure_fetcher = HfRepoStructureFetcher(token=token) + self._size_fetcher = HfSizeInfoFetcher(token=token) + + # Cache for parsed card + self._dataset_card: DatasetCard | None = None + self._metadata_cache: dict[str, list[ExtractedMetadata]] = {} + + @property + def dataset_card(self) -> DatasetCard: + """Get the validated dataset card.""" + if self._dataset_card is None: + self._load_and_validate_card() + # this is here for type checking purposes. _load_and_validate_card() + # will either set the _dataset_card or raise an error + assert self._dataset_card is not None + return self._dataset_card + + def _load_and_validate_card(self) -> None: + """Load and validate the dataset card from HuggingFace.""" + try: + self.logger.debug(f"Loading dataset card for {self.repo_id}") + card_data = self._card_fetcher.fetch(self.repo_id) + + if not card_data: + raise DataCardValidationError( + f"No dataset card found for {self.repo_id}" + ) + + # Validate using Pydantic model + self._dataset_card = DatasetCard(**card_data) + self.logger.debug(f"Successfully validated dataset card for {self.repo_id}") + + except ValidationError as e: + # Create a more user-friendly error message + error_details = [] + for error in e.errors(): + field_path = " -> ".join(str(x) for x in error["loc"]) + error_type = error["type"] + error_msg = error["msg"] + input_value = error.get("input", "N/A") + + if "dtype" in field_path and error_type == "string_type": + error_details.append( + f"Field '{field_path}': Expected a simple data type " + "string (like 'string', 'int64', 'float64') " + "but got a complex structure. This might be a categorical " + "field with class labels. " + f"Actual value: {input_value}" + ) + else: + error_details.append( + f"Field '{field_path}': {error_msg} (got: {input_value})" + ) + + detailed_msg = ( + f"Dataset card validation failed for {self.repo_id}:\n" + + "\n".join(f" - {detail}" for detail in error_details) + ) + self.logger.error(detailed_msg) + raise DataCardValidationError(detailed_msg) from e + except HfDataFetchError as e: + raise DataCardError(f"Failed to fetch dataset card: {e}") from e + + @property + def configs(self) -> list[DatasetConfig]: + """Get all dataset configurations.""" + return self.dataset_card.configs + + def get_config(self, config_name: str) -> DatasetConfig | None: + """Get a specific configuration by name.""" + return self.dataset_card.get_config_by_name(config_name) + + def get_features(self, config_name: str) -> list[FeatureInfo]: + """ + Get all feature definitions for a configuration. + + :param config_name: Configuration name + :return: List of FeatureInfo objects + :raises DataCardError: If config not found + + """ + config = self.get_config(config_name) + if not config: + raise DataCardError(f"Configuration '{config_name}' not found") + + return config.dataset_info.features + + def _extract_partition_values( + self, config: DatasetConfig, field_name: str + ) -> set[str]: + """Extract values from partition structure.""" + if ( + not config.dataset_info.partitioning + or not config.dataset_info.partitioning.enabled + ): + return set() + + partition_columns = config.dataset_info.partitioning.partition_by or [] + if field_name not in partition_columns: + return set() + + try: + # Get partition values from repository structure + partition_values = self._structure_fetcher.get_partition_values( + self.repo_id, field_name + ) + return set(partition_values) + except HfDataFetchError: + self.logger.warning(f"Failed to extract partition values for {field_name}") + return set() + + def get_metadata_relationships( + self, refresh_cache: bool = False + ) -> list[MetadataRelationship]: + """ + Get relationships between data configs and their metadata. + + :param refresh_cache: If True, force refresh dataset card from remote + + """ + # Clear cached dataset card if refresh requested + if refresh_cache: + self._dataset_card = None + + relationships = [] + data_configs = self.dataset_card.get_data_configs() + metadata_configs = self.dataset_card.get_metadata_configs() + + for data_config in data_configs: + # Check for explicit applies_to relationships + for meta_config in metadata_configs: + if ( + meta_config.applies_to + and data_config.config_name in meta_config.applies_to + ): + relationships.append( + MetadataRelationship( + data_config=data_config.config_name, + metadata_config=meta_config.config_name, + relationship_type="explicit", + ) + ) + + # Check for embedded metadata (always runs regardless of + # explicit relationships) + if data_config.metadata_fields: + relationships.append( + MetadataRelationship( + data_config=data_config.config_name, + metadata_config=f"{data_config.config_name}_embedded", + relationship_type="embedded", + ) + ) + + return relationships + + def get_repository_info(self) -> dict[str, Any]: + """Get general repository information.""" + card = self.dataset_card + + try: + structure = self._structure_fetcher.fetch(self.repo_id) + total_files = structure.get("total_files", 0) + last_modified = structure.get("last_modified") + except HfDataFetchError: + total_files = None + last_modified = None + + return { + "repo_id": self.repo_id, + "pretty_name": card.pretty_name, + "license": card.license, + "tags": card.tags, + "language": card.language, + "size_categories": card.size_categories, + "num_configs": len(card.configs), + "dataset_types": [config.dataset_type.value for config in card.configs], + "total_files": total_files, + "last_modified": last_modified, + "has_default_config": self.dataset_card.get_default_config() is not None, + } + + def extract_metadata_schema(self, config_name: str) -> dict[str, Any]: + """ + Extract complete metadata schema for planning metadata table structure. + + This is the primary method for understanding what metadata is available and + how to structure it into a metadata table. It consolidates information from + all sources: + + - **Field roles**: Which fields are regulators, targets, conditions, etc. + - **Top-level conditions**: Repo-wide conditions (constant for all samples) + - **Config-level conditions**: Config-specific conditions + (constant for this config) + - **Field-level definitions**: Per-sample condition definitions + + The returned schema provides all the information needed to: + 1. Identify sample identifier fields (regulator_identifier, etc.) + 2. Determine which conditions are constant vs. variable + 3. Access condition definitions for creating flattened columns + 4. Plan metadata table structure + + :param config_name: Configuration name to extract schema for + :return: Dict with comprehensive schema including: + - regulator_fields: List of regulator identifier field names + - target_fields: List of target identifier field names + - condition_fields: List of experimental_condition field names + - condition_definitions: Dict mapping field -> value -> definition + - top_level_conditions: Dict of repo-wide conditions + - config_level_conditions: Dict of config-specific conditions + :raises DataCardError: If configuration not found + + Example: + >>> schema = card.extract_metadata_schema('harbison_2004') + >>> # Identify identifier fields + >>> print(f"Regulator fields: {schema['regulator_fields']}") + >>> # Check for constant conditions + >>> if schema['top_level_conditions']: + ... print("Has repo-wide constant conditions") + >>> # Get field-level definitions for metadata table + >>> for field in schema['condition_fields']: + ... defs = schema['condition_definitions'][field] + ... print(f"{field} has {len(defs)} levels") + + """ + config = self.get_config(config_name) + if not config: + raise DataCardError(f"Configuration '{config_name}' not found") + + schema: dict[str, Any] = { + "regulator_fields": [], # Fields with role=regulator_identifier + "target_fields": [], # Fields with role=target_identifier + "condition_fields": [], # Fields with role=experimental_condition + "condition_definitions": {}, # Field-level condition details + "top_level_conditions": None, # Repo-level conditions + "config_level_conditions": None, # Config-level conditions + } + + for feature in config.dataset_info.features: + if feature.role == "regulator_identifier": + schema["regulator_fields"].append(feature.name) + elif feature.role == "target_identifier": + schema["target_fields"].append(feature.name) + elif feature.role == "experimental_condition": + schema["condition_fields"].append(feature.name) + if feature.definitions: + schema["condition_definitions"][feature.name] = feature.definitions + + # Add top-level conditions (applies to all configs/samples) + # Stored in model_extra as dict + if self.dataset_card.model_extra: + top_level = self.dataset_card.model_extra.get("experimental_conditions") + if top_level: + schema["top_level_conditions"] = top_level + + # Add config-level conditions (applies to this config's samples) + # Stored in model_extra as dict + if config.model_extra: + config_level = config.model_extra.get("experimental_conditions") + if config_level: + schema["config_level_conditions"] = config_level + + return schema + + def get_experimental_conditions( + self, config_name: str | None = None + ) -> dict[str, Any]: + """ + Get experimental conditions with proper hierarchy handling. + + This method enables drilling down into the experimental conditions hierarchy: + - Top-level (repo-wide): Common to all configs/samples + - Config-level: Specific to a config, common to its samples + - Field-level: Per-sample variation (use get_field_definitions instead) + + Returns experimental conditions at the appropriate level: + - If config_name is None: returns top-level (repo-wide) conditions only + - If config_name is provided: returns merged (top + config) conditions + + All conditions are returned as flexible dicts that preserve the original + YAML structure. Navigate nested dicts to access specific values. + + :param config_name: Optional config name. If provided, merges top + and config levels + :return: Dict of experimental conditions (empty dict if none defined) + + Example: + >>> # Get top-level conditions + >>> top = card.get_experimental_conditions() + >>> temp = top.get('temperature_celsius', 30) + >>> + >>> # Get merged conditions for a config + >>> merged = card.get_experimental_conditions('config_name') + >>> media = merged.get('media', {}) + >>> media_name = media.get('name', 'unspecified') + + """ + # Get top-level conditions (stored in model_extra) + top_level = ( + self.dataset_card.model_extra.get("experimental_conditions", {}) + if self.dataset_card.model_extra + else {} + ) + + # If no config specified, return top-level only + if config_name is None: + return top_level.copy() if isinstance(top_level, dict) else {} + + # Get config-level conditions + config = self.get_config(config_name) + if not config: + raise DataCardError(f"Configuration '{config_name}' not found") + + config_level = ( + config.model_extra.get("experimental_conditions", {}) + if config.model_extra + else {} + ) + + # Merge: config-level overrides top-level + merged = {} + if isinstance(top_level, dict): + merged.update(top_level) + if isinstance(config_level, dict): + merged.update(config_level) + + return merged + + def get_field_definitions( + self, config_name: str, field_name: str + ) -> dict[str, Any]: + """ + Get definitions for a specific field (field-level conditions). + + This is the third level of the experimental conditions hierarchy - conditions + that vary per sample. Returns a dict mapping each possible field value to its + detailed specification. + + For fields with role=experimental_condition, the definitions typically include + nested structures like media composition, temperature, treatments, etc. that + define what each categorical value means experimentally. + + :param config_name: Configuration name + :param field_name: Field name (typically has role=experimental_condition) + :return: Dict mapping field values to their definition dicts + (empty if no definitions) + :raises DataCardError: If config or field not found + + Example: + >>> # Get condition definitions + >>> defs = card.get_field_definitions('harbison_2004', 'condition') + >>> # defs = {'YPD': {...}, 'HEAT': {...}, ...} + >>> + >>> # Drill down into a specific condition + >>> ypd = defs['YPD'] + >>> env_conds = ypd.get('environmental_conditions', {}) + >>> media = env_conds.get('media', {}) + >>> media_name = media.get('name') + + """ + config = self.get_config(config_name) + if not config: + raise DataCardError(f"Configuration '{config_name}' not found") + + # Find the feature + feature = None + for f in config.dataset_info.features: + if f.name == field_name: + feature = f + break + + if not feature: + raise DataCardError( + f"Field '{field_name}' not found in config '{config_name}'" + ) + + # Return definitions if present, otherwise empty dict + return feature.definitions if feature.definitions else {} + + def summary(self) -> str: + """Get a human-readable summary of the dataset.""" + card = self.dataset_card + info = self.get_repository_info() + + lines = [ + f"Dataset: {card.pretty_name or self.repo_id}", + f"Repository: {self.repo_id}", + f"License: {card.license or 'Not specified'}", + f"Configurations: {len(card.configs)}", + f"Dataset Types: {', '.join(info['dataset_types'])}", + ] + + if card.tags: + lines.append(f"Tags: {', '.join(card.tags)}") + + # Add config summaries + lines.append("\nConfigurations:") + for config in card.configs: + default_mark = " (default)" if config.default else "" + lines.append( + f" - {config.config_name}: {config.dataset_type.value}{default_mark}" + ) + lines.append(f" {config.description}") + + return "\n".join(lines) diff --git a/tfbpapi/errors.py b/tfbpapi/errors.py new file mode 100644 index 0000000..cbacc92 --- /dev/null +++ b/tfbpapi/errors.py @@ -0,0 +1,39 @@ +"""Custom exception classes for dataset management.""" + +from typing import Any + + +class HfDataFetchError(Exception): + """Raised when HuggingFace API requests fail.""" + + def __init__( + self, + message: str, + repo_id: str | None = None, + status_code: int | None = None, + endpoint: str | None = None, + ): + super().__init__(message) + self.repo_id = repo_id + self.status_code = status_code + self.endpoint = endpoint + + +class DataCardError(Exception): + """Base exception for DataCard operations.""" + + pass + + +class DataCardValidationError(DataCardError): + """Exception raised when dataset card validation fails.""" + + def __init__( + self, + message: str, + repo_id: str | None = None, + validation_errors: list | None = None, + ): + super().__init__(message) + self.repo_id = repo_id + self.validation_errors = validation_errors or [] diff --git a/tfbpapi/fetchers.py b/tfbpapi/fetchers.py new file mode 100644 index 0000000..c8d978f --- /dev/null +++ b/tfbpapi/fetchers.py @@ -0,0 +1,244 @@ +"""Data fetchers for HuggingFace Hub integration.""" + +import logging +import re +from typing import Any + +import requests +from huggingface_hub import DatasetCard, repo_info +from requests import HTTPError + +from tfbpapi.constants import get_hf_token +from tfbpapi.errors import HfDataFetchError + + +class HfDataCardFetcher: + """Handles fetching dataset cards from HuggingFace Hub.""" + + def __init__(self, token: str | None = None): + """ + Initialize the fetcher. + + :param token: HuggingFace token for authentication + + """ + self.logger = logging.getLogger(self.__class__.__name__) + self.token = token or get_hf_token() + + def fetch(self, repo_id: str, repo_type: str = "dataset") -> dict[str, Any]: + """ + Fetch and return dataset card data. + + :param repo_id: Repository identifier (e.g., "user/dataset") + :param repo_type: Type of repository ("dataset", "model", "space") + :return: Dataset card data as dictionary + :raises HfDataFetchError: If fetching fails + + """ + try: + self.logger.debug(f"Fetching dataset card for {repo_id}") + card = DatasetCard.load(repo_id, repo_type=repo_type, token=self.token) + + if not card.data: + self.logger.warning(f"Dataset card for {repo_id} has no data section") + return {} + + return card.data.to_dict() + + except Exception as e: + error_msg = f"Failed to fetch dataset card for {repo_id}: {e}" + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + +class HfSizeInfoFetcher: + """Handles fetching size information from HuggingFace Dataset Server API.""" + + def __init__(self, token: str | None = None): + """ + Initialize the fetcher. + + :param token: HuggingFace token for authentication + + """ + self.logger = logging.getLogger(self.__class__.__name__) + self.token = token or get_hf_token() + self.base_url = "https://datasets-server.huggingface.co" + + def _build_headers(self) -> dict[str, str]: + """Build request headers with authentication if available.""" + headers = {"User-Agent": "TFBP-API/1.0"} + if self.token: + headers["Authorization"] = f"Bearer {self.token}" + return headers + + def fetch(self, repo_id: str) -> dict[str, Any]: + """ + Fetch dataset size information. + + :param repo_id: Repository identifier (e.g., "user/dataset") + :return: Size information as dictionary + :raises HfDataFetchError: If fetching fails + + """ + url = f"{self.base_url}/size" + params = {"dataset": repo_id} + headers = self._build_headers() + + try: + self.logger.debug(f"Fetching size info for {repo_id}") + response = requests.get(url, params=params, headers=headers, timeout=30) + response.raise_for_status() + + data = response.json() + self.logger.debug(f"Size info fetched successfully for {repo_id}") + return data + + except HTTPError as e: + if e.response.status_code == 404: + error_msg = f"Dataset {repo_id} not found" + elif e.response.status_code == 403: + error_msg = ( + f"Access denied to dataset {repo_id} (check token permissions)" + ) + else: + error_msg = f"HTTP error fetching size for {repo_id}: {e}" + + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + except requests.RequestException as e: + error_msg = f"Request failed fetching size for {repo_id}: {e}" + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + except ValueError as e: + error_msg = f"Invalid JSON response fetching size for {repo_id}: {e}" + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + +class HfRepoStructureFetcher: + """Handles fetching repository structure from HuggingFace Hub.""" + + def __init__(self, token: str | None = None): + """ + Initialize the fetcher. + + :param token: HuggingFace token for authentication + + """ + self.logger = logging.getLogger(self.__class__.__name__) + self.token = token or get_hf_token() + self._cached_structure: dict[str, dict[str, Any]] = {} + + def fetch(self, repo_id: str, force_refresh: bool = False) -> dict[str, Any]: + """ + Fetch repository structure information. + + :param repo_id: Repository identifier (e.g., "user/dataset") + :param force_refresh: If True, bypass cache and fetch fresh data + :return: Repository structure information + :raises HfDataFetchError: If fetching fails + + """ + # Check cache first unless force refresh is requested + if not force_refresh and repo_id in self._cached_structure: + self.logger.debug(f"Using cached repo structure for {repo_id}") + return self._cached_structure[repo_id] + + try: + self.logger.debug(f"Fetching repo structure for {repo_id}") + info = repo_info(repo_id=repo_id, repo_type="dataset", token=self.token) + + # Extract file structure + files = [] + partitions: dict[str, set] = {} + + for sibling in info.siblings or []: + file_info = { + "path": sibling.rfilename, + "size": sibling.size, + "is_lfs": sibling.lfs is not None, + } + files.append(file_info) + + # Extract partition information from file paths + self._extract_partition_info(sibling.rfilename, partitions) + + result = { + "repo_id": repo_id, + "files": files, + "partitions": partitions, + "total_files": len(files), + "last_modified": ( + info.last_modified.isoformat() if info.last_modified else None + ), + } + + # Cache the result + self._cached_structure[repo_id] = result + return result + + except Exception as e: + error_msg = f"Failed to fetch repo structure for {repo_id}: {e}" + self.logger.error(error_msg) + raise HfDataFetchError(error_msg) from e + + def _extract_partition_info( + self, file_path: str, partitions: dict[str, set[str]] + ) -> None: + """ + Extract partition information from file paths. + + :param file_path: Path to analyze for partitions + :param partitions: Dictionary to update with partition info + + """ + # Look for partition patterns like "column=value" in path + partition_pattern = r"([^/=]+)=([^/]+)" + matches = re.findall(partition_pattern, file_path) + + for column, value in matches: + if column not in partitions: + partitions[column] = set() + partitions[column].add(value) + + def get_partition_values( + self, repo_id: str, partition_column: str, force_refresh: bool = False + ) -> list[str]: + """ + Get all values for a specific partition column. + + :param repo_id: Repository identifier + :param partition_column: Name of the partition column + :param force_refresh: If True, bypass cache and fetch fresh data + :return: List of unique partition values + :raises HfDataFetchError: If fetching fails + + """ + structure = self.fetch(repo_id, force_refresh=force_refresh) + partition_values = structure.get("partitions", {}).get(partition_column, set()) + return sorted(list(partition_values)) + + def get_dataset_files( + self, repo_id: str, path_pattern: str | None = None, force_refresh: bool = False + ) -> list[dict[str, Any]]: + """ + Get dataset files, optionally filtered by path pattern. + + :param repo_id: Repository identifier + :param path_pattern: Optional regex pattern to filter files + :param force_refresh: If True, bypass cache and fetch fresh data + :return: List of matching files + :raises HfDataFetchError: If fetching fails + + """ + structure = self.fetch(repo_id, force_refresh=force_refresh) + files = structure["files"] + + if path_pattern: + pattern = re.compile(path_pattern) + files = [f for f in files if pattern.search(f["path"])] + + return files diff --git a/tfbpapi/hf_cache_manager.py b/tfbpapi/hf_cache_manager.py new file mode 100644 index 0000000..26ca708 --- /dev/null +++ b/tfbpapi/hf_cache_manager.py @@ -0,0 +1,631 @@ +import logging +from datetime import datetime, timedelta +from pathlib import Path +from typing import Any, Literal + +import duckdb +from huggingface_hub import scan_cache_dir, try_to_load_from_cache +from huggingface_hub.utils import DeleteCacheStrategy + +from tfbpapi.datacard import DataCard + + +class HfCacheManager(DataCard): + """Enhanced cache management for Hugging Face Hub with metadata-focused + retrieval.""" + + def __init__( + self, + repo_id: str, + duckdb_conn: duckdb.DuckDBPyConnection, + token: str | None = None, + logger: logging.Logger | None = None, + ): + super().__init__(repo_id, token) + self.duckdb_conn = duckdb_conn + self.logger = logger or logging.getLogger(__name__) + + def _get_metadata_for_config( + self, config, force_refresh: bool = False + ) -> dict[str, Any]: + """ + Get metadata for a specific configuration using 3-case strategy. + + :param config: Configuration object to process + :param force_refresh: If True, skip cache checks and download fresh from remote + + """ + config_result = { + "config_name": config.config_name, + "strategy": None, + "table_name": None, + "success": False, + "message": "", + } + + table_name = f"metadata_{config.config_name}" + + try: + # Skip cache checks if force_refresh is True + if not force_refresh: + # Case 1: Check if metadata already exists in DuckDB + if self._check_metadata_exists_in_duckdb(table_name): + config_result.update( + { + "strategy": "duckdb_exists", + "table_name": table_name, + "success": True, + "message": f"Metadata table {table_name} " + "already exists in DuckDB", + } + ) + return config_result + + # Case 2: Check if HF data is in cache, create DuckDB representation + if self._load_metadata_from_cache(config, table_name): + config_result.update( + { + "strategy": "cache_loaded", + "table_name": table_name, + "success": True, + "message": "Loaded metadata from cache " + f"into table {table_name}", + } + ) + return config_result + + # Case 3: Download from HF (explicit vs embedded) + if self._download_and_load_metadata(config, table_name): + config_result.update( + { + "strategy": "downloaded", + "table_name": table_name, + "success": True, + "message": "Downloaded and loaded metadata " + f"into table {table_name}", + } + ) + return config_result + + config_result["message"] = ( + f"Failed to retrieve metadata for {config.config_name}" + ) + + except Exception as e: + config_result["message"] = f"Error processing {config.config_name}: {e}" + self.logger.error(f"Error in metadata config {config.config_name}: {e}") + + return config_result + + def _check_metadata_exists_in_duckdb(self, table_name: str) -> bool: + """Case 1: Check if metadata table already exists in DuckDB database.""" + try: + # Query information schema to check if table exists + result = self.duckdb_conn.execute( + "SELECT table_name FROM information_schema.tables WHERE table_name = ?", + [table_name], + ).fetchone() + + exists = result is not None + if exists: + self.logger.debug(f"Table {table_name} already exists in DuckDB") + return exists + + except Exception as e: + self.logger.debug(f"Error checking DuckDB table existence: {e}") + return False + + def _load_metadata_from_cache(self, config, table_name: str) -> bool: + """Case 2: HF data in cache, create DuckDB representation.""" + try: + # Check if metadata files are cached locally + cached_files = [] + for data_file in config.data_files: + cached_path = try_to_load_from_cache( + repo_id=self.repo_id, + filename=data_file.path, + repo_type="dataset", + ) + + if isinstance(cached_path, str) and Path(cached_path).exists(): + cached_files.append(cached_path) + + if not cached_files: + self.logger.debug(f"No cached files found for {config.config_name}") + return False + + # Load cached parquet files into DuckDB + self._create_duckdb_table_from_files( + cached_files, table_name, config.config_name + ) + self.logger.info( + f"Loaded {len(cached_files)} cached files into {table_name}" + ) + return True + + except Exception as e: + self.logger.debug(f"Error loading from cache for {config.config_name}: {e}") + return False + + def _download_and_load_metadata(self, config, table_name: str) -> bool: + """Case 3: Download from HF (explicit vs embedded).""" + try: + from huggingface_hub import snapshot_download + + # Download specific files for this metadata config + file_patterns = [data_file.path for data_file in config.data_files] + + downloaded_path = snapshot_download( + repo_id=self.repo_id, + repo_type="dataset", + allow_patterns=file_patterns, + token=self.token, + ) + + # Find downloaded parquet files + downloaded_files = [] + for pattern in file_patterns: + file_path = Path(downloaded_path) / pattern + if file_path.exists() and file_path.suffix == ".parquet": + downloaded_files.append(str(file_path)) + else: + # Handle wildcard patterns, including nested wildcards + if "*" in pattern: + # Use glob on the full pattern relative to downloaded_path + base_path = Path(downloaded_path) + matching_files = list(base_path.glob(pattern)) + downloaded_files.extend( + [str(f) for f in matching_files if f.suffix == ".parquet"] + ) + else: + # Handle non-wildcard patterns that might be directories + parent_dir = Path(downloaded_path) / Path(pattern).parent + if parent_dir.exists(): + downloaded_files.extend( + [str(f) for f in parent_dir.glob("*.parquet")] + ) + + if not downloaded_files: + self.logger.warning( + f"No parquet files found after download for {config.config_name}" + ) + return False + + # Load downloaded files into DuckDB + self._create_duckdb_table_from_files( + downloaded_files, table_name, config.config_name + ) + self.logger.info( + f"Downloaded and loaded {len(downloaded_files)} files into {table_name}" + ) + return True + + except Exception as e: + self.logger.error( + f"Error downloading metadata for {config.config_name}: {e}" + ) + return False + + def _create_duckdb_table_from_files( + self, file_paths: list[str], table_name: str, config_name: str + ) -> None: + """Create DuckDB table/view from parquet files.""" + if len(file_paths) == 1: + # Single file + create_sql = f""" + CREATE OR REPLACE VIEW {table_name} AS + SELECT * FROM read_parquet('{file_paths[0]}') + """ + else: + # Multiple files + files_str = "', '".join(file_paths) + create_sql = f""" + CREATE OR REPLACE VIEW {table_name} AS + SELECT * FROM read_parquet(['{files_str}']) + """ + + self.duckdb_conn.execute(create_sql) + self.logger.debug( + f"Created DuckDB view {table_name} from {len(file_paths)} files" + ) + + # Validate source_sample fields if they exist + self._validate_source_sample_fields(table_name, config_name) + + def _validate_source_sample_fields(self, table_name: str, config_name: str) -> None: + """ + Validate source_sample fields have correct format. + + Composite sample identifiers must be in the format: + "repo_id;config_name;sample_id" (exactly 3 semicolon-separated parts) + + """ + config = self.get_config(config_name) + + # Find all source_sample fields + source_sample_fields = [ + f.name + for f in config.dataset_info.features # type: ignore + if f.role == "source_sample" + ] + + if not source_sample_fields: + return # No validation needed + + # For each field, validate format + for field_name in source_sample_fields: + query = f""" + SELECT {field_name}, + LENGTH({field_name}) - LENGTH(REPLACE({field_name}, ';', '')) + AS semicolon_count + FROM {table_name} + WHERE semicolon_count != 2 + LIMIT 1 + """ + result = self.duckdb_conn.execute(query).fetchone() + + if result: + raise ValueError( + f"Invalid format in field '{field_name}' " + f"with role='source_sample'. " + f"Expected 'repo_id;config_name;sample_id' " + f"(3 semicolon-separated parts), " + f"but found: '{result[0]}'" + ) + + def _extract_embedded_metadata_field( + self, data_table_name: str, field_name: str, metadata_table_name: str + ) -> bool: + """Extract a specific metadata field from a data table.""" + try: + # Create a metadata view with unique values from the specified field + extract_sql = f""" + CREATE OR REPLACE VIEW {metadata_table_name} AS + SELECT DISTINCT {field_name} as value, COUNT(*) as count + FROM {data_table_name} + WHERE {field_name} IS NOT NULL + GROUP BY {field_name} + ORDER BY count DESC + """ + + self.duckdb_conn.execute(extract_sql) + + # Verify the table was created and has data + count_result = self.duckdb_conn.execute( + f"SELECT COUNT(*) FROM {metadata_table_name}" + ).fetchone() + + if count_result and count_result[0] > 0: + self.logger.info( + f"Extracted {count_result[0]} unique values for {field_name} " + f"into {metadata_table_name}" + ) + return True + else: + self.logger.warning(f"No data found for field {field_name}") + return False + + except Exception as e: + self.logger.error(f"Error extracting field {field_name}: {e}") + return False + + def clean_cache_by_age( + self, + max_age_days: int = 30, + dry_run: bool = True, + ) -> DeleteCacheStrategy: + """ + Clean cache entries older than specified age. + + :param max_age_days: Remove revisions older than this many days + :param dry_run: If True, show what would be deleted without executing + size_threshold: Only delete if total cache size exceeds this (e.g., "10GB") + + :return: DeleteCacheStrategy object that can be executed + + """ + cache_info = scan_cache_dir() + cutoff_date = datetime.now() - timedelta(days=max_age_days) + + old_revisions = [] + for repo in cache_info.repos: + for revision in repo.revisions: + # Check if revision is older than cutoff + revision_date = datetime.fromtimestamp(revision.last_modified) + if revision_date < cutoff_date: + old_revisions.append(revision.commit_hash) + self.logger.debug( + f"Marking for deletion: {revision.commit_hash} " + f"(last modified: {revision.last_modified})" + ) + + if not old_revisions: + self.logger.info("No old revisions found to delete") + # return None + + delete_strategy = cache_info.delete_revisions(*old_revisions) + + self.logger.info( + f"Found {len(old_revisions)} old revisions. " + f"Will free {delete_strategy.expected_freed_size_str}" + ) + + if not dry_run: + delete_strategy.execute() + self.logger.info( + f"Cache cleanup completed. Freed " + f"{delete_strategy.expected_freed_size_str}" + ) + else: + self.logger.info("Dry run completed. Use dry_run=False to execute deletion") + + return delete_strategy + + def clean_cache_by_size( + self, + target_size: str, + strategy: Literal[ + "oldest_first", "largest_first", "least_used" + ] = "oldest_first", + dry_run: bool = True, + ) -> DeleteCacheStrategy: + """ + Clean cache to reach target size by removing revisions. + + :param target_size: Target cache size (e.g., "5GB", "500MB") + :param strategy: Deletion strategy - "oldest_first", "largest_first", + "least_used" + :param dry_run: If True, show what would be deleted without executing + + :return: DeleteCacheStrategy object that can be executed + + """ + cache_info = scan_cache_dir() + current_size = cache_info.size_on_disk + target_bytes = self._parse_size_string(target_size) + + if current_size <= target_bytes: + self.logger.info( + f"Cache size ({cache_info.size_on_disk_str}) already below " + f"target ({target_size})" + ) + + bytes_to_free = current_size - target_bytes + + # Get all revisions sorted by strategy + all_revisions = [] + for repo in cache_info.repos: + for revision in repo.revisions: + all_revisions.append(revision) + + # Sort revisions based on strategy + if strategy == "oldest_first": + all_revisions.sort(key=lambda r: r.last_modified) + elif strategy == "largest_first": + all_revisions.sort(key=lambda r: r.size_on_disk, reverse=True) + elif strategy == "least_used": + # Use last_modified as proxy for usage + all_revisions.sort(key=lambda r: r.last_modified) + else: + raise ValueError(f"Unknown strategy: {strategy}") + + # Select revisions to delete + revisions_to_delete = [] + freed_bytes = 0 + + for revision in all_revisions: + if freed_bytes >= bytes_to_free: + break + revisions_to_delete.append(revision.commit_hash) + freed_bytes += revision.size_on_disk + + if not revisions_to_delete: + self.logger.warning("No revisions selected for deletion") + + delete_strategy = cache_info.delete_revisions(*revisions_to_delete) + + self.logger.info( + f"Selected {len(revisions_to_delete)} revisions for deletion. " + f"Will free {delete_strategy.expected_freed_size_str}" + ) + + if not dry_run: + delete_strategy.execute() + self.logger.info( + f"Cache cleanup completed. Freed " + f"{delete_strategy.expected_freed_size_str}" + ) + else: + self.logger.info("Dry run completed. Use dry_run=False to execute deletion") + + return delete_strategy + + def clean_unused_revisions( + self, keep_latest: int = 2, dry_run: bool = True + ) -> DeleteCacheStrategy: + """ + Clean unused revisions, keeping only the latest N revisions per repo. + + :param keep_latest: Number of latest revisions to keep per repo + :param dry_run: If True, show what would be deleted without executing + :return: DeleteCacheStrategy object that can be executed + + """ + cache_info = scan_cache_dir() + revisions_to_delete = [] + + for repo in cache_info.repos: + # Sort revisions by last modified (newest first) + sorted_revisions = sorted( + repo.revisions, key=lambda r: r.last_modified, reverse=True + ) + + # Keep the latest N, mark the rest for deletion + if len(sorted_revisions) > keep_latest: + old_revisions = sorted_revisions[keep_latest:] + for revision in old_revisions: + revisions_to_delete.append(revision.commit_hash) + self.logger.debug( + f"Marking old revision for deletion: {repo.repo_id} - " + f"{revision.commit_hash}" + ) + + delete_strategy = cache_info.delete_revisions(*revisions_to_delete) + + self.logger.info( + f"Found {len(revisions_to_delete)} unused revisions. " + f"Will free {delete_strategy.expected_freed_size_str}" + ) + + if not dry_run: + delete_strategy.execute() + self.logger.info( + f"Cache cleanup completed. Freed " + f"{delete_strategy.expected_freed_size_str}" + ) + else: + self.logger.info("Dry run completed. Use dry_run=False to execute deletion") + + return delete_strategy + + def auto_clean_cache( + self, + max_age_days: int = 30, + max_total_size: str = "10GB", + keep_latest_per_repo: int = 2, + dry_run: bool = True, + ) -> list[DeleteCacheStrategy]: + """ + Automated cache cleaning with multiple strategies. + + :param max_age_days: Remove revisions older than this + :param max_total_size: Target maximum cache size + :param keep_latest_per_repo: Keep this many latest revisions per repo + :param dry_run: If True, show what would be deleted without executing + :return: List of DeleteCacheStrategy objects that were executed + + """ + strategies_executed = [] + + self.logger.info("Starting automated cache cleanup...") + + # Step 1: Remove very old revisions + strategy = self.clean_cache_by_age(max_age_days=max_age_days, dry_run=dry_run) + if strategy: + strategies_executed.append(strategy) + + # Step 2: Remove unused revisions (keep only latest per repo) + strategy = self.clean_unused_revisions( + keep_latest=keep_latest_per_repo, dry_run=dry_run + ) + if strategy: + strategies_executed.append(strategy) + + # Step 3: If still over size limit, remove more aggressively + cache_info = scan_cache_dir() + if cache_info.size_on_disk > self._parse_size_string(max_total_size): + strategy = self.clean_cache_by_size( + target_size=max_total_size, strategy="oldest_first", dry_run=dry_run + ) + if strategy: + strategies_executed.append(strategy) + + total_freed = sum(s.expected_freed_size for s in strategies_executed) + self.logger.info( + f"Automated cleanup complete. Total freed: " + f"{self._format_bytes(total_freed)}" + ) + + return strategies_executed + + def _parse_size_string(self, size_str: str) -> int: + """Parse size string like '10GB' to bytes.""" + size_str = size_str.upper().strip() + + # Check longer units first to avoid partial matches + multipliers = {"TB": 1024**4, "GB": 1024**3, "MB": 1024**2, "KB": 1024, "B": 1} + + for unit, multiplier in multipliers.items(): + if size_str.endswith(unit): + number = float(size_str[: -len(unit)]) + return int(number * multiplier) + + # If no unit specified, assume bytes + return int(size_str) + + def _format_bytes(self, bytes_size: int) -> str: + """Format bytes into human readable string.""" + if bytes_size == 0: + return "0B" + + # iterate over common units, dividing by 1024 each time, to find an + # appropriate unit. Default to TB if the size is very large + size = float(bytes_size) + for unit in ["B", "KB", "MB", "GB", "TB"]: + if size < 1024.0: + return f"{size:.1f}{unit}" + size /= 1024.0 + return f"{size:.1f}TB" + + def query(self, sql: str, config_name: str, refresh_cache: bool = False) -> Any: + """ + Execute SQL query against a specific dataset configuration. + + Loads the specified configuration and executes the SQL query. + Automatically replaces the config name in the SQL with the actual + table name for user convenience. + + :param sql: SQL query to execute + :param config_name: Configuration name to query (table will be loaded + if needed) + :param refresh_cache: If True, force refresh from remote instead of + using cache + :return: DataFrame with query results + :raises ValueError: If config_name not found or query fails + + Example: + mgr = HfCacheManager("BrentLab/harbison_2004", duckdb.connect()) + df = mgr.query( + "SELECT DISTINCT sample_id FROM harbison_2004", + "harbison_2004" + ) + + """ + # Validate config exists + if config_name not in [c.config_name for c in self.configs]: + available_configs = [c.config_name for c in self.configs] + raise ValueError( + f"Config '{config_name}' not found. " + f"Available configs: {available_configs}" + ) + + # Load the configuration data + config = self.get_config(config_name) + if not config: + raise ValueError(f"Could not retrieve config '{config_name}'") + + config_result = self._get_metadata_for_config( + config, force_refresh=refresh_cache + ) + if not config_result.get("success", False): + raise ValueError( + f"Failed to load data for config '{config_name}': " + f"{config_result.get('message', 'Unknown error')}" + ) + + table_name = config_result.get("table_name") + if not table_name: + raise ValueError(f"No table available for config '{config_name}'") + + # Replace config name with actual table name in SQL for user convenience + modified_sql = sql.replace(config_name, table_name) + + # Execute query + try: + result = self.duckdb_conn.execute(modified_sql).fetchdf() + self.logger.debug(f"Query executed successfully on {config_name}") + return result + except Exception as e: + self.logger.error(f"Query execution failed: {e}") + self.logger.error(f"SQL: {modified_sql}") + raise ValueError(f"Query execution failed: {e}") from e diff --git a/tfbpapi/metric_arrays.py b/tfbpapi/metric_arrays.py deleted file mode 100644 index 2bfaf14..0000000 --- a/tfbpapi/metric_arrays.py +++ /dev/null @@ -1,162 +0,0 @@ -import logging -from collections.abc import Callable - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def metric_arrays( - res_dict: dict[str, pd.DataFrame | dict[str, pd.DataFrame]], - metrics_dict: dict[str, Callable], - rownames: str = "target_symbol", - colnames: str = "regulator_symbol", - row_dedup_func: Callable | None = None, - drop_incomplete_rows: bool = True, -) -> dict[str, pd.DataFrame]: - """ - Extract specified metrics from an AbstractRecordsAndFilesAPI instance's - read(retrieve_files=True) results object. - - :param res_dict: The output of an AbstractRecordsAndFiles instance. - :param metrics_dict: A dictionary where the keys are metrics and the values are - functions to apply to rows in the event that there are multiple rows with - the same rownames. Set to None to raise error if duplicate rownames are found. - :param rownames: Column name to use for row labels. - :param colnames: Column name to use for column labels. - :param drop_incomplete_rows: When True, drops rows and columns with all NaN values. - - :return: A dictionary where the metric is the key and the value is a DataFrame. - The column values are metric values, and the column names correspond - to `colnames` in the metadata DataFrame. - - :raises AttributeError: If the values in `colnames` or `rownames` are not unique - :raises KeyError: If the res_dict does not have keys 'metadata' and 'data' - :raises KeyError: If the data dictionary does not have the same keys as the 'id' - column - :raises ValueError: If the metadata does not have an 'id' column - :raises ValueError: If either the metadata or the data dictionary values are not - DataFrames - :raises ValueError: If the `colnames` is not in the res_dict metadata - :raises ValueError: If the `rownames` is not in the res_dict data - :raises ValueError: If the metrics are not in the data dictionary - - """ - - # Check required keys - if not all(k in res_dict for k in ["metadata", "data"]): - raise KeyError("res_dict must have keys 'metadata' and 'data'") - - metadata: pd.DataFrame = res_dict["metadata"] - - # Verify 'id' in metadata - if "id" not in metadata.columns: - raise ValueError("metadata must have an 'id' column") - - # Check for missing keys in 'data' - missing_keys = [k for k in metadata["id"] if str(k) not in res_dict["data"]] - if missing_keys: - raise KeyError( - f"Data dictionary must have the same keys as the 'id' " - f"column. Missing keys: {missing_keys}" - ) - - # Ensure all data dictionary values are DataFrames - if not all(isinstance(v, pd.DataFrame) for v in res_dict["data"].values()): - raise ValueError("All values in the data dictionary must be DataFrames") - - # Verify rownames in data and colnames in metadata - if colnames not in metadata.columns: - raise ValueError(f"colnames '{colnames}' not in metadata") - data_with_missing_rownames = [ - id for id, df in res_dict["data"].items() if rownames not in df.columns - ] - if data_with_missing_rownames: - raise ValueError( - f"rownames '{rownames}' not in data for ids: {data_with_missing_rownames}" - ) - - # Factorize unique row and column labels - row_labels = pd.Index( - {item for df in res_dict["data"].values() for item in df[rownames].unique()} - ) - - # Initialize output dictionary with NaN DataFrames for each metric - output_dict = { - m: pd.DataFrame(index=pd.Index(row_labels, name=rownames)) - for m in metrics_dict.keys() - } - - # Populate DataFrames with metric values - info_msgs = set() - for _, row in metadata.iterrows(): - try: - data = res_dict["data"][row["id"]] - except KeyError: - info_msgs.add("casting `id` to str to extract data from res_dict['data']") - data = res_dict["data"][str(row["id"])] - - for metric, row_dedup_func in metrics_dict.items(): - # Filter data to include only the rownames and metric columns - if metric not in data.columns: - raise ValueError( - f"Metric '{metric}' not found in data for id '{row['id']}'" - ) - - metric_data = data[[rownames, metric]] - - # Handle deduplication if row_dedup_func is provided - if row_dedup_func is not None: - metric_data = ( - metric_data.groupby(rownames)[metric] - .apply(row_dedup_func) - .reset_index() - ) - else: - # Ensure no duplicates exist if no deduplication function is provided - if metric_data[rownames].duplicated().any(): - raise ValueError( - f"Duplicate entries found for metric '{metric}' " - f"in id '{row['id']}' without dedup_func" - ) - - # test if row[colnames] is already in output_dict[metric]. If it is, add a - # replicate suffix and try again, Continue doing this until the column name - # is unique - colname = row[colnames] - suffix = 2 - while colname in output_dict[metric].columns: - colname = f"{row[colnames]}_rep{suffix}" - suffix += 1 - if suffix > 2: - info_msgs.add( - f"Column name '{row[colnames]}' already exists in " - f"output DataFrame for metric '{metric}'. " - f"Renaming to '{colname}'" - ) - # Join metric data with output DataFrame for the metric - output_dict[metric] = output_dict[metric].join( - metric_data.set_index(rownames).rename(columns={metric: colname}), - how="left", - ) - logger.info("; ".join(info_msgs)) - - # Drop incomplete rows and columns if drop_incomplete_rows is True - if drop_incomplete_rows: - for metric, df in output_dict.items(): - # Drop rows and columns where all values are NaN - initial_shape = df.shape - output_dict[metric] = df.dropna(axis=0) - final_shape = output_dict[metric].shape - - dropped_rows = initial_shape[0] - final_shape[0] - dropped_columns = initial_shape[1] - final_shape[1] - - if dropped_rows > 0 or dropped_columns > 0: - logger.warning( - f"{dropped_rows} rows and {dropped_columns} " - f"columns with incomplete " - f"records were dropped for metric '{metric}'." - ) - - return output_dict diff --git a/tfbpapi/models.py b/tfbpapi/models.py new file mode 100644 index 0000000..bb86f2e --- /dev/null +++ b/tfbpapi/models.py @@ -0,0 +1,734 @@ +""" +Pydantic models for dataset card validation and metadata configuration. + +These models provide minimal structure for parsing HuggingFace dataset cards while +remaining flexible enough to accommodate diverse experimental systems. Most fields use +extra="allow" to accept domain-specific additions without requiring code changes. + +Also includes models for VirtualDB metadata normalization configuration. + +""" + +from enum import Enum +from pathlib import Path +from typing import Any + +import yaml # type: ignore[import-untyped] +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + + +class DatasetType(str, Enum): + """Supported dataset types.""" + + GENOMIC_FEATURES = "genomic_features" + ANNOTATED_FEATURES = "annotated_features" + GENOME_MAP = "genome_map" + METADATA = "metadata" + COMPARATIVE = "comparative" + + +class FeatureInfo(BaseModel): + """ + Information about a dataset feature/column. + + Minimal required fields with flexible dtype handling. + + """ + + name: str = Field(..., description="Column name in the data") + dtype: str | dict[str, Any] = Field( + ..., + description="Data type (string, int64, float64, etc.) or class_label dict", + ) + description: str = Field(..., description="Description of the field") + role: str | None = Field( + default=None, + description="Optional semantic role. 'experimental_condition' " + "has special behavior.", + ) + definitions: dict[str, Any] | None = Field( + default=None, + description="For experimental_condition fields: definitions per value", + ) + + +class PartitioningInfo(BaseModel): + """Partitioning configuration for datasets.""" + + enabled: bool = Field(default=False, description="Whether partitioning is enabled") + partition_by: list[str] | None = Field( + default=None, description="Partition column names" + ) + path_template: str | None = Field( + default=None, description="Path template for partitioned files" + ) + + +class DatasetInfo(BaseModel): + """Dataset structure information.""" + + features: list[FeatureInfo] = Field(..., description="Feature definitions") + partitioning: PartitioningInfo | None = Field( + default=None, description="Partitioning configuration" + ) + + +class DataFileInfo(BaseModel): + """Information about data files.""" + + split: str = Field(default="train", description="Dataset split name") + path: str = Field(..., description="Path to data file(s)") + + +class DatasetConfig(BaseModel): + """ + Configuration for a dataset within a repository. + + Uses extra="allow" to accept arbitrary experimental_conditions and other fields. + + """ + + config_name: str = Field(..., description="Unique configuration identifier") + description: str = Field(..., description="Human-readable description") + dataset_type: DatasetType = Field(..., description="Type of dataset") + default: bool = Field( + default=False, description="Whether this is the default config" + ) + applies_to: list[str] | None = Field( + default=None, description="Configs this metadata applies to" + ) + metadata_fields: list[str] | None = Field( + default=None, description="Fields for embedded metadata extraction" + ) + data_files: list[DataFileInfo] = Field(..., description="Data file information") + dataset_info: DatasetInfo = Field(..., description="Dataset structure information") + + model_config = ConfigDict(extra="allow") + + @field_validator("applies_to") + @classmethod + def applies_to_only_for_metadata(cls, v, info): + """Validate that applies_to is only used for metadata or comparative configs.""" + if v is not None: + dataset_type = info.data.get("dataset_type") + if dataset_type not in (DatasetType.METADATA, DatasetType.COMPARATIVE): + raise ValueError( + "applies_to field is only valid " + "for metadata and comparative dataset types" + ) + return v + + @field_validator("metadata_fields") + @classmethod + def metadata_fields_validation(cls, v): + """Validate metadata_fields usage.""" + if v is not None and len(v) == 0: + raise ValueError("metadata_fields cannot be empty list, use None instead") + return v + + +class DatasetCard(BaseModel): + """ + Complete dataset card model. + + Uses extra="allow" to accept arbitrary top-level metadata and + experimental_conditions. + + """ + + configs: list[DatasetConfig] = Field(..., description="Dataset configurations") + + model_config = ConfigDict(extra="allow") + + @field_validator("configs") + @classmethod + def configs_not_empty(cls, v): + """Ensure at least one config is present.""" + if not v: + raise ValueError("At least one dataset configuration is required") + return v + + @field_validator("configs") + @classmethod + def unique_config_names(cls, v): + """Ensure config names are unique.""" + names = [config.config_name for config in v] + if len(names) != len(set(names)): + raise ValueError("Configuration names must be unique") + return v + + @field_validator("configs") + @classmethod + def at_most_one_default(cls, v): + """Ensure at most one config is marked as default.""" + defaults = [config for config in v if config.default] + if len(defaults) > 1: + raise ValueError("At most one configuration can be marked as default") + return v + + def get_config_by_name(self, name: str) -> DatasetConfig | None: + """Get a configuration by name.""" + for config in self.configs: + if config.config_name == name: + return config + return None + + def get_configs_by_type(self, dataset_type: DatasetType) -> list[DatasetConfig]: + """Get all configurations of a specific type.""" + return [ + config for config in self.configs if config.dataset_type == dataset_type + ] + + def get_default_config(self) -> DatasetConfig | None: + """Get the default configuration if one exists.""" + defaults = [config for config in self.configs if config.default] + return defaults[0] if defaults else None + + def get_data_configs(self) -> list[DatasetConfig]: + """Get all non-metadata configurations.""" + return [ + config + for config in self.configs + if config.dataset_type != DatasetType.METADATA + ] + + def get_metadata_configs(self) -> list[DatasetConfig]: + """Get all metadata configurations.""" + return [ + config + for config in self.configs + if config.dataset_type == DatasetType.METADATA + ] + + +class ExtractedMetadata(BaseModel): + """Metadata extracted from datasets.""" + + config_name: str = Field(..., description="Source configuration name") + field_name: str = Field( + ..., description="Field name the metadata was extracted from" + ) + values: set[str] = Field(..., description="Unique values found") + extraction_method: str = Field(..., description="How the metadata was extracted") + + model_config = ConfigDict( + # Allow sets in JSON serialization + json_encoders={set: list} + ) + + +class MetadataRelationship(BaseModel): + """Relationship between a data config and its metadata.""" + + data_config: str = Field(..., description="Data configuration name") + metadata_config: str = Field(..., description="Metadata configuration name") + relationship_type: str = Field( + ..., description="Type of relationship (explicit, embedded)" + ) + + +# ============================================================================ +# VirtualDB Metadata Configuration Models +# ============================================================================ + + +class ComparativeAnalysis(BaseModel): + """ + Reference to a comparative dataset that includes this dataset. + + Comparative datasets relate samples across multiple source datasets. + This model specifies which comparative dataset references the current + dataset and through which field (via_field). + + Attributes: + repo: HuggingFace repository ID of the comparative dataset + dataset: Config name of the comparative dataset + via_field: Field in the comparative dataset containing composite + identifiers that reference this dataset's samples. + Format: "repo_id;config_name;sample_id" + + Example: + ```python + # In BrentLab/callingcards config + ComparativeAnalysis( + repo="BrentLab/yeast_comparative_analysis", + dataset="dto", + via_field="binding_id" + ) + # Means: dto dataset has a binding_id field with values like: + # "BrentLab/callingcards;annotated_features;123" + ``` + + """ + + repo: str = Field(..., description="Comparative dataset repository ID") + dataset: str = Field(..., description="Comparative dataset config name") + via_field: str = Field( + ..., description="Field containing composite sample identifiers" + ) + + +class PropertyMapping(BaseModel): + """ + Mapping specification for a single property. + + Attributes: + path: Optional dot-notation path to the property value. + For repo/config-level: relative to experimental_conditions + For field-level: relative to field definitions + When omitted with field specified, creates a column alias. + field: Optional field name for field-level properties. + When specified, looks in this field's definitions. + When omitted, looks in repo/config-level experimental_conditions. + expression: Optional SQL expression for derived/computed fields. + When specified, creates a computed column. + Cannot be used with field or path. + dtype: Optional data type specification for type conversion. + Supported values: 'string', 'numeric', 'bool'. + When specified, extracted values are converted to this type. + + Examples: + Field-level property with path: + PropertyMapping(field="condition", path="media.carbon_source") + + Repo/config-level property: + PropertyMapping(path="temperature_celsius") + + Field-level column alias (no path): + PropertyMapping(field="condition") + + Derived field with expression: + PropertyMapping(expression="dto_fdr < 0.05") + + """ + + field: str | None = Field(None, description="Field name for field-level properties") + path: str | None = Field(None, description="Dot-notation path to property") + expression: str | None = Field( + None, description="SQL expression for derived fields" + ) + dtype: str | None = Field( + None, description="Data type for conversion: 'string', 'numeric', or 'bool'" + ) + + @field_validator("path") + @classmethod + def validate_path(cls, v: str | None) -> str | None: + """Ensure path is not just whitespace if provided.""" + if v is not None and not v.strip(): + raise ValueError("path cannot be empty or whitespace") + return v.strip() if v else None + + @field_validator("field") + @classmethod + def validate_field(cls, v: str | None) -> str | None: + """Ensure field is not empty string if provided.""" + if v is not None and not v.strip(): + raise ValueError("field cannot be empty or whitespace") + return v.strip() if v else None + + @field_validator("expression") + @classmethod + def validate_expression(cls, v: str | None) -> str | None: + """Ensure expression is not empty string if provided.""" + if v is not None and not v.strip(): + raise ValueError("expression cannot be empty or whitespace") + return v.strip() if v else None + + @model_validator(mode="after") + def validate_at_least_one_specified(self) -> "PropertyMapping": + """Ensure at least one field type is specified and mutually exclusive.""" + if self.expression is not None: + if self.field is not None or self.path is not None: + raise ValueError( + "expression cannot be used with field or path - " + "derived fields are computed, not extracted" + ) + elif self.field is None and self.path is None: + raise ValueError( + "At least one of 'field', 'path', or 'expression' must be specified" + ) + return self + + +class DatasetVirtualDBConfig(BaseModel): + """ + VirtualDB configuration for a specific dataset within a repository. + + Attributes: + sample_id: Mapping for the sample identifier field (required for + primary datasets) + comparative_analyses: Optional list of comparative datasets that + reference this dataset + properties: Property mappings for this specific dataset (field names to + PropertyMapping) + + Example: + ```yaml + # In BrentLab/callingcards config + annotated_features: + sample_id: + field: sample_id + comparative_analyses: + - repo: BrentLab/yeast_comparative_analysis + dataset: dto + via_field: binding_id + regulator_locus_tag: + field: regulator_locus_tag + dto_fdr: # Field from comparative dataset, optional renaming + field: dto_fdr + ``` + + """ + + sample_id: PropertyMapping | None = Field( + None, description="Mapping for sample identifier field" + ) + comparative_analyses: list[ComparativeAnalysis] = Field( + default_factory=list, + description="Comparative datasets referencing this dataset", + ) + # Allow additional property mappings via extra fields + model_config = ConfigDict(extra="allow") + + @model_validator(mode="before") + @classmethod + def parse_property_mappings(cls, data: Any) -> Any: + """Parse extra fields as PropertyMapping objects.""" + if not isinstance(data, dict): + return data + + # Process all fields except sample_id and comparative_analyses + result = {} + for key, value in data.items(): + if key in ("sample_id", "comparative_analyses"): + # These are typed fields, let Pydantic handle them + result[key] = value + elif isinstance(value, dict): + # Assume it's a PropertyMapping + try: + result[key] = PropertyMapping.model_validate(value) + except Exception as e: + raise ValueError( + f"Invalid PropertyMapping for field '{key}': {e}" + ) from e + else: + # Already parsed or wrong type + result[key] = value + + return result + + +class RepositoryConfig(BaseModel): + """ + Configuration for a single repository. Eg BrentLab/harbison_2004. + + Attributes: + properties: Repo-wide property mappings that apply to all datasets + dataset: Dataset-specific configurations including sample_id, + comparative_analyses, and property mappings + + Example: + ```python + config = RepositoryConfig( + properties={ + "temperature_celsius": PropertyMapping(path="temperature_celsius") + }, + dataset={ + "dataset_name": DatasetVirtualDBConfig( + sample_id=PropertyMapping(field="sample_id"), + comparative_analyses=[ + ComparativeAnalysis( + repo="BrentLab/yeast_comparative_analysis", + dataset="dto", + via_field="binding_id" + ) + ], + # Additional property mappings via extra fields + **{"carbon_source": PropertyMapping( + field="condition", + path="media.carbon_source" + )} + ) + } + ) + ``` + + """ + + properties: dict[str, PropertyMapping] = Field( + default_factory=dict, description="Repo-wide property mappings" + ) + dataset: dict[str, DatasetVirtualDBConfig] | None = Field( + None, description="Dataset-specific configurations" + ) + + @model_validator(mode="before") + @classmethod + def parse_structure(cls, data: Any) -> Any: + """Parse raw dict structure into typed objects.""" + if not isinstance(data, dict): + return data + + # Extract and parse dataset section + dataset_section = data.get("dataset") + parsed_datasets: dict[str, DatasetVirtualDBConfig] | None = None + + if dataset_section: + if not isinstance(dataset_section, dict): + raise ValueError("'dataset' key must contain a dict") + + parsed_datasets = {} + for dataset_name, config_dict in dataset_section.items(): + if not isinstance(config_dict, dict): + raise ValueError(f"Dataset '{dataset_name}' must contain a dict") + + # Parse DatasetVirtualDBConfig + # The config_dict may contain: + # - sample_id (PropertyMapping) + # - comparative_analyses (list[ComparativeAnalysis]) + # - Other fields as PropertyMappings (via extra="allow") + try: + parsed_datasets[dataset_name] = ( + DatasetVirtualDBConfig.model_validate(config_dict) + ) + except Exception as e: + raise ValueError( + f"Invalid configuration for dataset '{dataset_name}': {e}" + ) from e + + # Parse repo-wide properties (all keys except 'dataset') + parsed_properties = {} + for key, value in data.items(): + if key == "dataset": + continue + + try: + parsed_properties[key] = PropertyMapping.model_validate(value) + except Exception as e: + raise ValueError(f"Invalid repo-wide property '{key}': {e}") from e + + return {"properties": parsed_properties, "dataset": parsed_datasets} + + +class MetadataConfig(BaseModel): + """ + Configuration for building standardized metadata tables. + + Specifies optional alias mappings for normalizing factor levels across + heterogeneous datasets, plus property path mappings for each repository. + + Attributes: + factor_aliases: Optional mappings of standardized names to actual values. + Example: {"carbon_source": + {"glucose": ["D-glucose", "dextrose"]}} + missing_value_labels: Labels for missing values by property name + description: Human-readable descriptions for each property + repositories: Dict mapping repository IDs to their configurations + + Example: + ```yaml + repositories: + BrentLab/harbison_2004: + dataset: + harbison_2004: + carbon_source: + field: condition + path: media.carbon_source + + BrentLab/kemmeren_2014: + temperature: + path: temperature_celsius + dataset: + kemmeren_2014: + carbon_source: + path: media.carbon_source + + factor_aliases: + carbon_source: + glucose: ["D-glucose", "dextrose"] + galactose: ["D-galactose", "Galactose"] + + missing_value_labels: + carbon_source: "unspecified" + + description: + carbon_source: "Carbon source in growth media" + ``` + + """ + + factor_aliases: dict[str, dict[str, list[Any]]] = Field( + default_factory=dict, + description="Optional alias mappings for normalizing factor levels", + ) + missing_value_labels: dict[str, str] = Field( + default_factory=dict, + description="Labels for missing values by property name", + ) + description: dict[str, str] = Field( + default_factory=dict, + description="Human-readable descriptions for each property", + ) + repositories: dict[str, RepositoryConfig] = Field( + ..., description="Repository configurations keyed by repo ID" + ) + + @field_validator("missing_value_labels", mode="before") + @classmethod + def validate_missing_value_labels(cls, v: Any) -> dict[str, str]: + """Validate missing value labels structure, filtering out None values.""" + if not v: + return {} + if not isinstance(v, dict): + raise ValueError("missing_value_labels must be a dict") + # Filter out None values that may come from empty YAML values + return {k: val for k, val in v.items() if val is not None} + + @field_validator("description", mode="before") + @classmethod + def validate_description(cls, v: Any) -> dict[str, str]: + """Validate description structure, filtering out None values.""" + if not v: + return {} + if not isinstance(v, dict): + raise ValueError("description must be a dict") + # Filter out None values that may come from empty YAML values + return {k: val for k, val in v.items() if val is not None} + + @field_validator("factor_aliases") + @classmethod + def validate_factor_aliases( + cls, v: dict[str, dict[str, list[Any]]] + ) -> dict[str, dict[str, list[Any]]]: + """Validate factor alias structure.""" + # Empty is OK - aliases are optional + if not v: + return v + + for prop_name, aliases in v.items(): + if not isinstance(aliases, dict): + raise ValueError( + f"Property '{prop_name}' aliases must be a dict, " + f"got {type(aliases).__name__}" + ) + + # Validate each alias mapping + for alias_name, actual_values in aliases.items(): + if not isinstance(actual_values, list): + raise ValueError( + f"Alias '{alias_name}' for '{prop_name}' must map " + f"to a list of values" + ) + if not actual_values: + raise ValueError( + f"Alias '{alias_name}' for '{prop_name}' cannot " + f"have empty value list" + ) + for val in actual_values: + if not isinstance(val, (str, int, float, bool)): + raise ValueError( + f"Alias '{alias_name}' for '{prop_name}' contains " + f"invalid value type: {type(val).__name__}" + ) + + return v + + @model_validator(mode="before") + @classmethod + def parse_repositories(cls, data: Any) -> Any: + """Parse repository configurations from 'repositories' key.""" + if not isinstance(data, dict): + return data + + # Extract repositories from 'repositories' key + repositories_data = data.get("repositories", {}) + + if not repositories_data: + raise ValueError( + "Configuration must have a 'repositories' key " + "with at least one repository" + ) + + if not isinstance(repositories_data, dict): + raise ValueError("'repositories' key must contain a dict") + + repositories = {} + for repo_id, repo_config in repositories_data.items(): + try: + repositories[repo_id] = RepositoryConfig.model_validate(repo_config) + except Exception as e: + raise ValueError( + f"Invalid configuration for repository '{repo_id}': {e}" + ) from e + + return { + "factor_aliases": data.get("factor_aliases", {}), + "missing_value_labels": data.get("missing_value_labels", {}), + "description": data.get("description", {}), + "repositories": repositories, + } + + @classmethod + def from_yaml(cls, path: Path | str) -> "MetadataConfig": + """ + Load and validate configuration from YAML file. + + :param path: Path to YAML configuration file + :return: Validated MetadataConfig instance + :raises FileNotFoundError: If file doesn't exist + :raises ValueError: If configuration is invalid + + """ + path = Path(path) + + if not path.exists(): + raise FileNotFoundError(f"Configuration file not found: {path}") + + with open(path) as f: + data = yaml.safe_load(f) + + if not isinstance(data, dict): + raise ValueError("Configuration must be a YAML dict") + + return cls.model_validate(data) + + def get_repository_config(self, repo_id: str) -> RepositoryConfig | None: + """ + Get configuration for a specific repository. + + :param repo_id: Repository ID (e.g., "BrentLab/harbison_2004") + :return: RepositoryConfig instance or None if not found + + """ + return self.repositories.get(repo_id) + + def get_property_mappings( + self, repo_id: str, config_name: str + ) -> dict[str, PropertyMapping]: + """ + Get merged property mappings for a repo/dataset combination. + + Merges repo-wide and dataset-specific mappings, with dataset-specific taking + precedence. + + :param repo_id: Repository ID + :param config_name: Dataset/config name + :return: Dict mapping property names to PropertyMapping objects + + """ + repo_config = self.get_repository_config(repo_id) + if not repo_config: + return {} + + # Start with repo-wide properties + mappings: dict[str, PropertyMapping] = dict(repo_config.properties) + + # Override with dataset-specific properties + if repo_config.dataset and config_name in repo_config.dataset: + dataset_config = repo_config.dataset[config_name] + # DatasetVirtualDBConfig stores property mappings in model_extra + if hasattr(dataset_config, "model_extra") and dataset_config.model_extra: + mappings.update(dataset_config.model_extra) + + return mappings diff --git a/tfbpapi/rank_transforms.py b/tfbpapi/rank_transforms.py deleted file mode 100644 index 9e4c672..0000000 --- a/tfbpapi/rank_transforms.py +++ /dev/null @@ -1,154 +0,0 @@ -import numpy as np -from scipy.stats import rankdata - - -def shifted_negative_log_ranks(ranks: np.ndarray) -> np.ndarray: - """ - Transforms ranks to negative log10 values and shifts such that the lowest value is - 0. - - :param ranks: A vector of ranks - :return np.ndarray: A vector of negative log10 transformed ranks shifted such that - the lowest value is 0 - :raises ValueError: If the ranks are not numeric. - - """ - if not np.issubdtype(ranks.dtype, np.number): - raise ValueError("`ranks` must be a numeric") - max_rank = np.max(ranks) - log_max_rank = np.log10(max_rank) - return -1 * np.log10(ranks) + log_max_rank - - -def stable_rank( - pvalue_vector: np.ndarray, enrichment_vector: np.ndarray, method="average" -) -> np.ndarray: - """ - Ranks data by primary_column, breaking ties based on secondary_column. The expected - primary and secondary columns are 'pvalue' and 'enrichment', respectively. Then the - ranks are transformed to negative log10 values and shifted such that the lowest - value is 0 and the highest value is log10(min_rank). - - :param pvalue_vector: A vector of pvalues - :param enrichment_vector: A vector of enrichment values corresponding to the pvalues - :param method: The method to use for final ranking. Default is "average". - See `rankdata` - - :return np.ndarray: A vector of negative log10 transformed ranks shifted such that - the lowest value is 0 and the highest value is log10(min_rank) - :raises ValueError: If the primary or secondary column is not numeric. - - """ - - # Check if primary and secondary columns are numeric - if not np.issubdtype(pvalue_vector.dtype, np.number): - raise ValueError("`primary_vector` must be a numeric") - if not np.issubdtype(enrichment_vector.dtype, np.number): - raise ValueError("`secondary_vector` must be a numeric") - - # Step 1: Rank by primary_column - # note that this will now always be an integer, unlike average which could return - # decimal values making adding the secondary rank more difficult - primary_rank = rankdata(pvalue_vector, method="min") - - # Step 2: Identify ties in primary_rank - unique_ranks = np.unique(primary_rank) - - # Step 3: Adjust ranks within ties using secondary ranking - adjusted_primary_rank = primary_rank.astype( - float - ) # Convert to float for adjustments - - for unique_rank in unique_ranks: - # Get indices where primary_rank == unique_rank - tie_indices = np.where(primary_rank == unique_rank)[0] - - if len(tie_indices) > 1: # Only adjust if there are ties - # Rank within the tie group by secondary_column - # (descending if higher is better) - tie_secondary_values = enrichment_vector[tie_indices] - secondary_rank_within_ties = rankdata( - -tie_secondary_values, method="average" - ) - - # Calculate dynamic scale factor to ensure adjustments are < 1. Since the - # primary_rank is an integer, adding a number less than 1 will not affect - # rank relative to the other groups. - max_secondary_rank = np.max(secondary_rank_within_ties) - scale_factor = ( - 0.9 / max_secondary_rank - ) # Keep scale factor slightly below 1/max rank - - # multiple the secondary_rank_within_ties values by 0.1 and add this value - # to the adjusted_primary_rank_values. This will rank the tied primary - # values by the secondary values, but not affect the overall primary rank - # outside of the tie group - # think about this scale factor - adjusted_primary_rank[tie_indices] += ( - secondary_rank_within_ties * scale_factor - ) - - # Step 4: Final rank based on the adjusted primary ranks - final_ranks = rankdata(adjusted_primary_rank, method=method) - - return final_ranks - - -def rank_by_pvalue(pvalue_vector: np.ndarray, method="average") -> np.ndarray: - """ - This expects a vector of pvalues, returns a vector of ranks where the lowest pvalue - has the lowest rank. - - :param pvalue_vector: A vector of pvalues - :param enrichment_vector: A vector of enrichment values corresponding to the pvalues - :param method: The method to use for ranking. Default is "average". See `rankdata` - :return np.ndarray: A vector of negative log10 transformed ranks shifted such that - the lowest value is 0 and the highest value is log10(min_rank) - :raises ValueError: If the primary or secondary column is not numeric. - - """ - - # Check if primary and secondary columns are numeric - if not np.issubdtype(pvalue_vector.dtype, np.number): - raise ValueError("`primary_vector` must be a numeric") - - # Step 1: Rank by primary_column - # note that this will now always be an integer, unlike average which could return - # decimal values making adding the secondary rank more difficult - return rankdata(pvalue_vector, method=method) - - -def transform( - pvalue_vector: np.ndarray, - enrichment_vector: np.ndarray, - use_enrichment: bool = True, - negative_log_shift: bool = True, - **kwargs, -) -> np.ndarray: - """ - This calls the rank() function and then transforms the ranks to negative log10 - values and shifts to the right such that the lowest value (largest rank, least - important) is 0. - - :param pvalue_vector: A vector of pvalues - :param enrichment_vector: A vector of enrichment values corresponding to the pvalues - :param use_enrichment: Set to True to use the enrichment vector to break ties. - Default is True. If False, pvalues will be ranked directly with method="average' - :param negative_log_shift: Set to True to shift the ranks to the right such that the - lowest value (largest rank, least important) is 0. Default is True. - :param kwargs: Additional keyword arguments to pass to the rank() function (e.g. - method="min") - :return np.ndarray: A vector of negative log10 transformed ranks shifted such that - the lowest value is 0 and the highest value is log10(min_rank) - :raises ValueError: If the primary or secondary column is not numeric. - - """ - if use_enrichment: - ranks = stable_rank(pvalue_vector, enrichment_vector, **kwargs) - else: - ranks = rank_by_pvalue(pvalue_vector, **kwargs) - - if negative_log_shift: - return shifted_negative_log_ranks(ranks) - else: - return ranks diff --git a/tfbpapi/tests/conftest.py b/tfbpapi/tests/conftest.py new file mode 100644 index 0000000..55c1082 --- /dev/null +++ b/tfbpapi/tests/conftest.py @@ -0,0 +1,1465 @@ +import pickle +from pathlib import Path +from unittest.mock import patch + +import pytest + + +@pytest.fixture +def mock_cache_info(): + """Load real cache data from pickle file.""" + cache_file = Path(__file__).parent / "data" / "cache_info.pkl" + + if not cache_file.exists(): + pytest.skip( + "test_cache_data.pkl not found. Run cache data generation script first." + ) + + with open(cache_file, "rb") as f: + return pickle.load(f) + + +@pytest.fixture +def mock_scan_cache_dir(mock_cache_info): + """Mock scan_cache_dir to return our pickled cache data.""" + with patch("huggingface_hub.scan_cache_dir", return_value=mock_cache_info): + yield mock_cache_info + + +# ============================================================================ +# Datainfo Fixtures (merged from tests/datainfo/conftest.py) +# ============================================================================ + + +@pytest.fixture +def sample_dataset_card_data(): + """Sample dataset card data for testing.""" + return { + "license": "mit", + "language": ["en"], + "tags": ["biology", "genomics", "yeast"], + "pretty_name": "Test Genomics Dataset", + "size_categories": ["100K log2(1.7) & " + "pval < 0.05). Note that " + "there is a slight " + "difference when " + "calculating from the data " + "provided here, I believe " + "due to a difference in " + "the way the targets are " + "parsed and filtered (some " + "ORFs that have since been " + "removed from the " + "annotations are removed). " + "I didn't investigate this " + "closely, though.", + "role": "experimental_condition", + }, + { + "name": "profile_first_published", + "dtype": "string", + "description": "citation or reference " + "indicating where this " + "expression profile was " + "first published", + "role": "experimental_condition", + }, + { + "name": "chase_notes", + "dtype": "string", + "description": "notes added during data " + "curation and parsing", + }, + ] + }, + } + ], + } diff --git a/tfbpapi/tests/conftests.py b/tfbpapi/tests/conftests.py deleted file mode 100644 index e69de29..0000000 diff --git a/tfbpapi/tests/example_datacards.py b/tfbpapi/tests/example_datacards.py new file mode 100644 index 0000000..36b023f --- /dev/null +++ b/tfbpapi/tests/example_datacards.py @@ -0,0 +1,510 @@ +# flake8: noqa +""" +Three diverse datacard examples for testing datacard parsing and database construction. + +These examples capture different patterns of experimental condition specification: +1. Top-level conditions with field-level variations (minimal media) +2. Complex field-level definitions with multiple environmental conditions +3. Partitioned dataset with separate metadata configs using applies_to + +""" + +EXAMPLE_1_SIMPLE_TOPLEVEL = """--- +license: mit +language: + - en +tags: + - genomics + - yeast + - transcription +pretty_name: "Example Dataset 1 - TF Perturbation" +size_categories: + - 100K- + Systematic gene identifier of the ChIP-targeted transcription factor + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the ChIP-targeted transcription factor + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: Systematic gene identifier of the target gene + role: target_identifier + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene + role: target_identifier + - name: condition + dtype: + class_label: + names: ["YPD", "galactose", "heat_shock", "oxidative_stress", + "amino_acid_starvation"] + description: Environmental or stress condition of the experiment + role: experimental_condition + definitions: + YPD: + description: Rich media baseline condition + environmental_conditions: + temperature_celsius: 30 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.6 + stage: mid_log_phase + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + galactose: + description: Alternative carbon source condition + environmental_conditions: + temperature_celsius: 30 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.6 + stage: mid_log_phase + media: + name: YPD + carbon_source: + - compound: D-galactose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + heat_shock: + description: Temperature stress condition + environmental_conditions: + temperature_celsius: 37 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.6 + stage: mid_log_phase + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + heat_treatment: + duration_minutes: 15 + oxidative_stress: + description: Hydrogen peroxide stress condition + environmental_conditions: + temperature_celsius: 30 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.6 + stage: mid_log_phase + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: hydrogen_peroxide + concentration_percent: 0.004 + duration_minutes: 20 + amino_acid_starvation: + description: Amino acid starvation via chemical inhibition + environmental_conditions: + temperature_celsius: 30 + cultivation_method: liquid_culture + growth_phase_at_harvest: + od600: 0.5 + stage: mid_log_phase + media: + name: synthetic_complete + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_nitrogen_base + # 6.71 g/L + concentration_percent: 0.671 + specifications: + - without_amino_acids + - without_ammonium_sulfate + - compound: ammonium_sulfate + # 5 g/L + concentration_percent: 0.5 + - compound: amino_acid_dropout_mix + # 2 g/L + concentration_percent: 0.2 + chemical_treatment: + compound: 3-amino-1,2,4-triazole + concentration_percent: 0.01 + duration_hours: 1 + - name: binding_score + dtype: float64 + description: ChIP-seq binding enrichment score + role: quantitative_measure + - name: peak_pvalue + dtype: float64 + description: Statistical significance of binding peak + role: quantitative_measure + - name: peak_qvalue + dtype: float64 + description: FDR-adjusted p-value for binding peak + role: quantitative_measure +--- +""" + + +EXAMPLE_3_PARTITIONED_WITH_METADATA = """--- +license: mit +language: + - en +tags: + - genomics + - yeast + - binding + - genome-wide + - chec-seq +pretty_name: "Example Dataset 3 - Genome Coverage Compendium" +size_categories: + - 10M- + unique identifier for a specific sample. The sample ID identifies a unique + (regulator_locus_tag, time, mechanism, restriction, date, strain) tuple. + - name: db_id + dtype: integer + description: >- + an old unique identifer, for use internally only. Deprecated and will be removed eventually. + Do not use in analysis. db_id = 0, for GEV and Z3EV, means that those samples are not + included in the original DB. + - name: regulator_locus_tag + dtype: string + description: >- + induced transcriptional regulator systematic ID. + See hf/BrentLab/yeast_genome_resources + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: >- + induced transcriptional regulator common name. If no common name exists, + then the `regulator_locus_tag` is used. + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: >- + The systematic ID of the feature to which the effect/pvalue is assigned. + See hf/BrentLab/yeast_genome_resources + role: target_identifier + - name: target_symbol + dtype: string + description: >- + The common name of the feature to which the effect/pvalue is assigned. + If there is no common name, the `target_locus_tag` is used. + role: target_identifier + - name: time + dtype: float + description: time point (minutes) + role: experimental_condition + - name: mechanism + dtype: + class_label: + names: ["GEV", "ZEV"] + description: Synthetic TF induction system (GEV or ZEV) + role: experimental_condition + definitions: + GEV: + perturbation_method: + type: inducible_overexpression + system: GEV + inducer: beta-estradiol + description: "Galactose-inducible estrogen receptor-VP16 fusion system" + ZEV: + perturbation_method: + type: inducible_overexpression + system: ZEV + inducer: beta-estradiol + description: "Z3 (synthetic zinc finger)-estrogen receptor-VP16 fusion system" + - name: restriction + dtype: + class_label: + names: ["M", "N", "P"] + description: >- + nutrient limitation, one of P (phosphate limitation (20 mg/l).), + N (Nitrogen‐limited cultures were maintained at 40 mg/l ammonium sulfate) or + M (Not defined in the paper or on the Calico website) + role: experimental_condition + definitions: + P: + media: + nitrogen_source: + - compound: ammonium_sulfate + # Saldanha et al 2004: 5 g/l + concentration_percent: 0.5 + phosphate_source: + - compound: potassium_phosphate_monobasic + # Hackett et al 2020: 20 mg/l + concentration_percent: 0.002 + N: + media: + nitrogen_source: + - compound: ammonium_sulfate + # Hackett et al 2020: 40 mg/l + concentration_percent: 0.004 + M: + description: "Not defined in the paper or on the Calico website" + - name: date + dtype: string + description: date performed + role: experimental_condition + - name: strain + dtype: string + description: strain name + role: experimental_condition + - name: green_median + dtype: float + description: median of green (reference) channel fluorescence + role: quantitative_measure + - name: red_median + dtype: float + description: median of red (experimental) channel fluorescence + role: quantitative_measure + - name: log2_ratio + dtype: float + description: log2(red / green) subtracting value at time zero + role: quantitative_measure + - name: log2_cleaned_ratio + dtype: float + description: Non-specific stress response and prominent outliers removed + role: quantitative_measure + - name: log2_noise_model + dtype: float + description: estimated noise standard deviation + role: quantitative_measure + - name: log2_cleaned_ratio_zth2d + dtype: float + description: >- + cleaned timecourses hard-thresholded based on + multiple observations (or last observation) passing the noise model + role: quantitative_measure + - name: log2_selected_timecourses + dtype: float + description: >- + cleaned timecourses hard-thresholded based on single observations + passing noise model and impulse evaluation of biological feasibility + role: quantitative_measure + - name: log2_shrunken_timecourses + dtype: float + description: >- + selected timecourses with observation-level shrinkage based on + local FDR (false discovery rate). Most users of the data will want + to use this column. + role: quantitative_measure +--- + +# harbison_2004 +--- +license: mit +language: + - en +tags: + - genomics + - yeast + - transcription + - binding +pretty_name: "Harbison, 2004 ChIP-chip" +size_categories: + - 1M- + Environmental condition of the experiment. Nearly all of the 204 regulators + have a YPD condition, and some have others in addition. + role: experimental_condition + definitions: + YPD: + description: Rich media baseline condition + # Harbison et al 2004: grown at 30°C (from HEAT condition context) + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: 1% yeast extract / 2% peptone / 2% glucose + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + SM: + description: Amino acid starvation stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.6 + od600: 0.6 + media: + # Harbison et al 2004: synthetic complete medium + name: synthetic_complete + carbon_source: unspecified + nitrogen_source: unspecified + chemical_treatment: + compound: sulfometuron_methyl + # Harbison et al 2004: 0.2 mg/ml + concentration_percent: 0.02 + duration_hours: 2 + RAPA: + description: Nutrient deprivation via TOR inhibition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: rapamycin + # Harbison et al 2004: 100 nM + concentration_percent: 9.142e-6 + duration_minutes: 20 + H2O2Hi: + description: High oxidative stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.5 + od600: 0.5 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: hydrogen_peroxide + # Harbison et al 2004: 4 mM + concentration_percent: 0.0136 + duration_minutes: 30 + H2O2Lo: + description: Moderate oxidative stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.5 + od600: 0.5 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: hydrogen_peroxide + # Harbison et al 2004: 0.4 mM + concentration_percent: 0.00136 + duration_minutes: 20 + Acid: + description: Acidic pH stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.5 + od600: 0.5 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: succinic_acid + # Harbison et al 2004: 0.05 M to reach pH 4.0 + concentration_percent: 0.59 + target_pH: 4.0 + duration_minutes: 30 + Alpha: + description: Mating pheromone induction condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + chemical_treatment: + compound: alpha_factor_pheromone + # Harbison et al 2004: 5 mg/ml + concentration_percent: 0.5 + duration_minutes: 30 + BUT14: + description: Long-term filamentation induction with butanol + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: YPD containing 1% butanol + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + additives: + - compound: butanol + concentration_percent: 1 + incubation_duration_hours: 14 + BUT90: + description: Short-term filamentation induction with butanol + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: YPD containing 1% butanol + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + additives: + - compound: butanol + concentration_percent: 1 + incubation_duration_minutes: 90 + "Thi-": + description: Vitamin B1 deprivation stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: synthetic complete medium lacking thiamin + name: synthetic_complete_minus_thiamine + carbon_source: unspecified + nitrogen_source: unspecified + GAL: + description: Galactose-based growth medium condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: YEP medium supplemented with galactose (2%) + name: yeast_extract_peptone + carbon_source: + - compound: D-galactose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: unspecified + - compound: peptone + concentration_percent: unspecified + HEAT: + description: Heat shock stress condition + # Harbison et al 2004: grown at 30°C, shifted to 37°C for 45 min + initial_temperature_celsius: 30 + temperature_shift_celsius: 37 + temperature_shift_duration_minutes: 45 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.5 + od600: 0.5 + media: + # Harbison et al 2004: YPD + name: YPD + carbon_source: + - compound: D-glucose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: 1 + - compound: peptone + concentration_percent: 2 + "Pi-": + description: Phosphate deprivation stress condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: synthetic complete medium lacking phosphate + name: synthetic_complete_minus_phosphate + carbon_source: unspecified + nitrogen_source: unspecified + RAFF: + description: Raffinose-based growth medium condition + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Harbison et al 2004: OD600 ~0.8 + od600: 0.8 + media: + # Harbison et al 2004: YEP medium supplemented with raffinose (2%) + name: yeast_extract_peptone + carbon_source: + - compound: D-raffinose + concentration_percent: 2 + nitrogen_source: + - compound: yeast_extract + concentration_percent: unspecified + - compound: peptone + concentration_percent: unspecified + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the ChIPd transcription factor + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the ChIPd transcription factor + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene measured + role: target_identifier + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene measured + role: target_identifier + - name: effect + dtype: float64 + description: The chip channel ratio (effect size) + role: quantitative_measure + - name: pvalue + dtype: float64 + description: pvalue of the chip channel ratio (effect) + role: quantitative_measure +--- + +# hu_2007_reimand_2010 +--- +license: mit +language: + - en +tags: + - genomics + - yeast + - transcription + - perturbation + - response + - knockout + - TFKO +pretty_name: Hu 2007/Reimand 2010 TFKO +size_categories: + - 1M- + an old unique identifer, for use internally only. Deprecated and will be removed eventually. + Do not use in analysis. + - name: regulator_locus_tag + dtype: string + description: induced transcriptional regulator systematic ID. See hf/BrentLab/yeast_genome_resources + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: induced transcriptional regulator common name. If no common name exists, then the `regulator_locus_tag` is used. + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: The systematic ID of the feature to which the effect/pvalue is assigned. See hf/BrentLab/yeast_genome_resources + role: target_identifier + - name: target_symbol + dtype: string + description: The common name of the feature to which the effect/pvalue is assigned. If there is no common name, the `target_locus_tag` is used. + role: target_identifier + - name: effect + dtype: float + description: >- + log fold change of mutant vs wt. From the remaind methods: Differential expression + was calculated using a moderated eBayes t-test as implemented in the Limma + Bioconductor package + role: quantitative_measure + - name: pval + dtype: float + description: P-values were FDR-adjusted across the whole microarray dataset to correct for multiple testing + role: quantitative_measure + - name: average_od_of_replicates + dtype: float + description: average OD of the replicates at harvest + - name: heat_shock + dtype: bool + description: >- + `True` if the regulator strain was subjected to heat shock treatment. + Applied to 22 transcription factors implicated in heat shock response. + `False` otherwise + role: experimental_condition + definitions: + true: + # Hu et al 2007: "15-min heat shock at 39°C" + temperature_celsius: 39 + duration_minutes: 15 + strain_background: + genotype: BY4741 + mating_type: MATa + markers: + - his3Δ1 + - leu2Δ0 + - met15Δ0 + - ura3Δ0 + source: Open_Biosystems + description: Knockout strains for nonessential transcription factors + false: + description: Standard growth conditions at 30°C + strain_background: + genotype: BY4741 + mating_type: MATa + markers: + - his3Δ1 + - leu2Δ0 + - met15Δ0 + - ura3Δ0 + source: Open_Biosystems + description: Knockout strains for nonessential transcription factors + - name: tetracycline_treatment + dtype: bool + description: >- + `True` if the regulator strain was treated with doxycycline to repress + TetO7-promoter regulated essential transcription factors. Applied to 6 + essential transcription factors. `False` for untreated control condition. + role: experimental_condition + definitions: + true: + drug_treatment: + compound: doxycycline + # Hu et al 2007: 10 mg/ml + concentration_percent: 1 + duration_hours_min: 14 + duration_hours_max: 16 + strain_background: + genotype: BY4741_derivative + mating_type: MATa + markers: + - URA3::CMV-tTA + - his3Δ1 + - leu2Δ0 + - met15Δ0 + source: Open_Biosystems + description: Essential transcription factors with TetO7-promoter regulation + false: + description: No doxycycline treatment; TetO7 promoter active + strain_background: + genotype: BY4741_derivative + mating_type: MATa + markers: + - URA3::CMV-tTA + - his3Δ1 + - leu2Δ0 + - met15Δ0 + source: Open_Biosystems + description: Essential transcription factors with TetO7-promoter regulation +--- + +# hughes_2006 +--- +license: mit +language: +- en +tags: +- biology +- genomics +- yeast +- transcription-factors +- gene-expression +- perturbation-screen +- overexpression +- knockout +- microarray +- functional-genomics +pretty_name: "Hughes 2006 Yeast Transcription Factor Perturbation Dataset" +size_categories: +- 100K- + unique identifier for a specific sample. The sample ID identifies + a unique regulator_locus_tag and can be used to join to the + other datasets in this repo, including the metadata + - name: regulator_locus_tag + dtype: string + role: identifier + description: >- + Systematic gene name (ORF identifier) of the + transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: found_domain + dtype: string + description: >- + Identified DNA-binding domain(s) or protein family classification + - name: sgd_description + dtype: string + description: >- + Functional description from Saccharomyces Genome Database (SGD) + - name: essential + dtype: bool + description: >- + Boolean indicating whether the gene is essential for viability + - name: oe_passed_qc + dtype: bool + description: >- + Boolean indicating whether overexpression experiments passed + quality control + - name: del_passed_qc + dtype: bool + description: >- + Boolean indicating whether deletion experiments passed + quality control + +- config_name: overexpression + description: Overexpression perturbation normalized log2 fold changes + dataset_type: annotated_features + data_files: + - split: train + path: overexpression.parquet + # temperature and growth phase are unspecified. nitrogen_source is + # also unspecified + media: + # Hughes et al 2006: "selective medium supplemented with 2% raffinose" + name: selective_medium + carbon_source: + - compound: D-raffinose + # Hughes et al 2006: 2% raffinose + concentration_percent: 2 + induction: + # Hughes et al 2006: "induction with 2% galactose for 3 h" + inducer: + compound: D-galactose + concentration_percent: 2 + duration_hours: 3 + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample. The sample ID identifies + a unique regulator_locus_tag and can be used to join to the + other datasets in this repo, including the metadata + - name: regulator_locus_tag + dtype: string + description: >- + Systematic gene name (ORF identifier) of the + perturbed transcription factor + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the perturbed transcription factor + - name: target_locus_tag + dtype: string + description: >- + Systematic gene name (ORF identifier) of the + target gene measured + role: target_identifier + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene measured + role: target_identifier + - name: dye_plus + dtype: float64 + role: quantitative_measure + description: >- + Normalized log2 fold change for positive (+) dye orientation. + Positive values indicate upregulation in response to overexpression. + - name: dye_minus + dtype: float64 + role: quantitative_measure + description: >- + Normalized log2 fold change for negative (-) dye orientation. + Positive values indicate upregulation in response to overexpression. + - name: mean_norm_log2fc + dtype: float64 + role: quantitative_measure + description: >- + Average log2 fold change across dye orientations, + providing a dye-independent estimate of gene expression + change upon transcription factor overexpression. + +- config_name: knockout + description: Deletion/knockout perturbation normalized log2 fold changes + dataset_type: annotated_features + data_files: + - split: train + path: knockout.parquet + experimental_conditions: + temperature_celsius: unspecified + cultivation_method: unspecified + media: + # Hughes et al 2006: "synthetic medium supplemented with 2% dextrose" + name: synthetic_medium + carbon_source: + - compound: D-glucose + # Hughes et al 2006: 2% dextrose + concentration_percent: 2 + nitrogen_source: unspecified + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample. The sample ID identifies + a unique regulator_locus_tag and can be used to join to the + other datasets in this repo, including the metadata + - name: regulator_locus_tag + dtype: string + description: >- + Systematic gene name (ORF identifier) of the perturbed + transcription factor + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the perturbed transcription factor + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: >- + Systematic gene name (ORF identifier) of the + target gene measured + role: target_identifier + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene measured + role: target_identifier + - name: dye_plus + dtype: float64 + description: >- + Normalized log2 fold change for positive (+) dye orientation. + Positive values indicate upregulation in response to deletion. + role: quantitative_measure + - name: dye_minus + dtype: float64 + description: >- + Normalized log2 fold change for negative (-) dye orientation. + Positive values indicate upregulation in response to deletion. + role: quantitative_measure + - name: mean_norm_log2fc + dtype: float64 + description: >- + Average log2 fold change across dye orientations, providing a + dye-independent estimate of gene expression change upon + transcription factor deletion. + role: quantitative_measure +--- + +# kemmeren_2014 +--- +license: mit +language: +- en +tags: +- genomics +- yeast +- transcription +- perturbation +- response +- knockout +- TFKO +pretty_name: "Kemmeren, 2014 Overexpression" +size_categories: +- 1M- + Transcriptional regulator overexpression perturbation data with + differential expression measurements + dataset_type: annotated_features + default: true + metadata_fields: ["regulator_locus_tag", "regulator_symbol"] + data_files: + - split: train + path: kemmeren_2014.parquet + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample. + The sample ID identifies a unique regulator. + - name: db_id + dtype: integer + description: >- + an old unique identifer, for use internally only. Deprecated and will be removed eventually. + Do not use in analysis. db_id = 0 for loci that were originally parsed incorrectly. + - name: regulator_locus_tag + dtype: string + description: >- + induced transcriptional regulator systematic ID. + See hf/BrentLab/yeast_genome_resources + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: >- + induced transcriptional regulator common name. + If no common name exists, then the `regulator_locus_tag` is used. + role: regulator_identifier + - name: reporterId + dtype: string + description: probe ID as reported from the original data + - name: target_locus_tag + dtype: string + description: >- + The systematic ID of the feature to which the effect/pvalue is assigned. + See hf/BrentLab/yeast_genome_resources + role: target_identifier + - name: target_symbol + dtype: string + description: >- + The common name of the feature to which the effect/pvalue is assigned. + If there is no common name, the `target_locus_tag` is used. + role: target_identifier + - name: M + dtype: float64 + description: log₂ fold change (mutant vs wildtype) + role: quantitative_measure + - name: Madj + dtype: float64 + description: >- + M value with the cell cycle signal removed + (see paper cited in the introduction above) + role: quantitative_measure + - name: A + dtype: float64 + description: >- + average log2 intensity of the two channels, a proxy for expression level + (This is a guess based on microarray convention -- not specified on holstege site) + role: quantitative_measure + - name: pval + dtype: float64 + description: significance of the modeled effect (M), from limma + role: quantitative_measure + - name: variable_in_wt + dtype: string + description: >- + True if the given locus is variable in the WT condition. + Recommended to remove these from analysis. False otherwise. + See Holstege website for more information + role: experimental_condition + - name: multiple_probes + dtype: string + description: >- + True if there is more than one probe associated with + the same genomic locus. False otherwise + role: experimental_condition + - name: kemmeren_regulator + dtype: string + description: >- + True if the regulator is one of the regulators studied in the + original Kemmeren et al. (2014) global regulator study. False otherwise + role: experimental_condition + - name: regulator_desc + dtype: string + description: >- + functional description of the induced regulator + from the original paper supplement + role: experimental_condition + - name: functional_category + dtype: string + description: functional classification of the regulator from the original paper supplement + role: experimental_condition + - name: slides + dtype: string + description: identifier(s) for the microarray slide(s) used in this experiment + role: experimental_condition + - name: mating_type + dtype: string + description: mating type of the strain background used in the experiment + role: experimental_condition + - name: source_of_deletion_mutants + dtype: string + description: origin of the strain + role: experimental_condition + - name: primary_hybsets + dtype: string + description: identifier for the primary hybridization set to which this sample belongs + role: experimental_condition + - name: responsive_non_responsive + dtype: string + description: >- + classification of the regulator as responsive or not to the + deletion from the original paper supplement + role: experimental_condition + - name: nr_sign_changes + dtype: integer + description: >- + number of significant changes in expression detected for the regulator locus tag (abs(M) > log2(1.7) & pval < 0.05). + Note that there is a slight difference when calculating from the data provided here, I believe due to a difference in + the way the targets are parsed and filtered (some ORFs that have since been removed from the annotations are removed). + I didn't investigate this closely, though. + role: experimental_condition + - name: profile_first_published + dtype: string + description: citation or reference indicating where this expression profile was first published + role: experimental_condition + - name: chase_notes + dtype: string + description: notes added during data curation and parsing +--- + +# mahendrawada_2025 +--- +license: mit +language: +- en +tags: +- biology +- genomics +- yeast +- transcription-factors +- gene-expression +- binding +- chec +- perturbation +- rnaseq +- nascent rnaseq +pretty_name: "Mahendrawada 2025 ChEC-seq and Nascent RNA-seq data" +size_categories: +- 100K- + unique identifier for a specific sample, which uniquely identifies one of the 178 TFs. + Across datasets in this repo, the a given sample_id identifies the same regulator. + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene + - name: peak_score + dtype: float64 + description: ChEC signal around peak center (sum of ChEC signal from -150 to +150 bp from peak summit) normalized to Drosophila spike-in control + - name: processing_method + dtype: string + description: Method used for peak calling and quantification (original authors) + +- config_name: reprocessed_chec_seq + description: ChEC-seq transcription factor binding data reprocessed with updated peak calling methodology + dataset_type: annotated_features + data_files: + - split: train + path: chec_reprocessed_mahendrawada_2025.parquet + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample, which uniquely identifies one of the 178 TFs. + Across datasets in this repo, the a given sample_id identifies the same regulator. + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene + - name: enrichment + dtype: float64 + description: ratio of experimental insertions to background insertions + - name: poisson_pval + dtype: float64 + description: enrichment poisson pvalue + +- config_name: reprocessed_diffcontrol_5prime + description: Comparing two different sets of control replicates, m2025 from the Mahendrawada 2025 paper, and h2021 from a previous paper from the Hahn lab + dataset_type: annotated_features + metadata_fields: + - control_source + - condition + - regulator_locus_tag + experimental_conditions: + # Mahendrawada et al 2025: "30 °C culture" + temperature_celsius: 30 + cultivation_method: unspecified + growth_phase_at_harvest: + # Mahendrawada et al 2025: "A600 of ~1.0" + od600: 1.0 + media: + # Mahendrawada et al 2025: "synthetic complete (SC) media" + name: synthetic_complete + carbon_source: unspecified + nitrogen_source: + - compound: yeast_nitrogen_base + # Mahendrawada et al 2025: 1.7 g/L (without ammonium sulfate or amino acids (BD Difco)) + concentration_percent: 0.17 + specifications: + - without_ammonium_sulfate + - without_amino_acids + - compound: ammonium_sulfate + # Mahendrawada et al 2025: 5 g/L + concentration_percent: 0.5 + - compound: amino_acid_dropout_mix + # Mahendrawada et al 2025: 0.6 g/L + concentration_percent: 0.06 + - compound: adenine_sulfate + # Mahendrawada et al 2025: 40 μg/ml = 0.04 g/L + concentration_percent: 0.004 + - compound: uracil + # Mahendrawada et al 2025: 2 μg/ml = 0.002 g/L + concentration_percent: 0.0002 + data_files: + - split: train + path: reprocess_diffcontrol_5prime.parquet + dataset_info: + features: + - name: control_source + dtype: string + description: Source identifier for the control dataset (m2025 or h2021) + - name: condition + dtype: string + description: Experimental condition. 'standard' is YPD. + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + - name: chr + dtype: string + description: Chromosome name of the promoter/target region + - name: start + dtype: int64 + description: Start coordinate of the promoter region + - name: end + dtype: int64 + description: End coordinate of the promoter region + - name: strand + dtype: string + description: Strand orientation (+ or -) of the promoter/target + - name: input_vs_target_log2_fold_change + dtype: float64 + description: Log2 fold change of TF-tagged sample vs control (from DESeq2) + - name: input_vs_target_p_value + dtype: float64 + description: P-value for differential enrichment (from DESeq2) + - name: input_vs_target_adj_p_value + dtype: float64 + description: Adjusted p-value (FDR-corrected) for differential enrichment (from DESeq2) + +- config_name: rna_seq + description: Nascent RNA-seq differential expression data following transcription factor depletion using 4TU metabolic labeling + dataset_type: annotated_features + metadata_fields: + - regulator_locus_tag + - regulator_symbol + data_files: + - split: train + path: rnaseq_mahendrawada_2025.parquet + dataset_info: + features: + - name: sample_id + dtype: integer + description: >- + unique identifier for a specific sample, which uniquely identifies one of the 178 TFs. + Across datasets in this repo, the a given sample_id identifies the same regulator. + - name: db_id + dtype: integer + description: >- + an old unique identifer, for use internally only. Deprecated and will be removed eventually. + Do not use in analysis. + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the depleted transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the depleted transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the differentially expressed target gene + - name: target_symbol + dtype: string + description: Standard gene symbol of the differentially expressed target gene + - name: log2fc + dtype: float64 + description: Log2 fold change (IAA/DMSO) for significantly affected genes (DESeq2, padj <0.1, FC >= 1.3) +--- + +# rossi_2021 +--- +license: mit +tags: +- transcription-factor +- binding +- chipexo +- genomics +- biology +language: +- en +pretty_name: Rossi ChIP-exo 2021 +experimental_conditions: + temperature_celsius: 25 + cultivation_method: unspecified + growth_phase_at_harvest: + phase: mid_log + od600: 0.8 + media: + name: yeast_peptone_dextrose + carbon_source: + - compound: D-glucose + concentration_percent: unspecified + nitrogen_source: + - compound: yeast_extract + concentration_percent: unspecified + - compound: peptone + concentration_percent: unspecified + + # Heat shock applied only to SAGA strains + # note that im not sure which strains this + # applies to -- it is a TODO to better + # document this + heat_shock: + induced: true + temperature_celsius: 37 + duration_minutes: 6 + pre_induction_temperature_celsius: 25 + method: equal_volume_medium_transfer +configs: +- config_name: metadata + description: Metadata describing the tagged regulator in each experiment + dataset_type: metadata + data_files: + - split: train + path: rossi_2021_metadata.parquet + dataset_info: + features: + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: run_accession + dtype: string + description: GEO run accession identifier for the sample + - name: yeastepigenome_id + dtype: string + description: Sample identifier used by yeastepigenome.org +- config_name: genome_map + description: "ChIP-exo 5' tag coverage data partitioned by sample accession" + dataset_type: genome_map + data_files: + - split: train + path: genome_map/*/*.parquet + dataset_info: + features: + - name: chr + dtype: string + description: Chromosome name (e.g., chrI, chrII, etc.) + - name: pos + dtype: int32 + description: "Genomic position of the 5' tag" + - name: pileup + dtype: int32 + description: "Depth of coverage (number of 5' tags) at this genomic position" +- config_name: rossi_annotated_features + description: ChIP-exo regulator-target binding features with peak statistics + dataset_type: annotated_features + default: true + metadata_fields: + - regulator_locus_tag + - regulator_symbol + - target_locus_tag + data_files: + - split: train + path: yeastepigenome_annotatedfeatures.parquet + dataset_info: + features: + - name: sample_id + dtype: int32 + description: >- + Unique identifier for each ChIP-exo experimental sample. + - name: pss_id + dtype: float64 + description: >- + Current brentlab promotersetsig table id. This will eventually be removed. + - name: binding_id + dtype: float64 + description: >- + Current brentlab binding table id. This will eventually be removed. + - name: yeastepigenome_id + dtype: float64 + description: >- + Unique identifier in the yeastepigenome database. + - name: regulator_locus_tag + dtype: string + description: >- + Systematic ORF name of the regulator. + role: regulator_identifier + - name: regulator_symbol + dtype: string + description: >- + Common gene name of the regulator. + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: >- + The systematic ID of the feature to which the effect/pvalue is + assigned. See hf/BrentLab/yeast_genome_resources + role: target_identifier + - name: target_symbol + dtype: string + description: >- + The common name of the feature to which the effect/pvalue is + assigned. If there is no common name, the `target_locus_tag` is + used. + role: target_identifier + - name: n_sig_peaks + dtype: float64 + description: >- + Number of peaks in the promoter region of the the target gene + role: quantitative_measure + - name: max_fc + dtype: float64 + description: >- + If there are multiple peaks in the promoter region, then the maximum is + reported. Otherwise, it is the fold change of the single peak in the + promoter. + role: quantitative_measure + - name: min_pval + dtype: float64 + description: >- + The most significant p-value among peaks for this interaction. + role: quantitative_measure +- config_name: reprocess_annotatedfeatures + description: >- + Annotated features reprocessed with updated peak + calling methodology + dataset_type: annotated_features + data_files: + - split: train + path: reprocess_annotatedfeatures.parquet + dataset_info: + features: + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + - name: regulator_symbol + dtype: string + description: Standard gene symbol of the transcription factor + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + - name: target_symbol + dtype: string + description: Standard gene symbol of the target gene + - name: baseMean + dtype: float64 + description: Average of normalized count values, dividing by size factors, taken over all samples + - name: log2FoldChange + dtype: float64 + description: Log2 fold change between comparison and control groups + - name: lfcSE + dtype: float64 + description: Standard error estimate for the log2 fold change estimate + - name: stat + dtype: float64 + description: Value of the test statistic for the gene + - name: pvalue + dtype: float64 + description: P-value of the test for the gene + - name: padj + dtype: float64 + description: Adjusted p-value for multiple testing for the gene +- config_name: reprocess_annotatedfeatures_tagcounts + description: Another version of the reprocessed data, quantified similarly to Calling Cards + dataset_type: annotated_features + data_files: + - split: train + path: reprocess_annotatedfeatures_tagcounts.parquet + dataset_info: + features: + - name: regulator_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the transcription factor + role: regulator_identifier + - name: target_locus_tag + dtype: string + description: Systematic gene name (ORF identifier) of the target gene + role: target_identifier + - name: rank + dtype: int64 + description: Rank (ties method min rank) of the peak based on pvalue with ties broken by enrichment. Largest rank is most significant. + - name: control_count + dtype: int64 + description: Number of tags in the control condition + - name: experimental_count + dtype: int64 + description: Number of tags in the experimental condition + - name: mu + dtype: float64 + description: Expected count under the null hypothesis (control_count + 1) * (experimental_total_tags / control_total_tags) + - name: enrichment + dtype: float64 + description: Enrichment ratio of experimental over control. (experimental_counts / experimental_total) / (control_counts + pseudocount) / control_total + role: quantitative_measure + - name: log2_enrichment + dtype: float64 + description: Log2-transformed enrichment ratio + role: quantitative_measure + - name: neg_log10_pvalue + dtype: float64 + description: Negative log10 of the p-value for binding significance + role: quantitative_measure + - name: neg_log10_qvalue + dtype: float64 + description: Negative log10 of the FDR-adjusted q-value + role: quantitative_measure +--- + +# yeast_genome_resources +--- +license: mit +pretty_name: BrentLab Yeast Genome Resources +language: + - en +dataset_info: + features: + - name: start + dtype: int32 + description: Start coordinate (1-based, **inclusive**) + - name: end + dtype: int32 + description: End coordinate (1-based, **inclusive**) + - name: strand + dtype: string + levels: + - + + - "-" + description: Strand of feature + - name: type + dtype: string + levels: + - gene + - ncRNA_gene + - tRNA_gene + - snoRNA_gene + - transposable_element_gene + - pseudogene + - telomerase_RNA_gene + - snRNA_gene + - rRNA_gene + - blocked_reading_frame + description: classification of feature + - name: locus_tag + dtype: string + description: Systematic ID of feature + - name: symbol + dtype: string + description: Common name of feature + - name: alias + dtype: string + description: Alternative names of feature, typically alternative symbols + - name: source + dtype: string + description: Annotation file version/origin of the feature + - name: note + dtype: string + description: Additional feature information, typically the description from the + SGD gff/gtf + partitioning: + keys: + - name: chr + dtype: string + levels: + - chrI + - chrII + - chrVII + - chrV + - chrIII + - chrIV + - chrVIII + - chrVI + - chrX + - chrIX + - chrXI + - chrXIV + - chrXII + - chrXIII + - chrXV + - chrXVI + - chrM +configs: + - config_name: features + default: true + data_files: + - split: train + path: + - features/*/part-0.parquet +--- diff --git a/tfbpapi/tests/snapshots/__init__.py b/tfbpapi/tests/snapshots/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tfbpapi/tests/snapshots/promotersetsig_records_and_files.tar.gz b/tfbpapi/tests/snapshots/promotersetsig_records_and_files.tar.gz deleted file mode 100644 index bde8021..0000000 Binary files a/tfbpapi/tests/snapshots/promotersetsig_records_and_files.tar.gz and /dev/null differ diff --git a/tfbpapi/tests/snapshots/snap_test_AbstractAPI.py b/tfbpapi/tests/snapshots/snap_test_AbstractAPI.py deleted file mode 100644 index 8444992..0000000 --- a/tfbpapi/tests/snapshots/snap_test_AbstractAPI.py +++ /dev/null @@ -1,20 +0,0 @@ -# -*- coding: utf-8 -*- -# snapshottest: v1 - https://goo.gl/zC4yUc -from __future__ import unicode_literals - -from snapshottest import Snapshot - - -snapshots = Snapshot() - -snapshots['test_cache_operations cache_get_after_delete'] = 'None' - -snapshots['test_cache_operations cache_get_after_set'] = 'test_value' - -snapshots['test_cache_operations cache_list'] = "['test_key']" - -snapshots['test_pop_params pop_params_after_all_removed'] = '{}' - -snapshots['test_pop_params pop_params_after_one_removed'] = '{"param2": "value2"}' - -snapshots['test_push_params push_params'] = '{"param1": "value1", "param2": "value2"}' diff --git a/tfbpapi/tests/snapshots/snap_test_AbstractRecordsAndFilesAPI.py b/tfbpapi/tests/snapshots/snap_test_AbstractRecordsAndFilesAPI.py deleted file mode 100644 index 807cb7d..0000000 --- a/tfbpapi/tests/snapshots/snap_test_AbstractRecordsAndFilesAPI.py +++ /dev/null @@ -1,15 +0,0 @@ -# snapshottest: v1 - https://goo.gl/zC4yUc - -from snapshottest import Snapshot - -snapshots = Snapshot() - -snapshots[ - "test_save_response_records_and_files 1" -] = """id,uploader_id,upload_date,modifier_id,modified_date,binding_id,promoter_id,background_id,fileformat_id,file -10690,1,2024-03-26,1,2024-03-26 14:28:43.825628+00:00,4079,4,6,5,promotersetsig/10690.csv.gz -10694,1,2024-03-26,1,2024-03-26 14:28:44.739775+00:00,4083,4,6,5,promotersetsig/10694.csv.gz -10754,1,2024-03-26,1,2024-03-26 14:29:01.837335+00:00,4143,4,6,5,promotersetsig/10754.csv.gz -10929,1,2024-03-26,1,2024-03-26 14:29:45.379790+00:00,4318,4,6,5,promotersetsig/10929.csv.gz -10939,1,2024-03-26,1,2024-03-26 14:29:47.853980+00:00,4327,4,6,5,promotersetsig/10939.csv.gz -""" diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_delete b/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_delete deleted file mode 100644 index 4af1832..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_delete +++ /dev/null @@ -1 +0,0 @@ -None \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_set b/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_set deleted file mode 100644 index fff1c65..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_get_after_set +++ /dev/null @@ -1 +0,0 @@ -test_value \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_list b/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_list deleted file mode 100644 index 1950491..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_cache_operations/cache_list +++ /dev/null @@ -1 +0,0 @@ -['test_key'] \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_all_removed b/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_all_removed deleted file mode 100644 index 9e26dfe..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_all_removed +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_one_removed b/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_one_removed deleted file mode 100644 index cab5c0c..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_pop_params/pop_params_after_one_removed +++ /dev/null @@ -1 +0,0 @@ -{"param2": "value2"} \ No newline at end of file diff --git a/tfbpapi/tests/snapshots/test_AbstractAPI/test_push_params/push_params b/tfbpapi/tests/snapshots/test_AbstractAPI/test_push_params/push_params deleted file mode 100644 index 21d59b6..0000000 --- a/tfbpapi/tests/snapshots/test_AbstractAPI/test_push_params/push_params +++ /dev/null @@ -1 +0,0 @@ -{"param1": "value1", "param2": "value2"} \ No newline at end of file diff --git a/tfbpapi/tests/test_AbstractAPI.py b/tfbpapi/tests/test_AbstractAPI.py deleted file mode 100644 index 84a643d..0000000 --- a/tfbpapi/tests/test_AbstractAPI.py +++ /dev/null @@ -1,94 +0,0 @@ -import json -from typing import Any - -import pytest -import responses - -from tfbpapi.AbstractAPI import AbstractAPI -from tfbpapi.ParamsDict import ParamsDict - - -class ConcreteAPI(AbstractAPI): - """Concrete implementation of AbstractAPI for testing purposes.""" - - def create(self, data: dict[str, Any], **kwargs) -> Any: - pass # Implement for testing if necessary - - def read(self, **kwargs) -> dict[str, Any]: - return {"id": id} # Mock implementation for testing - - def update(self, df: Any, **kwargs) -> Any: - pass # Implement for testing if necessary - - def delete(self, id: str, **kwargs) -> Any: - pass # Implement for testing if necessary - - def submit(self, post_dict: dict, **kwargs) -> Any: - pass # Implement for testing if necessary - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - pass # Implement for testing if necessary - - -@pytest.fixture -@responses.activate -def api_client(): - valid_url = "https://valid.url" - responses.add(responses.HEAD, valid_url, status=200) - return ConcreteAPI(url=valid_url, token="token") - - -def test_initialize(snapshot, api_client): - assert api_client.url == "https://valid.url" - assert api_client.token == "token" - assert isinstance(api_client.params, ParamsDict) - - -def test_push_params(snapshot, api_client): - params = {"param1": "value1", "param2": "value2"} - api_client.push_params(params) - # Serialize the dictionary to a JSON string for comparison - params_as_json = json.dumps(api_client.params.as_dict(), sort_keys=True) - snapshot.assert_match(params_as_json, "push_params") - - -def test_pop_params(snapshot, api_client): - params = {"param1": "value1", "param2": "value2"} - api_client.push_params(params) - api_client.pop_params(["param1"]) - params_as_json1 = json.dumps(api_client.params.as_dict(), sort_keys=True) - snapshot.assert_match(params_as_json1, "pop_params_after_one_removed") - api_client.pop_params() - params_as_json2 = json.dumps(api_client.params.as_dict(), sort_keys=True) - snapshot.assert_match(params_as_json2, "pop_params_after_all_removed") - - -@responses.activate -def test_is_valid_url(api_client): - invalid_url = "https://invalid.url" - - responses.add(responses.HEAD, invalid_url, status=404) - - with pytest.raises(ValueError): - api_client.url = invalid_url - - -def test_cache_operations(snapshot, api_client): - key = "test_key" - value = "test_value" - - api_client._cache_set(key, value) - snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_set") - - keys = api_client._cache_list() - snapshot.assert_match(str(keys), "cache_list") - - api_client._cache_delete(key) - snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_delete") - snapshot.assert_match(str(api_client._cache_get(key)), "cache_get_after_delete") - - -if __name__ == "__main__": - pytest.main() diff --git a/tfbpapi/tests/test_AbstractRecordsAndFilesAPI.py b/tfbpapi/tests/test_AbstractRecordsAndFilesAPI.py deleted file mode 100644 index 1c64a39..0000000 --- a/tfbpapi/tests/test_AbstractRecordsAndFilesAPI.py +++ /dev/null @@ -1,284 +0,0 @@ -import gzip -from io import BytesIO -from tempfile import NamedTemporaryFile -from typing import Any - -import pandas as pd -import pytest -import responses -from aioresponses import aioresponses - -from tfbpapi.AbstractRecordsAndFilesAPI import ( - AbstractRecordsAndFilesAPI, -) - -# The following test is commented out because it requires a running server -- this is -# how I retrieved the data for the tests below. The data is saved in the snapshot -# directory -# -# @pytest.mark.asyncio -# async def test_save_response_records_and_files(snapshot): -# async with aiohttp.ClientSession() as session: -# url = "http://127.0.0.1:8001/api/promotersetsig/export" -# async with session.get( -# url, -# headers={ -# "Authorization": f"token {os.getenv('TOKEN')}", -# "Content-Type": "application/json", -# }, -# params={ -# "regulator_symbol": "HAP5", -# "workflow": "nf_core_callingcards_dev", -# "data_usable": "pass", -# }, -# ) as response: -# response.raise_for_status() -# response_text = await response.text() -# snapshot.assert_match(response_text) -# assert response.status == 200 - - -# @pytest.mark.asyncio -# async def test_save_response_records_and_files(): -# async with aiohttp.ClientSession() as session: -# url = "http://127.0.0.1:8001/api/promotersetsig/record_table_and_files" -# async with session.get( -# url, -# headers={ -# "Authorization": f"token {os.getenv('TOKEN')}", -# "Content-Type": "application/gzip", -# }, -# params={ -# "regulator_symbol": "HAP5", -# "workflow": "nf_core_callingcards_dev", -# "data_usable": "pass", -# }, -# ) as response: -# response.raise_for_status() -# response_content = await response.read() -# with open("saved_response.tar.gz", "wb") as f: -# f.write(response_content) -# assert response.status == 200 - - -def promotersetsig_csv_gzip() -> bytes: - # Define the data as a dictionary - data = { - "id": [10690, 10694, 10754, 10929, 10939], - "uploader_id": [1, 1, 1, 1, 1], - "upload_date": ["2024-03-26"] * 5, - "modifier_id": [1, 1, 1, 1, 1], - "modified_date": [ - "2024-03-26 14:28:43.825628+00:00", - "2024-03-26 14:28:44.739775+00:00", - "2024-03-26 14:29:01.837335+00:00", - "2024-03-26 14:29:45.379790+00:00", - "2024-03-26 14:29:47.853980+00:00", - ], - "binding_id": [4079, 4083, 4143, 4318, 4327], - "promoter_id": [4, 4, 4, 4, 4], - "background_id": [6, 6, 6, 6, 6], - "fileformat_id": [5, 5, 5, 5, 5], - "file": [ - "promotersetsig/10690.csv.gz", - "promotersetsig/10694.csv.gz", - "promotersetsig/10754.csv.gz", - "promotersetsig/10929.csv.gz", - "promotersetsig/10939.csv.gz", - ], - } - - # Create a DataFrame - df = pd.DataFrame(data) - - # Convert the DataFrame to CSV and compress it using gzip - csv_buffer = BytesIO() - with gzip.GzipFile(fileobj=csv_buffer, mode="w") as gz: - df.to_csv(gz, index=False) - - # Get the gzipped data as bytes - return csv_buffer.getvalue() - - -class ConcreteRecordsAndFilesAPI(AbstractRecordsAndFilesAPI): - """Concrete implementation of AbstractRecordsAndFilesAPI for testing purposes.""" - - def create(self, data: dict[str, Any], **kwargs) -> Any: - pass - - def update(self, df: Any, **kwargs) -> Any: - pass - - def delete(self, id: str, **kwargs) -> Any: - pass - - def submit(self, post_dict: dict, **kwargs) -> Any: - pass # Implement for testing if necessary - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - pass # Implement for testing if necessary - - -@pytest.fixture -@responses.activate -def api_client(): - valid_url = "http://127.0.0.1:8001/api/promotersetsig" - responses.add(responses.HEAD, valid_url, status=200) - return ConcreteRecordsAndFilesAPI(url=valid_url, token="my_token") - - -@pytest.mark.asyncio -async def test_read_without_files(snapshot, api_client): - with aioresponses() as m: - # Mock the HTTP response with the saved snapshot response - m.get( - "http://127.0.0.1:8001/api/promotersetsig/export", - status=200, - body=promotersetsig_csv_gzip(), - headers={"Content-Type": "application/gzip"}, - ) - - result = await api_client.read() - assert isinstance(result.get("metadata"), pd.DataFrame) - assert result.get("metadata").shape == ( - 5, - 10, - ) - - -# chatGPT and I went through many iterations of trying to mock two endpoints at once. -# no success. the retrieve_files is untested outside of the tutorial notebook as a -# result -# -# @pytest.mark.asyncio -# async def test_read_with_responses(snapshot, api_client): -# with responses.RequestsMock() as rsps: -# # Mock the /export endpoint -# rsps.add( -# responses.GET, -# "http://127.0.0.1:8001/api/promotersetsig/export", -# body=promotersetsig_csv_gzip(), -# status=200, -# content_type="text/csv", -# ) - -# # Path to the tar.gz file -# tar_gz_file_path = os.path.join( -# os.path.dirname(__file__), -# "snapshots", -# "promotersetsig_records_and_files.tar.gz", -# ) - -# # Read the content of the tar.gz file -# with open(tar_gz_file_path, "rb") as tar_gz_file: -# tar_gz_content = tar_gz_file.read() - -# # Mock the /record_table_and_files endpoint -# rsps.add( -# responses.GET, -# "http://127.0.0.1:8001/api/promotersetsig/record_table_and_files", -# body=tar_gz_content, -# status=200, -# content_type="application/gzip", -# ) - -# # Helper function to create a mock ClientResponse -# async def create_mock_response(url, method, body, content_type, status): -# return MockClientResponse( -# method, URL(url), status, {"Content-Type": content_type}, body -# ) - -# # Patch aiohttp.ClientSession.get to use our mocked responses -# async def mock_get(self, url, **kwargs): -# if "export" in url: -# return await create_mock_response( -# url, -# "GET", -# promotersetsig_csv_gzip().encode(), -# "text/csv", -# 200, -# ) -# elif "record_table_and_files" in url: -# return await create_mock_response( -# url, -# "GET", -# tar_gz_content, -# "application/gzip", -# 200, -# ) -# else: -# raise ValueError("Unexpected URL") - -# with patch("aiohttp.ClientSession.get", new=mock_get): -# # Test the read method without retrieving files -# result = await api_client.read() -# assert isinstance(result.get("metadata"), pd.DataFrame) -# assert result.get("metadata").shape == (5, 10) - -# # Test the read method with retrieving files -# result = await api_client.read(retrieve_files=True) -# assert isinstance(result.get("metadata"), pd.DataFrame) -# assert result.get("metadata").shape == (5, 10) -# assert isinstance(result.get("data"), dict) -# assert len(result.get("data")) == 5 -# assert all(isinstance(v, pd.DataFrame) \ -# for v in result.get("data").values()) - -# test the _detect_delimiter method #### - - -def test_detect_delimiter_errors(api_client): - # test that a FileNotFound error is raised if the file does not exist - with pytest.raises(FileNotFoundError): - api_client._detect_delimiter("non_existent_file.csv") - - with NamedTemporaryFile(mode="w", suffix=".csv.gz") as tmpfile: - tmpfile.write("col1,col2,col3\nval1,val2,val3") - tmpfile.flush() - tmpfile_path = tmpfile.name - - with pytest.raises(gzip.BadGzipFile): - api_client._detect_delimiter(tmpfile_path) - - -def test_comma_delimiter(api_client): - with NamedTemporaryFile(mode="w", suffix=".csv") as tmpfile: - tmpfile.write("col1,col2,col3\nval1,val2,val3") - tmpfile.flush() - tmpfile_path = tmpfile.name - - delimiter = api_client._detect_delimiter(tmpfile_path) - assert delimiter == "," - - -def test_tab_delimiter(api_client): - with NamedTemporaryFile(mode="w", suffix=".csv") as tmpfile: - tmpfile.write("col1\tcol2\tcol3\nval1\tval2\tval3") - tmpfile.flush() - tmpfile_path = tmpfile.name - - delimiter = api_client._detect_delimiter(tmpfile_path) - assert delimiter == "\t" - - -def test_space_delimiter(api_client): - with NamedTemporaryFile(mode="w", suffix=".csv") as tmpfile: - tmpfile.write("col1 col2 col3\nval1 val2 val3") - tmpfile.flush() - tmpfile_path = tmpfile.name - - delimiter = api_client._detect_delimiter(tmpfile_path) - assert delimiter == " " - - -def test_gzipped_file(api_client): - with NamedTemporaryFile(suffix=".csv.gz") as tmpfile: - with gzip.open(tmpfile.name, "wt") as gzfile: - gzfile.write("col1,col2,col3\nval1,val2,val3") - gzfile.flush() - tmpfile_path = tmpfile.name - - delimiter = api_client._detect_delimiter(tmpfile_path) - assert delimiter == "," diff --git a/tfbpapi/tests/test_AbstractRecordsOnlyAPI.py b/tfbpapi/tests/test_AbstractRecordsOnlyAPI.py deleted file mode 100644 index 1def39a..0000000 --- a/tfbpapi/tests/test_AbstractRecordsOnlyAPI.py +++ /dev/null @@ -1,71 +0,0 @@ -import gzip -from typing import Any - -import pandas as pd -import pytest -import responses -from aioresponses import aioresponses - -from tfbpapi.AbstractRecordsOnlyAPI import AbstractRecordsOnlyAPI - - -class ConcreteAPI(AbstractRecordsOnlyAPI): - """Concrete implementation of AbstractRecordsOnlyAPI for testing purposes.""" - - def create(self, data: dict[str, Any], **kwargs) -> Any: - pass # Implement for testing if necessary - - def update(self, df: Any, **kwargs) -> Any: - pass # Implement for testing if necessary - - def delete(self, id: str, **kwargs) -> Any: - pass # Implement for testing if necessary - - def submit(self, post_dict: dict, **kwargs) -> Any: - pass # Implement for testing if necessary - - def retrieve( - self, group_task_id: str, timeout: int, polling_interval: int, **kwargs - ) -> Any: - pass # Implement for testing if necessary - - -@pytest.fixture -@responses.activate -def api_client(): - valid_url = "https://example.com/api/endpoint" - responses.add(responses.HEAD, valid_url, status=200) - return ConcreteAPI(url=valid_url, token="my_token") - - -@pytest.mark.asyncio -async def test_read(snapshot, api_client): - with aioresponses() as m: - # Mocking the response - mocked_csv = ( - "id,uploader_id,upload_date,modifier_id,modified_date,binding_id,promoter_id,background_id,fileformat_id,file\n" # noqa: E501 - "10690,1,2024-03-26,1,2024-03-26 14:28:43.825628+00:00,4079,4,6,5,promotersetsig/10690.csv.gz\n" # noqa: E501 - "10694,1,2024-03-26,1,2024-03-26 14:28:44.739775+00:00,4083,4,6,5,promotersetsig/10694.csv.gz\n" # noqa: E501 - "10754,1,2024-03-26,1,2024-03-26 14:29:01.837335+00:00,4143,4,6,5,promotersetsig/10754.csv.gz\n" # noqa: E501 - "10929,1,2024-03-26,1,2024-03-26 14:29:45.379790+00:00,4318,4,6,5,promotersetsig/10929.csv.gz\n" # noqa: E501 - "10939,1,2024-03-26,1,2024-03-26 14:29:47.853980+00:00,4327,4,6,5,promotersetsig/10939.csv.gz" # noqa: E501 - ) - - # Convert to bytes and gzip the content - gzipped_csv = gzip.compress(mocked_csv.encode("utf-8")) - - m.get( - "https://example.com/api/endpoint/export", - status=200, - body=gzipped_csv, - headers={"Content-Type": "application/gzip"}, - ) - - result = await api_client.read() - assert isinstance(result, dict) - assert isinstance(result.get("metadata"), pd.DataFrame) - assert result.get("metadata").shape == (5, 10) # type: ignore - - -if __name__ == "__main__": - pytest.main() diff --git a/tfbpapi/tests/test_Cache.py b/tfbpapi/tests/test_Cache.py deleted file mode 100644 index a84eb37..0000000 --- a/tfbpapi/tests/test_Cache.py +++ /dev/null @@ -1,66 +0,0 @@ -import time - -import pytest - -from tfbpapi.Cache import Cache - - -def test_cache_set_and_get(): - cache = Cache() - cache.set("key1", "value1") - assert cache.get("key1") == "value1" - assert cache.get("key2", "default_value") == "default_value" - - -def test_cache_list(): - cache = Cache() - cache.set("key1", "value1") - cache.set("key2", "value2") - keys = cache.list() - assert "key1" in keys - assert "key2" in keys - - -def test_cache_delete(): - cache = Cache() - cache.set("key1", "value1") - cache.set("key2", "value2") - cache.delete("key1") - assert cache.get("key1") is None - assert cache.get("key2") == "value2" - - -def test_cache_ttl(): - cache = Cache(ttl=1) # TTL set to 1 second - cache.set("key1", "value1") - time.sleep(1.5) # Wait for TTL to expire - assert cache.get("key1") is None # Should be None after TTL expiry - - -def test_cache_lru(): - cache = Cache(maxsize=2) - cache.set("key1", "value1") - cache.set("key2", "value2") - cache.set("key3", "value3") # This should evict "key1" if LRU works - assert cache.get("key1") is None - assert cache.get("key2") == "value2" - assert cache.get("key3") == "value3" - - -def test_separate_cache_instances(): - cache1 = Cache() - cache2 = Cache() - - cache1.set("key1", "value1") - cache2.set("key2", "value2") - - # Ensure they don't share state - assert cache1.get("key1") == "value1" - assert cache1.get("key2") is None - - assert cache2.get("key2") == "value2" - assert cache2.get("key1") is None - - -if __name__ == "__main__": - pytest.main() diff --git a/tfbpapi/tests/test_ParamsDict.py b/tfbpapi/tests/test_ParamsDict.py deleted file mode 100644 index ee5a246..0000000 --- a/tfbpapi/tests/test_ParamsDict.py +++ /dev/null @@ -1,96 +0,0 @@ -import pytest -import requests # type: ignore -import responses - -from tfbpapi.ParamsDict import ParamsDict - - -def test_initialization(): - params = ParamsDict({"b": 2, "a": 1}, valid_keys=["a", "b"]) - assert params == {"a": 1, "b": 2} - - -def test_getitem(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert params["a"] == 1 - assert params[["a", "b"]] == ParamsDict({"a": 1, "b": 2}) - with pytest.raises(KeyError): - _ = params["123"] # Changed from 123 to '123' - - -def test_setitem(): - params = ParamsDict({"a": 1}, valid_keys=["a", "b", "c", "d"]) - params.update({"b": 2}) - assert params == {"a": 1, "b": 2} - - params[["c", "d"]] = [3, 4] - assert params == {"a": 1, "b": 2, "c": 3, "d": 4} - - with pytest.raises(ValueError): - params[["e", "f"]] = [5] - - with pytest.raises(KeyError): - params[123] = 5 # type: ignore - - with pytest.raises(KeyError): - params.update({"d": 4, "e": 5}) - - -def test_delitem(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - del params["a"] - assert params == {"b": 2} - with pytest.raises(KeyError): - del params["123"] # Changed from 123 to '123' - - -def test_repr(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert repr(params) == "ParamsDict({'a': 1, 'b': 2})" - - -def test_str(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert str(params) == "a: 1, b: 2" - - -def test_len(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b", "c"]) - assert len(params) == 2 - params["c"] = 3 - assert len(params) == 3 - - -def test_keys_values_items(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert set(params.keys()) == {"a", "b"} - assert set(params.values()) == {1, 2} - assert set(params.items()) == {("a", 1), ("b", 2)} - - -def test_clear(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - params.clear() - assert len(params) == 0 - - -def test_as_dict(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - assert params.as_dict() == {"a": 1, "b": 2} - - -@responses.activate -def test_requests_integration(): - params = ParamsDict({"a": 1, "b": 2}, valid_keys=["a", "b"]) - - url = "https://httpbin.org/get" - responses.add(responses.GET, url, json={"args": {"a": "1", "b": "2"}}, status=200) - - response = requests.get(url, params=params) - assert response.status_code == 200 - response_json = response.json() - assert response_json["args"] == {"a": "1", "b": "2"} - - -if __name__ == "__main__": - pytest.main() diff --git a/tfbpapi/tests/test_datacard.py b/tfbpapi/tests/test_datacard.py new file mode 100644 index 0000000..01b2c0b --- /dev/null +++ b/tfbpapi/tests/test_datacard.py @@ -0,0 +1,449 @@ +"""Tests for the DataCard class.""" + +from unittest.mock import Mock, patch + +import pytest + +from tfbpapi import DataCard +from tfbpapi.errors import DataCardError, DataCardValidationError, HfDataFetchError +from tfbpapi.models import DatasetType + + +class TestDataCard: + """Test suite for DataCard class.""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_init( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + test_token, + ): + """Test DataCard initialization.""" + datacard = DataCard(test_repo_id, token=test_token) + + assert datacard.repo_id == test_repo_id + assert datacard.token == test_token + assert datacard._dataset_card is None + assert datacard._metadata_cache == {} + + # Check that fetchers were initialized + mock_card_fetcher.assert_called_once_with(token=test_token) + mock_structure_fetcher.assert_called_once_with(token=test_token) + mock_size_fetcher.assert_called_once_with(token=test_token) + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_init_without_token( + self, mock_size_fetcher, mock_structure_fetcher, mock_card_fetcher, test_repo_id + ): + """Test DataCard initialization without token.""" + datacard = DataCard(test_repo_id) + + assert datacard.repo_id == test_repo_id + assert datacard.token is None + + # Check that fetchers were initialized without token + mock_card_fetcher.assert_called_once_with(token=None) + mock_structure_fetcher.assert_called_once_with(token=None) + mock_size_fetcher.assert_called_once_with(token=None) + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_load_and_validate_card_success( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test successful card loading and validation.""" + # Setup mock + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + # Access dataset_card property to trigger loading + card = datacard.dataset_card + + assert card is not None + assert len(card.configs) == 4 + assert card.pretty_name == "Test Genomics Dataset" + mock_fetcher_instance.fetch.assert_called_once_with(test_repo_id) + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_load_card_no_data( + self, mock_size_fetcher, mock_structure_fetcher, mock_card_fetcher, test_repo_id + ): + """Test handling when no dataset card is found.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = {} + + datacard = DataCard(test_repo_id) + + with pytest.raises(DataCardValidationError, match="No dataset card found"): + _ = datacard.dataset_card + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_load_card_validation_error( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + invalid_dataset_card_data, + ): + """Test handling of validation errors.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = invalid_dataset_card_data + + datacard = DataCard(test_repo_id) + + with pytest.raises( + DataCardValidationError, match="Dataset card validation failed" + ): + _ = datacard.dataset_card + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_load_card_fetch_error( + self, mock_size_fetcher, mock_structure_fetcher, mock_card_fetcher, test_repo_id + ): + """Test handling of fetch errors.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.side_effect = HfDataFetchError("Fetch failed") + + datacard = DataCard(test_repo_id) + + with pytest.raises(DataCardError, match="Failed to fetch dataset card"): + _ = datacard.dataset_card + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_configs_property( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test getting all configurations via property.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + configs = datacard.configs + + assert len(configs) == 4 + config_names = [config.config_name for config in configs] + assert "genomic_features" in config_names + assert "binding_data" in config_names + assert "genome_map_data" in config_names + assert "experiment_metadata" in config_names + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_get_config_by_name( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test getting a specific configuration by name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + config = datacard.get_config("binding_data") + assert config is not None + assert config.config_name == "binding_data" + assert config.dataset_type == DatasetType.ANNOTATED_FEATURES + + # Test non-existent config + assert datacard.get_config("nonexistent") is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_get_metadata_relationships( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test getting metadata relationships.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + relationships = datacard.get_metadata_relationships() + + # Should have explicit relationship between binding_data and experiment_metadata + explicit_rels = [r for r in relationships if r.relationship_type == "explicit"] + assert len(explicit_rels) == 1 + assert explicit_rels[0].data_config == "binding_data" + assert explicit_rels[0].metadata_config == "experiment_metadata" + + # Should have embedded relationship for binding_data (has metadata_fields) + embedded_rels = [r for r in relationships if r.relationship_type == "embedded"] + assert len(embedded_rels) == 1 + assert embedded_rels[0].data_config == "binding_data" + assert embedded_rels[0].metadata_config == "binding_data_embedded" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_get_repository_info_success( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + sample_repo_structure, + ): + """Test getting repository information.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.fetch.return_value = sample_repo_structure + + datacard = DataCard(test_repo_id) + + info = datacard.get_repository_info() + + assert info["repo_id"] == test_repo_id + assert info["pretty_name"] == "Test Genomics Dataset" + assert info["license"] == "mit" + assert info["num_configs"] == 4 + assert "genomic_features" in info["dataset_types"] + assert "annotated_features" in info["dataset_types"] + assert "genome_map" in info["dataset_types"] + assert "metadata" in info["dataset_types"] + assert info["total_files"] == 5 + assert info["last_modified"] == "2023-12-01T10:30:00Z" + assert info["has_default_config"] is True + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_get_repository_info_fetch_error( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test getting repository info when structure fetch fails.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.fetch.side_effect = HfDataFetchError( + "Structure fetch failed" + ) + + datacard = DataCard(test_repo_id) + + info = datacard.get_repository_info() + + assert info["repo_id"] == test_repo_id + assert info["total_files"] is None + assert info["last_modified"] is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_summary( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + sample_repo_structure, + ): + """Test getting a summary of the dataset.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.fetch.return_value = sample_repo_structure + + datacard = DataCard(test_repo_id) + + summary = datacard.summary() + + assert "Dataset: Test Genomics Dataset" in summary + assert f"Repository: {test_repo_id}" in summary + assert "License: mit" in summary + assert "Configurations: 4" in summary + assert "genomic_features" in summary + assert "binding_data" in summary + assert "genome_map_data" in summary + assert "experiment_metadata" in summary + assert "(default)" in summary # genomic_features is marked as default + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_partition_values( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test extracting partition values.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.get_partition_values.return_value = [ + "TF1", + "TF2", + "TF3", + ] + + datacard = DataCard(test_repo_id) + + # Get the genome_map_data config which has partitioning enabled + config = datacard.get_config("genome_map_data") + assert config is not None + assert config.dataset_info.partitioning.enabled is True + + values = datacard._extract_partition_values(config, "regulator") + assert values == {"TF1", "TF2", "TF3"} + mock_structure_fetcher_instance.get_partition_values.assert_called_once_with( + test_repo_id, "regulator" + ) + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_partition_values_no_partitioning( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test extracting partition values when partitioning is disabled.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + # Get a config without partitioning + config = datacard.get_config("genomic_features") + assert config is not None + assert config.dataset_info.partitioning is None + + values = datacard._extract_partition_values(config, "some_field") + assert values == set() + mock_structure_fetcher_instance.get_partition_values.assert_not_called() + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_partition_values_field_not_in_partitions( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test extracting partition values when field is not a partition column.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + + # Get the genome_map_data config which has partitioning enabled + config = datacard.get_config("genome_map_data") + assert config is not None + + # Try to extract values for a field that's not in partition_by + values = datacard._extract_partition_values(config, "not_a_partition_field") + assert values == set() + mock_structure_fetcher_instance.get_partition_values.assert_not_called() + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_partition_values_fetch_error( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Test extracting partition values when fetch fails.""" + mock_card_fetcher_instance = Mock() + mock_structure_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_card_fetcher_instance + mock_structure_fetcher.return_value = mock_structure_fetcher_instance + + mock_card_fetcher_instance.fetch.return_value = sample_dataset_card_data + mock_structure_fetcher_instance.get_partition_values.side_effect = ( + HfDataFetchError("Fetch failed") + ) + + datacard = DataCard(test_repo_id) + + config = datacard.get_config("genome_map_data") + values = datacard._extract_partition_values(config, "regulator") + + # Should return empty set on error + assert values == set() diff --git a/tfbpapi/tests/test_datacard_parsing.py b/tfbpapi/tests/test_datacard_parsing.py new file mode 100644 index 0000000..5d2210e --- /dev/null +++ b/tfbpapi/tests/test_datacard_parsing.py @@ -0,0 +1,169 @@ +"""Test script to verify datacard parsing with new environmental_conditions.""" + +import yaml + +from tfbpapi.models import DatasetCard +from tfbpapi.tests.example_datacards import ( + EXAMPLE_1_SIMPLE_TOPLEVEL, + EXAMPLE_2_COMPLEX_FIELD_DEFINITIONS, + EXAMPLE_3_PARTITIONED_WITH_METADATA, +) + + +def test_example_1(): + """Test parsing example 1: simple top-level conditions.""" + print("=" * 80) + print("Testing Example 1: Simple Top-Level Conditions") + print("=" * 80) + + # Extract YAML from markdown + yaml_content = EXAMPLE_1_SIMPLE_TOPLEVEL.split("---")[1] + data = yaml.safe_load(yaml_content) + + try: + card = DatasetCard(**data) + print("✓ Successfully parsed Example 1") + print(f" - Configs: {len(card.configs)}") + print( + " - Top-level experimental_conditions: " + f"{card.experimental_conditions is not None}" + ) + + if card.experimental_conditions: + env_cond = card.experimental_conditions.environmental_conditions + if env_cond: + print(f" - Temperature: {env_cond.temperature_celsius}°C") + print(f" - Cultivation: {env_cond.cultivation_method}") + if env_cond.media: + print(f" - Media: {env_cond.media.name}") + print(f" - Carbon sources: {len(env_cond.media.carbon_source)}") + print( + f" - Nitrogen sources: {len(env_cond.media.nitrogen_source)}" + ) + + # Check field-level definitions + config = card.configs[0] + for feature in config.dataset_info.features: + if feature.definitions: + print( + f" - Feature '{feature.name}' has " + f"{len(feature.definitions)} definitions" + ) + for def_name in feature.definitions.keys(): + print(f" - {def_name}") + + print() + return True + except Exception as e: + print(f"✗ Failed to parse Example 1: {e}") + import traceback + + traceback.print_exc() + print() + return False + + +def test_example_2(): + """Test parsing example 2: complex field-level definitions.""" + print("=" * 80) + print("Testing Example 2: Complex Field-Level Definitions") + print("=" * 80) + + yaml_content = EXAMPLE_2_COMPLEX_FIELD_DEFINITIONS.split("---")[1] + data = yaml.safe_load(yaml_content) + + try: + card = DatasetCard(**data) + print("✓ Successfully parsed Example 2") + print(f" - Configs: {len(card.configs)}") + print(f" - Strain information: {card.strain_information is not None}") + + # Check field-level definitions + config = card.configs[0] + for feature in config.dataset_info.features: + if feature.definitions: + print( + f" - Feature '{feature.name}' has " + f"{len(feature.definitions)} definitions:" + ) + for def_name, def_value in feature.definitions.items(): + print(f" - {def_name}") + if "environmental_conditions" in def_value: + env = def_value["environmental_conditions"] + if "temperature_celsius" in env: + print(f" Temperature: {env['temperature_celsius']}°C") + if "media" in env: + print(f" Media: {env['media']['name']}") + + print() + return True + except Exception as e: + print(f"✗ Failed to parse Example 2: {e}") + import traceback + + traceback.print_exc() + print() + return False + + +def test_example_3(): + """Test parsing example 3: partitioned with metadata.""" + print("=" * 80) + print("Testing Example 3: Partitioned with Metadata") + print("=" * 80) + + yaml_content = EXAMPLE_3_PARTITIONED_WITH_METADATA.split("---")[1] + data = yaml.safe_load(yaml_content) + + try: + card = DatasetCard(**data) + print("✓ Successfully parsed Example 3") + print(f" - Configs: {len(card.configs)}") + print( + " - Top-level experimental_conditions: " + f"{card.experimental_conditions is not None}" + ) + + if card.experimental_conditions: + env_cond = card.experimental_conditions.environmental_conditions + if env_cond and env_cond.media: + print(f" - Top-level media: {env_cond.media.name}") + + # Check config-level experimental_conditions + for config in card.configs: + if config.experimental_conditions: + print(f" - Config '{config.config_name}' has experimental_conditions") + env_cond = config.experimental_conditions.environmental_conditions + if env_cond and env_cond.media: + print(f" - Media: {env_cond.media.name}") + print(f" - Temperature: {env_cond.temperature_celsius}°C") + + print() + return True + except Exception as e: + print(f"✗ Failed to parse Example 3: {e}") + import traceback + + traceback.print_exc() + print() + return False + + +if __name__ == "__main__": + results = [] + + results.append(test_example_1()) + results.append(test_example_2()) + results.append(test_example_3()) + + print("=" * 80) + print("Summary") + print("=" * 80) + print(f"Passed: {sum(results)}/{len(results)}") + + if all(results): + print("\n✓ All tests passed!") + exit(0) + else: + print("\n✗ Some tests failed") + exit(1) diff --git a/tfbpapi/tests/test_fetchers.py b/tfbpapi/tests/test_fetchers.py new file mode 100644 index 0000000..ac350f5 --- /dev/null +++ b/tfbpapi/tests/test_fetchers.py @@ -0,0 +1,435 @@ +"""Tests for datainfo fetcher classes.""" + +from unittest.mock import Mock, patch + +import pytest +import requests +from requests import HTTPError + +from tfbpapi.fetchers import ( + HfDataCardFetcher, + HfRepoStructureFetcher, + HfSizeInfoFetcher, +) +from tfbpapi.errors import HfDataFetchError + + +class TestHfDataCardFetcher: + """Test HfDataCardFetcher class.""" + + def test_init_with_token(self, test_token): + """Test initialization with token.""" + fetcher = HfDataCardFetcher(token=test_token) + assert fetcher.token == test_token + + def test_init_without_token(self): + """Test initialization without token.""" + with patch.dict("os.environ", {}, clear=True): + fetcher = HfDataCardFetcher() + assert fetcher.token is None + + def test_init_with_env_token(self, test_token): + """Test initialization with environment token.""" + with patch.dict("os.environ", {"HF_TOKEN": test_token}): + fetcher = HfDataCardFetcher() + assert fetcher.token == test_token + + @patch("tfbpapi.fetchers.DatasetCard") + def test_fetch_success( + self, mock_dataset_card, test_repo_id, sample_dataset_card_data + ): + """Test successful dataset card fetch.""" + # Setup mock + mock_card = Mock() + mock_card.data.to_dict.return_value = sample_dataset_card_data + mock_dataset_card.load.return_value = mock_card + + fetcher = HfDataCardFetcher(token="test_token") + result = fetcher.fetch(test_repo_id) + + assert result == sample_dataset_card_data + mock_dataset_card.load.assert_called_once_with( + test_repo_id, repo_type="dataset", token="test_token" + ) + + @patch("tfbpapi.fetchers.DatasetCard") + def test_fetch_no_data_section(self, mock_dataset_card, test_repo_id): + """Test fetch when dataset card has no data section.""" + # Setup mock with no data + mock_card = Mock() + mock_card.data = None + mock_dataset_card.load.return_value = mock_card + + fetcher = HfDataCardFetcher() + result = fetcher.fetch(test_repo_id) + + assert result == {} + + @patch("tfbpapi.fetchers.DatasetCard") + def test_fetch_exception(self, mock_dataset_card, test_repo_id): + """Test fetch when DatasetCard.load raises exception.""" + mock_dataset_card.load.side_effect = Exception("API Error") + + fetcher = HfDataCardFetcher() + + with pytest.raises(HfDataFetchError, match="Failed to fetch dataset card"): + fetcher.fetch(test_repo_id) + + def test_fetch_different_repo_types(self, sample_dataset_card_data): + """Test fetch with different repository types.""" + with patch("tfbpapi.fetchers.DatasetCard") as mock_dataset_card: + mock_card = Mock() + mock_card.data.to_dict.return_value = sample_dataset_card_data + mock_dataset_card.load.return_value = mock_card + + fetcher = HfDataCardFetcher() + + # Test with model repo + fetcher.fetch("test/repo", repo_type="model") + mock_dataset_card.load.assert_called_with( + "test/repo", repo_type="model", token=None + ) + + # Test with space repo + fetcher.fetch("test/repo", repo_type="space") + mock_dataset_card.load.assert_called_with( + "test/repo", repo_type="space", token=None + ) + + +class TestHfSizeInfoFetcher: + """Test HfSizeInfoFetcher class.""" + + def test_init(self, test_token): + """Test initialization.""" + fetcher = HfSizeInfoFetcher(token=test_token) + assert fetcher.token == test_token + assert fetcher.base_url == "https://datasets-server.huggingface.co" + + def test_build_headers_with_token(self, test_token): + """Test building headers with token.""" + fetcher = HfSizeInfoFetcher(token=test_token) + headers = fetcher._build_headers() + + assert headers["User-Agent"] == "TFBP-API/1.0" + assert headers["Authorization"] == f"Bearer {test_token}" + + def test_build_headers_without_token(self): + """Test building headers without token.""" + fetcher = HfSizeInfoFetcher() + headers = fetcher._build_headers() + + assert headers["User-Agent"] == "TFBP-API/1.0" + assert "Authorization" not in headers + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_success(self, mock_get, test_repo_id, sample_size_info): + """Test successful size info fetch.""" + # Setup mock response + mock_response = Mock() + mock_response.json.return_value = sample_size_info + mock_get.return_value = mock_response + + fetcher = HfSizeInfoFetcher(token="test_token") + result = fetcher.fetch(test_repo_id) + + assert result == sample_size_info + mock_get.assert_called_once() + + # Check call arguments + call_args = mock_get.call_args + assert call_args[1]["params"]["dataset"] == test_repo_id + assert call_args[1]["headers"]["Authorization"] == "Bearer test_token" + assert call_args[1]["timeout"] == 30 + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_404_error(self, mock_get, test_repo_id): + """Test fetch with 404 error.""" + # Setup mock 404 response + mock_response = Mock() + mock_response.status_code = 404 + error = HTTPError(response=mock_response) + mock_get.side_effect = error + + fetcher = HfSizeInfoFetcher() + + with pytest.raises(HfDataFetchError, match="Dataset .* not found"): + fetcher.fetch(test_repo_id) + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_403_error(self, mock_get, test_repo_id): + """Test fetch with 403 error.""" + # Setup mock 403 response + mock_response = Mock() + mock_response.status_code = 403 + error = HTTPError(response=mock_response) + mock_get.side_effect = error + + fetcher = HfSizeInfoFetcher() + + with pytest.raises( + HfDataFetchError, match="Access denied.*check token permissions" + ): + fetcher.fetch(test_repo_id) + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_other_http_error(self, mock_get, test_repo_id): + """Test fetch with other HTTP error.""" + # Setup mock 500 response + mock_response = Mock() + mock_response.status_code = 500 + error = HTTPError(response=mock_response) + mock_get.side_effect = error + + fetcher = HfSizeInfoFetcher() + + with pytest.raises(HfDataFetchError, match="HTTP error fetching size"): + fetcher.fetch(test_repo_id) + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_request_exception(self, mock_get, test_repo_id): + """Test fetch with request exception.""" + mock_get.side_effect = requests.RequestException("Network error") + + fetcher = HfSizeInfoFetcher() + + with pytest.raises(HfDataFetchError, match="Request failed fetching size"): + fetcher.fetch(test_repo_id) + + @patch("tfbpapi.fetchers.requests.get") + def test_fetch_json_decode_error(self, mock_get, test_repo_id): + """Test fetch with JSON decode error.""" + # Setup mock response with invalid JSON + mock_response = Mock() + mock_response.json.side_effect = ValueError("Invalid JSON") + mock_get.return_value = mock_response + + fetcher = HfSizeInfoFetcher() + + with pytest.raises(HfDataFetchError, match="Invalid JSON response"): + fetcher.fetch(test_repo_id) + + +class TestHfRepoStructureFetcher: + """Test HfRepoStructureFetcher class.""" + + def test_init(self, test_token): + """Test initialization.""" + fetcher = HfRepoStructureFetcher(token=test_token) + assert fetcher.token == test_token + assert fetcher._cached_structure == {} + + @patch("tfbpapi.fetchers.repo_info") + def test_fetch_success(self, mock_repo_info, test_repo_id, sample_repo_structure): + """Test successful repository structure fetch.""" + # Setup mock repo info + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="features.parquet", size=2048000, lfs=Mock()), + Mock(rfilename="binding/part1.parquet", size=1024000, lfs=Mock()), + Mock( + rfilename="tracks/regulator=TF1/experiment=exp1/data.parquet", + size=5120000, + lfs=Mock(), + ), + ] + mock_info.last_modified.isoformat.return_value = "2023-12-01T10:30:00Z" + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher(token="test_token") + result = fetcher.fetch(test_repo_id) + + assert result["repo_id"] == test_repo_id + assert result["total_files"] == 3 + assert len(result["files"]) == 3 + assert result["last_modified"] == "2023-12-01T10:30:00Z" + + # Check that repo_info was called correctly + mock_repo_info.assert_called_once_with( + repo_id=test_repo_id, repo_type="dataset", token="test_token" + ) + + @patch("tfbpapi.fetchers.repo_info") + def test_fetch_with_caching(self, mock_repo_info, test_repo_id): + """Test fetch with caching behavior.""" + # Setup mock + mock_info = Mock() + mock_info.siblings = [] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + + # First fetch + result1 = fetcher.fetch(test_repo_id) + assert mock_repo_info.call_count == 1 + + # Second fetch should use cache + result2 = fetcher.fetch(test_repo_id) + assert mock_repo_info.call_count == 1 # Not called again + assert result1 == result2 + + # Force refresh should call API again + result3 = fetcher.fetch(test_repo_id, force_refresh=True) + assert mock_repo_info.call_count == 2 + + @patch("tfbpapi.fetchers.repo_info") + def test_fetch_siblings_none(self, mock_repo_info, test_repo_id): + """Test fetch when siblings is None.""" + # Setup mock with None siblings + mock_info = Mock() + mock_info.siblings = None + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + result = fetcher.fetch(test_repo_id) + + assert result["total_files"] == 0 + assert result["files"] == [] + assert result["partitions"] == {} + + @patch("tfbpapi.fetchers.repo_info") + def test_fetch_exception(self, mock_repo_info, test_repo_id): + """Test fetch when repo_info raises exception.""" + mock_repo_info.side_effect = Exception("API Error") + + fetcher = HfRepoStructureFetcher() + + with pytest.raises(HfDataFetchError, match="Failed to fetch repo structure"): + fetcher.fetch(test_repo_id) + + def test_extract_partition_info(self): + """Test extracting partition information from file paths.""" + fetcher = HfRepoStructureFetcher() + partitions = {} + + # Test normal partition pattern + fetcher._extract_partition_info( + "data/regulator=TF1/condition=control/file.parquet", partitions + ) + assert "regulator" in partitions + assert "TF1" in partitions["regulator"] + assert "condition" in partitions + assert "control" in partitions["condition"] + + # Test multiple values for same partition + fetcher._extract_partition_info( + "data/regulator=TF2/condition=treatment/file.parquet", partitions + ) + assert len(partitions["regulator"]) == 2 + assert "TF2" in partitions["regulator"] + assert "treatment" in partitions["condition"] + + # Test file without partitions + fetcher._extract_partition_info("simple_file.parquet", partitions) + # partitions dict should remain unchanged + assert len(partitions) == 2 + + @patch("tfbpapi.fetchers.repo_info") + def test_get_partition_values_success(self, mock_repo_info, test_repo_id): + """Test getting partition values for a specific column.""" + # Setup mock with partitioned files + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="data/regulator=TF1/file1.parquet", size=1000, lfs=None), + Mock(rfilename="data/regulator=TF2/file2.parquet", size=1000, lfs=None), + Mock(rfilename="data/regulator=TF3/file3.parquet", size=1000, lfs=None), + ] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + values = fetcher.get_partition_values(test_repo_id, "regulator") + + assert values == ["TF1", "TF2", "TF3"] # Should be sorted + + @patch("tfbpapi.fetchers.repo_info") + def test_get_partition_values_no_partitions(self, mock_repo_info, test_repo_id): + """Test getting partition values when no partitions exist.""" + # Setup mock with no partitioned files + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="simple_file.parquet", size=1000, lfs=None), + ] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + values = fetcher.get_partition_values(test_repo_id, "regulator") + + assert values == [] + + @patch("tfbpapi.fetchers.repo_info") + def test_get_dataset_files_all(self, mock_repo_info, test_repo_id): + """Test getting all dataset files.""" + # Setup mock + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="file1.parquet", size=1000, lfs=None), + Mock(rfilename="file2.parquet", size=2000, lfs=Mock()), + ] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + files = fetcher.get_dataset_files(test_repo_id) + + assert len(files) == 2 + assert files[0]["path"] == "file1.parquet" + assert files[0]["size"] == 1000 + assert files[0]["is_lfs"] is False + + assert files[1]["path"] == "file2.parquet" + assert files[1]["size"] == 2000 + assert files[1]["is_lfs"] is True + + @patch("tfbpapi.fetchers.repo_info") + def test_get_dataset_files_with_pattern(self, mock_repo_info, test_repo_id): + """Test getting dataset files with path pattern filter.""" + # Setup mock + mock_info = Mock() + mock_info.siblings = [ + Mock(rfilename="data/file1.parquet", size=1000, lfs=None), + Mock(rfilename="metadata/info.json", size=500, lfs=None), + Mock(rfilename="data/file2.parquet", size=2000, lfs=None), + ] + mock_info.last_modified = None + mock_repo_info.return_value = mock_info + + fetcher = HfRepoStructureFetcher() + files = fetcher.get_dataset_files(test_repo_id, path_pattern=r".*\.parquet$") + + assert len(files) == 2 + assert all(f["path"].endswith(".parquet") for f in files) + + def test_get_dataset_files_uses_cache(self): + """Test that get_dataset_files uses fetch caching.""" + fetcher = HfRepoStructureFetcher() + + with patch.object(fetcher, "fetch") as mock_fetch: + mock_fetch.return_value = {"files": []} + + # First call + fetcher.get_dataset_files("test/repo") + mock_fetch.assert_called_with("test/repo", force_refresh=False) + + # Second call with force_refresh + fetcher.get_dataset_files("test/repo", force_refresh=True) + mock_fetch.assert_called_with("test/repo", force_refresh=True) + + def test_get_partition_values_uses_cache(self): + """Test that get_partition_values uses fetch caching.""" + fetcher = HfRepoStructureFetcher() + + with patch.object(fetcher, "fetch") as mock_fetch: + mock_fetch.return_value = {"partitions": {"regulator": {"TF1", "TF2"}}} + + # First call + result = fetcher.get_partition_values("test/repo", "regulator") + mock_fetch.assert_called_with("test/repo", force_refresh=False) + assert result == ["TF1", "TF2"] + + # Second call with force_refresh + fetcher.get_partition_values("test/repo", "regulator", force_refresh=True) + mock_fetch.assert_called_with("test/repo", force_refresh=True) diff --git a/tfbpapi/tests/test_hf_cache_manager.py b/tfbpapi/tests/test_hf_cache_manager.py new file mode 100644 index 0000000..aa395df --- /dev/null +++ b/tfbpapi/tests/test_hf_cache_manager.py @@ -0,0 +1,783 @@ +"""Comprehensive tests for HfCacheManager class.""" + +import logging +from datetime import datetime, timedelta +from unittest.mock import Mock, patch + +import duckdb +import pytest + +from tfbpapi.hf_cache_manager import HfCacheManager +from tfbpapi.models import DatasetType + + +class TestHfCacheManagerInit: + """Test HfCacheManager initialization.""" + + def test_init_basic(self): + """Test basic initialization.""" + conn = duckdb.connect(":memory:") + repo_id = "test/repo" + + with patch( + "tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None + ) as mock_datacard_init: + cache_manager = HfCacheManager(repo_id, conn) + # Manually set the properties that would normally + # be set by DataCard.__init__ + cache_manager.repo_id = repo_id + cache_manager.token = None + + assert cache_manager.repo_id == repo_id + assert cache_manager.duckdb_conn == conn + assert cache_manager.token is None + assert cache_manager.logger is not None + # DataCard should be initialized as parent + mock_datacard_init.assert_called_once_with(repo_id, None) + + def test_init_with_token_and_logger(self): + """Test initialization with token and custom logger.""" + conn = duckdb.connect(":memory:") + repo_id = "test/repo" + token = "test_token" + logger = logging.getLogger("test_logger") + + with patch( + "tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None + ) as mock_datacard_init: + cache_manager = HfCacheManager(repo_id, conn, token=token, logger=logger) + # Manually set the properties that would + # normally be set by DataCard.__init__ + cache_manager.repo_id = repo_id + cache_manager.token = token + + assert cache_manager.repo_id == repo_id + assert cache_manager.duckdb_conn == conn + assert cache_manager.token == token + assert cache_manager.logger == logger + # DataCard should be initialized as parent with token + mock_datacard_init.assert_called_once_with(repo_id, token) + + +class TestHfCacheManagerDatacard: + """Test DataCard integration since HfCacheManager now inherits from DataCard.""" + + def test_datacard_inheritance(self): + """Test that HfCacheManager properly inherits from DataCard.""" + conn = duckdb.connect(":memory:") + repo_id = "test/repo" + token = "test_token" + + with patch( + "tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None + ) as mock_datacard_init: + cache_manager = HfCacheManager(repo_id, conn, token=token) + + # DataCard should be initialized during construction + mock_datacard_init.assert_called_once_with(repo_id, token) + + # Should have DataCard methods available (they exist on the class) + assert hasattr(cache_manager, "get_config") + + +class TestHfCacheManagerDuckDBOperations: + """Test DuckDB operations that are still part of HfCacheManager.""" + + @patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None) + def test_create_duckdb_table_from_files_single_file( + self, mock_datacard_init, tmpdir + ): + """Test creating DuckDB table from single parquet file.""" + # Create a mock parquet file + parquet_file = tmpdir.join("test.parquet") + parquet_file.write("dummy_content") + + # Use a separate cache manager with mock connection for this test + mock_conn = Mock() + test_cache_manager = HfCacheManager("test/repo", mock_conn) + + # Mock the validation method since we're testing table creation + test_cache_manager._validate_source_sample_fields = Mock() # type: ignore + + test_cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + mock_conn.execute.assert_called_once() + sql_call = mock_conn.execute.call_args[0][0] + assert "CREATE OR REPLACE VIEW test_table" in sql_call + assert str(parquet_file) in sql_call + + @patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None) + def test_create_duckdb_table_from_files_multiple_files( + self, mock_datacard_init, tmpdir + ): + """Test creating DuckDB table from multiple parquet files.""" + # Create mock parquet files + file1 = tmpdir.join("test1.parquet") + file1.write("dummy_content1") + file2 = tmpdir.join("test2.parquet") + file2.write("dummy_content2") + + files = [str(file1), str(file2)] + + # Use a separate cache manager with mock connection for this test + mock_conn = Mock() + test_cache_manager = HfCacheManager("test/repo", mock_conn) + + # Mock the validation method since we're testing table creation + test_cache_manager._validate_source_sample_fields = Mock() # type: ignore + + test_cache_manager._create_duckdb_table_from_files( + files, "test_table", "test_config" + ) + + mock_conn.execute.assert_called_once() + sql_call = mock_conn.execute.call_args[0][0] + assert "CREATE OR REPLACE VIEW test_table" in sql_call + assert str(file1) in sql_call + assert str(file2) in sql_call + + +class TestHfCacheManagerCacheManagement: + """Test cache management functionality.""" + + def setup_method(self): + """Set up test fixtures.""" + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + self.conn = duckdb.connect(":memory:") + self.repo_id = "test/repo" + self.cache_manager = HfCacheManager(self.repo_id, self.conn) + + def test_parse_size_string(self): + """Test size string parsing.""" + assert self.cache_manager._parse_size_string("10KB") == 10 * 1024 + assert self.cache_manager._parse_size_string("5MB") == 5 * 1024**2 + assert self.cache_manager._parse_size_string("2GB") == 2 * 1024**3 + assert self.cache_manager._parse_size_string("1TB") == 1 * 1024**4 + assert self.cache_manager._parse_size_string("500") == 500 + assert self.cache_manager._parse_size_string("10.5GB") == int(10.5 * 1024**3) + + def test_format_bytes(self): + """Test byte formatting.""" + assert self.cache_manager._format_bytes(0) == "0B" + assert self.cache_manager._format_bytes(1023) == "1023.0B" + assert self.cache_manager._format_bytes(1024) == "1.0KB" + assert self.cache_manager._format_bytes(1024**2) == "1.0MB" + assert self.cache_manager._format_bytes(1024**3) == "1.0GB" + assert self.cache_manager._format_bytes(1024**4) == "1.0TB" + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_by_age(self, mock_scan_cache_dir): + """Test age-based cache cleaning.""" + # Setup mock cache info + mock_cache_info = Mock() + mock_revision = Mock() + mock_revision.commit_hash = "abc123" + mock_revision.last_modified = (datetime.now() - timedelta(days=35)).timestamp() + + mock_repo = Mock() + mock_repo.revisions = [mock_revision] + + mock_cache_info.repos = [mock_repo] + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "100MB" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_cache_by_age(max_age_days=30, dry_run=True) + + assert result == mock_delete_strategy + mock_cache_info.delete_revisions.assert_called_once_with("abc123") + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_by_age_no_old_revisions(self, mock_scan_cache_dir): + """Test age-based cleaning when no old revisions exist.""" + mock_cache_info = Mock() + mock_revision = Mock() + mock_revision.commit_hash = "abc123" + mock_revision.last_modified = datetime.now().timestamp() # Recent + + mock_repo = Mock() + mock_repo.revisions = [mock_revision] + + mock_cache_info.repos = [mock_repo] + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "0B" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_cache_by_age(max_age_days=30, dry_run=True) + + # Should still return a strategy, but with empty revisions + assert result == mock_delete_strategy + mock_cache_info.delete_revisions.assert_called_once_with() + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_by_size(self, mock_scan_cache_dir): + """Test size-based cache cleaning.""" + # Setup mock cache info + mock_cache_info = Mock() + mock_cache_info.size_on_disk = 5 * 1024**3 # 5GB + mock_cache_info.size_on_disk_str = "5.0GB" + + mock_revision = Mock() + mock_revision.commit_hash = "abc123" + mock_revision.last_modified = datetime.now().timestamp() + mock_revision.size_on_disk = 2 * 1024**3 # 2GB + + mock_repo = Mock() + mock_repo.revisions = [mock_revision] + + mock_cache_info.repos = [mock_repo] + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "2GB" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_cache_by_size( + target_size="3GB", strategy="oldest_first", dry_run=True + ) + + assert result == mock_delete_strategy + mock_cache_info.delete_revisions.assert_called_once() + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_by_size_already_under_target(self, mock_scan_cache_dir): + """Test size-based cleaning when already under target.""" + mock_cache_info = Mock() + mock_cache_info.size_on_disk = 1 * 1024**3 # 1GB + mock_cache_info.size_on_disk_str = "1.0GB" + mock_cache_info.repos = [] + + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "0B" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_cache_by_size( + target_size="2GB", strategy="oldest_first", dry_run=True + ) + + assert result == mock_delete_strategy + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_unused_revisions(self, mock_scan_cache_dir): + """Test cleaning unused revisions.""" + # Setup mock with multiple revisions + mock_cache_info = Mock() + + mock_revision1 = Mock() + mock_revision1.commit_hash = "abc123" + mock_revision1.last_modified = (datetime.now() - timedelta(days=1)).timestamp() + + mock_revision2 = Mock() + mock_revision2.commit_hash = "def456" + mock_revision2.last_modified = (datetime.now() - timedelta(days=10)).timestamp() + + mock_revision3 = Mock() + mock_revision3.commit_hash = "ghi789" + mock_revision3.last_modified = (datetime.now() - timedelta(days=20)).timestamp() + + mock_repo = Mock() + mock_repo.revisions = [mock_revision1, mock_revision2, mock_revision3] + + mock_cache_info.repos = [mock_repo] + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size_str = "1GB" + mock_cache_info.delete_revisions.return_value = mock_delete_strategy + + mock_scan_cache_dir.return_value = mock_cache_info + + result = self.cache_manager.clean_unused_revisions(keep_latest=2, dry_run=True) + + assert result == mock_delete_strategy + # Should delete oldest revision (ghi789) + mock_cache_info.delete_revisions.assert_called_once_with("ghi789") + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_auto_clean_cache(self, mock_scan_cache_dir): + """Test automated cache cleaning.""" + mock_cache_info = Mock() + mock_cache_info.size_on_disk = 10 * 1024**3 # 10GB + mock_cache_info.repos = [] + + mock_delete_strategy = Mock() + mock_delete_strategy.expected_freed_size = 1 * 1024**3 # 1GB + mock_delete_strategy.expected_freed_size_str = "1GB" + + mock_scan_cache_dir.return_value = mock_cache_info + + with patch.object( + self.cache_manager, "clean_cache_by_age", return_value=mock_delete_strategy + ): + with patch.object( + self.cache_manager, + "clean_unused_revisions", + return_value=mock_delete_strategy, + ): + with patch.object( + self.cache_manager, + "clean_cache_by_size", + return_value=mock_delete_strategy, + ): + result = self.cache_manager.auto_clean_cache( + max_age_days=30, + max_total_size="5GB", + keep_latest_per_repo=2, + dry_run=True, + ) + + assert ( + len(result) == 3 + ) # All three cleanup strategies should be executed + assert all(strategy == mock_delete_strategy for strategy in result) + + +class TestHfCacheManagerErrorHandling: + """Test error handling and edge cases.""" + + def setup_method(self): + """Set up test fixtures.""" + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + self.conn = duckdb.connect(":memory:") + self.repo_id = "test/repo" + self.cache_manager = HfCacheManager(self.repo_id, self.conn) + + def test_parse_size_string_invalid_input(self): + """Test error handling for invalid size strings.""" + with pytest.raises(ValueError): + self.cache_manager._parse_size_string("invalid") + + @patch("tfbpapi.hf_cache_manager.scan_cache_dir") + def test_clean_cache_invalid_strategy(self, mock_scan_cache_dir): + """Test error handling for invalid cleanup strategy.""" + mock_cache_info = Mock() + mock_cache_info.size_on_disk = 5 * 1024**3 + mock_cache_info.repos = [] + mock_scan_cache_dir.return_value = mock_cache_info + + with pytest.raises(ValueError, match="Unknown strategy"): + self.cache_manager.clean_cache_by_size( + target_size="1GB", + strategy="invalid_strategy", # type: ignore[arg-type] + dry_run=True, + ) + + +class TestHfCacheManagerIntegration: + """Integration tests with real DuckDB operations.""" + + def setup_method(self): + """Set up test fixtures.""" + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + self.conn = duckdb.connect(":memory:") + self.repo_id = "test/repo" + self.cache_manager = HfCacheManager(self.repo_id, self.conn) + + def test_metadata_workflow_integration(self, tmpdir): + """Test complete metadata workflow with real files.""" + # Create temporary parquet file content + metadata_file = tmpdir.join("metadata.parquet") + metadata_file.write("dummy_parquet_content") + + # Test the core table creation functionality + mock_conn = Mock() + test_cache_manager = HfCacheManager("test/repo", mock_conn) + + # Mock the validation method since we're testing table creation + test_cache_manager._validate_source_sample_fields = Mock() # type: ignore + + # Test _create_duckdb_table_from_files directly + test_cache_manager._create_duckdb_table_from_files( + [str(metadata_file)], "metadata_test_metadata", "test_metadata" + ) + + # Verify the SQL was generated correctly + mock_conn.execute.assert_called_once() + sql_call = mock_conn.execute.call_args[0][0] + assert "CREATE OR REPLACE VIEW metadata_test_metadata" in sql_call + assert str(metadata_file) in sql_call + + def test_embedded_metadata_workflow_integration(self): + """Test complete embedded metadata workflow with real DuckDB operations.""" + # Create real test data in DuckDB + self.conn.execute( + """ + CREATE TABLE test_data AS + SELECT + 'gene_' || (row_number() OVER()) as gene_id, + CASE + WHEN (row_number() OVER()) % 3 = 0 THEN 'treatment_A' + WHEN (row_number() OVER()) % 3 = 1 THEN 'treatment_B' + ELSE 'control' + END as experimental_condition, + random() * 1000 as expression_value + FROM range(30) + """ + ) + + # Extract embedded metadata + result = self.cache_manager._extract_embedded_metadata_field( + "test_data", "experimental_condition", "metadata_test_condition" + ) + + assert result is True + + # Verify the metadata table was created correctly + metadata_results = self.conn.execute( + "SELECT value, count FROM metadata_test_condition ORDER BY count DESC" + ).fetchall() + + assert len(metadata_results) == 3 # Three unique conditions + + # Check that the counts make sense (should be 10 each for 30 total rows) + total_count = sum(row[1] for row in metadata_results) + assert total_count == 30 + + # Check that conditions are as expected + conditions = {row[0] for row in metadata_results} + assert conditions == {"treatment_A", "treatment_B", "control"} + + def test_table_existence_checking_integration(self): + """Test table existence checking with real DuckDB operations.""" + # Test non-existent table + assert ( + self.cache_manager._check_metadata_exists_in_duckdb("nonexistent_table") + is False + ) + + # Create a real table + self.conn.execute("CREATE TABLE test_table (id INTEGER, name TEXT)") + + # Test existing table + assert self.cache_manager._check_metadata_exists_in_duckdb("test_table") is True + + # Test with view + self.conn.execute("CREATE VIEW test_view AS SELECT * FROM test_table") + assert self.cache_manager._check_metadata_exists_in_duckdb("test_view") is True + + +# Fixtures for common test data +@pytest.fixture +def sample_metadata_config(): + """Sample metadata configuration for testing.""" + return Mock( + config_name="test_metadata", + description="Test metadata configuration", + data_files=[Mock(path="metadata.parquet")], + applies_to=["data_config"], + ) + + +@pytest.fixture +def sample_data_config(): + """Sample data configuration for testing.""" + return Mock( + config_name="test_data", + metadata_fields=["condition", "replicate"], + dataset_type=DatasetType.ANNOTATED_FEATURES, + ) + + +@pytest.fixture +def mock_cache_revision(): + """Mock cache revision for testing.""" + revision = Mock() + revision.commit_hash = "abc123def456" + revision.last_modified = datetime.now().timestamp() + revision.size_on_disk = 1024 * 1024 * 100 # 100MB + return revision + + +@pytest.fixture +def mock_cache_repo(mock_cache_revision): + """Mock cache repository for testing.""" + repo = Mock() + repo.repo_id = "test/repository" + repo.revisions = [mock_cache_revision] + repo.size_on_disk = 1024 * 1024 * 100 # 100MB + repo.size_on_disk_str = "100.0MB" + return repo + + +@pytest.fixture +def mock_cache_info(mock_cache_repo): + """Mock cache info for testing.""" + cache_info = Mock() + cache_info.cache_dir = "/tmp/cache" + cache_info.repos = [mock_cache_repo] + cache_info.size_on_disk = 1024 * 1024 * 100 # 100MB + cache_info.size_on_disk_str = "100.0MB" + + # Mock delete_revisions method + def mock_delete_revisions(*revision_hashes): + strategy = Mock() + strategy.expected_freed_size = ( + len(revision_hashes) * 1024 * 1024 * 50 + ) # 50MB per revision + strategy.expected_freed_size_str = f"{len(revision_hashes) * 50}.0MB" + strategy.delete_content = list(revision_hashes) + strategy.execute = Mock() + return strategy + + cache_info.delete_revisions = mock_delete_revisions + return cache_info + + +class TestSourceSampleValidation: + """Test validation of source_sample field format.""" + + def setup_method(self): + """Set up test fixtures.""" + self.conn = duckdb.connect(":memory:") + self.repo_id = "test/repo" + + def test_valid_source_sample_format(self, tmpdir): + """Test that valid source_sample format passes validation.""" + # Create parquet file with valid composite identifiers + parquet_file = tmpdir.join("valid_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'BrentLab/harbison_2004;harbison_2004;CBF1_YPD' + as binding_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as binding_score + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with source_sample field + mock_feature = Mock() + mock_feature.name = "binding_sample_ref" + mock_feature.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should not raise any error + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + def test_invalid_source_sample_two_parts(self, tmpdir): + """Test that source_sample with only 2 parts raises ValueError.""" + # Create parquet file with invalid format (only 2 parts) + parquet_file = tmpdir.join("invalid_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'BrentLab/harbison_2004;CBF1_YPD' as binding_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as binding_score + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with source_sample field + mock_feature = Mock() + mock_feature.name = "binding_sample_ref" + mock_feature.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should raise ValueError with clear message + with pytest.raises(ValueError) as exc_info: + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + error_msg = str(exc_info.value) + assert "Invalid format in field 'binding_sample_ref'" in error_msg + assert "role='source_sample'" in error_msg + assert "3 semicolon-separated parts" in error_msg + assert "BrentLab/harbison_2004;CBF1_YPD" in error_msg + + def test_invalid_source_sample_one_part(self, tmpdir): + """Test that source_sample with only 1 part raises ValueError.""" + # Create parquet file with invalid format (only 1 part) + parquet_file = tmpdir.join("invalid_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'CBF1_YPD' as binding_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as binding_score + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with source_sample field + mock_feature = Mock() + mock_feature.name = "binding_sample_ref" + mock_feature.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should raise ValueError + with pytest.raises(ValueError) as exc_info: + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + error_msg = str(exc_info.value) + assert "Invalid format in field 'binding_sample_ref'" in error_msg + assert "CBF1_YPD" in error_msg + + def test_invalid_source_sample_four_parts(self, tmpdir): + """Test that source_sample with 4 parts raises ValueError.""" + # Create parquet file with invalid format (4 parts) + parquet_file = tmpdir.join("invalid_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'a;b;c;d' as binding_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as binding_score + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with source_sample field + mock_feature = Mock() + mock_feature.name = "binding_sample_ref" + mock_feature.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should raise ValueError + with pytest.raises(ValueError) as exc_info: + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + error_msg = str(exc_info.value) + assert "Invalid format in field 'binding_sample_ref'" in error_msg + assert "a;b;c;d" in error_msg + + def test_no_source_sample_fields(self, tmpdir): + """Test that validation is skipped when no source_sample fields exist.""" + # Create parquet file with normal data + parquet_file = tmpdir.join("normal_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'gene_' || (row_number() OVER()) as target_locus_tag, + random() * 100 as expression_value + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard without source_sample fields + mock_feature = Mock() + mock_feature.name = "target_locus_tag" + mock_feature.role = "target_identifier" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Should not raise any error + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) + + def test_multiple_source_sample_fields(self, tmpdir): + """Test validation with multiple source_sample fields.""" + # Create parquet file with multiple composite identifier fields + parquet_file = tmpdir.join("multi_ref_data.parquet") + self.conn.execute( + f""" + COPY ( + SELECT + 'BrentLab/harbison_2004;harbison_2004;CBF1_YPD' + as binding_sample_ref, + 'BrentLab/kemmeren_2014;kemmeren_2014;sample_42' + as expression_sample_ref, + 'gene_' || (row_number() OVER()) as target_locus_tag + FROM range(5) + ) TO '{parquet_file}' (FORMAT PARQUET) + """ + ) + + # Create mock datacard with multiple source_sample fields + mock_feature1 = Mock() + mock_feature1.name = "binding_sample_ref" + mock_feature1.role = "source_sample" + + mock_feature2 = Mock() + mock_feature2.name = "expression_sample_ref" + mock_feature2.role = "source_sample" + + mock_dataset_info = Mock() + mock_dataset_info.features = [mock_feature1, mock_feature2] + + mock_config = Mock() + mock_config.config_name = "test_config" + mock_config.dataset_info = mock_dataset_info + + with patch("tfbpapi.hf_cache_manager.DataCard.__init__", return_value=None): + cache_manager = HfCacheManager(self.repo_id, self.conn) + cache_manager.get_config = Mock(return_value=mock_config) # type: ignore + + # Both fields are valid - should not raise + cache_manager._create_duckdb_table_from_files( + [str(parquet_file)], "test_table", "test_config" + ) diff --git a/tfbpapi/tests/test_metadata_config_models.py b/tfbpapi/tests/test_metadata_config_models.py new file mode 100644 index 0000000..1697930 --- /dev/null +++ b/tfbpapi/tests/test_metadata_config_models.py @@ -0,0 +1,514 @@ +""" +Tests for metadata configuration Pydantic models. + +Tests validation, error messages, and config loading for MetadataBuilder. + +""" + +import pytest +import yaml # type: ignore +from pydantic import ValidationError + +from tfbpapi.models import ( + MetadataConfig, + PropertyMapping, + RepositoryConfig, +) + + +class TestPropertyMapping: + """Tests for PropertyMapping model.""" + + def test_valid_field_level_mapping(self): + """Test valid field-level property mapping.""" + mapping = PropertyMapping(field="condition", path="media.carbon_source") + assert mapping.field == "condition" + assert mapping.path == "media.carbon_source" + + def test_valid_repo_level_mapping(self): + """Test valid repo-level property mapping (no field).""" + mapping = PropertyMapping(path="temperature_celsius") + assert mapping.field is None + assert mapping.path == "temperature_celsius" + + def test_invalid_empty_path(self): + """Test that empty path is rejected.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(path="") + assert "path cannot be empty" in str(exc_info.value) + + def test_invalid_whitespace_path(self): + """Test that whitespace-only path is rejected.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(path=" ") + assert "path cannot be empty" in str(exc_info.value) + + def test_invalid_empty_field(self): + """Test that empty field string is rejected.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(field="", path="media.carbon_source") + assert "field cannot be empty" in str(exc_info.value) + + def test_path_whitespace_stripped(self): + """Test that path whitespace is stripped.""" + mapping = PropertyMapping(path=" media.carbon_source ") + assert mapping.path == "media.carbon_source" + + def test_valid_field_only_mapping(self): + """Test valid field-only mapping (column alias).""" + mapping = PropertyMapping(field="condition") + assert mapping.field == "condition" + assert mapping.path is None + + def test_invalid_neither_field_nor_path(self): + """Test that at least one of field, path, or expression is required.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping() + assert ( + "At least one of 'field', 'path', or 'expression' must be specified" + in str(exc_info.value) + ) + + def test_valid_expression_only(self): + """Test valid expression-only mapping (derived field).""" + mapping = PropertyMapping(expression="dto_fdr < 0.05") + assert mapping.expression == "dto_fdr < 0.05" + assert mapping.field is None + assert mapping.path is None + + def test_invalid_expression_with_field(self): + """Test that expression cannot be combined with field.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(expression="dto_fdr < 0.05", field="sample_id") + assert "expression cannot be used with field or path" in str(exc_info.value) + + def test_invalid_expression_with_path(self): + """Test that expression cannot be combined with path.""" + with pytest.raises(ValidationError) as exc_info: + PropertyMapping(expression="dto_fdr < 0.05", path="media.carbon_source") + assert "expression cannot be used with field or path" in str(exc_info.value) + + +class TestComparativeAnalysis: + """Tests for ComparativeAnalysis model.""" + + def test_valid_comparative_analysis(self): + """Test valid comparative analysis configuration.""" + from tfbpapi.models import ComparativeAnalysis + + ca = ComparativeAnalysis( + repo="BrentLab/yeast_comparative_analysis", + dataset="dto", + via_field="binding_id", + ) + assert ca.repo == "BrentLab/yeast_comparative_analysis" + assert ca.dataset == "dto" + assert ca.via_field == "binding_id" + + +class TestDatasetVirtualDBConfig: + """Tests for DatasetVirtualDBConfig model.""" + + def test_valid_config_with_sample_id(self): + """Test valid dataset config with sample_id.""" + from tfbpapi.models import DatasetVirtualDBConfig, PropertyMapping + + config = DatasetVirtualDBConfig(sample_id=PropertyMapping(field="sample_id")) + assert config.sample_id is not None + assert config.sample_id.field == "sample_id" + + def test_valid_config_with_comparative_analyses(self): + """Test valid dataset config with comparative analyses.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config_dict = { + "sample_id": {"field": "sample_id"}, + "comparative_analyses": [ + { + "repo": "BrentLab/yeast_comparative_analysis", + "dataset": "dto", + "via_field": "binding_id", + } + ], + } + config = DatasetVirtualDBConfig.model_validate(config_dict) + assert config.sample_id is not None + assert len(config.comparative_analyses) == 1 + assert ( + config.comparative_analyses[0].repo == "BrentLab/yeast_comparative_analysis" + ) + + def test_config_with_extra_property_mappings(self): + """Test that extra fields are parsed as PropertyMappings.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config_dict = { + "sample_id": {"field": "sample_id"}, + "regulator_locus_tag": {"field": "regulator_locus_tag"}, + "dto_fdr": {"expression": "dto_fdr < 0.05"}, + } + config = DatasetVirtualDBConfig.model_validate(config_dict) + + # Access extra fields via model_extra + assert "regulator_locus_tag" in config.model_extra + assert "dto_fdr" in config.model_extra + + +class TestRepositoryConfig: + """Tests for RepositoryConfig model.""" + + def test_valid_repo_config_with_datasets(self): + """Test valid repository config with dataset section.""" + config_data = { + "temperature_celsius": {"path": "temperature_celsius"}, + "dataset": { + "dataset1": { + "carbon_source": { + "field": "condition", + "path": "media.carbon_source", + } + } + }, + } + config = RepositoryConfig.model_validate(config_data) + assert config.dataset is not None + assert "dataset1" in config.dataset + + def test_valid_repo_config_no_datasets(self): + """Test valid repository config without dataset section.""" + config_data = {"temperature_celsius": {"path": "temperature_celsius"}} + config = RepositoryConfig.model_validate(config_data) + assert config.dataset is None + + def test_invalid_dataset_not_dict(self): + """Test that dataset section must be a dict.""" + config_data = {"dataset": "not a dict"} + with pytest.raises(ValidationError) as exc_info: + RepositoryConfig.model_validate(config_data) + assert "'dataset' key must contain a dict" in str(exc_info.value) + + def test_valid_field_only_property(self): + """Test that field-only properties are valid (column aliases).""" + config_data = { + "dataset": {"dataset1": {"carbon_source": {"field": "condition"}}} + } + config = RepositoryConfig.model_validate(config_data) + assert config.dataset is not None + assert "dataset1" in config.dataset + # Access extra field via model_extra + dataset_config = config.dataset["dataset1"] + assert "carbon_source" in dataset_config.model_extra + assert dataset_config.model_extra["carbon_source"].field == "condition" + assert dataset_config.model_extra["carbon_source"].path is None + + def test_valid_repo_wide_field_only_property(self): + """Test that repo-wide field-only properties are valid.""" + config_data = {"environmental_condition": {"field": "condition"}} + config = RepositoryConfig.model_validate(config_data) + assert "environmental_condition" in config.properties + assert config.properties["environmental_condition"].field == "condition" + assert config.properties["environmental_condition"].path is None + + +class TestMetadataConfig: + """Tests for MetadataConfig model.""" + + def test_valid_config_with_aliases(self, tmp_path): + """Test valid config with factor aliases.""" + config_data = { + "factor_aliases": { + "carbon_source": { + "glucose": ["D-glucose", "dextrose"], + "galactose": ["D-galactose", "Galactose"], + } + }, + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + assert "carbon_source" in config.factor_aliases + assert "glucose" in config.factor_aliases["carbon_source"] + assert config.factor_aliases["carbon_source"]["glucose"] == [ + "D-glucose", + "dextrose", + ] + + def test_valid_config_without_aliases(self, tmp_path): + """Test that factor_aliases is optional.""" + config_data = { + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + } + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + assert config.factor_aliases == {} + + def test_valid_config_empty_aliases(self, tmp_path): + """Test that empty factor_aliases dict is allowed.""" + config_data = { + "factor_aliases": {}, + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + assert config.factor_aliases == {} + + def test_invalid_alias_not_dict(self): + """Test that property aliases must be a dict.""" + config_data = { + "factor_aliases": { + "carbon_source": ["D-glucose"] # Should be dict, not list + }, + "repositories": { + "BrentLab/test": {"dataset": {"test": {"prop": {"path": "path"}}}} + }, + } + + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + # Pydantic catches this with type validation before our custom validator + assert "valid dictionary" in str(exc_info.value) or "must be a dict" in str( + exc_info.value + ) + + def test_invalid_alias_value_not_list(self): + """Test that alias values must be lists.""" + config_data = { + "factor_aliases": { + "carbon_source": {"glucose": "D-glucose"} # Should be list, not string + }, + "repositories": { + "BrentLab/test": {"dataset": {"test": {"prop": {"path": "path"}}}} + }, + } + + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + # Pydantic catches this with type validation before our custom validator + assert "valid list" in str(exc_info.value) or "must map to a list" in str( + exc_info.value + ) + + def test_invalid_alias_empty_list(self): + """Test that alias value lists cannot be empty.""" + config_data = { + "factor_aliases": {"carbon_source": {"glucose": []}}, + "repositories": { + "BrentLab/test": {"dataset": {"test": {"prop": {"path": "path"}}}} + }, + } + + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "cannot have empty value list" in str(exc_info.value) + + def test_aliases_allow_numeric_values(self): + """Test that aliases can map to numeric values.""" + config_data = { + "factor_aliases": { + "temperature_celsius": { + "thirty": [30, "30"], # Integer and string + "thirty_seven": [37, 37.0], # Integer and float + } + }, + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"temperature": {"path": "temperature_celsius"}} + } + } + }, + } + + config = MetadataConfig.model_validate(config_data) + assert config.factor_aliases["temperature_celsius"]["thirty"] == [30, "30"] + assert config.factor_aliases["temperature_celsius"]["thirty_seven"] == [ + 37, + 37.0, + ] + + def test_invalid_no_repositories(self): + """Test that at least one repository is required.""" + config_data = {"factor_aliases": {"carbon_source": {"glucose": ["D-glucose"]}}} + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "at least one repository" in str(exc_info.value) + + def test_get_repository_config(self, tmp_path): + """Test get_repository_config method.""" + config_data = { + "factor_aliases": {"carbon_source": {"glucose": ["D-glucose"]}}, + "repositories": { + "BrentLab/harbison_2004": { + "dataset": { + "harbison_2004": { + "carbon_source": { + "field": "condition", + "path": "media.carbon_source", + } + } + } + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + repo_config = config.get_repository_config("BrentLab/harbison_2004") + assert repo_config is not None + assert isinstance(repo_config, RepositoryConfig) + assert repo_config.dataset is not None + assert "harbison_2004" in repo_config.dataset + + # Non-existent repo + assert config.get_repository_config("BrentLab/nonexistent") is None + + def test_get_property_mappings(self, tmp_path): + """Test get_property_mappings method.""" + config_data = { + "factor_aliases": { + "carbon_source": {"glucose": ["D-glucose"]}, + "temperature": {"thirty": [30]}, + }, + "repositories": { + "BrentLab/kemmeren_2014": { + "temperature": {"path": "temperature_celsius"}, # Repo-wide + "dataset": { + "kemmeren_2014": { + "carbon_source": {"path": "media.carbon_source"} + } + }, + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + mappings = config.get_property_mappings( + "BrentLab/kemmeren_2014", "kemmeren_2014" + ) + + # Should have both repo-wide and dataset-specific + assert "temperature" in mappings + assert "carbon_source" in mappings + # Mappings are PropertyMapping objects, not dicts + assert isinstance(mappings["temperature"], PropertyMapping) + assert mappings["temperature"].path == "temperature_celsius" + assert mappings["carbon_source"].path == "media.carbon_source" + + def test_dataset_specific_overrides_repo_wide(self, tmp_path): + """Test that dataset-specific mappings override repo-wide.""" + config_data = { + "repositories": { + "BrentLab/test": { + "carbon_source": {"path": "repo.level.path"}, # Repo-wide + "dataset": { + "test_dataset": { + "carbon_source": {"path": "dataset.level.path"} # Override + } + }, + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + mappings = config.get_property_mappings("BrentLab/test", "test_dataset") + + # Dataset-specific should win + assert mappings["carbon_source"].path == "dataset.level.path" + + def test_file_not_found(self): + """Test that FileNotFoundError is raised for missing file.""" + with pytest.raises(FileNotFoundError): + MetadataConfig.from_yaml("/nonexistent/path/config.yaml") + + def test_invalid_yaml_structure(self, tmp_path): + """Test that non-dict YAML is rejected.""" + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + f.write("- not\\n- a\\n- dict\\n") + + with pytest.raises(ValueError) as exc_info: + MetadataConfig.from_yaml(config_path) + assert "Configuration must be a YAML dict" in str(exc_info.value) + + def test_nested_alias_property_names(self, tmp_path): + """Test that alias property names can use dot notation.""" + config_data = { + "factor_aliases": { + "carbon_source": {"glucose": ["D-glucose"]}, + "carbon_source.concentration_percent": {"two_percent": [2]}, + "carbon_source.specifications": {"no_aa": ["without_amino_acids"]}, + }, + "repositories": { + "BrentLab/test": { + "dataset": { + "test": { + "carbon_source": { + "field": "condition", + "path": "media.carbon_source", + } + } + } + } + }, + } + + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config_data, f) + + config = MetadataConfig.from_yaml(config_path) + + # All alias properties should be preserved + assert "carbon_source" in config.factor_aliases + assert "carbon_source.concentration_percent" in config.factor_aliases + assert "carbon_source.specifications" in config.factor_aliases + + # Values should be correct + assert config.factor_aliases["carbon_source"]["glucose"] == ["D-glucose"] + assert config.factor_aliases["carbon_source.concentration_percent"][ + "two_percent" + ] == [2] + assert config.factor_aliases["carbon_source.specifications"]["no_aa"] == [ + "without_amino_acids" + ] diff --git a/tfbpapi/tests/test_metric_arrays.py b/tfbpapi/tests/test_metric_arrays.py deleted file mode 100644 index 45a8203..0000000 --- a/tfbpapi/tests/test_metric_arrays.py +++ /dev/null @@ -1,194 +0,0 @@ -import logging - -import numpy as np -import pandas as pd -import pytest - -from tfbpapi.metric_arrays import metric_arrays - - -def test_metric_arrays_expected_result(caplog): - res_dict = { - "metadata": pd.DataFrame( - { - "id": ["A", "B"], - "regulator_symbol": ["tf1", "tf2"], - } - ), - "data": { - "A": pd.DataFrame( - { - "target_symbol": ["gene1", "gene2"], - "metric1": [1.0, 2.0], - } - ), - "B": pd.DataFrame( - { - "target_symbol": ["gene2", "gene1"], - "metric1": [3.0, 4.0], - } - ), - }, - } - metrics_dict = {"metric1": np.mean} - - # Run function - with caplog.at_level(logging.WARNING): - output_dict = metric_arrays(res_dict, metrics_dict) - - # Check expected result for metric1 - # order based on the index of output_dict['metrics1'] since the ordering of - # the rows is random due to the set operation - expected_df = pd.DataFrame( - {"tf1": [1.0, 2.0], "tf2": [4.0, 3.0]}, - index=pd.Index(["gene1", "gene2"], name="target_symbol"), - ).reindex(output_dict["metric1"].index) - - pd.testing.assert_frame_equal(output_dict["metric1"], expected_df) - - # Check no warning since there are no incomplete rows or columns - assert "incomplete" not in caplog.text - - -def test_metric_arrays_missing_data(caplog): - res_dict = { - "metadata": pd.DataFrame( - { - "id": ["A", "B"], - "regulator_symbol": ["tf1", "tf2"], - } - ), - "data": { - "A": pd.DataFrame( - { - "target_symbol": ["gene1", "gene2"], - "metric1": [1.0, 2.0], - } - ), - "B": pd.DataFrame( - { - "target_symbol": ["gene1", "gene3"], - "metric1": [5.0, 3.0], - } - ), - }, - } - metrics_dict = {"metric1": np.mean} - - # Run function with incomplete row dropping - with caplog.at_level(logging.WARNING): - output_dict1 = metric_arrays(res_dict, metrics_dict, drop_incomplete_rows=False) - - # Check result for metric1 with "gene2" dropped due to missing data in B - # sort based on output_dict['metric1'] index since - # the ordering of the rows is random - expected_df1 = pd.DataFrame( - {"tf1": [1.0, 2.0, np.nan], "tf2": [5.0, np.nan, 3.0]}, - index=pd.Index(["gene1", "gene2", "gene3"], name="target_symbol"), - ).reindex(output_dict1["metric1"].index) - - pd.testing.assert_frame_equal(output_dict1["metric1"], expected_df1) - - # Run function with incomplete row dropping - with caplog.at_level(logging.WARNING): - output_dict2 = metric_arrays(res_dict, metrics_dict, drop_incomplete_rows=True) - - # Check result for metric1 with "gene2" dropped due to missing data in B - expected_df2 = pd.DataFrame( - {"tf1": [1.0], "tf2": [5.0]}, - index=pd.Index(["gene1"], name="target_symbol"), - ).reindex(output_dict2["metric1"].index) - - pd.testing.assert_frame_equal(output_dict2["metric1"], expected_df2) - - # Check warning for incomplete rows - assert "2 rows and 0 columns with incomplete records were dropped" in caplog.text - - -def test_metric_arrays_missing_keys(): - res_dict = { - "metadata": pd.DataFrame( - {"id": ["A"], "target_symbol": ["gene1"], "regulator_symbol": ["tf1"]} - ), - # Missing data for id "A" - "data": {}, - } - metrics_dict = {"metric1": np.mean} - - # Expect a KeyError for missing data keys - with pytest.raises(KeyError, match="Data dictionary must have the same keys"): - metric_arrays(res_dict, metrics_dict) - - -def test_metric_arrays_non_dataframe_value(): - res_dict = { - "metadata": pd.DataFrame( - {"id": ["A"], "target_symbol": ["gene1"], "regulator_symbol": ["tf1"]} - ), - "data": {"A": [1, 2, 3]}, # Invalid non-DataFrame entry - } - metrics_dict = {"metric1": np.mean} - - # Expect ValueError when data dictionary values are not DataFrames - with pytest.raises( - ValueError, match="All values in the data dictionary must be DataFrames" - ): - metric_arrays(res_dict, metrics_dict) - - -def test_metric_arrays_duplicate_rows_without_dedup_func(): - res_dict = { - "metadata": pd.DataFrame( - { - "id": ["A"], - "target_symbol": ["gene1"], - "regulator_symbol": ["tf1"], - } - ), - "data": { - "A": pd.DataFrame( - { - "target_symbol": ["gene1", "gene1"], - "metric1": [1.0, 2.0], - } - ), - }, - } - metrics_dict = {"metric1": None} # No deduplication function provided - - # Expect a ValueError due to duplicate rows without deduplication function - # - with pytest.raises( - ValueError, match="Duplicate entries found for metric 'metric1'" - ): - metric_arrays(res_dict, metrics_dict) # type: ignore - - -def test_metric_arrays_deduplication_function(): - res_dict = { - "metadata": pd.DataFrame( - { - "id": ["A"], - "target_symbol": ["gene1"], - "regulator_symbol": ["tf1"], - } - ), - "data": { - "A": pd.DataFrame( - { - "target_symbol": ["gene1", "gene1"], - "metric1": [1.0, 2.0], - } - ), - }, - } - metrics_dict = {"metric1": np.mean} # Deduplication function to average duplicates - - # Run function with deduplication - output_dict = metric_arrays(res_dict, metrics_dict) - - # Check that duplicates were averaged correctly - expected_df = pd.DataFrame( - {"tf1": [1.5]}, pd.Index(["gene1"], name="target_symbol") - ) - pd.testing.assert_frame_equal(output_dict["metric1"], expected_df) diff --git a/tfbpapi/tests/test_models.py b/tfbpapi/tests/test_models.py new file mode 100644 index 0000000..1771c4d --- /dev/null +++ b/tfbpapi/tests/test_models.py @@ -0,0 +1,577 @@ +""" +Tests for datainfo Pydantic models. + +These tests validate the minimal, flexible models that parse HuggingFace dataset cards. + +""" + +import pytest +from pydantic import ValidationError + +from tfbpapi.models import ( + DataFileInfo, + DatasetCard, + DatasetConfig, + DatasetInfo, + DatasetType, + ExtractedMetadata, + FeatureInfo, + MetadataRelationship, + PartitioningInfo, +) + + +class TestDatasetType: + """Tests for DatasetType enum.""" + + def test_dataset_type_values(self): + """Test that all expected dataset types are defined.""" + assert DatasetType.GENOMIC_FEATURES == "genomic_features" + assert DatasetType.ANNOTATED_FEATURES == "annotated_features" + assert DatasetType.GENOME_MAP == "genome_map" + assert DatasetType.METADATA == "metadata" + assert DatasetType.COMPARATIVE == "comparative" + + def test_dataset_type_from_string(self): + """Test creating DatasetType from string.""" + dt = DatasetType("genomic_features") + assert dt == DatasetType.GENOMIC_FEATURES + + def test_invalid_dataset_type(self): + """Test that invalid dataset type raises error.""" + with pytest.raises(ValueError): + DatasetType("invalid_type") + + +class TestFeatureInfo: + """Tests for FeatureInfo model.""" + + def test_minimal_feature_info(self): + """Test creating FeatureInfo with minimal fields.""" + feature = FeatureInfo( + name="gene_id", dtype="string", description="Gene identifier" + ) + assert feature.name == "gene_id" + assert feature.dtype == "string" + assert feature.description == "Gene identifier" + assert feature.role is None + assert feature.definitions is None + + def test_feature_info_with_role(self): + """Test FeatureInfo with role field.""" + feature = FeatureInfo( + name="condition", + dtype="string", + description="Experimental condition", + role="experimental_condition", + ) + assert feature.role == "experimental_condition" + + def test_feature_info_with_definitions(self): + """Test FeatureInfo with definitions for experimental_condition.""" + feature = FeatureInfo( + name="condition", + dtype={"class_label": {"names": ["control", "treated"]}}, + description="Treatment condition", + role="experimental_condition", + definitions={ + "control": {"temperature_celsius": 30}, + "treated": {"temperature_celsius": 37}, + }, + ) + assert feature.definitions is not None + assert "control" in feature.definitions + assert feature.definitions["control"]["temperature_celsius"] == 30 + + def test_feature_info_with_dict_dtype(self): + """Test FeatureInfo with class_label dtype.""" + feature = FeatureInfo( + name="category", + dtype={"class_label": {"names": ["A", "B", "C"]}}, + description="Categorical field", + ) + assert isinstance(feature.dtype, dict) + assert "class_label" in feature.dtype + + +class TestPartitioningInfo: + """Tests for PartitioningInfo model.""" + + def test_default_partitioning_info(self): + """Test PartitioningInfo with defaults.""" + partitioning = PartitioningInfo() + assert partitioning.enabled is False + assert partitioning.partition_by is None + assert partitioning.path_template is None + + def test_enabled_partitioning_info(self): + """Test PartitioningInfo with partitioning enabled.""" + partitioning = PartitioningInfo( + enabled=True, + partition_by=["accession"], + path_template="data/accession={accession}/*.parquet", + ) + assert partitioning.enabled is True + assert partitioning.partition_by == ["accession"] + assert partitioning.path_template == "data/accession={accession}/*.parquet" + + +class TestDataFileInfo: + """Tests for DataFileInfo model.""" + + def test_default_data_file_info(self): + """Test DataFileInfo with default split.""" + data_file = DataFileInfo(path="data.parquet") + assert data_file.split == "train" + assert data_file.path == "data.parquet" + + def test_custom_data_file_info(self): + """Test DataFileInfo with custom split.""" + data_file = DataFileInfo(split="test", path="test_data.parquet") + assert data_file.split == "test" + assert data_file.path == "test_data.parquet" + + +class TestDatasetInfo: + """Tests for DatasetInfo model.""" + + def test_minimal_dataset_info(self): + """Test DatasetInfo with minimal features.""" + dataset_info = DatasetInfo( + features=[ + FeatureInfo( + name="gene_id", dtype="string", description="Gene identifier" + ) + ] + ) + assert len(dataset_info.features) == 1 + assert dataset_info.partitioning is None + + def test_dataset_info_with_partitioning(self): + """Test DatasetInfo with partitioning.""" + dataset_info = DatasetInfo( + features=[ + FeatureInfo(name="chr", dtype="string", description="Chromosome"), + FeatureInfo(name="pos", dtype="int32", description="Position"), + ], + partitioning=PartitioningInfo(enabled=True, partition_by=["chr"]), + ) + assert len(dataset_info.features) == 2 + assert dataset_info.partitioning.enabled is True # type: ignore + + +class TestDatasetConfig: + """Tests for DatasetConfig model.""" + + def test_minimal_dataset_config(self): + """Test DatasetConfig with minimal required fields.""" + config = DatasetConfig( + config_name="test_data", + description="Test dataset", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[FeatureInfo(name="id", dtype="string", description="ID")] + ), + ) + assert config.config_name == "test_data" + assert config.dataset_type == DatasetType.ANNOTATED_FEATURES + assert config.default is False + assert config.applies_to is None + assert config.metadata_fields is None + + def test_dataset_config_with_applies_to(self): + """Test DatasetConfig with applies_to for metadata.""" + config = DatasetConfig( + config_name="metadata", + description="Metadata", + dataset_type=DatasetType.METADATA, + applies_to=["data_config_1", "data_config_2"], + data_files=[DataFileInfo(path="metadata.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo( + name="sample_id", dtype="string", description="Sample ID" + ) + ] + ), + ) + assert config.applies_to == ["data_config_1", "data_config_2"] + + def test_dataset_config_applies_to_validation_error(self): + """Test that applies_to raises error for non-metadata configs.""" + with pytest.raises(ValidationError): + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + applies_to=["other_config"], + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[FeatureInfo(name="id", dtype="string", description="ID")] + ), + ) + + def test_dataset_config_with_metadata_fields(self): + """Test DatasetConfig with metadata_fields.""" + config = DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + metadata_fields=["regulator_symbol", "condition"], + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo( + name="regulator_symbol", dtype="string", description="TF symbol" + ), + FeatureInfo( + name="condition", dtype="string", description="Condition" + ), + ] + ), + ) + assert config.metadata_fields == ["regulator_symbol", "condition"] + + def test_dataset_config_empty_metadata_fields_error(self): + """Test that empty metadata_fields raises error.""" + with pytest.raises(ValidationError): + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + metadata_fields=[], + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[FeatureInfo(name="id", dtype="string", description="ID")] + ), + ) + + def test_dataset_config_accepts_extra_fields(self): + """Test that DatasetConfig accepts extra fields like experimental_conditions.""" + config_data = { + "config_name": "data", + "description": "Data", + "dataset_type": "annotated_features", + "experimental_conditions": { + "temperature_celsius": 30, + "media": {"name": "YPD"}, + }, + "data_files": [{"path": "data.parquet"}], + "dataset_info": { + "features": [{"name": "id", "dtype": "string", "description": "ID"}] + }, + } + config = DatasetConfig(**config_data) + assert hasattr(config, "model_extra") + assert "experimental_conditions" in config.model_extra + + +class TestDatasetCard: + """Tests for DatasetCard model.""" + + def test_minimal_dataset_card(self): + """Test DatasetCard with minimal structure.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ) + ] + ) + assert len(card.configs) == 1 + + def test_dataset_card_accepts_extra_fields(self): + """Test that DatasetCard accepts extra top-level fields.""" + card_data = { + "license": "mit", + "pretty_name": "Test Dataset", + "tags": ["biology", "genomics"], + "experimental_conditions": {"strain_background": "BY4741"}, + "configs": [ + { + "config_name": "data", + "description": "Data", + "dataset_type": "annotated_features", + "data_files": [{"path": "data.parquet"}], + "dataset_info": { + "features": [ + {"name": "id", "dtype": "string", "description": "ID"} + ] + }, + } + ], + } + card = DatasetCard(**card_data) + assert hasattr(card, "model_extra") + assert "license" in card.model_extra + assert "experimental_conditions" in card.model_extra + + def test_empty_configs_error(self): + """Test that empty configs raises error.""" + with pytest.raises(ValidationError): + DatasetCard(configs=[]) + + def test_duplicate_config_names_error(self): + """Test that duplicate config names raises error.""" + with pytest.raises(ValidationError): + DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data 1", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data1.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="data", + description="Data 2", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data2.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + + def test_multiple_default_configs_error(self): + """Test that multiple default configs raises error.""" + with pytest.raises(ValidationError): + DatasetCard( + configs=[ + DatasetConfig( + config_name="data1", + description="Data 1", + dataset_type=DatasetType.ANNOTATED_FEATURES, + default=True, + data_files=[DataFileInfo(path="data1.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="data2", + description="Data 2", + dataset_type=DatasetType.ANNOTATED_FEATURES, + default=True, + data_files=[DataFileInfo(path="data2.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + + def test_get_config_by_name(self): + """Test get_config_by_name method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data1", + description="Data 1", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data1.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="data2", + description="Data 2", + dataset_type=DatasetType.METADATA, + data_files=[DataFileInfo(path="data2.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + config = card.get_config_by_name("data1") + assert config is not None + assert config.config_name == "data1" + assert card.get_config_by_name("nonexistent") is None + + def test_get_configs_by_type(self): + """Test get_configs_by_type method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="metadata", + description="Metadata", + dataset_type=DatasetType.METADATA, + data_files=[DataFileInfo(path="metadata.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + data_configs = card.get_configs_by_type(DatasetType.ANNOTATED_FEATURES) + assert len(data_configs) == 1 + assert data_configs[0].config_name == "data" + + def test_get_default_config(self): + """Test get_default_config method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data1", + description="Data 1", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data1.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="data2", + description="Data 2", + dataset_type=DatasetType.ANNOTATED_FEATURES, + default=True, + data_files=[DataFileInfo(path="data2.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + default = card.get_default_config() + assert default is not None + assert default.config_name == "data2" + + def test_get_data_configs(self): + """Test get_data_configs method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="metadata", + description="Metadata", + dataset_type=DatasetType.METADATA, + data_files=[DataFileInfo(path="metadata.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + data_configs = card.get_data_configs() + assert len(data_configs) == 1 + assert data_configs[0].dataset_type != DatasetType.METADATA + + def test_get_metadata_configs(self): + """Test get_metadata_configs method.""" + card = DatasetCard( + configs=[ + DatasetConfig( + config_name="data", + description="Data", + dataset_type=DatasetType.ANNOTATED_FEATURES, + data_files=[DataFileInfo(path="data.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + DatasetConfig( + config_name="metadata", + description="Metadata", + dataset_type=DatasetType.METADATA, + data_files=[DataFileInfo(path="metadata.parquet")], + dataset_info=DatasetInfo( + features=[ + FeatureInfo(name="id", dtype="string", description="ID") + ] + ), + ), + ] + ) + metadata_configs = card.get_metadata_configs() + assert len(metadata_configs) == 1 + assert metadata_configs[0].dataset_type == DatasetType.METADATA + + +class TestExtractedMetadata: + """Tests for ExtractedMetadata model.""" + + def test_extracted_metadata_creation(self): + """Test creating ExtractedMetadata.""" + metadata = ExtractedMetadata( + config_name="test_config", + field_name="regulator_symbol", + values={"CBF1", "GAL4", "GCN4"}, + extraction_method="distinct", + ) + assert metadata.config_name == "test_config" + assert metadata.field_name == "regulator_symbol" + assert len(metadata.values) == 3 + assert "CBF1" in metadata.values + + +class TestMetadataRelationship: + """Tests for MetadataRelationship model.""" + + def test_metadata_relationship_creation(self): + """Test creating MetadataRelationship.""" + relationship = MetadataRelationship( + data_config="binding_data", + metadata_config="experiment_metadata", + relationship_type="explicit", + ) + assert relationship.data_config == "binding_data" + assert relationship.metadata_config == "experiment_metadata" + assert relationship.relationship_type == "explicit" diff --git a/tfbpapi/tests/test_rank_transforms.py b/tfbpapi/tests/test_rank_transforms.py deleted file mode 100644 index 31dbeaa..0000000 --- a/tfbpapi/tests/test_rank_transforms.py +++ /dev/null @@ -1,80 +0,0 @@ -import numpy as np -from scipy.stats import rankdata - -from tfbpapi.rank_transforms import ( - shifted_negative_log_ranks, - transform, -) - - -def test_shifted_negative_log_ranks_basic(): - ranks = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) - expected_log_ranks = -1 * np.log10(ranks) + np.log10(np.max(ranks)) - - actual_log_ranks = shifted_negative_log_ranks(ranks) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_shifted_negative_log_ranks_with_ties(): - ranks = np.array([1.0, 2.5, 2.5, 3.0, 4.0]) - expected_log_ranks = -1 * np.log10(ranks) + np.log10(np.max(ranks)) - - actual_log_ranks = shifted_negative_log_ranks(ranks) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_negative_log_transform_basic(): - pvalues = np.array([0.01, 0.05, 0.01, 0.02, 0.05]) - enrichment = np.array([5.0, 3.0, 6.0, 4.0, 4.5]) - - # Expected ranks based on pvalue (primary) with enrichment (secondary) tie-breaking - expected_ranks = np.array([2.0, 5.0, 1.0, 3.0, 4.0]) - expected_log_ranks = -1 * np.log10(expected_ranks) + np.log10( - np.max(expected_ranks) - ) - - actual_log_ranks = transform(pvalues, enrichment) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_all_ties_in_primary_column(): - pvalues = np.array([0.01, 0.01, 0.01, 0.01]) - enrichment = np.array([10.0, 20.0, 15.0, 5.0]) - - # With all pvalues tied, the ranking should depend solely - # on enrichment (higher is better) - expected_secondary_ranks = rankdata(-enrichment, method="average") - expected_log_ranks = -1 * np.log10(expected_secondary_ranks) + np.log10( - np.max(expected_secondary_ranks) - ) - - actual_log_ranks = transform(pvalues, enrichment) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_no_ties_in_primary_column(): - pvalues = np.array([0.01, 0.02, 0.03, 0.04]) - enrichment = np.array([5.0, 10.0, 15.0, 20.0]) - - # With no ties in pvalue, the secondary column should have no effect - expected_ranks = rankdata(pvalues, method="average") - expected_log_ranks = -1 * np.log10(expected_ranks) + np.log10( - np.max(expected_ranks) - ) - - actual_log_ranks = transform(pvalues, enrichment) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) - - -def test_tied_in_both_pvalue_and_enrichment(): - pvalues = np.array([0.01, 0.05, 0.01, 0.02, 0.05]) - enrichment = np.array([5.0, 3.0, 5.0, 4.0, 3.0]) - - # With ties in both primary and secondary columns - expected_ranks = np.array([1.5, 4.5, 1.5, 3.0, 4.5]) - expected_log_ranks = -1 * np.log10(expected_ranks) + np.log10( - np.max(expected_ranks) - ) - - actual_log_ranks = transform(pvalues, enrichment) - np.testing.assert_array_almost_equal(actual_log_ranks, expected_log_ranks) diff --git a/tfbpapi/tests/test_real_datacards.py b/tfbpapi/tests/test_real_datacards.py new file mode 100644 index 0000000..cd07626 --- /dev/null +++ b/tfbpapi/tests/test_real_datacards.py @@ -0,0 +1,706 @@ +""" +Test real datacards from the HuggingFace collection. + +This test suite validates that all real datacards from the BrentLab collection parse +correctly with the updated models.py and specification. + +""" + +import warnings + +import pytest +import yaml + +from tfbpapi.models import DatasetCard + +# Real datacard YAML strings from the collection +BARKAI_COMPENDIUM = """ +license: mit +language: +- en +tags: +- transcription-factor +- binding +- chec-seq +- genomics +- biology +pretty_name: Barkai ChEC-seq Compendium +size_categories: + - 100M 0 + + # Verify config has required fields + config = card.configs[0] + assert config.config_name is not None + assert config.dataset_type is not None + assert config.dataset_info is not None + assert config.dataset_info.features is not None + assert len(config.dataset_info.features) > 0 + + +def test_harbison_2004_condition_definitions(): + """Test that harbison_2004 field-level definitions parse correctly.""" + data = yaml.safe_load(HARBISON_2004) + card = DatasetCard(**data) + + # Find the config + config = card.configs[0] + assert config.config_name == "harbison_2004" + + # Find condition feature + condition_feature = next( + f for f in config.dataset_info.features if f.name == "condition" + ) + + # Should have definitions + assert condition_feature.definitions is not None + assert "YPD" in condition_feature.definitions + assert "Acid" in condition_feature.definitions + assert "BUT14" in condition_feature.definitions + + # YPD definition should have environmental conditions + ypd_def = condition_feature.definitions["YPD"] + assert "environmental_conditions" in ypd_def + + # Acid definition should have target_pH in chemical_treatment + acid_def = condition_feature.definitions["Acid"] + assert "environmental_conditions" in acid_def + assert "chemical_treatment" in acid_def["environmental_conditions"] + assert "target_pH" in acid_def["environmental_conditions"]["chemical_treatment"] + + # BUT14 should have media additives + but14_def = condition_feature.definitions["BUT14"] + assert "environmental_conditions" in but14_def + assert "media" in but14_def["environmental_conditions"] + assert "additives" in but14_def["environmental_conditions"]["media"] + + +def test_hughes_2006_induction(): + """Test that hughes_2006 induction field parses correctly.""" + data = yaml.safe_load(HUGHES_2006) + card = DatasetCard(**data) + + # Check experimental conditions (stored as dict in model_extra) + assert card.configs[0].model_extra is not None + assert "experimental_conditions" in card.configs[0].model_extra + exp_conds = card.configs[0].model_extra["experimental_conditions"] + + # Check induction field + assert "induction" in exp_conds + induction = exp_conds["induction"] + assert "inducer" in induction + assert induction["inducer"]["compound"] == "D-galactose" + assert induction["duration_hours"] == 3 + + +def test_kemmeren_2014_growth_phase(): + """Test that kemmeren_2014 growth phase with od600_tolerance parses correctly.""" + data = yaml.safe_load(KEMMEREN_2014) + card = DatasetCard(**data) + + # Check growth phase (stored as dict in model_extra) + assert card.model_extra is not None + assert "experimental_conditions" in card.model_extra + exp_conds = card.model_extra["experimental_conditions"] + + assert "growth_phase_at_harvest" in exp_conds + growth_phase = exp_conds["growth_phase_at_harvest"] + assert growth_phase["phase"] == "early_mid_log" + assert growth_phase["od600"] == 0.6 + assert growth_phase["od600_tolerance"] == 0.1 + + +def test_hu_2007_strain_background_in_definitions(): + """Test that strain_background in field definitions parses correctly.""" + data = yaml.safe_load(HU_2007) + card = DatasetCard(**data) + + # Find heat_shock feature + config = card.configs[0] + heat_shock_feature = next( + f for f in config.dataset_info.features if f.name == "heat_shock" + ) + + # Check definitions + assert heat_shock_feature.definitions is not None + assert "true" in heat_shock_feature.definitions + + # Check strain_background in definition + true_def = heat_shock_feature.definitions["true"] + assert "strain_background" in true_def + + +def test_field_role_validation(): + """Test that role field accepts any string value.""" + # This should parse successfully with any role string + data = yaml.safe_load(CALLINGCARDS) + card = DatasetCard(**data) + + # Find a feature with a role + config = card.configs[0] + regulator_feature = next( + f for f in config.dataset_info.features if f.name == "regulator_locus_tag" + ) + + # Verify role is a string (not an enum) + assert regulator_feature.role == "regulator_identifier" + assert isinstance(regulator_feature.role, str) + + +def test_concentration_fields(): + """Test that various concentration fields parse correctly.""" + data = yaml.safe_load(KEMMEREN_2014) + card = DatasetCard(**data) + + # Check media compounds (stored as dict in model_extra) + assert card.model_extra is not None + assert "experimental_conditions" in card.model_extra + exp_conds = card.model_extra["experimental_conditions"] + assert "media" in exp_conds + media = exp_conds["media"] + + # Check carbon source + assert "carbon_source" in media + carbon_sources = media["carbon_source"] + assert len(carbon_sources) > 0 + carbon = carbon_sources[0] + assert carbon["concentration_percent"] is not None + + # Check nitrogen source with specifications + assert "nitrogen_source" in media + nitrogen_sources = media["nitrogen_source"] + assert len(nitrogen_sources) > 0 + nitrogen = nitrogen_sources[0] + assert nitrogen["specifications"] is not None + assert "without_amino_acids" in nitrogen["specifications"] + + +def test_extra_fields_do_not_raise_errors(): + """Test that extra fields are accepted (with warnings) but don't raise errors.""" + # All real datacards should parse without ValidationError + # even if they have extra fields + datacards = [ + BARKAI_COMPENDIUM, + CALLINGCARDS, + HARBISON_2004, + HU_2007, + HUGHES_2006, + KEMMEREN_2014, + MAHENDRAWADA_2025, + ROSSI_2021, + ] + + for datacard_yaml in datacards: + data = yaml.safe_load(datacard_yaml) + # This should not raise ValidationError + card = DatasetCard(**data) + assert card is not None + + +def test_empty_nitrogen_source_list(): + """Test that empty nitrogen_source lists are accepted.""" + data = yaml.safe_load(BARKAI_COMPENDIUM) + card = DatasetCard(**data) + + # Check that nitrogen_source is an empty list (stored as dict in model_extra) + assert card.model_extra is not None + assert "experimental_conditions" in card.model_extra + exp_conds = card.model_extra["experimental_conditions"] + assert "media" in exp_conds + media = exp_conds["media"] + assert media["nitrogen_source"] == [] + + +def test_media_additives(): + """Test that media additives parse correctly.""" + data = yaml.safe_load(HARBISON_2004) + card = DatasetCard(**data) + + # Find BUT14 condition definition + config = card.configs[0] + condition_feature = next( + f for f in config.dataset_info.features if f.name == "condition" + ) + but14_def = condition_feature.definitions["BUT14"] + + # Check additives + env_conds_dict = but14_def["environmental_conditions"] + media = env_conds_dict["media"] + assert "additives" in media + additives = media["additives"] + assert len(additives) > 0 + assert additives[0]["compound"] == "butanol" + assert additives[0]["concentration_percent"] == 1 + + +def test_strain_background_formats(): + """Test that strain_background accepts both string and dict formats.""" + # String format + data1 = yaml.safe_load(BARKAI_COMPENDIUM) + card1 = DatasetCard(**data1) + assert card1.model_extra is not None + assert "experimental_conditions" in card1.model_extra + exp_conds1 = card1.model_extra["experimental_conditions"] + assert exp_conds1["strain_background"] == "BY4741" + + # String format in rossi + data2 = yaml.safe_load(ROSSI_2021) + card2 = DatasetCard(**data2) + assert card2.model_extra is not None + assert "experimental_conditions" in card2.model_extra + exp_conds2 = card2.model_extra["experimental_conditions"] + assert exp_conds2["strain_background"] == "W303" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py new file mode 100644 index 0000000..1293bf9 --- /dev/null +++ b/tfbpapi/tests/test_virtual_db.py @@ -0,0 +1,695 @@ +""" +Tests for VirtualDB unified query interface. + +Tests configuration loading, schema discovery, querying, filtering, and caching. + +""" + +import tempfile +from pathlib import Path + +import pandas as pd +import pytest +import yaml # type: ignore + +from tfbpapi.virtual_db import VirtualDB, get_nested_value, normalize_value + + +class TestHelperFunctions: + """Tests for helper functions.""" + + def test_get_nested_value_simple(self): + """Test simple nested dict navigation.""" + data = {"media": {"name": "YPD"}} + result = get_nested_value(data, "media.name") + assert result == "YPD" + + def test_get_nested_value_missing_key(self): + """Test that missing keys return None.""" + data = {"media": {"name": "YPD"}} + result = get_nested_value(data, "media.carbon_source") + assert result is None + + def test_get_nested_value_list_extraction(self): + """Test extracting property from list of dicts.""" + data = { + "media": { + "carbon_source": [{"compound": "glucose"}, {"compound": "galactose"}] + } + } + result = get_nested_value(data, "media.carbon_source.compound") + assert result == ["glucose", "galactose"] + + def test_get_nested_value_non_dict(self): + """Test that non-dict input returns None.""" + result = get_nested_value("not a dict", "path") # type: ignore + assert result is None + + def test_normalize_value_exact_match(self): + """Test exact alias match.""" + aliases = {"glucose": ["D-glucose", "dextrose"]} + result = normalize_value("D-glucose", aliases) + assert result == "glucose" + + def test_normalize_value_case_insensitive(self): + """Test case-insensitive matching.""" + aliases = {"glucose": ["D-glucose", "dextrose"]} + result = normalize_value("DEXTROSE", aliases) + assert result == "glucose" + + def test_normalize_value_no_match(self): + """Test pass-through when no alias matches.""" + aliases = {"glucose": ["D-glucose"]} + result = normalize_value("maltose", aliases) + assert result == "maltose" + + def test_normalize_value_no_aliases(self): + """Test pass-through when no aliases provided.""" + result = normalize_value("D-glucose", None) + assert result == "D-glucose" + + def test_normalize_value_missing_value_label(self): + """Test missing value handling.""" + result = normalize_value(None, None, "unspecified") + assert result == "unspecified" + + def test_normalize_value_missing_value_no_label(self): + """Test missing value without label.""" + result = normalize_value(None, None) + assert result == "None" + + +class TestVirtualDBConfig: + """Tests for VirtualDB configuration loading.""" + + def create_test_config(self, **overrides): + """Helper to create test configuration file.""" + config = { + "factor_aliases": { + "carbon_source": { + "glucose": ["D-glucose", "dextrose"], + "galactose": ["D-galactose", "Galactose"], + } + }, + "missing_value_labels": {"carbon_source": "unspecified"}, + "description": {"carbon_source": "Carbon source in growth media"}, + "repositories": { + "BrentLab/test_repo": { + "temperature_celsius": {"path": "temperature_celsius"}, + "dataset": { + "test_dataset": { + "carbon_source": { + "field": "condition", + "path": "media.carbon_source.compound", + } + } + }, + } + }, + } + config.update(overrides) + return config + + def test_init_with_valid_config(self): + """Test VirtualDB initialization with valid config.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(self.create_test_config(), f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + assert vdb.config is not None + assert vdb.token is None + assert len(vdb.cache) == 0 + finally: + Path(config_path).unlink() + + def test_init_with_token(self): + """Test VirtualDB initialization with HF token.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(self.create_test_config(), f) + config_path = f.name + + try: + vdb = VirtualDB(config_path, token="test_token") + assert vdb.token == "test_token" + finally: + Path(config_path).unlink() + + def test_init_missing_config_file(self): + """Test error when config file doesn't exist.""" + with pytest.raises(FileNotFoundError): + VirtualDB("/nonexistent/path.yaml") + + def test_repr(self): + """Test string representation.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(self.create_test_config(), f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + repr_str = repr(vdb) + assert "VirtualDB" in repr_str + assert "1 repositories" in repr_str + assert "1 datasets configured" in repr_str + assert "0 views cached" in repr_str + finally: + Path(config_path).unlink() + + +class TestSchemaDiscovery: + """Tests for schema discovery methods.""" + + def create_multi_dataset_config(self): + """Create config with multiple datasets.""" + return { + "factor_aliases": {}, + "repositories": { + "BrentLab/repo1": { + "temperature_celsius": {"path": "temperature_celsius"}, + "dataset": { + "dataset1": { + "carbon_source": { + "field": "condition", + "path": "media.carbon_source", + } + } + }, + }, + "BrentLab/repo2": { + "nitrogen_source": {"path": "media.nitrogen_source"}, + "dataset": { + "dataset2": { + "carbon_source": {"path": "media.carbon_source"}, + "temperature_celsius": {"path": "temperature_celsius"}, + } + }, + }, + }, + } + + def test_get_fields_all_datasets(self): + """Test getting all fields across all datasets.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(self.create_multi_dataset_config(), f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + fields = vdb.get_fields() + assert "carbon_source" in fields + assert "temperature_celsius" in fields + assert "nitrogen_source" in fields + assert fields == sorted(fields) # Should be sorted + finally: + Path(config_path).unlink() + + def test_get_fields_specific_dataset(self): + """Test getting fields for specific dataset.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(self.create_multi_dataset_config(), f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + fields = vdb.get_fields("BrentLab/repo1", "dataset1") + assert "carbon_source" in fields + assert "temperature_celsius" in fields + # nitrogen_source is in repo2, not repo1 + assert "nitrogen_source" not in fields + finally: + Path(config_path).unlink() + + def test_get_fields_invalid_partial_args(self): + """Test error when only one of repo_id/config_name provided.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(self.create_multi_dataset_config(), f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + with pytest.raises(ValueError, match="Both repo_id and config_name"): + vdb.get_fields(repo_id="BrentLab/repo1") + finally: + Path(config_path).unlink() + + def test_get_common_fields(self): + """Test getting fields common to all datasets.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(self.create_multi_dataset_config(), f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + common = vdb.get_common_fields() + # Both datasets have carbon_source and temperature_celsius + assert "carbon_source" in common + assert "temperature_celsius" in common + # nitrogen_source is only in repo2 + assert "nitrogen_source" not in common + finally: + Path(config_path).unlink() + + +class TestCaching: + """Tests for view materialization and caching.""" + + def create_simple_config(self): + """Create simple config for testing.""" + return { + "factor_aliases": {}, + "repositories": { + "BrentLab/test_repo": { + "dataset": { + "test_dataset": { + "carbon_source": {"path": "media.carbon_source"} + } + } + } + }, + } + + def test_invalidate_cache_all(self): + """Test invalidating all cache.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(self.create_simple_config(), f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + # Manually add to cache + vdb.cache[("BrentLab/test_repo", "test_dataset")] = pd.DataFrame() + assert len(vdb.cache) == 1 + + vdb.invalidate_cache() + assert len(vdb.cache) == 0 + finally: + Path(config_path).unlink() + + def test_invalidate_cache_specific(self): + """Test invalidating specific dataset cache.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml.dump(self.create_simple_config(), f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + # Add multiple entries to cache + vdb.cache[("BrentLab/test_repo", "test_dataset")] = pd.DataFrame() + vdb.cache[("BrentLab/other_repo", "other_dataset")] = pd.DataFrame() + assert len(vdb.cache) == 2 + + vdb.invalidate_cache([("BrentLab/test_repo", "test_dataset")]) + assert len(vdb.cache) == 1 + assert ("BrentLab/other_repo", "other_dataset") in vdb.cache + finally: + Path(config_path).unlink() + + +class TestFiltering: + """Tests for filter application logic.""" + + def test_apply_filters_exact_match(self): + """Test exact value matching in filters.""" + df = pd.DataFrame( + { + "sample_id": ["s1", "s2", "s3"], + "carbon_source": ["glucose", "galactose", "glucose"], + } + ) + + # Create minimal VirtualDB instance + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + config = { + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + } + } + yaml.dump(config, f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + filtered = vdb._apply_filters( + df, {"carbon_source": "glucose"}, "BrentLab/test", "test" + ) + assert len(filtered) == 2 + assert all(filtered["carbon_source"] == "glucose") + finally: + Path(config_path).unlink() + + def test_apply_filters_numeric_range(self): + """Test numeric range filtering.""" + df = pd.DataFrame( + {"sample_id": ["s1", "s2", "s3"], "temperature_celsius": [25, 30, 37]} + ) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + config = { + "repositories": { + "BrentLab/test": { + "dataset": { + "test": { + "temperature_celsius": {"path": "temperature_celsius"} + } + } + } + } + } + yaml.dump(config, f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + + # Test >= operator + filtered = vdb._apply_filters( + df, {"temperature_celsius": (">=", 30)}, "BrentLab/test", "test" + ) + assert len(filtered) == 2 + assert all(filtered["temperature_celsius"] >= 30) + + # Test between operator + filtered = vdb._apply_filters( + df, + {"temperature_celsius": ("between", 28, 32)}, + "BrentLab/test", + "test", + ) + assert len(filtered) == 1 + assert filtered.iloc[0]["temperature_celsius"] == 30 + finally: + Path(config_path).unlink() + + def test_apply_filters_with_alias_expansion(self): + """Test filter with alias expansion.""" + df = pd.DataFrame( + { + "sample_id": ["s1", "s2", "s3"], + "carbon_source": ["glucose", "D-glucose", "galactose"], + } + ) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + config = { + "factor_aliases": { + "carbon_source": {"glucose": ["D-glucose", "dextrose", "glucose"]} + }, + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + }, + } + yaml.dump(config, f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + filtered = vdb._apply_filters( + df, {"carbon_source": "glucose"}, "BrentLab/test", "test" + ) + # Should match both "glucose" and "D-glucose" due to alias expansion + assert len(filtered) == 2 + finally: + Path(config_path).unlink() + + +class TestExtraction: + """Tests for metadata extraction methods.""" + + def test_add_field_metadata(self): + """Test adding field-level metadata to DataFrame.""" + df = pd.DataFrame({"sample_id": ["s1", "s2"], "condition": ["YPD", "YPG"]}) + + field_metadata = { + "YPD": {"carbon_source": ["glucose"], "growth_media": ["YPD"]}, + "YPG": {"carbon_source": ["glycerol"], "growth_media": ["YPG"]}, + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + config = { + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + } + } + yaml.dump(config, f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + result = vdb._add_field_metadata(df, field_metadata) + + assert "carbon_source" in result.columns + assert "growth_media" in result.columns + assert ( + result.loc[result["condition"] == "YPD", "carbon_source"].iloc[0] + == "glucose" + ) + assert ( + result.loc[result["condition"] == "YPG", "carbon_source"].iloc[0] + == "glycerol" + ) + finally: + Path(config_path).unlink() + + +class TestQuery: + """Tests for query method - requires mocking HfQueryAPI.""" + + def test_query_empty_result(self): + """Test query with no matching datasets.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + config = { + "repositories": { + "BrentLab/test": { + "dataset": { + "test": {"carbon_source": {"path": "media.carbon_source"}} + } + } + } + } + yaml.dump(config, f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + # Query with non-configured dataset should return empty + result = vdb.query(datasets=[("BrentLab/other", "other")]) + assert isinstance(result, pd.DataFrame) + assert result.empty + finally: + Path(config_path).unlink() + + +class TestComparativeDatasets: + """Tests for comparative dataset field-based joins.""" + + def test_parse_composite_identifier(self): + """Test parsing composite identifiers.""" + composite_id = "BrentLab/harbison_2004;harbison_2004;sample_42" + repo, config, sample = VirtualDB._parse_composite_identifier(composite_id) + assert repo == "BrentLab/harbison_2004" + assert config == "harbison_2004" + assert sample == "sample_42" + + def test_parse_composite_identifier_invalid(self): + """Test that invalid composite IDs raise errors.""" + with pytest.raises(ValueError, match="Invalid composite ID format"): + VirtualDB._parse_composite_identifier("invalid:format") + + def test_get_comparative_fields_for_dataset(self): + """Test getting comparative fields mapping.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + config = { + "repositories": { + "BrentLab/primary": { + "dataset": { + "primary_data": { + "sample_id": {"field": "sample_id"}, + "comparative_analyses": [ + { + "repo": "BrentLab/comparative", + "dataset": "comp_data", + "via_field": "binding_id", + } + ], + } + } + }, + "BrentLab/comparative": { + "dataset": { + "comp_data": { + "dto_fdr": {"field": "dto_fdr"}, + "dto_pvalue": {"field": "dto_empirical_pvalue"}, + } + } + }, + } + } + yaml.dump(config, f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + field_mapping = vdb._get_comparative_fields_for_dataset( + "BrentLab/primary", "primary_data" + ) + + # Should have dto_fdr and dto_pvalue, but NOT binding_id (via_field) + assert "dto_fdr" in field_mapping + assert "dto_pvalue" in field_mapping + assert "binding_id" not in field_mapping + + # Check mapping structure + assert field_mapping["dto_fdr"]["comp_repo"] == "BrentLab/comparative" + assert field_mapping["dto_fdr"]["comp_dataset"] == "comp_data" + assert field_mapping["dto_fdr"]["via_field"] == "binding_id" + finally: + Path(config_path).unlink() + + def test_get_comparative_fields_no_links(self): + """Test that datasets without comparative links return empty mapping.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + config = { + "repositories": { + "BrentLab/primary": { + "dataset": { + "primary_data": {"sample_id": {"field": "sample_id"}} + } + } + } + } + yaml.dump(config, f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + field_mapping = vdb._get_comparative_fields_for_dataset( + "BrentLab/primary", "primary_data" + ) + assert field_mapping == {} + finally: + Path(config_path).unlink() + + def test_get_comparative_analyses(self): + """Test getting comparative analysis relationships.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + config = { + "repositories": { + "BrentLab/primary": { + "dataset": { + "primary_data": { + "sample_id": {"field": "sample_id"}, + "comparative_analyses": [ + { + "repo": "BrentLab/comparative", + "dataset": "comp_data", + "via_field": "binding_id", + } + ], + } + } + }, + "BrentLab/comparative": { + "dataset": {"comp_data": {"dto_fdr": {"field": "dto_fdr"}}} + }, + } + } + yaml.dump(config, f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + info = vdb.get_comparative_analyses() + + # Check primary to comparative mapping + assert "BrentLab/primary/primary_data" in info["primary_to_comparative"] + links = info["primary_to_comparative"]["BrentLab/primary/primary_data"] + assert len(links) == 1 + assert links[0]["comparative_repo"] == "BrentLab/comparative" + assert links[0]["comparative_dataset"] == "comp_data" + assert links[0]["via_field"] == "binding_id" + + # Check comparative fields + assert "BrentLab/comparative/comp_data" in info["comparative_fields"] + assert ( + "dto_fdr" + in info["comparative_fields"]["BrentLab/comparative/comp_data"] + ) + finally: + Path(config_path).unlink() + + def test_get_comparative_analyses_filtered(self): + """Test filtering comparative analyses by repo and config.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + config = { + "repositories": { + "BrentLab/primary1": { + "dataset": { + "data1": { + "sample_id": {"field": "sample_id"}, + "comparative_analyses": [ + { + "repo": "BrentLab/comp", + "dataset": "comp_data", + "via_field": "id1", + } + ], + } + } + }, + "BrentLab/primary2": { + "dataset": { + "data2": { + "sample_id": {"field": "sample_id"}, + "comparative_analyses": [ + { + "repo": "BrentLab/comp", + "dataset": "comp_data", + "via_field": "id2", + } + ], + } + } + }, + } + } + yaml.dump(config, f) + config_path = f.name + + try: + vdb = VirtualDB(config_path) + + # Get all + all_info = vdb.get_comparative_analyses() + assert len(all_info["primary_to_comparative"]) == 2 + + # Filter by repo and config + filtered = vdb.get_comparative_analyses("BrentLab/primary1", "data1") + assert len(filtered["primary_to_comparative"]) == 1 + assert "BrentLab/primary1/data1" in filtered["primary_to_comparative"] + + # Filter by repo only + repo_filtered = vdb.get_comparative_analyses("BrentLab/primary2") + assert len(repo_filtered["primary_to_comparative"]) == 1 + assert "BrentLab/primary2/data2" in repo_filtered["primary_to_comparative"] + finally: + Path(config_path).unlink() + + +# Note: Full integration tests with real HuggingFace datasets would go here +# but are excluded as they require network access and specific test datasets. +# These tests cover the core logic and would be supplemented with integration +# tests using the actual sample config and real datasets like harbison_2004. diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py new file mode 100644 index 0000000..f6dd12e --- /dev/null +++ b/tfbpapi/virtual_db.py @@ -0,0 +1,1345 @@ +""" +VirtualDB provides a unified query interface across heterogeneous datasets. + +This module enables cross-dataset queries with standardized field names and values, +mapping varying experimental condition structures to a common schema through external +YAML configuration. + +Key Components: +- VirtualDB: Main interface for unified cross-dataset queries +- Helper functions: get_nested_value(), normalize_value() for metadata extraction +- Configuration-driven schema via models.MetadataConfig + +Example Usage: + >>> from tfbpapi.datainfo import VirtualDB + >>> vdb = VirtualDB("config.yaml") + >>> + >>> # Discover available fields + >>> fields = vdb.get_fields() + >>> print(fields) # ["carbon_source", "temperature_celsius", ...] + >>> + >>> # Query across datasets + >>> df = vdb.query( + ... filters={"carbon_source": "glucose", "temperature_celsius": 30}, + ... fields=["sample_id", "carbon_source", "temperature_celsius"] + ... ) + >>> + >>> # Get complete data with measurements + >>> df = vdb.query( + ... filters={"carbon_source": "glucose"}, + ... complete=True + ... ) + +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import duckdb +import pandas as pd + +from tfbpapi.datacard import DataCard +from tfbpapi.errors import DataCardError +from tfbpapi.hf_cache_manager import HfCacheManager +from tfbpapi.models import MetadataConfig, PropertyMapping + + +def get_nested_value(data: dict, path: str) -> Any: + """ + Navigate nested dict/list using dot notation. + + Handles missing intermediate keys gracefully by returning None. + Supports extracting properties from lists of dicts. + + :param data: Dictionary to navigate + :param path: Dot-separated path (e.g., "media.carbon_source.compound") + :return: Value at path or None if not found + + Examples: + Simple nested dict: + get_nested_value({"media": {"name": "YPD"}}, "media.name") + Returns: "YPD" + + List of dicts - extract property from each item: + get_nested_value( + {"media": {"carbon_source": [{"compound": "glucose"}, + {"compound": "galactose"}]}}, + "media.carbon_source.compound" + ) + Returns: ["glucose", "galactose"] + + """ + if not isinstance(data, dict): + return None + + keys = path.split(".") + current = data + + for i, key in enumerate(keys): + if isinstance(current, dict): + if key not in current: + return None + current = current[key] + elif isinstance(current, list): + # If current is a list and we have more keys, + # extract property from each item + if i < len(keys): + # Extract the remaining path from each list item + remaining_path = ".".join(keys[i:]) + results = [] + for item in current: + if isinstance(item, dict): + val = get_nested_value(item, remaining_path) + if val is not None: + results.append(val) + return results if results else None + else: + return None + + return current + + +def normalize_value( + actual_value: Any, + aliases: dict[str, list[Any]] | None, + missing_value_label: str | None = None, +) -> str: + """ + Normalize a value using optional alias mappings (case-insensitive). + + Returns the alias name if a match is found, otherwise returns the + original value as a string. Handles missing values by returning + the configured missing_value_label. + + :param actual_value: The value from the data to normalize + :param aliases: Optional dict mapping alias names to lists of actual values. + Example: {"glucose": ["D-glucose", "dextrose"]} + :param missing_value_label: Label to use for None/missing values + :return: Alias name if match found, missing_value_label if None, + otherwise str(actual_value) + + Examples: + With aliases - exact match: + normalize_value("D-glucose", {"glucose": ["D-glucose", "dextrose"]}) + Returns: "glucose" + + With aliases - case-insensitive match: + normalize_value("DEXTROSE", {"glucose": ["D-glucose", "dextrose"]}) + Returns: "glucose" + + Missing value: + normalize_value(None, None, "unspecified") + Returns: "unspecified" + + No alias match - pass through: + normalize_value("maltose", {"glucose": ["D-glucose"]}) + Returns: "maltose" + + """ + # Handle None/missing values + if actual_value is None: + return missing_value_label if missing_value_label else "None" + + if aliases is None: + return str(actual_value) + + # Convert to string for comparison (case-insensitive) + actual_str = str(actual_value).lower() + + # Check each alias mapping + for alias_name, actual_values in aliases.items(): + for val in actual_values: + if str(val).lower() == actual_str: + return alias_name + + # No match found - pass through original value + return str(actual_value) + + +class VirtualDB: + """ + Unified query interface across heterogeneous datasets. + + VirtualDB provides a virtual database layer over multiple HuggingFace datasets, + allowing cross-dataset queries with standardized field names and normalized values. + Each configured dataset becomes a view with a common schema defined by external + YAML configuration. + + The YAML configuration specifies: + 1. Property mappings: How to extract each field from dataset structures + 2. Factor aliases: Normalize varying terminologies to standard values + 3. Missing value labels: Handle missing data consistently + 4. Descriptions: Document each field's semantics + + Attributes: + config: MetadataConfig instance with all configuration + token: Optional HuggingFace token for private datasets + cache: Dict mapping (repo_id, config_name) to cached DataFrame views + + """ + + def __init__(self, config_path: Path | str, token: str | None = None): + """ + Initialize VirtualDB with configuration and optional auth token. + + :param config_path: Path to YAML configuration file + :param token: Optional HuggingFace token for private datasets + :raises FileNotFoundError: If config file doesn't exist + :raises ValueError: If configuration is invalid + + """ + self.config = MetadataConfig.from_yaml(config_path) + self.token = token + self.cache: dict[tuple[str, str], pd.DataFrame] = {} + # Build mapping of comparative dataset references + self._comparative_links = self._build_comparative_links() + + def get_fields( + self, repo_id: str | None = None, config_name: str | None = None + ) -> list[str]: + """ + Get list of queryable fields. + + :param repo_id: Optional repository ID to filter to specific dataset + :param config_name: Optional config name (required if repo_id provided) + :return: List of field names + + Examples: + All fields across all datasets: + fields = vdb.get_fields() + + Fields for specific dataset: + fields = vdb.get_fields("BrentLab/harbison_2004", "harbison_2004") + + """ + if repo_id is not None and config_name is not None: + # Get fields for specific dataset + mappings = self.config.get_property_mappings(repo_id, config_name) + return sorted(mappings.keys()) + + if repo_id is not None or config_name is not None: + raise ValueError( + "Both repo_id and config_name must be provided, or neither" + ) + + # Get all fields across all datasets + all_fields: set[str] = set() + for repo_id, repo_config in self.config.repositories.items(): + # Add repo-wide fields + all_fields.update(repo_config.properties.keys()) + # Add dataset-specific fields + if repo_config.dataset: + for dataset_config in repo_config.dataset.values(): + # DatasetVirtualDBConfig stores property mappings in model_extra + if ( + hasattr(dataset_config, "model_extra") + and dataset_config.model_extra + ): + all_fields.update(dataset_config.model_extra.keys()) + # Also include special fields if they exist + if dataset_config.sample_id: + all_fields.add("sample_id") + + return sorted(all_fields) + + def get_common_fields(self) -> list[str]: + """ + Get fields present in ALL configured datasets. + + :return: List of field names common to all datasets + + Example: + common = vdb.get_common_fields() + # ["carbon_source", "temperature_celsius"] + + """ + if not self.config.repositories: + return [] + + # Get field sets for each dataset + dataset_fields: list[set[str]] = [] + for repo_id, repo_config in self.config.repositories.items(): + if repo_config.dataset: + for config_name in repo_config.dataset.keys(): + mappings = self.config.get_property_mappings(repo_id, config_name) + dataset_fields.append(set(mappings.keys())) + + if not dataset_fields: + return [] + + # Return intersection + common = set.intersection(*dataset_fields) + return sorted(common) + + def get_unique_values( + self, field: str, by_dataset: bool = False + ) -> list[str] | dict[str, list[str]]: + """ + Get unique values for a field across datasets (with normalization). + + :param field: Field name to get values for + :param by_dataset: If True, return dict keyed by dataset identifier + :return: List of unique normalized values, or dict if by_dataset=True + + Examples: + All unique values: + values = vdb.get_unique_values("carbon_source") + # ["glucose", "galactose", "raffinose"] + + Values by dataset: + values = vdb.get_unique_values("carbon_source", by_dataset=True) + # {"BrentLab/harbison_2004": ["glucose", "galactose"], + # "BrentLab/kemmeren_2014": ["glucose", "raffinose"]} + + """ + if by_dataset: + result: dict[str, list[str]] = {} + else: + all_values: set[str] = set() + + # Query each dataset that has this field + for repo_id, repo_config in self.config.repositories.items(): + if repo_config.dataset: + for config_name in repo_config.dataset.keys(): + mappings = self.config.get_property_mappings(repo_id, config_name) + if field not in mappings: + continue + + # Build metadata table for this dataset + metadata_df = self._build_metadata_table(repo_id, config_name) + if metadata_df.empty or field not in metadata_df.columns: + continue + + # Get unique values (already normalized) + unique_vals = metadata_df[field].dropna().unique().tolist() + + if by_dataset: + dataset_key = f"{repo_id}/{config_name}" + result[dataset_key] = sorted(unique_vals) + else: + all_values.update(unique_vals) + + if by_dataset: + return result + else: + return sorted(all_values) + + def get_comparative_analyses( + self, repo_id: str | None = None, config_name: str | None = None + ) -> dict[str, Any]: + """ + Get information about comparative analysis relationships. + + Returns information about which comparative datasets are available + and how they link to primary datasets. Useful for discovering + what cross-dataset analyses can be performed. + + :param repo_id: Optional repository ID to filter to specific repo + :param config_name: Optional config name (requires repo_id) + :return: Dictionary with two keys: + - "primary_to_comparative": Maps primary datasets to their + comparative analyses + - "comparative_fields": Maps comparative datasets to fields + available for joining + :raises ValueError: If config_name provided without repo_id + + Examples: + Get all comparative analysis relationships: + info = vdb.get_comparative_analyses() + + Get relationships for specific primary dataset: + info = vdb.get_comparative_analyses( + "BrentLab/callingcards", "annotated_features" + ) + + """ + if config_name and not repo_id: + raise ValueError("repo_id required when config_name is specified") + + primary_to_comparative: dict[str, list[dict[str, str]]] = {} + comparative_fields: dict[str, list[str]] = {} + + # Filter links based on parameters + if repo_id and config_name: + # Specific dataset requested + links_to_process = { + (repo_id, config_name): self._comparative_links.get( + (repo_id, config_name), {} + ) + } + elif repo_id: + # All configs in specific repo + links_to_process = { + k: v for k, v in self._comparative_links.items() if k[0] == repo_id + } + else: + # All links + links_to_process = self._comparative_links + + # Build primary to comparative mapping + for (prim_repo, prim_config), link_info in links_to_process.items(): + if "comparative_analyses" not in link_info: + continue + + dataset_key = f"{prim_repo}/{prim_config}" + primary_to_comparative[dataset_key] = [] + + for ca in link_info["comparative_analyses"]: + primary_to_comparative[dataset_key].append( + { + "comparative_repo": ca["repo"], + "comparative_dataset": ca["dataset"], + "via_field": ca["via_field"], + } + ) + + # Track which fields are available from comparative datasets + comp_key = f"{ca['repo']}/{ca['dataset']}" + if comp_key not in comparative_fields: + # Get fields from the comparative dataset + # First try config mappings + comp_fields = self.get_fields(ca["repo"], ca["dataset"]) + + # If no mappings, get actual fields from DataCard + if not comp_fields: + try: + card = DataCard(ca["repo"], token=self.token) + config = card.get_config(ca["dataset"]) + if config and config.dataset_info: + comp_fields = [ + f.name for f in config.dataset_info.features + ] + except Exception: + comp_fields = [] + + comparative_fields[comp_key] = comp_fields + + return { + "primary_to_comparative": primary_to_comparative, + "comparative_fields": comparative_fields, + } + + def query( + self, + filters: dict[str, Any] | None = None, + datasets: list[tuple[str, str]] | None = None, + fields: list[str] | None = None, + complete: bool = False, + ) -> pd.DataFrame: + """ + Query VirtualDB with optional filters and field selection. + + :param filters: Dict of field:value pairs to filter on + :param datasets: List of (repo_id, config_name) tuples to query (None = all) + :param fields: List of field names to return (None = all) + :param complete: If True, return measurement-level data; if False, sample-level + :return: DataFrame with query results + + Examples: + Basic query across all datasets: + df = vdb.query(filters={"carbon_source": "glucose"}) + + Query specific datasets with field selection: + df = vdb.query( + filters={"carbon_source": "glucose", "temperature_celsius": 30}, + datasets=[("BrentLab/harbison_2004", "harbison_2004")], + fields=["sample_id", "carbon_source", "temperature_celsius"] + ) + + Complete data with measurements: + df = vdb.query( + filters={"carbon_source": "glucose"}, + complete=True + ) + + """ + # Determine which datasets to query + if datasets is None: + # Query all configured datasets + datasets = [] + for repo_id, repo_config in self.config.repositories.items(): + if repo_config.dataset: + for config_name in repo_config.dataset.keys(): + datasets.append((repo_id, config_name)) + + if not datasets: + return pd.DataFrame() + + # Query each dataset + results: list[pd.DataFrame] = [] + for repo_id, config_name in datasets: + # Build metadata table + metadata_df = self._build_metadata_table(repo_id, config_name) + if metadata_df.empty: + continue + + # Separate filters into primary and comparative + primary_filters = {} + comparative_filters = {} + if filters: + # Get comparative field mapping + comp_field_mapping = self._get_comparative_fields_for_dataset( + repo_id, config_name + ) + for field, value in filters.items(): + if field in comp_field_mapping: + comparative_filters[field] = value + else: + primary_filters[field] = value + + # Apply primary filters first + if primary_filters: + metadata_df = self._apply_filters( + metadata_df, primary_filters, repo_id, config_name + ) + + # Enrich with comparative data if needed + # IMPORTANT: Do this BEFORE getting complete data so comparative fields + # are joined at the sample level, not measurement level + # This happens when: fields are requested from comparative datasets + # OR when filtering on comparative fields + if fields or comparative_filters: + comp_field_mapping = self._get_comparative_fields_for_dataset( + repo_id, config_name + ) + if fields: + requested_comp_fields = [ + f for f in fields if f in comp_field_mapping + ] + # Also need fields that are filtered on + filtered_comp_fields = [ + f for f in comparative_filters.keys() if f in comp_field_mapping + ] + all_comp_fields = list( + set(requested_comp_fields + filtered_comp_fields) + ) + if all_comp_fields: + metadata_df = self._enrich_with_comparative_data( + metadata_df, repo_id, config_name, all_comp_fields + ) + + # Apply comparative filters after enrichment + if comparative_filters: + metadata_df = self._apply_filters( + metadata_df, comparative_filters, repo_id, config_name + ) + + # If complete=True, join with full data + # Do this AFTER comparative enrichment so DTO fields are already added + if complete: + sample_ids = metadata_df["sample_id"].tolist() + if sample_ids: + full_df = self._get_complete_data( + repo_id, config_name, sample_ids, metadata_df + ) + if not full_df.empty: + metadata_df = full_df + + # Select requested fields + if fields: + # Keep sample_id and any dataset identifier columns + keep_cols = ["sample_id"] + if "dataset_id" in metadata_df.columns: + keep_cols.append("dataset_id") + # Add requested fields that exist + for field in fields: + if field in metadata_df.columns and field not in keep_cols: + keep_cols.append(field) + metadata_df = metadata_df[keep_cols].copy() + + # Add dataset identifier + if "dataset_id" not in metadata_df.columns: + metadata_df = metadata_df.copy() + metadata_df["dataset_id"] = f"{repo_id}/{config_name}" + + results.append(metadata_df) + + if not results: + return pd.DataFrame() + + # Concatenate results, filling NaN for missing columns + return pd.concat(results, ignore_index=True, sort=False) + + def materialize_views(self, datasets: list[tuple[str, str]] | None = None) -> None: + """ + Build and cache metadata DataFrames for faster subsequent queries. + + :param datasets: List of (repo_id, config_name) tuples to materialize + (None = materialize all) + + Example: + vdb.materialize_views() # Cache all datasets + vdb.materialize_views([("BrentLab/harbison_2004", "harbison_2004")]) + + """ + if datasets is None: + # Materialize all configured datasets + datasets = [] + for repo_id, repo_config in self.config.repositories.items(): + if repo_config.dataset: + for config_name in repo_config.dataset.keys(): + datasets.append((repo_id, config_name)) + + for repo_id, config_name in datasets: + # Build and cache + self._build_metadata_table(repo_id, config_name, use_cache=False) + + def invalidate_cache(self, datasets: list[tuple[str, str]] | None = None) -> None: + """ + Clear cached metadata DataFrames. + + :param datasets: List of (repo_id, config_name) tuples to invalidate + (None = invalidate all) + + Example: + vdb.invalidate_cache() # Clear all cache + vdb.invalidate_cache([("BrentLab/harbison_2004", "harbison_2004")]) + + """ + if datasets is None: + self.cache.clear() + else: + for dataset_key in datasets: + if dataset_key in self.cache: + del self.cache[dataset_key] + + def _build_comparative_links(self) -> dict[tuple[str, str], dict[str, Any]]: + """ + Build mapping of primary datasets to their comparative dataset references. + + Returns dict keyed by (repo_id, config_name) with value being dict: { + "comparative_analyses": [ { "repo": comparative_repo_id, + "dataset": comparative_config_name, "via_field": + field_name_with_composite_ids } ] } + + """ + links: dict[tuple[str, str], dict[str, Any]] = {} + + for repo_id, repo_config in self.config.repositories.items(): + if not repo_config.dataset: + continue + + for config_name, dataset_config in repo_config.dataset.items(): + if dataset_config.comparative_analyses: + links[(repo_id, config_name)] = { + "comparative_analyses": [ + { + "repo": ca.repo, + "dataset": ca.dataset, + "via_field": ca.via_field, + } + for ca in dataset_config.comparative_analyses + ] + } + + return links + + def _get_comparative_fields_for_dataset( + self, repo_id: str, config_name: str + ) -> dict[str, dict[str, str]]: + """ + Get mapping of comparative fields available for a primary dataset. + + :param repo_id: Primary dataset repository ID + :param config_name: Primary dataset config name + :return: Dict mapping field_name to comparative dataset info + {field_name: { + "comp_repo": comparative_repo_id, + "comp_dataset": comparative_dataset_name, + "via_field": field_with_composite_ids + }} + + Example: + For callingcards dataset linked to DTO via binding_id: + { + "dto_fdr": { + "comp_repo": "BrentLab/yeast_comparative_analysis", + "comp_dataset": "dto", + "via_field": "binding_id" + }, + "dto_empirical_pvalue": {...} + } + + """ + field_mapping: dict[str, dict[str, str]] = {} + + # Get comparative analyses for this dataset + links = self._comparative_links.get((repo_id, config_name), {}) + if "comparative_analyses" not in links: + return field_mapping + + # For each comparative dataset, get its fields + for ca in links["comparative_analyses"]: + comp_repo = ca["repo"] + comp_dataset = ca["dataset"] + via_field = ca["via_field"] + + # Get fields from comparative dataset + comp_fields = self.get_fields(comp_repo, comp_dataset) + + # If no fields from config, try DataCard + if not comp_fields: + try: + from tfbpapi.datacard import DataCard + + card = DataCard(comp_repo, token=self.token) + config = card.get_config(comp_dataset) + if config and config.dataset_info: + comp_fields = [f.name for f in config.dataset_info.features] + except Exception: + comp_fields = [] + + # Map each field to this comparative dataset + for field_name in comp_fields: + # Skip the via_field itself (it's the join key) + if field_name == via_field: + continue + + field_mapping[field_name] = { + "comp_repo": comp_repo, + "comp_dataset": comp_dataset, + "via_field": via_field, + } + + return field_mapping + + def _enrich_with_comparative_data( + self, + primary_df: pd.DataFrame, + repo_id: str, + config_name: str, + requested_fields: list[str], + ) -> pd.DataFrame: + """ + Enrich primary dataset with fields from comparative datasets. + + :param primary_df: Primary dataset DataFrame with sample_id column + :param repo_id: Primary dataset repository ID + :param config_name: Primary dataset config name + :param requested_fields: List of field names requested by user + :return: DataFrame enriched with comparative fields + + """ + # Get mapping of which fields come from which comparative datasets + comp_field_mapping = self._get_comparative_fields_for_dataset( + repo_id, config_name + ) + + if not comp_field_mapping: + return primary_df + + # Find which requested fields are from comparative datasets + comp_fields_to_fetch = [f for f in requested_fields if f in comp_field_mapping] + + if not comp_fields_to_fetch: + return primary_df + + # Group fields by comparative dataset to minimize queries + by_comp_dataset: dict[tuple[str, str, str], list[str]] = {} + for field in comp_fields_to_fetch: + info = comp_field_mapping[field] + key = (info["comp_repo"], info["comp_dataset"], info["via_field"]) + if key not in by_comp_dataset: + by_comp_dataset[key] = [] + by_comp_dataset[key].append(field) + + # For each comparative dataset, load and join + result_df = primary_df.copy() + + for (comp_repo, comp_dataset, via_field), fields in by_comp_dataset.items(): + try: + # Load comparative dataset using HfCacheManager + # but query the raw data table instead of metadata view + from tfbpapi.hf_cache_manager import HfCacheManager + + comp_cache_mgr = HfCacheManager( + comp_repo, duckdb_conn=duckdb.connect(":memory:"), token=self.token + ) + + # Get the config to load data + comp_config = comp_cache_mgr.get_config(comp_dataset) + if not comp_config: + continue + + # Load the data (this will download and register parquet files) + result = comp_cache_mgr._get_metadata_for_config(comp_config) + if not result.get("success", False): + continue + + # Now query the raw data table directly (not the metadata view) + # The raw table name is config_name without "metadata_" prefix + select_fields = [via_field] + fields + columns = ", ".join(select_fields) + + # Query the actual parquet data by creating a view from the files + try: + # Get file paths that were loaded + import glob + + from huggingface_hub import snapshot_download + + cache_dir = snapshot_download( + repo_id=comp_repo, + repo_type="dataset", + allow_patterns=f"{comp_dataset}/**/*.parquet", + token=self.token, + ) + + parquet_files = glob.glob( + f"{cache_dir}/{comp_dataset}/**/*.parquet", recursive=True + ) + + if not parquet_files: + continue + + # Create a temporary view from parquet files + temp_view = f"temp_{comp_dataset}_raw" + files_sql = ", ".join([f"'{f}'" for f in parquet_files]) + comp_cache_mgr.duckdb_conn.execute( + f"CREATE OR REPLACE VIEW {temp_view} AS " + f"SELECT * FROM read_parquet([{files_sql}])" + ) + + # Query the view + sql = f"SELECT {columns} FROM {temp_view}" + comp_df = comp_cache_mgr.duckdb_conn.execute(sql).fetchdf() + + except Exception: + # If direct parquet loading fails, skip this comparative dataset + continue + + if comp_df.empty: + continue + + # Parse composite identifiers to extract sample_id + # via_field contains values like + # "BrentLab/harbison_2004;harbison_2004;123" + # We need to extract the third component and match on + # current repo/config + def extract_sample_id(composite_id: str) -> str | None: + """Extract sample_id if composite matches current dataset.""" + if pd.isna(composite_id): + return None + try: + parts = composite_id.split(";") + if len(parts) != 3: + return None + # Check if this composite ID references our dataset + if parts[0] == repo_id and parts[1] == config_name: + return parts[2] + return None + except Exception: + return None + + comp_df["_join_sample_id"] = comp_df[via_field].apply(extract_sample_id) + + # Convert _join_sample_id to match primary_df sample_id dtype + # This handles cases where sample_id is int but composite has string + if "_join_sample_id" in comp_df.columns: + primary_dtype = primary_df["sample_id"].dtype + if pd.api.types.is_integer_dtype(primary_dtype): + # Convert to numeric, coercing errors to NaN + comp_df["_join_sample_id"] = pd.to_numeric( + comp_df["_join_sample_id"], errors="coerce" + ) + elif pd.api.types.is_string_dtype(primary_dtype): + comp_df["_join_sample_id"] = comp_df["_join_sample_id"].astype( + str + ) + + # Filter to only rows that match our dataset + comp_df = comp_df[comp_df["_join_sample_id"].notna()].copy() + + if comp_df.empty: + continue + + # Drop the via_field column (we don't need it in results) + comp_df = comp_df.drop(columns=[via_field]) + + # Merge with primary data + result_df = result_df.merge( + comp_df, left_on="sample_id", right_on="_join_sample_id", how="left" + ) + + # Drop the temporary join column + result_df = result_df.drop(columns=["_join_sample_id"]) + + except Exception: + # If enrichment fails for this comparative dataset, continue + continue + + return result_df + + @staticmethod + def _parse_composite_identifier(composite_id: str) -> tuple[str, str, str]: + """ + Parse composite sample identifier into components. + + :param composite_id: Composite ID in format "repo_id;config_name;sample_id" + :return: Tuple of (repo_id, config_name, sample_id) + + Example: + _parse_composite_identifier( + "BrentLab/harbison_2004;harbison_2004;sample_42" + ) + Returns: ("BrentLab/harbison_2004", "harbison_2004", "sample_42") + + """ + parts = composite_id.split(";") + if len(parts) != 3: + raise ValueError( + f"Invalid composite ID format: {composite_id}. " + "Expected 'repo_id;config_name;sample_id'" + ) + return parts[0], parts[1], parts[2] + + def _build_metadata_table( + self, repo_id: str, config_name: str, use_cache: bool = True + ) -> pd.DataFrame: + """ + Build metadata table for a single dataset. + + Extracts sample-level metadata from experimental conditions hierarchy and field + definitions, with normalization and missing value handling. + + :param repo_id: Repository ID + :param config_name: Configuration name + :param use_cache: Whether to use/update cache + :return: DataFrame with one row per sample_id + + """ + cache_key = (repo_id, config_name) + + # Check cache + if use_cache and cache_key in self.cache: + return self.cache[cache_key] + + try: + # Load DataCard and CacheManager + card = DataCard(repo_id, token=self.token) + cache_mgr = HfCacheManager( + repo_id, duckdb_conn=duckdb.connect(":memory:"), token=self.token + ) + + # Get property mappings + property_mappings = self.config.get_property_mappings(repo_id, config_name) + if not property_mappings: + return pd.DataFrame() + + # Extract repo/config-level metadata + repo_metadata = self._extract_repo_level( + card, config_name, property_mappings + ) + + # Extract field-level metadata + field_metadata = self._extract_field_level( + card, config_name, property_mappings + ) + + # Get sample-level data from HuggingFace + config = card.get_config(config_name) + + # Check if this is a comparative dataset + from tfbpapi.models import DatasetType + + is_comparative = ( + config + and hasattr(config, "dataset_type") + and config.dataset_type == DatasetType.COMPARATIVE + ) + + if config and hasattr(config, "metadata_fields") and config.metadata_fields: + # Select only metadata fields + columns = ", ".join(config.metadata_fields) + if not is_comparative and "sample_id" not in config.metadata_fields: + columns = f"sample_id, {columns}" + sql = f"SELECT DISTINCT {columns} FROM {config_name}" + else: + # No metadata_fields specified, select all + sql = f"SELECT DISTINCT * FROM {config_name}" + + df = cache_mgr.query(sql, config_name) + + # For non-comparative datasets: one row per sample_id + # For comparative datasets: keep all rows (each row is a relationship) + if not is_comparative and "sample_id" in df.columns: + df = df.groupby("sample_id").first().reset_index() + + # Add repo-level metadata as columns + for prop_name, values in repo_metadata.items(): + # Use first value (repo-level properties are constant) + df[prop_name] = values[0] if values else None + + # Add field-level metadata + if field_metadata: + df = self._add_field_metadata(df, field_metadata) + + # Apply dtype conversions to DataFrame columns + df = self._apply_column_dtypes(df, property_mappings) + + # Cache result + if use_cache: + self.cache[cache_key] = df + + return df + + except Exception as e: + # Log error for debugging with full traceback + import traceback + + print(f"Error downloading metadata for {config_name}: {e}") + traceback.print_exc() + # Return empty DataFrame on error + return pd.DataFrame() + + def _apply_column_dtypes( + self, df: pd.DataFrame, property_mappings: dict[str, PropertyMapping] + ) -> pd.DataFrame: + """ + Apply dtype conversions to DataFrame columns based on property mappings. + + :param df: DataFrame to apply conversions to + :param property_mappings: Property mappings with dtype specifications + :return: DataFrame with converted column dtypes + + """ + for prop_name, mapping in property_mappings.items(): + # Skip if no dtype specified or column doesn't exist + if not mapping.dtype or prop_name not in df.columns: + continue + + # Convert column dtype + try: + if mapping.dtype == "numeric": + df[prop_name] = pd.to_numeric(df[prop_name], errors="coerce") + elif mapping.dtype == "bool": + df[prop_name] = df[prop_name].astype(bool) + elif mapping.dtype == "string": + df[prop_name] = df[prop_name].astype(str) + except (ValueError, TypeError): + # Conversion failed, leave as is + pass + + return df + + def _convert_dtype(self, value: Any, dtype: str) -> Any: + """ + Convert value to specified data type. + + :param value: The value to convert to a given `dtype` + :param dtype: Target data type ("numeric", "bool", "string") + + :return: Converted value or None if conversion fails + + """ + if value is None: + return None + + try: + if dtype == "numeric": + # Try float first (handles both int and float) + return float(value) + elif dtype == "bool": + return bool(value) + elif dtype == "string": + return str(value) + else: + # Unknown dtype, pass through unchanged + return value + except (ValueError, TypeError): + # Conversion failed, return None + return None + + def _extract_repo_level( + self, + card: DataCard, + config_name: str, + property_mappings: dict[str, PropertyMapping], + ) -> dict[str, list[str]]: + """ + Extract and normalize repo/config-level metadata. + + :param card: DataCard instance + :param config_name: Configuration name + :param property_mappings: Property mappings for this dataset + :return: Dict mapping property names to normalized values + + """ + metadata: dict[str, list[str]] = {} + + # Get experimental conditions + try: + conditions = card.get_experimental_conditions(config_name) + except DataCardError: + conditions = {} + + if not conditions: + return metadata + + # Extract each mapped property + for prop_name, mapping in property_mappings.items(): + # Skip field-level mappings + if mapping.field is not None: + continue + + # Build full path + # Note: `conditions` is already the experimental_conditions dict, + # so we don't add the prefix + full_path = mapping.path + + # Get value at path + value = get_nested_value(conditions, full_path) # type: ignore + + # Handle missing values + missing_label = self.config.missing_value_labels.get(prop_name) + if value is None: + if missing_label: + metadata[prop_name] = [missing_label] + continue + + # Ensure value is a list + actual_values = [value] if not isinstance(value, list) else value + + # Apply dtype conversion if specified + if mapping.dtype: + actual_values = [ + self._convert_dtype(v, mapping.dtype) for v in actual_values + ] + + # Normalize using aliases + aliases = self.config.factor_aliases.get(prop_name) + normalized_values = [ + normalize_value(v, aliases, missing_label) for v in actual_values + ] + + metadata[prop_name] = normalized_values + + return metadata + + def _extract_field_level( + self, + card: DataCard, + config_name: str, + property_mappings: dict[str, PropertyMapping], + ) -> dict[str, dict[str, Any]]: + """ + Extract and normalize field-level metadata. + + :param card: DataCard instance + :param config_name: Configuration name + :param property_mappings: Property mappings for this dataset + :return: Dict mapping field values to their normalized metadata + + """ + field_metadata: dict[str, dict[str, Any]] = {} + + # Group property mappings by field + field_mappings: dict[str, dict[str, PropertyMapping]] = {} + for prop_name, mapping in property_mappings.items(): + # Only process if field is specified AND path exists + # (no path means it's just a column alias, not metadata extraction) + if mapping.field is not None and mapping.path is not None: + field_name = mapping.field + if field_name not in field_mappings: + field_mappings[field_name] = {} + field_mappings[field_name][prop_name] = mapping + + # Process each field that has mappings + for field_name, prop_mappings_dict in field_mappings.items(): + # Get field definitions + definitions = card.get_field_definitions(config_name, field_name) + if not definitions: + continue + + # Extract metadata for each field value + for field_value, definition in definitions.items(): + if field_value not in field_metadata: + field_metadata[field_value] = {} + + for prop_name, mapping in prop_mappings_dict.items(): + # Get value at path + value = get_nested_value(definition, mapping.path) # type: ignore + + # Handle missing values + missing_label = self.config.missing_value_labels.get(prop_name) + if value is None: + if missing_label: + field_metadata[field_value][prop_name] = [missing_label] + continue + + # Ensure value is a list + actual_values = [value] if not isinstance(value, list) else value + + # Apply dtype conversion if specified + if mapping.dtype: + actual_values = [ + self._convert_dtype(v, mapping.dtype) for v in actual_values + ] + + # Normalize using aliases + aliases = self.config.factor_aliases.get(prop_name) + normalized_values = [ + normalize_value(v, aliases, missing_label) + for v in actual_values + ] + + field_metadata[field_value][prop_name] = normalized_values + + return field_metadata + + def _add_field_metadata( + self, df: pd.DataFrame, field_metadata: dict[str, dict[str, Any]] + ) -> pd.DataFrame: + """ + Add columns from field-level metadata to DataFrame. + + :param df: DataFrame with base sample metadata + :param field_metadata: Dict mapping field values to their properties + :return: DataFrame with additional property columns + + """ + # For each field value, add its properties as columns + for field_value, properties in field_metadata.items(): + for prop_name, prop_values in properties.items(): + # Initialize column if needed + if prop_name not in df.columns: + df[prop_name] = None + + # Find rows where any column matches field_value + for col in df.columns: + if col in [prop_name, "sample_id", "dataset_id"]: + continue + mask = df[col] == field_value + if mask.any(): + # Set property value (take first from list) + value = prop_values[0] if prop_values else None + df.loc[mask, prop_name] = value + + return df + + def _apply_filters( + self, + df: pd.DataFrame, + filters: dict[str, Any], + repo_id: str, + config_name: str, + ) -> pd.DataFrame: + """ + Apply filters to DataFrame with alias expansion and numeric handling. + + :param df: DataFrame to filter + :param filters: Dict of field:value pairs + :param repo_id: Repository ID (for alias lookup) + :param config_name: Config name (for alias lookup) + :return: Filtered DataFrame + + """ + for field, filter_value in filters.items(): + if field not in df.columns: + continue + + # Handle numeric range filters + if isinstance(filter_value, tuple): + operator = filter_value[0] + if operator == "between" and len(filter_value) == 3: + df = df[ + (df[field] >= filter_value[1]) & (df[field] <= filter_value[2]) + ] + elif operator in (">=", ">", "<=", "<", "==", "!="): + if operator == ">=": + df = df[df[field] >= filter_value[1]] + elif operator == ">": + df = df[df[field] > filter_value[1]] + elif operator == "<=": + df = df[df[field] <= filter_value[1]] + elif operator == "<": + df = df[df[field] < filter_value[1]] + elif operator == "==": + df = df[df[field] == filter_value[1]] + elif operator == "!=": + df = df[df[field] != filter_value[1]] + else: + # Exact match with alias expansion + aliases = self.config.factor_aliases.get(field) + if aliases: + # Expand filter value to all aliases + expanded_values = [filter_value] + for alias_name, actual_values in aliases.items(): + if alias_name == filter_value: + # Add all actual values for this alias + expanded_values.extend([str(v) for v in actual_values]) + df = df[df[field].isin(expanded_values)] + else: + # No aliases, exact match + df = df[df[field] == filter_value] + + return df + + def _get_complete_data( + self, + repo_id: str, + config_name: str, + sample_ids: list[str], + metadata_df: pd.DataFrame, + ) -> pd.DataFrame: + """ + Get complete data (with measurements) for sample_ids. + + Uses WHERE sample_id IN (...) approach for efficient retrieval. + + :param repo_id: Repository ID + :param config_name: Configuration name + :param sample_ids: List of sample IDs to retrieve + :param metadata_df: Metadata DataFrame to merge with + :return: DataFrame with measurements and metadata + + """ + try: + cache_mgr = HfCacheManager( + repo_id, duckdb_conn=duckdb.connect(":memory:"), token=self.token + ) + + # Build IN clause + sample_id_list = ", ".join([f"'{sid}'" for sid in sample_ids]) + sql = f""" + SELECT * + FROM {config_name} + WHERE sample_id IN ({sample_id_list}) + """ + + full_df = cache_mgr.query(sql, config_name) + + # Merge with metadata (metadata_df has normalized fields) + # Drop metadata columns from full_df to avoid duplicates + metadata_cols = [ + col + for col in metadata_df.columns + if col not in ["sample_id", "dataset_id"] + ] + full_df = full_df.drop( + columns=[c for c in metadata_cols if c in full_df.columns], + errors="ignore", + ) + + # Merge on sample_id + result = full_df.merge(metadata_df, on="sample_id", how="left") + + return result + + except Exception: + return pd.DataFrame() + + def __repr__(self) -> str: + """String representation.""" + n_repos = len(self.config.repositories) + n_datasets = sum( + len(rc.dataset) if rc.dataset else 0 + for rc in self.config.repositories.values() + ) + n_cached = len(self.cache) + return ( + f"VirtualDB({n_repos} repositories, {n_datasets} datasets configured, " + f"{n_cached} views cached)" + )