Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
74c511e
phase 1 refactor creating individual modules for each data_portal_view
petercarbsmith Apr 4, 2026
73f8623
phase 2 all imports are verfied working
petercarbsmith Apr 4, 2026
4fc807a
Phase 3 & 4: Add migration templates and comprehensive documentation
petercarbsmith Apr 4, 2026
02de1c1
refactor: Switch to raw SQL snapshots for materialized view migrations
petercarbsmith Apr 6, 2026
d292dfc
feat: Phase 5 - Consolidate 8 remaining views into single migration
petercarbsmith Apr 7, 2026
90bb531
fix: Correct column name in mv_biomass_county_production view
petercarbsmith Apr 7, 2026
a367562
fix: Replace bulk GRANT with individual view permissions
petercarbsmith Apr 7, 2026
f2efc34
fix: Add timezone configuration to Prefect containers
petercarbsmith Apr 7, 2026
967f810
finally have immutable view and index creation. New tables from Mei P…
petercarbsmith Apr 7, 2026
d550641
cleaning up some documentation
petercarbsmith Apr 7, 2026
2f19df1
adding qc filtering to views to not include fail results
petercarbsmith Apr 7, 2026
c72e37e
fixing migration issue with squashed data_portal stuff
petercarbsmith Apr 7, 2026
cc11e75
Add b3f2d1c8e9a0 api_key table migration in correct sequence and upda…
petercarbsmith Apr 7, 2026
82a28db
Merge branch 'main' of https://github.com/sustainability-software-lab…
petercarbsmith Apr 7, 2026
c90a158
Phase 5: Complete Field Sample ETL v03 Testing & Validation
petercarbsmith Apr 8, 2026
3304b0b
Phase 6: Remove old samplemetadata extractor and v01/v02 transforms
petercarbsmith Apr 8, 2026
36c5a47
fix: Apply pre-commit formatting corrections
petercarbsmith Apr 8, 2026
ab72cd9
fixing refresh_views issue with no unique constraint on some views
petercarbsmith Apr 8, 2026
e4e753f
fixing up some pretreatment etl problems.
petercarbsmith Apr 9, 2026
e8788b6
phase one of new etl plan. Creates sql models and migrations
petercarbsmith Apr 9, 2026
0370d73
feat: Implement Phase 2 Resource Images ETL pipeline
petercarbsmith Apr 9, 2026
109f510
final fix to fermentation_record and resource_image. Flows now work a…
petercarbsmith Apr 9, 2026
9565352
feat: etl pipeline for county ag report record buit and working well
petercarbsmith Apr 9, 2026
268c55a
bug: fixing dataset in observation to populate for county reports
petercarbsmith Apr 9, 2026
4320bd6
adding ag report test and turning all the flows back on
petercarbsmith Apr 9, 2026
6743407
fix-fermentation record duplicate issue and mounting volumes to docke…
petercarbsmith Apr 10, 2026
ecd888c
turning back on all flows, fixing county_ag_report
petercarbsmith Apr 10, 2026
2ab2525
fixing tests for test_fermenetation
petercarbsmith Apr 10, 2026
0f86863
feat: consolidate etl_fixes and rebase onto upstream main
petercarbsmith Apr 10, 2026
fdd7570
implementing strain normalization for fermentation_record
petercarbsmith Apr 13, 2026
bf884c8
bug: attempting to fix migrations CI failure
petercarbsmith Apr 13, 2026
1a03b6c
bug: it was a gitignore problem! Sorry about that. Everthing should b…
petercarbsmith Apr 14, 2026
3a320cb
addressing reviewer comments to clean up
petercarbsmith Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,5 @@ scripts/check_pretreatment_duplicates.py
# hatch-vcs generated version files
_version.py

# analysis environment
analysis
# analysis environment (only ignore the BioCirv AI submodule workspace)
analysis/biocirv-ai/
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Add fermentation method fields, resource_image, and county_ag_report_record tables

Revision ID: bd227e99e006
Revises: 9e8f7a6b5c52
Create Date: 2026-04-09 14:09:11.091043

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
import sqlmodel

# revision identifiers, used by Alembic.
revision: str = 'bd227e99e006'
down_revision: Union[str, Sequence[str], None] = '9e8f7a6b5c52'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('resource_image',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('created_at', sa.DateTime(), nullable=True),
sa.Column('updated_at', sa.DateTime(), nullable=True),
sa.Column('etl_run_id', sa.Integer(), nullable=True),
sa.Column('lineage_group_id', sa.Integer(), nullable=True),
sa.Column('resource_id', sa.Integer(), nullable=False),
sa.Column('resource_name', sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column('image_url', sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column('sort_order', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['etl_run_id'], ['etl_run.id'], ),
sa.ForeignKeyConstraint(['resource_id'], ['resource.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('resource_id', 'image_url', name='resource_image_resource_id_image_url_key')
)
op.create_table('county_ag_report_record',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('created_at', sa.DateTime(), nullable=True),
sa.Column('updated_at', sa.DateTime(), nullable=True),
sa.Column('etl_run_id', sa.Integer(), nullable=True),
sa.Column('lineage_group_id', sa.Integer(), nullable=True),
sa.Column('record_id', sqlmodel.sql.sqltypes.AutoString(), nullable=False),
sa.Column('geoid', sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column('primary_ag_product_id', sa.Integer(), nullable=True),
sa.Column('description', sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column('resource_type', sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column('data_year', sa.Integer(), nullable=True),
sa.Column('data_source_id', sa.Integer(), nullable=True),
sa.Column('produced_nsjv', sa.Boolean(), nullable=True),
sa.Column('processed_nsjv', sa.Boolean(), nullable=True),
sa.Column('note', sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column('prodn_value_note', sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.ForeignKeyConstraint(['data_source_id'], ['data_source.id'], ),
sa.ForeignKeyConstraint(['etl_run_id'], ['etl_run.id'], ),
sa.ForeignKeyConstraint(['geoid'], ['place.geoid'], ),
sa.ForeignKeyConstraint(['primary_ag_product_id'], ['primary_ag_product.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('record_id')
)
op.create_foreign_key('fermentation_record_pretreatment_method_id_fkey', 'fermentation_record', 'method', ['pretreatment_method_id'], ['id'])
op.create_foreign_key('fermentation_record_eh_method_id_fkey', 'fermentation_record', 'method', ['eh_method_id'], ['id'])
op.create_foreign_key('fermentation_record_strain_id_fkey', 'fermentation_record', 'strain', ['strain_id'], ['id'])
op.create_unique_constraint('strain_name_key', 'strain', ['name'])
# ### end Alembic commands ###


def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint('strain_name_key', 'strain', type_='unique')
op.drop_constraint('fermentation_record_strain_id_fkey', 'fermentation_record', type_='foreignkey')
op.drop_constraint('fermentation_record_pretreatment_method_id_fkey', 'fermentation_record', type_='foreignkey')
op.drop_constraint('fermentation_record_eh_method_id_fkey', 'fermentation_record', type_='foreignkey')
op.drop_table('county_ag_report_record')
op.drop_table('resource_image')
# ### end Alembic commands ###
94 changes: 0 additions & 94 deletions plans/biocirv_materialized_views_revision.md

This file was deleted.

2 changes: 2 additions & 0 deletions resources/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ services:
- ../../alembic.ini:/app/alembic.ini
- ../../src/ca_biositing/pipeline/ca_biositing:/app/.pixi/envs/etl/lib/python3.12/site-packages/ca_biositing
- ../../src/ca_biositing/datamodels/ca_biositing/datamodels:/app/.pixi/envs/etl/lib/python3.12/site-packages/ca_biositing/datamodels
- ../../src/ca_biositing/pipeline/ca_biositing:/app/.pixi/envs/etl/lib/python3.13/site-packages/ca_biositing
- ../../src/ca_biositing/datamodels/ca_biositing/datamodels:/app/.pixi/envs/etl/lib/python3.13/site-packages/ca_biositing/datamodels
depends_on:
prefect-server:
condition: service_healthy
Expand Down
3 changes: 2 additions & 1 deletion resources/prefect/run_prefect_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
"samples": "ca_biositing.pipeline.flows.samples_etl.samples_etl_flow",
"analysis_records": "ca_biositing.pipeline.flows.analysis_records.analysis_records_flow",
"aim2_bioconversion": "ca_biositing.pipeline.flows.aim2_bioconversion.aim2_bioconversion_flow",
"county_ag_report": "ca_biositing.pipeline.flows.county_ag_report_etl.county_ag_report_flow",
"usda_etl": "ca_biositing.pipeline.flows.usda_etl.usda_etl_flow",
"landiq": "ca_biositing.pipeline.flows.landiq_etl.landiq_etl_flow",
"billion_ton": "ca_biositing.pipeline.flows.billion_ton_etl.billion_ton_etl_flow",
#"field_sample": "ca_biositing.pipeline.flows.field_sample_etl.field_sample_etl_flow",
"field_sample": "ca_biositing.pipeline.flows.field_sample_etl.field_sample_etl_flow",
#"prepared_sample": "ca_biositing.pipeline.flows.prepared_sample_etl.prepared_sample_etl_flow",
"thermochem": "ca_biositing.pipeline.flows.thermochem_etl.thermochem_etl_flow",
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .experiment_equipment import DeconVessel, Equipment, Experiment, ExperimentAnalysis, ExperimentEquipment, ExperimentMethod, ExperimentPreparedSample

# External Data
from .external_data import BillionTon2023Record, LandiqRecord, LandiqResourceMapping, Polygon, ResourceUsdaCommodityMap, UsdaCensusRecord, UsdaCommodity, UsdaDomain, UsdaMarketRecord, UsdaMarketReport, UsdaStatisticCategory, UsdaSurveyProgram, UsdaSurveyRecord, UsdaTermMap
from .external_data import BillionTon2023Record, CountyAgReportRecord, LandiqRecord, LandiqResourceMapping, Polygon, ResourceUsdaCommodityMap, UsdaCensusRecord, UsdaCommodity, UsdaDomain, UsdaMarketRecord, UsdaMarketReport, UsdaStatisticCategory, UsdaSurveyProgram, UsdaSurveyRecord, UsdaTermMap

# Field Sampling
from .field_sampling import AgTreatment, CollectionMethod, FieldSample, FieldSampleCondition, FieldStorageMethod, HarvestMethod, LocationSoilType, PhysicalCharacteristic, ProcessingMethod, SoilType
Expand All @@ -41,7 +41,7 @@
from .places import LocationAddress, Place

# Resource Information
from .resource_information import PrimaryAgProduct, Resource, ResourceAvailability, ResourceClass, ResourceCounterfactual, ResourceMorphology, ResourceSubclass, ResourcePriceRecord, ResourceTransportRecord, ResourceStorageRecord, ResourceEndUseRecord, ResourceProductionRecord
from .resource_information import PrimaryAgProduct, Resource, ResourceAvailability, ResourceClass, ResourceCounterfactual, ResourceImage, ResourceMorphology, ResourceSubclass, ResourcePriceRecord, ResourceTransportRecord, ResourceStorageRecord, ResourceEndUseRecord, ResourceProductionRecord

# Sample Preparation
from .sample_preparation import PreparationMethod, PreparationMethodAbbreviation, PreparedSample
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
class FermentationRecord(Aim2RecordBase, table=True):
__tablename__ = "fermentation_record"

strain_id: Optional[int] = Field(default=None)
pretreatment_method_id: Optional[int] = Field(default=None)
eh_method_id: Optional[int] = Field(default=None)
strain_id: Optional[int] = Field(default=None, foreign_key="strain.id")
pretreatment_method_id: Optional[int] = Field(default=None, foreign_key="method.id")
eh_method_id: Optional[int] = Field(default=None, foreign_key="method.id")
well_position: Optional[str] = Field(default=None)
vessel_id: Optional[int] = Field(default=None, foreign_key="decon_vessel.id")
analyte_detection_equipment_id: Optional[int] = Field(default=None)
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from ..base import LookupBase
from sqlmodel import Field, SQLModel
from sqlmodel import Field
from typing import Optional


class Strain(LookupBase, table=True):
__tablename__ = "strain"

name: Optional[str] = Field(default=None, unique=True)
parent_strain_id: Optional[int] = Field(default=None)
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .billion_ton import BillionTon2023Record
from .county_ag_report_record import CountyAgReportRecord
from .landiq_record import LandiqRecord
from .landiq_resource_mapping import LandiqResourceMapping
from .polygon import Polygon
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from ..base import BaseEntity
from sqlmodel import Field, Relationship
from typing import Optional


class CountyAgReportRecord(BaseEntity, table=True):
__tablename__ = "county_ag_report_record"

record_id: str = Field(nullable=False, unique=True)
geoid: Optional[str] = Field(default=None, foreign_key="place.geoid")
primary_ag_product_id: Optional[int] = Field(default=None, foreign_key="primary_ag_product.id")
description: Optional[str] = Field(default=None)
resource_type: Optional[str] = Field(default=None)
data_year: Optional[int] = Field(default=None)
data_source_id: Optional[int] = Field(default=None, foreign_key="data_source.id")
produced_nsjv: Optional[bool] = Field(default=None)
processed_nsjv: Optional[bool] = Field(default=None)
note: Optional[str] = Field(default=None)
prodn_value_note: Optional[str] = Field(default=None)

# Relationships
place: Optional["Place"] = Relationship()
primary_ag_product: Optional["PrimaryAgProduct"] = Relationship()
data_source: Optional["DataSource"] = Relationship()
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .resource_counterfactual import ResourceCounterfactual
from .resource import ResourceMorphology
from .resource import ResourceSubclass
from .resource_image import ResourceImage
from .resource_price_record import ResourcePriceRecord
from .resource_transport_record import ResourceTransportRecord
from .resource_storage_record import ResourceStorageRecord
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from ..base import BaseEntity
from sqlmodel import Field, Relationship
from typing import Optional
from sqlalchemy import UniqueConstraint


class ResourceImage(BaseEntity, table=True):
__tablename__ = "resource_image"
__table_args__ = (
UniqueConstraint('resource_id', 'image_url', name='resource_image_resource_id_image_url_key'),
)

resource_id: int = Field(foreign_key="resource.id")
resource_name: Optional[str] = Field(default=None)
image_url: Optional[str] = Field(default=None)
sort_order: Optional[int] = Field(default=None)

# Relationships
resource: Optional["Resource"] = Relationship()
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""
ETL Extract: County Ag Reports
"""

from .factory import create_extractor

GSHEET_NAME = "Aim 1-Feedstock Collection and Processing Data-BioCirV"

primary_products = create_extractor(GSHEET_NAME, "07.7-Primary_products")
pp_production_value = create_extractor(GSHEET_NAME, "07.7a-PP_Prodn_Value")
pp_data_sources = create_extractor(GSHEET_NAME, "07.7b-PP_Data_sources")
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Factory extractor for 04_Producers worksheet from SampleMetadata_v03-BioCirV.
This worksheet contains producer/origin information and extended sample metadata:
- Sample_name: Unique sample identifier (join key)
- Resource, ProviderCode, FV_Date_Time: Redundant copies from 01_Sample_IDs
- Producer: Producer name (identifies the source organization)
- Prod_Location: Producer location name (maps to field_sample_storage_location_id)
- Prod_Street, Prod_City, Prod_Zip: Producer address components
- Prod_Date: Production date
- Harvest_Method: Method used for harvesting
- Treatment: Treatment applied to the sample
- Soil_Type: Type of soil at production location
- Crop_Variety, Crop_Cultivar: Variety and cultivar information
- Production_Notes: Notes about the production process
- Other metadata: Additional extended fields for sample context
This extractor provides producer/origin context and addresses for
field_sample_storage_location_id creation via LocationAddress.
"""

from .factory import create_extractor

GSHEET_NAME = "SampleMetadata_v03-BioCirV"
WORKSHEET_NAME = "04_Producers"

# Create the extract task using the factory pattern
extract = create_extractor(GSHEET_NAME, WORKSHEET_NAME, task_name="extract_producers")
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Factory extractor for 03_Qty_FieldStorage worksheet from SampleMetadata_v03-BioCirV.

This worksheet contains sample quantity and field storage information:
- Sample_name: Unique sample identifier (join key)
- Resource, ProviderCode, FV_Date_Time: Redundant copies from 01_Sample_IDs
- Sample_Container: Container type and size (e.g., "Bucket (5 gal.)", "Core", "Bale")
* Used for amount_collected_unit_id extraction (unit is embedded in this field)
- Qty: Amount collected (maps to amount_collected)
- Qty_Unit: Explicit unit column (if present; otherwise extract from Sample_Container)
- Primary_Collector: Collector identifier (maps to collector_id via Contact lookup)
- Collection_Team: Team members involved in collection
- Destination_Lab: Lab where sample was sent
- FieldStorage_Location: Storage location name (maps to field_storage_location_id)
- FieldStorage_Conditions: Storage conditions (temperature, humidity, etc.)
- FieldStorage_Duration: Duration stored in field
- Other metadata: Comments, dates, etc.

This extractor provides quantity, unit, and field storage context for collected samples.
"""

from .factory import create_extractor

GSHEET_NAME = "SampleMetadata_v03-BioCirV"
WORKSHEET_NAME = "03_Qty_FieldStorage"

# Create the extract task using the factory pattern
extract = create_extractor(GSHEET_NAME, WORKSHEET_NAME, task_name="extract_qty_field_storage")
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""
ETL Extract: Resource Images
"""

from .factory import create_extractor

GSHEET_NAME = "Aim 1-Feedstock Collection and Processing Data-BioCirV"
WORKSHEET_NAME = "08.0_Resource_images"

extract = create_extractor(GSHEET_NAME, WORKSHEET_NAME)
Loading
Loading