From 916ec501cc5ede571dfb2a34316ad768cf934f33 Mon Sep 17 00:00:00 2001 From: vasques1 Date: Thu, 29 Jan 2026 16:47:59 -0500 Subject: [PATCH 01/13] add file models --- api/filerecord/__init__.py | 7 ++ api/filerecord/models.py | 185 +++++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 api/filerecord/__init__.py create mode 100644 api/filerecord/models.py diff --git a/api/filerecord/__init__.py b/api/filerecord/__init__.py new file mode 100644 index 0000000..a238ddd --- /dev/null +++ b/api/filerecord/__init__.py @@ -0,0 +1,7 @@ +""" +FileRecord module - reusable file metadata records. + +This module provides a polymorphic file reference system that can associate +file metadata (URI, size, hashes, tags) with various entity types like +QCRecord, Sample, etc. +""" diff --git a/api/filerecord/models.py b/api/filerecord/models.py new file mode 100644 index 0000000..fccb644 --- /dev/null +++ b/api/filerecord/models.py @@ -0,0 +1,185 @@ +""" +FileRecord Models - Reusable file metadata records. + +These models provide a polymorphic file reference system that can associate +file metadata (URI, size, hashes, tags, samples) with various entity types. +""" + +import uuid +from datetime import datetime +from enum import Enum +from typing import List, TYPE_CHECKING +from sqlmodel import SQLModel, Field, Relationship, UniqueConstraint +from pydantic import ConfigDict + + +class FileRecordEntityType(str, Enum): + """Entity types that can have file records associated.""" + QCRECORD = "QCRECORD" + SAMPLE = "SAMPLE" + + +# ============================================================================ +# Database Tables +# ============================================================================ + + +class FileRecordHash(SQLModel, table=True): + """ + Hash values for file records. + Supports multiple hash algorithms (md5, sha256, etag, etc.) per file. + """ + __tablename__ = "filerecordhash" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_record_id: uuid.UUID = Field(foreign_key="filerecord.id", nullable=False) + algorithm: str = Field(max_length=50, nullable=False) + value: str = Field(max_length=128, nullable=False) + + # Relationship back to parent + file_record: "FileRecord" = Relationship(back_populates="hashes") + + __table_args__ = ( + UniqueConstraint("file_record_id", "algorithm", name="uq_filerecordhash_file_algorithm"), + ) + + +class FileRecordTag(SQLModel, table=True): + """ + Key-value tags for file records. + Allows arbitrary metadata to be attached to files. + """ + __tablename__ = "filerecordtag" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_record_id: uuid.UUID = Field(foreign_key="filerecord.id", nullable=False) + key: str = Field(max_length=255, nullable=False) + value: str = Field(nullable=False) + + # Relationship back to parent + file_record: "FileRecord" = Relationship(back_populates="tags") + + __table_args__ = ( + UniqueConstraint("file_record_id", "key", name="uq_filerecordtag_file_key"), + ) + + +class FileRecordSample(SQLModel, table=True): + """ + Associates samples with a file record. + + Supports: + - 0 rows: workflow-level file (e.g., expression matrix) + - 1 row: single-sample file (e.g., BAM file) + - N rows: multi-sample file with roles (e.g., tumor/normal VCF) + """ + __tablename__ = "filerecordsample" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_record_id: uuid.UUID = Field(foreign_key="filerecord.id", nullable=False) + sample_name: str = Field(max_length=255, nullable=False) + role: str | None = Field(default=None, max_length=50) # e.g., "tumor", "normal" + + # Relationship back to parent + file_record: "FileRecord" = Relationship(back_populates="samples") + + __table_args__ = ( + UniqueConstraint("file_record_id", "sample_name", name="uq_filerecordsample_file_sample"), + ) + + +class FileRecord(SQLModel, table=True): + """ + Metadata record for files stored in external locations (S3, etc.). + + Uses polymorphic association via entity_type and entity_id to link + to parent entities (QCRecord, Sample, etc.) without hard FK constraints. + """ + __tablename__ = "filerecord" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + entity_type: FileRecordEntityType = Field(nullable=False) + entity_id: uuid.UUID = Field(nullable=False) + uri: str = Field(max_length=1024, nullable=False) + size: int | None = Field(default=None) # File size in bytes + created_on: datetime | None = Field(default=None) # File creation timestamp + + # Relationships to child tables + hashes: List["FileRecordHash"] = Relationship( + back_populates="file_record", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + tags: List["FileRecordTag"] = Relationship( + back_populates="file_record", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + samples: List["FileRecordSample"] = Relationship( + back_populates="file_record", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + + model_config = ConfigDict(from_attributes=True) + + +# ============================================================================ +# Request/Response Models (Pydantic) +# ============================================================================ + + +class HashInput(SQLModel): + """Hash input for file creation - key is algorithm, value is hash.""" + algorithm: str + value: str + + +class TagInput(SQLModel): + """Tag input for file creation.""" + key: str + value: str + + +class SampleInput(SQLModel): + """Sample association input for file creation.""" + sample_name: str + role: str | None = None + + +class FileRecordCreate(SQLModel): + """Request model for creating a file record.""" + uri: str + size: int | None = None + created_on: datetime | None = None + hash: dict[str, str] | None = None # {"md5": "abc...", "sha256": "def..."} + tags: dict[str, str] | None = None # {"type": "alignment", "format": "bam"} + samples: List[SampleInput] | None = None # Sample associations + + model_config = ConfigDict(extra="forbid") + + +class HashPublic(SQLModel): + """Public representation of a file hash.""" + algorithm: str + value: str + + +class TagPublic(SQLModel): + """Public representation of a file tag.""" + key: str + value: str + + +class SamplePublic(SQLModel): + """Public representation of a sample association.""" + sample_name: str + role: str | None + + +class FileRecordPublic(SQLModel): + """Public representation of a file record.""" + id: uuid.UUID + uri: str + size: int | None + created_on: datetime | None + hashes: List[HashPublic] + tags: List[TagPublic] + samples: List[SamplePublic] From 7ab85b522a727c2b0b24097c8a4c2a744d5ad4bf Mon Sep 17 00:00:00 2001 From: vasques1 Date: Thu, 29 Jan 2026 16:49:26 -0500 Subject: [PATCH 02/13] add qcmetric models --- api/qcmetrics/__init__.py | 6 + api/qcmetrics/models.py | 241 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 247 insertions(+) create mode 100644 api/qcmetrics/__init__.py create mode 100644 api/qcmetrics/models.py diff --git a/api/qcmetrics/__init__.py b/api/qcmetrics/__init__.py new file mode 100644 index 0000000..2328ac2 --- /dev/null +++ b/api/qcmetrics/__init__.py @@ -0,0 +1,6 @@ +""" +QCMetrics module - Quality control metrics from pipeline executions. + +This module provides models and APIs for storing and retrieving QC metrics +from bioinformatics pipeline runs. +""" diff --git a/api/qcmetrics/models.py b/api/qcmetrics/models.py new file mode 100644 index 0000000..c980850 --- /dev/null +++ b/api/qcmetrics/models.py @@ -0,0 +1,241 @@ +""" +QCMetrics Models - Quality control metrics from pipeline executions. + +These models store QC metrics and outputs from bioinformatics pipelines, +supporting workflow-level, single-sample, and multi-sample (paired) metrics. +""" + +import uuid +from datetime import datetime, timezone +from typing import List +from sqlmodel import SQLModel, Field, Relationship, UniqueConstraint +from pydantic import ConfigDict + +from api.filerecord.models import ( + FileRecordCreate, + FileRecordPublic, +) + + +# ============================================================================ +# Database Tables +# ============================================================================ + + +class QCRecordMetadata(SQLModel, table=True): + """ + Key-value store for pipeline-level metadata. + Examples: pipeline name, version, configuration parameters. + """ + __tablename__ = "qcrecordmetadata" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + qcrecord_id: uuid.UUID = Field(foreign_key="qcrecord.id", nullable=False) + key: str = Field(max_length=255, nullable=False) + value: str = Field(nullable=False) + + # Relationship back to parent + qcrecord: "QCRecord" = Relationship(back_populates="metadata") + + __table_args__ = ( + UniqueConstraint("qcrecord_id", "key", name="uq_qcrecordmetadata_record_key"), + ) + + +class QCMetricValue(SQLModel, table=True): + """ + Key-value store for individual metric values within a metric group. + Examples: reads=50000000, alignment_rate=95.5, tmb=8.5 + """ + __tablename__ = "qcmetricvalue" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + qc_metric_id: uuid.UUID = Field(foreign_key="qcmetric.id", nullable=False) + key: str = Field(max_length=255, nullable=False) + value: str = Field(nullable=False) + + # Relationship back to parent + qc_metric: "QCMetric" = Relationship(back_populates="values") + + __table_args__ = ( + UniqueConstraint("qc_metric_id", "key", name="uq_qcmetricvalue_metric_key"), + ) + + +class QCMetricSample(SQLModel, table=True): + """ + Associates samples with a metric group. + + Supports: + - 0 rows: workflow-level metric (e.g., pipeline runtime) + - 1 row: single-sample metric (e.g., alignment stats for Sample1) + - N rows: multi-sample metric with roles (e.g., tumor/normal somatic variants) + """ + __tablename__ = "qcmetricsample" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + qc_metric_id: uuid.UUID = Field(foreign_key="qcmetric.id", nullable=False) + sample_name: str = Field(max_length=255, nullable=False) + role: str | None = Field(default=None, max_length=50) # e.g., "tumor", "normal" + + # Relationship back to parent + qc_metric: "QCMetric" = Relationship(back_populates="samples") + + __table_args__ = ( + UniqueConstraint("qc_metric_id", "sample_name", name="uq_qcmetricsample_metric_sample"), + ) + + +class QCMetric(SQLModel, table=True): + """ + A named group of metrics within a QC record. + + Can be workflow-level (no samples), single-sample, or multi-sample (paired). + Examples: alignment_stats, somatic_variants, expression_summary + """ + __tablename__ = "qcmetric" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + qcrecord_id: uuid.UUID = Field(foreign_key="qcrecord.id", nullable=False) + name: str = Field(max_length=255, nullable=False) + + # Relationships to child tables + values: List["QCMetricValue"] = Relationship( + back_populates="qc_metric", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + samples: List["QCMetricSample"] = Relationship( + back_populates="qc_metric", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + + # Relationship back to parent + qcrecord: "QCRecord" = Relationship(back_populates="metrics") + + __table_args__ = ( + UniqueConstraint("qcrecord_id", "name", name="uq_qcmetric_record_name"), + ) + + +class QCRecord(SQLModel, table=True): + """ + Main QC record entity - one per pipeline execution per project. + + Multiple records per project are allowed for versioning (history). + The created_on timestamp differentiates versions. + """ + __tablename__ = "qcrecord" + __searchable__ = ["project_id"] + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + created_on: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False + ) + created_by: str = Field(max_length=100, nullable=False) + project_id: str = Field(max_length=50, nullable=False, index=True) + + # Relationships to child tables + metadata: List["QCRecordMetadata"] = Relationship( + back_populates="qcrecord", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + metrics: List["QCMetric"] = Relationship( + back_populates="qcrecord", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + + model_config = ConfigDict(from_attributes=True) + + +# ============================================================================ +# Request/Response Models (Pydantic) +# ============================================================================ + + +class MetadataKeyValue(SQLModel): + """Key-value pair for metadata.""" + key: str + value: str + + +class MetricValueInput(SQLModel): + """Key-value pair for metric values.""" + key: str + value: str + + +class MetricSampleInput(SQLModel): + """Sample association input for metrics.""" + sample_name: str + role: str | None = None + + +class MetricInput(SQLModel): + """Input model for a metric group.""" + name: str + samples: List[MetricSampleInput] | None = None + values: dict[str, str] # {"reads": "50000000", "alignment_rate": "95.5"} + + +class QCRecordCreate(SQLModel): + """ + Request model for creating a QC record. + + Accepts both the new explicit format and backward-compatible formats. + """ + project_id: str + metadata: dict[str, str] | None = None # {"pipeline": "RNA-Seq", "version": "2.0"} + metrics: List[MetricInput] | None = None # New format with explicit sample associations + sample_level_metrics: dict[str, dict[str, str]] | None = None # Legacy ES format + output_files: List[FileRecordCreate] | None = None + + model_config = ConfigDict(extra="forbid") + + +class MetricValuePublic(SQLModel): + """Public representation of a metric value.""" + key: str + value: str + + +class MetricSamplePublic(SQLModel): + """Public representation of a sample association.""" + sample_name: str + role: str | None + + +class MetricPublic(SQLModel): + """Public representation of a metric group.""" + name: str + samples: List[MetricSamplePublic] + values: List[MetricValuePublic] + + +class QCRecordPublic(SQLModel): + """Public representation of a QC record.""" + id: uuid.UUID + created_on: datetime + created_by: str + project_id: str + metadata: List[MetadataKeyValue] + metrics: List[MetricPublic] + output_files: List[FileRecordPublic] + + +class QCRecordsPublic(SQLModel): + """Paginated list of QC records.""" + data: List[QCRecordPublic] + total: int + page: int + per_page: int + + +class QCRecordSearchRequest(SQLModel): + """Request model for searching QC records.""" + filter_on: dict | None = None # Flexible filtering + page: int = 1 + per_page: int = 100 + latest: bool = True # Return only newest version per project + + model_config = ConfigDict(extra="forbid") From 3b61eb18304418e997ed0286ea7f6b16a4906c2e Mon Sep 17 00:00:00 2001 From: vasques1 Date: Thu, 29 Jan 2026 18:48:24 -0500 Subject: [PATCH 03/13] add qcmetrics and filerecord migration --- ...5e6_add_qcmetrics_and_filerecord_tables.py | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py diff --git a/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py b/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py new file mode 100644 index 0000000..4c489f2 --- /dev/null +++ b/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py @@ -0,0 +1,161 @@ +"""Add QCMetrics and FileRecord tables + +Revision ID: f1a2b3c4d5e6 +Revises: e158df5a8df1 +Create Date: 2026-01-29 16:45:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = 'f1a2b3c4d5e6' +down_revision: Union[str, Sequence[str], None] = 'e158df5a8df1' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Create QCMetrics and FileRecord tables.""" + + # ======================================================================== + # FileRecord Tables (reusable across QCRecord, Sample, etc.) + # ======================================================================== + + # filerecord - main file metadata table + op.create_table( + 'filerecord', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('entity_type', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False), + sa.Column('entity_id', sa.Uuid(), nullable=False), + sa.Column('uri', sqlmodel.sql.sqltypes.AutoString(length=1024), nullable=False), + sa.Column('size', sa.BigInteger(), nullable=True), + sa.Column('created_on', sa.DateTime(), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_index( + 'ix_filerecord_entity', + 'filerecord', + ['entity_type', 'entity_id'] + ) + + # filerecordhash - hash values for files + op.create_table( + 'filerecordhash', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('file_record_id', sa.Uuid(), nullable=False), + sa.Column('algorithm', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False), + sa.Column('value', sqlmodel.sql.sqltypes.AutoString(length=128), nullable=False), + sa.ForeignKeyConstraint(['file_record_id'], ['filerecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_record_id', 'algorithm', name='uq_filerecordhash_file_algorithm') + ) + + # filerecordtag - key-value tags for files + op.create_table( + 'filerecordtag', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('file_record_id', sa.Uuid(), nullable=False), + sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('value', sa.Text(), nullable=False), + sa.ForeignKeyConstraint(['file_record_id'], ['filerecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_record_id', 'key', name='uq_filerecordtag_file_key') + ) + + # filerecordsample - sample associations for files + op.create_table( + 'filerecordsample', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('file_record_id', sa.Uuid(), nullable=False), + sa.Column('sample_name', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('role', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=True), + sa.ForeignKeyConstraint(['file_record_id'], ['filerecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_record_id', 'sample_name', name='uq_filerecordsample_file_sample') + ) + + # ======================================================================== + # QCRecord Tables + # ======================================================================== + + # qcrecord - main QC record table + op.create_table( + 'qcrecord', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('created_on', sa.DateTime(), nullable=False), + sa.Column('created_by', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=False), + sa.Column('project_id', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + op.create_index('ix_qcrecord_project_id', 'qcrecord', ['project_id']) + + # qcrecordmetadata - pipeline-level metadata + op.create_table( + 'qcrecordmetadata', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('qcrecord_id', sa.Uuid(), nullable=False), + sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('value', sa.Text(), nullable=False), + sa.ForeignKeyConstraint(['qcrecord_id'], ['qcrecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('qcrecord_id', 'key', name='uq_qcrecordmetadata_record_key') + ) + + # qcmetric - named metric groups + op.create_table( + 'qcmetric', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('qcrecord_id', sa.Uuid(), nullable=False), + sa.Column('name', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.ForeignKeyConstraint(['qcrecord_id'], ['qcrecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('qcrecord_id', 'name', name='uq_qcmetric_record_name') + ) + + # qcmetricvalue - metric values + op.create_table( + 'qcmetricvalue', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('qc_metric_id', sa.Uuid(), nullable=False), + sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('value', sa.Text(), nullable=False), + sa.ForeignKeyConstraint(['qc_metric_id'], ['qcmetric.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('qc_metric_id', 'key', name='uq_qcmetricvalue_metric_key') + ) + + # qcmetricsample - sample associations for metrics + op.create_table( + 'qcmetricsample', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('qc_metric_id', sa.Uuid(), nullable=False), + sa.Column('sample_name', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('role', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=True), + sa.ForeignKeyConstraint(['qc_metric_id'], ['qcmetric.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('qc_metric_id', 'sample_name', name='uq_qcmetricsample_metric_sample') + ) + + +def downgrade() -> None: + """Drop QCMetrics and FileRecord tables.""" + + # Drop QCRecord tables (in reverse order of creation) + op.drop_table('qcmetricsample') + op.drop_table('qcmetricvalue') + op.drop_table('qcmetric') + op.drop_table('qcrecordmetadata') + op.drop_index('ix_qcrecord_project_id', table_name='qcrecord') + op.drop_table('qcrecord') + + # Drop FileRecord tables + op.drop_table('filerecordsample') + op.drop_table('filerecordtag') + op.drop_table('filerecordhash') + op.drop_index('ix_filerecord_entity', table_name='filerecord') + op.drop_table('filerecord') From f0980fad6847f96b4adf962d171faf152f0e1c07 Mon Sep 17 00:00:00 2001 From: vasques1 Date: Thu, 29 Jan 2026 19:01:32 -0500 Subject: [PATCH 04/13] add qcrecord CRUD methods --- api/qcmetrics/services.py | 480 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 480 insertions(+) create mode 100644 api/qcmetrics/services.py diff --git a/api/qcmetrics/services.py b/api/qcmetrics/services.py new file mode 100644 index 0000000..c4bd136 --- /dev/null +++ b/api/qcmetrics/services.py @@ -0,0 +1,480 @@ +""" +Services for the QCMetrics API. + +Business logic for creating, searching, and deleting QC records. +""" + +import logging +from datetime import datetime, timezone +from typing import List +from fastapi import HTTPException, status +from sqlmodel import Session, select, col +from sqlalchemy import func + +from api.qcmetrics.models import ( + QCRecord, + QCRecordMetadata, + QCMetric, + QCMetricValue, + QCMetricSample, + QCRecordCreate, + QCRecordPublic, + QCRecordsPublic, + MetadataKeyValue, + MetricPublic, + MetricValuePublic, + MetricSamplePublic, + MetricInput, +) +from api.filerecord.models import ( + FileRecord, + FileRecordHash, + FileRecordTag, + FileRecordSample, + FileRecordEntityType, + FileRecordCreate, + FileRecordPublic, + HashPublic, + TagPublic, + SamplePublic, +) + + +logger = logging.getLogger(__name__) + + +def create_qcrecord( + session: Session, + qcrecord_create: QCRecordCreate, + created_by: str, +) -> QCRecordPublic: + """ + Create a new QC record with all associated data. + + Handles both the new explicit format (metrics with samples) and + the legacy ES format (sample_level_metrics dict). + """ + # Check for duplicate record + existing = _check_duplicate_record(session, qcrecord_create) + if existing: + logger.info( + "Equivalent QC record already exists for project %s: %s", + qcrecord_create.project_id, + existing.id + ) + return _qcrecord_to_public(session, existing) + + # Create main QC record + qcrecord = QCRecord( + created_on=datetime.now(timezone.utc), + created_by=created_by, + project_id=qcrecord_create.project_id, + ) + session.add(qcrecord) + session.flush() # Get the ID + + # Add metadata + if qcrecord_create.metadata: + for key, value in qcrecord_create.metadata.items(): + metadata_entry = QCRecordMetadata( + qcrecord_id=qcrecord.id, + key=key, + value=str(value), + ) + session.add(metadata_entry) + + # Add metrics (new format) + if qcrecord_create.metrics: + for metric_input in qcrecord_create.metrics: + _create_metric(session, qcrecord.id, metric_input) + + # Handle legacy sample_level_metrics format (ES compatibility) + if qcrecord_create.sample_level_metrics: + for sample_name, metrics_dict in qcrecord_create.sample_level_metrics.items(): + # Convert to new format: one metric group per sample + metric_input = MetricInput( + name=f"sample_metrics_{sample_name}", + samples=[{"sample_name": sample_name}], + values=metrics_dict, + ) + _create_metric(session, qcrecord.id, metric_input) + + # Add output files + if qcrecord_create.output_files: + for file_create in qcrecord_create.output_files: + _create_file_record( + session, + entity_type=FileRecordEntityType.QCRECORD, + entity_id=qcrecord.id, + file_create=file_create, + ) + + session.commit() + session.refresh(qcrecord) + + logger.info( + "Created QC record %s for project %s by %s", + qcrecord.id, + qcrecord.project_id, + created_by + ) + + return _qcrecord_to_public(session, qcrecord) + + +def _create_metric( + session: Session, + qcrecord_id, + metric_input: MetricInput, +) -> QCMetric: + """Create a metric group with its samples and values.""" + metric = QCMetric( + qcrecord_id=qcrecord_id, + name=metric_input.name, + ) + session.add(metric) + session.flush() + + # Add sample associations + if metric_input.samples: + for sample_input in metric_input.samples: + sample_assoc = QCMetricSample( + qc_metric_id=metric.id, + sample_name=sample_input.sample_name if hasattr(sample_input, 'sample_name') else sample_input['sample_name'], + role=sample_input.role if hasattr(sample_input, 'role') else sample_input.get('role'), + ) + session.add(sample_assoc) + + # Add metric values + for key, value in metric_input.values.items(): + metric_value = QCMetricValue( + qc_metric_id=metric.id, + key=key, + value=str(value), + ) + session.add(metric_value) + + return metric + + +def _create_file_record( + session: Session, + entity_type: FileRecordEntityType, + entity_id, + file_create: FileRecordCreate, +) -> FileRecord: + """Create a file record with its hashes, tags, and samples.""" + file_record = FileRecord( + entity_type=entity_type, + entity_id=entity_id, + uri=file_create.uri, + size=file_create.size, + created_on=file_create.created_on, + ) + session.add(file_record) + session.flush() + + # Add hashes + if file_create.hash: + for algorithm, value in file_create.hash.items(): + hash_entry = FileRecordHash( + file_record_id=file_record.id, + algorithm=algorithm, + value=value, + ) + session.add(hash_entry) + + # Add tags + if file_create.tags: + for key, value in file_create.tags.items(): + tag_entry = FileRecordTag( + file_record_id=file_record.id, + key=key, + value=str(value), + ) + session.add(tag_entry) + + # Add sample associations + if file_create.samples: + for sample_input in file_create.samples: + sample_assoc = FileRecordSample( + file_record_id=file_record.id, + sample_name=sample_input.sample_name, + role=sample_input.role, + ) + session.add(sample_assoc) + + return file_record + + +def _check_duplicate_record( + session: Session, + qcrecord_create: QCRecordCreate, +) -> QCRecord | None: + """ + Check if an equivalent QC record already exists. + + Returns the existing record if found, None otherwise. + """ + # Find existing records for this project + stmt = select(QCRecord).where( + QCRecord.project_id == qcrecord_create.project_id + ).order_by(col(QCRecord.created_on).desc()) + + existing_records = session.exec(stmt).all() + + if not existing_records: + return None + + # For now, just check the latest record + # A full comparison would require comparing all nested data + # This is a simplified version that checks metadata keys + latest = existing_records[0] + + # Get existing metadata + existing_metadata = { + m.key: m.value + for m in session.exec( + select(QCRecordMetadata).where( + QCRecordMetadata.qcrecord_id == latest.id + ) + ).all() + } + + # Compare metadata + new_metadata = qcrecord_create.metadata or {} + if existing_metadata == {k: str(v) for k, v in new_metadata.items()}: + # Metadata matches - could do deeper comparison here + # For now, consider it a duplicate if metadata matches + return latest + + return None + + +def search_qcrecords( + session: Session, + filter_on: dict | None = None, + page: int = 1, + per_page: int = 100, + latest: bool = True, +) -> QCRecordsPublic: + """ + Search for QC records with filtering and pagination. + + Args: + session: Database session + filter_on: Dictionary of fields to filter by + page: Page number (1-based) + per_page: Results per page + latest: If True, return only the newest record per project + """ + filter_on = filter_on or {} + + # Build base query + stmt = select(QCRecord) + + # Apply filters + if "project_id" in filter_on: + project_ids = filter_on["project_id"] + if isinstance(project_ids, list): + stmt = stmt.where(col(QCRecord.project_id).in_(project_ids)) + else: + stmt = stmt.where(QCRecord.project_id == project_ids) + + # Handle metadata filtering + if "metadata" in filter_on and isinstance(filter_on["metadata"], dict): + for key, value in filter_on["metadata"].items(): + # Subquery to find QCRecords with matching metadata + subq = select(QCRecordMetadata.qcrecord_id).where( + QCRecordMetadata.key == key, + QCRecordMetadata.value == str(value) + ) + stmt = stmt.where(col(QCRecord.id).in_(subq)) + + # Order by created_on descending + stmt = stmt.order_by(col(QCRecord.created_on).desc()) + + # Execute to get all matching records + all_records = list(session.exec(stmt).all()) + + # Apply "latest" filter - keep only newest per project + if latest: + seen_projects = set() + filtered_records = [] + for record in all_records: + if record.project_id not in seen_projects: + filtered_records.append(record) + seen_projects.add(record.project_id) + all_records = filtered_records + + # Calculate pagination + total = len(all_records) + start_idx = (page - 1) * per_page + end_idx = start_idx + per_page + paginated_records = all_records[start_idx:end_idx] + + # Convert to public format + data = [_qcrecord_to_public(session, record) for record in paginated_records] + + return QCRecordsPublic( + data=data, + total=total, + page=page, + per_page=per_page, + ) + + +def get_qcrecord_by_id(session: Session, qcrecord_id: str) -> QCRecordPublic: + """Get a single QC record by ID.""" + import uuid as uuid_module + + try: + record_uuid = uuid_module.UUID(qcrecord_id) + except ValueError as exc: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid UUID format: {qcrecord_id}" + ) from exc + + record = session.get(QCRecord, record_uuid) + if not record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"QC record not found: {qcrecord_id}" + ) + + return _qcrecord_to_public(session, record) + + +def delete_qcrecord(session: Session, qcrecord_id: str) -> dict: + """Delete a QC record and all associated data.""" + import uuid as uuid_module + + try: + record_uuid = uuid_module.UUID(qcrecord_id) + except ValueError as exc: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid UUID format: {qcrecord_id}" + ) from exc + + record = session.get(QCRecord, record_uuid) + if not record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"QC record not found: {qcrecord_id}" + ) + + # Delete associated file records (polymorphic, not cascade) + file_records = session.exec( + select(FileRecord).where( + FileRecord.entity_type == FileRecordEntityType.QCRECORD, + FileRecord.entity_id == record_uuid + ) + ).all() + + for file_record in file_records: + session.delete(file_record) + + # Delete the QC record (cascades to metadata, metrics, etc.) + session.delete(record) + session.commit() + + logger.info("Deleted QC record %s", qcrecord_id) + + return {"status": "deleted", "id": qcrecord_id} + + +def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: + """Convert a QCRecord database object to public format.""" + # Get metadata + metadata_entries = session.exec( + select(QCRecordMetadata).where( + QCRecordMetadata.qcrecord_id == record.id + ) + ).all() + + metadata = [ + MetadataKeyValue(key=m.key, value=m.value) + for m in metadata_entries + ] + + # Get metrics + metric_entries = session.exec( + select(QCMetric).where(QCMetric.qcrecord_id == record.id) + ).all() + + metrics = [] + for metric in metric_entries: + # Get metric values + values = session.exec( + select(QCMetricValue).where(QCMetricValue.qc_metric_id == metric.id) + ).all() + + # Get metric samples + samples = session.exec( + select(QCMetricSample).where(QCMetricSample.qc_metric_id == metric.id) + ).all() + + metrics.append(MetricPublic( + name=metric.name, + samples=[ + MetricSamplePublic(sample_name=s.sample_name, role=s.role) + for s in samples + ], + values=[ + MetricValuePublic(key=v.key, value=v.value) + for v in values + ], + )) + + # Get file records + file_records = session.exec( + select(FileRecord).where( + FileRecord.entity_type == FileRecordEntityType.QCRECORD, + FileRecord.entity_id == record.id + ) + ).all() + + output_files = [] + for file_record in file_records: + # Get hashes + hashes = session.exec( + select(FileRecordHash).where( + FileRecordHash.file_record_id == file_record.id + ) + ).all() + + # Get tags + tags = session.exec( + select(FileRecordTag).where( + FileRecordTag.file_record_id == file_record.id + ) + ).all() + + # Get samples + samples = session.exec( + select(FileRecordSample).where( + FileRecordSample.file_record_id == file_record.id + ) + ).all() + + output_files.append(FileRecordPublic( + id=file_record.id, + uri=file_record.uri, + size=file_record.size, + created_on=file_record.created_on, + hashes=[HashPublic(algorithm=h.algorithm, value=h.value) for h in hashes], + tags=[TagPublic(key=t.key, value=t.value) for t in tags], + samples=[SamplePublic(sample_name=s.sample_name, role=s.role) for s in samples], + )) + + return QCRecordPublic( + id=record.id, + created_on=record.created_on, + created_by=record.created_by, + project_id=record.project_id, + metadata=metadata, + metrics=metrics, + output_files=output_files, + ) From 91407637f8bef75bf5a53559bbb4df4c495067ea Mon Sep 17 00:00:00 2001 From: vasques1 Date: Thu, 29 Jan 2026 19:02:54 -0500 Subject: [PATCH 05/13] add qcmetrics routes --- api/qcmetrics/routes.py | 206 ++++++++++++++++++++++++++++++++++++++++ main.py | 1 + 2 files changed, 207 insertions(+) create mode 100644 api/qcmetrics/routes.py diff --git a/api/qcmetrics/routes.py b/api/qcmetrics/routes.py new file mode 100644 index 0000000..29c18d3 --- /dev/null +++ b/api/qcmetrics/routes.py @@ -0,0 +1,206 @@ +""" +Routes/endpoints for the QCMetrics API. + +Provides endpoints for creating, searching, and deleting QC records. +""" + +from fastapi import APIRouter, Depends, Query, status +from typing import Optional + +from api.qcmetrics.models import ( + QCRecordCreate, + QCRecordPublic, + QCRecordsPublic, + QCRecordSearchRequest, +) +from api.qcmetrics import services +from core.deps import SessionDep + +router = APIRouter(prefix="/qcmetrics", tags=["QC Metrics"]) + + +@router.post( + "", + response_model=QCRecordPublic, + status_code=status.HTTP_201_CREATED, + summary="Create a new QC record", +) +def create_qcrecord( + session: SessionDep, + qcrecord_create: QCRecordCreate, + created_by: str = Query( + ..., + description="Username of the person creating this record" + ), +) -> QCRecordPublic: + """ + Create a new QC record with metrics and output files. + + The record stores quality control metrics from a pipeline execution. + + **Request body format:** + + ```json + { + "project_id": "P-1234", + "metadata": { + "pipeline": "RNA-Seq", + "version": "2.0.0" + }, + "metrics": [ + { + "name": "alignment_stats", + "samples": [{"sample_name": "Sample1"}], + "values": {"reads": "50000000", "alignment_rate": "95.5"} + } + ], + "output_files": [ + { + "uri": "s3://bucket/path/file.bam", + "size": 123456789, + "samples": [{"sample_name": "Sample1"}], + "hash": {"md5": "abc123..."}, + "tags": {"type": "alignment"} + } + ] + } + ``` + + **Sample association patterns:** + - **Workflow-level**: Omit `samples` array (applies to entire pipeline run) + - **Single sample**: One entry in `samples` array + - **Sample pair**: Two entries with roles, e.g., `[{"sample_name": "T1", "role": "tumor"}, {"sample_name": "N1", "role": "normal"}]` + + **Duplicate detection:** + If an equivalent record already exists for the project (same metadata), + the existing record is returned instead of creating a duplicate. + """ + return services.create_qcrecord(session, qcrecord_create, created_by) + + +@router.get( + "/search", + response_model=QCRecordsPublic, + summary="Search QC records (GET)", +) +def search_qcrecords_get( + session: SessionDep, + project_id: Optional[str] = Query(None, description="Filter by project ID"), + latest: bool = Query(True, description="Return only newest record per project"), + page: int = Query(1, ge=1, description="Page number"), + per_page: int = Query(100, ge=1, le=1000, description="Results per page"), +) -> QCRecordsPublic: + """ + Search QC records using query parameters. + + **Parameters:** + - `project_id`: Filter to specific project(s) + - `latest`: If true (default), returns only the most recent QC record per project + - `page`: Page number for pagination (starts at 1) + - `per_page`: Number of results per page (max 1000) + + **Example:** + ``` + GET /api/v1/qcmetrics/search?project_id=P-1234&latest=true + ``` + """ + filter_on = {} + if project_id: + filter_on["project_id"] = project_id + + return services.search_qcrecords( + session, + filter_on=filter_on, + page=page, + per_page=per_page, + latest=latest, + ) + + +@router.post( + "/search", + response_model=QCRecordsPublic, + summary="Search QC records (POST)", +) +def search_qcrecords_post( + session: SessionDep, + search_request: QCRecordSearchRequest, +) -> QCRecordsPublic: + """ + Search QC records using a JSON body for advanced filtering. + + **Request body format:** + + ```json + { + "filter_on": { + "project_id": "P-1234", + "metadata": { + "pipeline": "RNA-Seq" + } + }, + "page": 1, + "per_page": 100, + "latest": true + } + ``` + + **Filter options:** + - `project_id`: Single value or list of project IDs + - `metadata`: Key-value pairs to match against pipeline metadata + + **Pagination:** + - `page`: Page number (starts at 1) + - `per_page`: Results per page (max 1000) + + **Latest filtering:** + - `latest: true` (default): Returns only the newest QC record per project + - `latest: false`: Returns all matching records (full history) + """ + return services.search_qcrecords( + session, + filter_on=search_request.filter_on, + page=search_request.page, + per_page=search_request.per_page, + latest=search_request.latest, + ) + + +@router.get( + "/{qcrecord_id}", + response_model=QCRecordPublic, + summary="Get QC record by ID", +) +def get_qcrecord( + session: SessionDep, + qcrecord_id: str, +) -> QCRecordPublic: + """ + Retrieve a specific QC record by its UUID. + + Returns the full QC record including metadata, metrics, and output files. + """ + return services.get_qcrecord_by_id(session, qcrecord_id) + + +@router.delete( + "/{qcrecord_id}", + status_code=status.HTTP_200_OK, + summary="Delete QC record", +) +def delete_qcrecord( + session: SessionDep, + qcrecord_id: str, +) -> dict: + """ + Delete a QC record and all associated data. + + This permanently removes: + - The QC record + - All associated metadata + - All associated metrics and metric values + - All associated output file records + + **Warning:** This action cannot be undone. + """ + return services.delete_qcrecord(session, qcrecord_id) diff --git a/main.py b/main.py index d9efa23..37b4192 100644 --- a/main.py +++ b/main.py @@ -18,6 +18,7 @@ from api.vendors.routes import router as vendors_router from api.workflow.routes import router as workflow_router from api.manifest.routes import router as manifest_router +from api.qcmetrics.routes import router as qcmetrics_router # Customize route id's From b4a6ec94470767c8b9b772465b666d6c53abae2f Mon Sep 17 00:00:00 2001 From: vasques1 Date: Thu, 29 Jan 2026 19:10:58 -0500 Subject: [PATCH 06/13] revise field metadata -> pipeline_metadata to appease SQLAlchemy - E sqlalchemy.exc.InvalidRequestError: Attribute name 'metadata' is reserved when using the Declarative API. --- api/qcmetrics/models.py | 4 ++-- main.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/api/qcmetrics/models.py b/api/qcmetrics/models.py index c980850..c2f909f 100644 --- a/api/qcmetrics/models.py +++ b/api/qcmetrics/models.py @@ -35,7 +35,7 @@ class QCRecordMetadata(SQLModel, table=True): value: str = Field(nullable=False) # Relationship back to parent - qcrecord: "QCRecord" = Relationship(back_populates="metadata") + qcrecord: "QCRecord" = Relationship(back_populates="pipeline_metadata") __table_args__ = ( UniqueConstraint("qcrecord_id", "key", name="uq_qcrecordmetadata_record_key"), @@ -136,7 +136,7 @@ class QCRecord(SQLModel, table=True): project_id: str = Field(max_length=50, nullable=False, index=True) # Relationships to child tables - metadata: List["QCRecordMetadata"] = Relationship( + pipeline_metadata: List["QCRecordMetadata"] = Relationship( back_populates="qcrecord", sa_relationship_kwargs={"cascade": "all, delete-orphan"} ) diff --git a/main.py b/main.py index 37b4192..4879ea4 100644 --- a/main.py +++ b/main.py @@ -69,6 +69,7 @@ def health_check(): app.include_router(vendors_router, prefix=API_PREFIX) app.include_router(manifest_router, prefix=API_PREFIX) app.include_router(workflow_router, prefix=API_PREFIX) +app.include_router(qcmetrics_router, prefix=API_PREFIX) if __name__ == "__main__": From 0bdc2a2d24c2bf151c0eff48461556229c5a44f0 Mon Sep 17 00:00:00 2001 From: vasques1 Date: Thu, 29 Jan 2026 19:11:14 -0500 Subject: [PATCH 07/13] add test suite --- tests/api/test_qcmetrics.py | 453 ++++++++++++++++++++++++++++++++++++ 1 file changed, 453 insertions(+) create mode 100644 tests/api/test_qcmetrics.py diff --git a/tests/api/test_qcmetrics.py b/tests/api/test_qcmetrics.py new file mode 100644 index 0000000..de1d69d --- /dev/null +++ b/tests/api/test_qcmetrics.py @@ -0,0 +1,453 @@ +""" +Tests for the QCMetrics API. +""" + +from fastapi.testclient import TestClient +from sqlmodel import Session + + +def test_create_qcrecord_basic(client: TestClient, session: Session): + """ + Test creating a basic QC record with metadata only. + """ + qcrecord_data = { + "project_id": "P-TEST-001", + "metadata": { + "pipeline": "RNA-Seq", + "version": "2.0.0" + } + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + assert data["project_id"] == "P-TEST-001" + assert data["created_by"] == "test_user" + assert len(data["metadata"]) == 2 + + # Check metadata values + metadata_dict = {m["key"]: m["value"] for m in data["metadata"]} + assert metadata_dict["pipeline"] == "RNA-Seq" + assert metadata_dict["version"] == "2.0.0" + + +def test_create_qcrecord_with_single_sample_metrics(client: TestClient, session: Session): + """ + Test creating a QC record with single-sample metrics. + """ + qcrecord_data = { + "project_id": "P-TEST-002", + "metadata": { + "pipeline": "WES" + }, + "metrics": [ + { + "name": "alignment_stats", + "samples": [{"sample_name": "Sample1"}], + "values": { + "total_reads": "50000000", + "mapped_reads": "48500000", + "alignment_rate": "97.0" + } + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + assert len(data["metrics"]) == 1 + + metric = data["metrics"][0] + assert metric["name"] == "alignment_stats" + assert len(metric["samples"]) == 1 + assert metric["samples"][0]["sample_name"] == "Sample1" + + # Check metric values + values_dict = {v["key"]: v["value"] for v in metric["values"]} + assert values_dict["total_reads"] == "50000000" + assert values_dict["alignment_rate"] == "97.0" + + +def test_create_qcrecord_with_paired_sample_metrics(client: TestClient, session: Session): + """ + Test creating a QC record with tumor/normal paired metrics. + """ + qcrecord_data = { + "project_id": "P-TEST-003", + "metadata": { + "pipeline": "Somatic" + }, + "metrics": [ + { + "name": "somatic_variants", + "samples": [ + {"sample_name": "Sample1", "role": "tumor"}, + {"sample_name": "Sample2", "role": "normal"} + ], + "values": { + "snv_count": "15234", + "indel_count": "1523", + "tmb": "8.5" + } + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + metric = data["metrics"][0] + + # Check paired samples with roles + assert len(metric["samples"]) == 2 + samples_by_role = {s["role"]: s["sample_name"] for s in metric["samples"]} + assert samples_by_role["tumor"] == "Sample1" + assert samples_by_role["normal"] == "Sample2" + + +def test_create_qcrecord_with_workflow_level_metrics(client: TestClient, session: Session): + """ + Test creating a QC record with workflow-level metrics (no samples). + """ + qcrecord_data = { + "project_id": "P-TEST-004", + "metadata": { + "pipeline": "RNA-Seq" + }, + "metrics": [ + { + "name": "pipeline_summary", + "values": { + "total_samples_processed": "48", + "samples_passed_qc": "46", + "pipeline_runtime_hours": "12.5" + } + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + metric = data["metrics"][0] + + # Workflow-level metrics have no samples + assert len(metric["samples"]) == 0 + + values_dict = {v["key"]: v["value"] for v in metric["values"]} + assert values_dict["total_samples_processed"] == "48" + + +def test_create_qcrecord_with_output_files(client: TestClient, session: Session): + """ + Test creating a QC record with output files. + """ + qcrecord_data = { + "project_id": "P-TEST-005", + "metadata": { + "pipeline": "WGS" + }, + "output_files": [ + { + "uri": "s3://bucket/Sample1.bam", + "size": 123456789, + "samples": [{"sample_name": "Sample1"}], + "hash": {"md5": "abc123def456"}, + "tags": {"type": "alignment", "format": "bam"} + }, + { + "uri": "s3://bucket/expression_matrix.tsv", + "size": 5678901, + "hash": {"sha256": "xyz789"}, + "tags": {"type": "expression"} + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + assert len(data["output_files"]) == 2 + + # Check first file (single sample) + bam_file = next(f for f in data["output_files"] if "bam" in f["uri"]) + assert bam_file["size"] == 123456789 + assert len(bam_file["samples"]) == 1 + assert bam_file["samples"][0]["sample_name"] == "Sample1" + + # Check hashes + hashes_dict = {h["algorithm"]: h["value"] for h in bam_file["hashes"]} + assert hashes_dict["md5"] == "abc123def456" + + # Check tags + tags_dict = {t["key"]: t["value"] for t in bam_file["tags"]} + assert tags_dict["type"] == "alignment" + + # Check second file (workflow-level, no samples) + matrix_file = next(f for f in data["output_files"] if "matrix" in f["uri"]) + assert len(matrix_file["samples"]) == 0 + + +def test_search_qcrecords_empty(client: TestClient, session: Session): + """ + Test searching QC records when none exist. + """ + response = client.get("/api/v1/qcmetrics/search") + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 0 + assert data["data"] == [] + + +def test_search_qcrecords_by_project_id(client: TestClient, session: Session): + """ + Test searching QC records by project ID. + """ + # Create a QC record + qcrecord_data = { + "project_id": "P-SEARCH-001", + "metadata": {"pipeline": "RNA-Seq"} + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data) + + # Search for it + response = client.get("/api/v1/qcmetrics/search?project_id=P-SEARCH-001") + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 1 + assert data["data"][0]["project_id"] == "P-SEARCH-001" + + +def test_search_qcrecords_latest_only(client: TestClient, session: Session): + """ + Test that latest=true returns only the newest record per project. + """ + # Create two QC records for the same project + qcrecord_data_1 = { + "project_id": "P-LATEST-001", + "metadata": {"version": "1.0"} + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_1) + + qcrecord_data_2 = { + "project_id": "P-LATEST-001", + "metadata": {"version": "2.0"} # Different metadata, so not a duplicate + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_2) + + # Search with latest=true (default) + response = client.get("/api/v1/qcmetrics/search?project_id=P-LATEST-001&latest=true") + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 1 + + # Should be version 2.0 (the latest) + metadata_dict = {m["key"]: m["value"] for m in data["data"][0]["metadata"]} + assert metadata_dict["version"] == "2.0" + + +def test_search_qcrecords_all_versions(client: TestClient, session: Session): + """ + Test that latest=false returns all versions. + """ + # Create two QC records for the same project + qcrecord_data_1 = { + "project_id": "P-ALLVER-001", + "metadata": {"version": "1.0"} + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_1) + + qcrecord_data_2 = { + "project_id": "P-ALLVER-001", + "metadata": {"version": "2.0"} + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_2) + + # Search with latest=false + response = client.get("/api/v1/qcmetrics/search?project_id=P-ALLVER-001&latest=false") + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 2 + + +def test_search_qcrecords_post_with_metadata_filter(client: TestClient, session: Session): + """ + Test POST search with metadata filtering. + """ + # Create QC records with different pipelines + client.post("/api/v1/qcmetrics?created_by=test_user", json={ + "project_id": "P-META-001", + "metadata": {"pipeline": "RNA-Seq"} + }) + client.post("/api/v1/qcmetrics?created_by=test_user", json={ + "project_id": "P-META-002", + "metadata": {"pipeline": "WES"} + }) + + # Search for RNA-Seq pipeline only + search_request = { + "filter_on": { + "metadata": {"pipeline": "RNA-Seq"} + } + } + response = client.post("/api/v1/qcmetrics/search", json=search_request) + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 1 + assert data["data"][0]["project_id"] == "P-META-001" + + +def test_get_qcrecord_by_id(client: TestClient, session: Session): + """ + Test getting a QC record by its ID. + """ + # Create a QC record + create_response = client.post("/api/v1/qcmetrics?created_by=test_user", json={ + "project_id": "P-GET-001", + "metadata": {"pipeline": "RNA-Seq"} + }) + qcrecord_id = create_response.json()["id"] + + # Get by ID + response = client.get(f"/api/v1/qcmetrics/{qcrecord_id}") + assert response.status_code == 200 + + data = response.json() + assert data["id"] == qcrecord_id + assert data["project_id"] == "P-GET-001" + + +def test_get_qcrecord_not_found(client: TestClient, session: Session): + """ + Test getting a non-existent QC record returns 404. + """ + fake_uuid = "00000000-0000-0000-0000-000000000000" + response = client.get(f"/api/v1/qcmetrics/{fake_uuid}") + assert response.status_code == 404 + + +def test_get_qcrecord_invalid_uuid(client: TestClient, session: Session): + """ + Test getting with an invalid UUID format returns 400. + """ + response = client.get("/api/v1/qcmetrics/not-a-uuid") + assert response.status_code == 400 + + +def test_delete_qcrecord(client: TestClient, session: Session): + """ + Test deleting a QC record. + """ + # Create a QC record + create_response = client.post("/api/v1/qcmetrics?created_by=test_user", json={ + "project_id": "P-DELETE-001", + "metadata": {"pipeline": "RNA-Seq"} + }) + qcrecord_id = create_response.json()["id"] + + # Delete it + response = client.delete(f"/api/v1/qcmetrics/{qcrecord_id}") + assert response.status_code == 200 + assert response.json()["status"] == "deleted" + + # Verify it's gone + get_response = client.get(f"/api/v1/qcmetrics/{qcrecord_id}") + assert get_response.status_code == 404 + + +def test_delete_qcrecord_not_found(client: TestClient, session: Session): + """ + Test deleting a non-existent QC record returns 404. + """ + fake_uuid = "00000000-0000-0000-0000-000000000000" + response = client.delete(f"/api/v1/qcmetrics/{fake_uuid}") + assert response.status_code == 404 + + +def test_duplicate_detection(client: TestClient, session: Session): + """ + Test that equivalent records are detected as duplicates. + """ + qcrecord_data = { + "project_id": "P-DUP-001", + "metadata": {"pipeline": "RNA-Seq", "version": "2.0"} + } + + # Create first record + response1 = client.post("/api/v1/qcmetrics?created_by=user1", json=qcrecord_data) + assert response1.status_code == 201 + id1 = response1.json()["id"] + + # Try to create identical record + response2 = client.post("/api/v1/qcmetrics?created_by=user2", json=qcrecord_data) + assert response2.status_code == 201 + id2 = response2.json()["id"] + + # Should return the same record (duplicate detection) + assert id1 == id2 + + +def test_legacy_sample_level_metrics_format(client: TestClient, session: Session): + """ + Test backward compatibility with the legacy ES format (sample_level_metrics). + """ + qcrecord_data = { + "project_id": "P-LEGACY-001", + "metadata": {"pipeline": "RNA-Seq"}, + "sample_level_metrics": { + "Sample1": { + "reads": "50000000", + "alignment_rate": "95.5" + }, + "Sample2": { + "reads": "45000000", + "alignment_rate": "93.2" + } + } + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + + # Legacy format should be converted to metrics + assert len(data["metrics"]) == 2 + + # Check that sample names are preserved + metric_sample_names = set() + for metric in data["metrics"]: + for sample in metric["samples"]: + metric_sample_names.add(sample["sample_name"]) + + assert "Sample1" in metric_sample_names + assert "Sample2" in metric_sample_names From 4d10433532515734c1805b4b6b41c9282cc3104d Mon Sep 17 00:00:00 2001 From: vasques1 Date: Fri, 30 Jan 2026 12:12:44 -0500 Subject: [PATCH 08/13] reorder alphabetically for better swagger docs navigation --- main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 4879ea4..6f5eb3f 100644 --- a/main.py +++ b/main.py @@ -10,15 +10,15 @@ from api.files.routes import router as files_router from api.jobs.routes import router as jobs_router +from api.manifest.routes import router as manifest_router from api.project.routes import router as project_router +from api.qcmetrics.routes import router as qcmetrics_router from api.runs.routes import router as runs_router from api.samples.routes import router as samples_router from api.search.routes import router as search_router from api.settings.routes import router as settings_router from api.vendors.routes import router as vendors_router from api.workflow.routes import router as workflow_router -from api.manifest.routes import router as manifest_router -from api.qcmetrics.routes import router as qcmetrics_router # Customize route id's @@ -61,15 +61,15 @@ def health_check(): app.include_router(files_router, prefix=API_PREFIX) app.include_router(jobs_router, prefix=API_PREFIX) +app.include_router(manifest_router, prefix=API_PREFIX) app.include_router(project_router, prefix=API_PREFIX) +app.include_router(qcmetrics_router, prefix=API_PREFIX) app.include_router(runs_router, prefix=API_PREFIX) app.include_router(samples_router, prefix=API_PREFIX) app.include_router(search_router, prefix=API_PREFIX) app.include_router(settings_router, prefix=API_PREFIX) app.include_router(vendors_router, prefix=API_PREFIX) -app.include_router(manifest_router, prefix=API_PREFIX) app.include_router(workflow_router, prefix=API_PREFIX) -app.include_router(qcmetrics_router, prefix=API_PREFIX) if __name__ == "__main__": From 0b0a422cdc4021718a6f291e66d6c7c84f134f2e Mon Sep 17 00:00:00 2001 From: vasques1 Date: Fri, 30 Jan 2026 12:40:02 -0500 Subject: [PATCH 09/13] linting throughout PR --- api/filerecord/models.py | 6 +-- api/qcmetrics/models.py | 8 ++-- api/qcmetrics/routes.py | 37 ++++++++-------- api/qcmetrics/services.py | 86 ++++++++++++++++++------------------- tests/api/test_qcmetrics.py | 76 ++++++++++++++++---------------- 5 files changed, 106 insertions(+), 107 deletions(-) diff --git a/api/filerecord/models.py b/api/filerecord/models.py index fccb644..ae7588f 100644 --- a/api/filerecord/models.py +++ b/api/filerecord/models.py @@ -8,7 +8,7 @@ import uuid from datetime import datetime from enum import Enum -from typing import List, TYPE_CHECKING +from typing import List from sqlmodel import SQLModel, Field, Relationship, UniqueConstraint from pydantic import ConfigDict @@ -67,7 +67,7 @@ class FileRecordTag(SQLModel, table=True): class FileRecordSample(SQLModel, table=True): """ Associates samples with a file record. - + Supports: - 0 rows: workflow-level file (e.g., expression matrix) - 1 row: single-sample file (e.g., BAM file) @@ -91,7 +91,7 @@ class FileRecordSample(SQLModel, table=True): class FileRecord(SQLModel, table=True): """ Metadata record for files stored in external locations (S3, etc.). - + Uses polymorphic association via entity_type and entity_id to link to parent entities (QCRecord, Sample, etc.) without hard FK constraints. """ diff --git a/api/qcmetrics/models.py b/api/qcmetrics/models.py index c2f909f..2a23c5a 100644 --- a/api/qcmetrics/models.py +++ b/api/qcmetrics/models.py @@ -65,7 +65,7 @@ class QCMetricValue(SQLModel, table=True): class QCMetricSample(SQLModel, table=True): """ Associates samples with a metric group. - + Supports: - 0 rows: workflow-level metric (e.g., pipeline runtime) - 1 row: single-sample metric (e.g., alignment stats for Sample1) @@ -89,7 +89,7 @@ class QCMetricSample(SQLModel, table=True): class QCMetric(SQLModel, table=True): """ A named group of metrics within a QC record. - + Can be workflow-level (no samples), single-sample, or multi-sample (paired). Examples: alignment_stats, somatic_variants, expression_summary """ @@ -120,7 +120,7 @@ class QCMetric(SQLModel, table=True): class QCRecord(SQLModel, table=True): """ Main QC record entity - one per pipeline execution per project. - + Multiple records per project are allowed for versioning (history). The created_on timestamp differentiates versions. """ @@ -181,7 +181,7 @@ class MetricInput(SQLModel): class QCRecordCreate(SQLModel): """ Request model for creating a QC record. - + Accepts both the new explicit format and backward-compatible formats. """ project_id: str diff --git a/api/qcmetrics/routes.py b/api/qcmetrics/routes.py index 29c18d3..03a8fb3 100644 --- a/api/qcmetrics/routes.py +++ b/api/qcmetrics/routes.py @@ -4,8 +4,8 @@ Provides endpoints for creating, searching, and deleting QC records. """ -from fastapi import APIRouter, Depends, Query, status from typing import Optional +from fastapi import APIRouter, Query, status from api.qcmetrics.models import ( QCRecordCreate, @@ -35,11 +35,11 @@ def create_qcrecord( ) -> QCRecordPublic: """ Create a new QC record with metrics and output files. - + The record stores quality control metrics from a pipeline execution. - + **Request body format:** - + ```json { "project_id": "P-1234", @@ -65,12 +65,13 @@ def create_qcrecord( ] } ``` - + **Sample association patterns:** - **Workflow-level**: Omit `samples` array (applies to entire pipeline run) - **Single sample**: One entry in `samples` array - - **Sample pair**: Two entries with roles, e.g., `[{"sample_name": "T1", "role": "tumor"}, {"sample_name": "N1", "role": "normal"}]` - + - **Sample pair**: Two entries with roles, e.g., + `[{"sample_name": "T1", "role": "tumor"}, {"sample_name": "N1", "role": "normal"}]` + **Duplicate detection:** If an equivalent record already exists for the project (same metadata), the existing record is returned instead of creating a duplicate. @@ -92,13 +93,13 @@ def search_qcrecords_get( ) -> QCRecordsPublic: """ Search QC records using query parameters. - + **Parameters:** - `project_id`: Filter to specific project(s) - `latest`: If true (default), returns only the most recent QC record per project - `page`: Page number for pagination (starts at 1) - `per_page`: Number of results per page (max 1000) - + **Example:** ``` GET /api/v1/qcmetrics/search?project_id=P-1234&latest=true @@ -107,7 +108,7 @@ def search_qcrecords_get( filter_on = {} if project_id: filter_on["project_id"] = project_id - + return services.search_qcrecords( session, filter_on=filter_on, @@ -128,9 +129,9 @@ def search_qcrecords_post( ) -> QCRecordsPublic: """ Search QC records using a JSON body for advanced filtering. - + **Request body format:** - + ```json { "filter_on": { @@ -144,15 +145,15 @@ def search_qcrecords_post( "latest": true } ``` - + **Filter options:** - `project_id`: Single value or list of project IDs - `metadata`: Key-value pairs to match against pipeline metadata - + **Pagination:** - `page`: Page number (starts at 1) - `per_page`: Results per page (max 1000) - + **Latest filtering:** - `latest: true` (default): Returns only the newest QC record per project - `latest: false`: Returns all matching records (full history) @@ -177,7 +178,7 @@ def get_qcrecord( ) -> QCRecordPublic: """ Retrieve a specific QC record by its UUID. - + Returns the full QC record including metadata, metrics, and output files. """ return services.get_qcrecord_by_id(session, qcrecord_id) @@ -194,13 +195,13 @@ def delete_qcrecord( ) -> dict: """ Delete a QC record and all associated data. - + This permanently removes: - The QC record - All associated metadata - All associated metrics and metric values - All associated output file records - + **Warning:** This action cannot be undone. """ return services.delete_qcrecord(session, qcrecord_id) diff --git a/api/qcmetrics/services.py b/api/qcmetrics/services.py index c4bd136..aad81e9 100644 --- a/api/qcmetrics/services.py +++ b/api/qcmetrics/services.py @@ -6,10 +6,9 @@ import logging from datetime import datetime, timezone -from typing import List +import uuid as uuid_module from fastapi import HTTPException, status from sqlmodel import Session, select, col -from sqlalchemy import func from api.qcmetrics.models import ( QCRecord, @@ -50,7 +49,7 @@ def create_qcrecord( ) -> QCRecordPublic: """ Create a new QC record with all associated data. - + Handles both the new explicit format (metrics with samples) and the legacy ES format (sample_level_metrics dict). """ @@ -140,8 +139,10 @@ def _create_metric( for sample_input in metric_input.samples: sample_assoc = QCMetricSample( qc_metric_id=metric.id, - sample_name=sample_input.sample_name if hasattr(sample_input, 'sample_name') else sample_input['sample_name'], - role=sample_input.role if hasattr(sample_input, 'role') else sample_input.get('role'), + sample_name=sample_input.sample_name if hasattr(sample_input, 'sample_name') + else sample_input['sample_name'], + role=sample_input.role if hasattr(sample_input, 'role') + else sample_input.get('role'), ) session.add(sample_assoc) @@ -213,16 +214,16 @@ def _check_duplicate_record( ) -> QCRecord | None: """ Check if an equivalent QC record already exists. - + Returns the existing record if found, None otherwise. """ # Find existing records for this project stmt = select(QCRecord).where( QCRecord.project_id == qcrecord_create.project_id ).order_by(col(QCRecord.created_on).desc()) - + existing_records = session.exec(stmt).all() - + if not existing_records: return None @@ -230,7 +231,7 @@ def _check_duplicate_record( # A full comparison would require comparing all nested data # This is a simplified version that checks metadata keys latest = existing_records[0] - + # Get existing metadata existing_metadata = { m.key: m.value @@ -240,14 +241,14 @@ def _check_duplicate_record( ) ).all() } - + # Compare metadata new_metadata = qcrecord_create.metadata or {} if existing_metadata == {k: str(v) for k, v in new_metadata.items()}: # Metadata matches - could do deeper comparison here # For now, consider it a duplicate if metadata matches return latest - + return None @@ -260,7 +261,7 @@ def search_qcrecords( ) -> QCRecordsPublic: """ Search for QC records with filtering and pagination. - + Args: session: Database session filter_on: Dictionary of fields to filter by @@ -269,10 +270,10 @@ def search_qcrecords( latest: If True, return only the newest record per project """ filter_on = filter_on or {} - + # Build base query stmt = select(QCRecord) - + # Apply filters if "project_id" in filter_on: project_ids = filter_on["project_id"] @@ -280,7 +281,7 @@ def search_qcrecords( stmt = stmt.where(col(QCRecord.project_id).in_(project_ids)) else: stmt = stmt.where(QCRecord.project_id == project_ids) - + # Handle metadata filtering if "metadata" in filter_on and isinstance(filter_on["metadata"], dict): for key, value in filter_on["metadata"].items(): @@ -290,13 +291,13 @@ def search_qcrecords( QCRecordMetadata.value == str(value) ) stmt = stmt.where(col(QCRecord.id).in_(subq)) - + # Order by created_on descending stmt = stmt.order_by(col(QCRecord.created_on).desc()) - + # Execute to get all matching records all_records = list(session.exec(stmt).all()) - + # Apply "latest" filter - keep only newest per project if latest: seen_projects = set() @@ -306,16 +307,16 @@ def search_qcrecords( filtered_records.append(record) seen_projects.add(record.project_id) all_records = filtered_records - + # Calculate pagination total = len(all_records) start_idx = (page - 1) * per_page end_idx = start_idx + per_page paginated_records = all_records[start_idx:end_idx] - + # Convert to public format data = [_qcrecord_to_public(session, record) for record in paginated_records] - + return QCRecordsPublic( data=data, total=total, @@ -326,8 +327,7 @@ def search_qcrecords( def get_qcrecord_by_id(session: Session, qcrecord_id: str) -> QCRecordPublic: """Get a single QC record by ID.""" - import uuid as uuid_module - + try: record_uuid = uuid_module.UUID(qcrecord_id) except ValueError as exc: @@ -335,21 +335,19 @@ def get_qcrecord_by_id(session: Session, qcrecord_id: str) -> QCRecordPublic: status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid UUID format: {qcrecord_id}" ) from exc - + record = session.get(QCRecord, record_uuid) if not record: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"QC record not found: {qcrecord_id}" ) - + return _qcrecord_to_public(session, record) def delete_qcrecord(session: Session, qcrecord_id: str) -> dict: """Delete a QC record and all associated data.""" - import uuid as uuid_module - try: record_uuid = uuid_module.UUID(qcrecord_id) except ValueError as exc: @@ -357,14 +355,14 @@ def delete_qcrecord(session: Session, qcrecord_id: str) -> dict: status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid UUID format: {qcrecord_id}" ) from exc - + record = session.get(QCRecord, record_uuid) if not record: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"QC record not found: {qcrecord_id}" ) - + # Delete associated file records (polymorphic, not cascade) file_records = session.exec( select(FileRecord).where( @@ -372,16 +370,16 @@ def delete_qcrecord(session: Session, qcrecord_id: str) -> dict: FileRecord.entity_id == record_uuid ) ).all() - + for file_record in file_records: session.delete(file_record) - + # Delete the QC record (cascades to metadata, metrics, etc.) session.delete(record) session.commit() - + logger.info("Deleted QC record %s", qcrecord_id) - + return {"status": "deleted", "id": qcrecord_id} @@ -393,29 +391,29 @@ def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: QCRecordMetadata.qcrecord_id == record.id ) ).all() - + metadata = [ MetadataKeyValue(key=m.key, value=m.value) for m in metadata_entries ] - + # Get metrics metric_entries = session.exec( select(QCMetric).where(QCMetric.qcrecord_id == record.id) ).all() - + metrics = [] for metric in metric_entries: # Get metric values values = session.exec( select(QCMetricValue).where(QCMetricValue.qc_metric_id == metric.id) ).all() - + # Get metric samples samples = session.exec( select(QCMetricSample).where(QCMetricSample.qc_metric_id == metric.id) ).all() - + metrics.append(MetricPublic( name=metric.name, samples=[ @@ -427,7 +425,7 @@ def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: for v in values ], )) - + # Get file records file_records = session.exec( select(FileRecord).where( @@ -435,7 +433,7 @@ def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: FileRecord.entity_id == record.id ) ).all() - + output_files = [] for file_record in file_records: # Get hashes @@ -444,21 +442,21 @@ def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: FileRecordHash.file_record_id == file_record.id ) ).all() - + # Get tags tags = session.exec( select(FileRecordTag).where( FileRecordTag.file_record_id == file_record.id ) ).all() - + # Get samples samples = session.exec( select(FileRecordSample).where( FileRecordSample.file_record_id == file_record.id ) ).all() - + output_files.append(FileRecordPublic( id=file_record.id, uri=file_record.uri, @@ -468,7 +466,7 @@ def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: tags=[TagPublic(key=t.key, value=t.value) for t in tags], samples=[SamplePublic(sample_name=s.sample_name, role=s.role) for s in samples], )) - + return QCRecordPublic( id=record.id, created_on=record.created_on, diff --git a/tests/api/test_qcmetrics.py b/tests/api/test_qcmetrics.py index de1d69d..c422254 100644 --- a/tests/api/test_qcmetrics.py +++ b/tests/api/test_qcmetrics.py @@ -23,12 +23,12 @@ def test_create_qcrecord_basic(client: TestClient, session: Session): json=qcrecord_data ) assert response.status_code == 201 - + data = response.json() assert data["project_id"] == "P-TEST-001" assert data["created_by"] == "test_user" assert len(data["metadata"]) == 2 - + # Check metadata values metadata_dict = {m["key"]: m["value"] for m in data["metadata"]} assert metadata_dict["pipeline"] == "RNA-Seq" @@ -62,15 +62,15 @@ def test_create_qcrecord_with_single_sample_metrics(client: TestClient, session: json=qcrecord_data ) assert response.status_code == 201 - + data = response.json() assert len(data["metrics"]) == 1 - + metric = data["metrics"][0] assert metric["name"] == "alignment_stats" assert len(metric["samples"]) == 1 assert metric["samples"][0]["sample_name"] == "Sample1" - + # Check metric values values_dict = {v["key"]: v["value"] for v in metric["values"]} assert values_dict["total_reads"] == "50000000" @@ -107,10 +107,10 @@ def test_create_qcrecord_with_paired_sample_metrics(client: TestClient, session: json=qcrecord_data ) assert response.status_code == 201 - + data = response.json() metric = data["metrics"][0] - + # Check paired samples with roles assert len(metric["samples"]) == 2 samples_by_role = {s["role"]: s["sample_name"] for s in metric["samples"]} @@ -144,13 +144,13 @@ def test_create_qcrecord_with_workflow_level_metrics(client: TestClient, session json=qcrecord_data ) assert response.status_code == 201 - + data = response.json() metric = data["metrics"][0] - + # Workflow-level metrics have no samples assert len(metric["samples"]) == 0 - + values_dict = {v["key"]: v["value"] for v in metric["values"]} assert values_dict["total_samples_processed"] == "48" @@ -186,24 +186,24 @@ def test_create_qcrecord_with_output_files(client: TestClient, session: Session) json=qcrecord_data ) assert response.status_code == 201 - + data = response.json() assert len(data["output_files"]) == 2 - + # Check first file (single sample) bam_file = next(f for f in data["output_files"] if "bam" in f["uri"]) assert bam_file["size"] == 123456789 assert len(bam_file["samples"]) == 1 assert bam_file["samples"][0]["sample_name"] == "Sample1" - + # Check hashes hashes_dict = {h["algorithm"]: h["value"] for h in bam_file["hashes"]} assert hashes_dict["md5"] == "abc123def456" - + # Check tags tags_dict = {t["key"]: t["value"] for t in bam_file["tags"]} assert tags_dict["type"] == "alignment" - + # Check second file (workflow-level, no samples) matrix_file = next(f for f in data["output_files"] if "matrix" in f["uri"]) assert len(matrix_file["samples"]) == 0 @@ -215,7 +215,7 @@ def test_search_qcrecords_empty(client: TestClient, session: Session): """ response = client.get("/api/v1/qcmetrics/search") assert response.status_code == 200 - + data = response.json() assert data["total"] == 0 assert data["data"] == [] @@ -231,11 +231,11 @@ def test_search_qcrecords_by_project_id(client: TestClient, session: Session): "metadata": {"pipeline": "RNA-Seq"} } client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data) - + # Search for it response = client.get("/api/v1/qcmetrics/search?project_id=P-SEARCH-001") assert response.status_code == 200 - + data = response.json() assert data["total"] == 1 assert data["data"][0]["project_id"] == "P-SEARCH-001" @@ -251,20 +251,20 @@ def test_search_qcrecords_latest_only(client: TestClient, session: Session): "metadata": {"version": "1.0"} } client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_1) - + qcrecord_data_2 = { "project_id": "P-LATEST-001", "metadata": {"version": "2.0"} # Different metadata, so not a duplicate } client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_2) - + # Search with latest=true (default) response = client.get("/api/v1/qcmetrics/search?project_id=P-LATEST-001&latest=true") assert response.status_code == 200 - + data = response.json() assert data["total"] == 1 - + # Should be version 2.0 (the latest) metadata_dict = {m["key"]: m["value"] for m in data["data"][0]["metadata"]} assert metadata_dict["version"] == "2.0" @@ -280,17 +280,17 @@ def test_search_qcrecords_all_versions(client: TestClient, session: Session): "metadata": {"version": "1.0"} } client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_1) - + qcrecord_data_2 = { "project_id": "P-ALLVER-001", "metadata": {"version": "2.0"} } client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_2) - + # Search with latest=false response = client.get("/api/v1/qcmetrics/search?project_id=P-ALLVER-001&latest=false") assert response.status_code == 200 - + data = response.json() assert data["total"] == 2 @@ -308,7 +308,7 @@ def test_search_qcrecords_post_with_metadata_filter(client: TestClient, session: "project_id": "P-META-002", "metadata": {"pipeline": "WES"} }) - + # Search for RNA-Seq pipeline only search_request = { "filter_on": { @@ -317,7 +317,7 @@ def test_search_qcrecords_post_with_metadata_filter(client: TestClient, session: } response = client.post("/api/v1/qcmetrics/search", json=search_request) assert response.status_code == 200 - + data = response.json() assert data["total"] == 1 assert data["data"][0]["project_id"] == "P-META-001" @@ -333,11 +333,11 @@ def test_get_qcrecord_by_id(client: TestClient, session: Session): "metadata": {"pipeline": "RNA-Seq"} }) qcrecord_id = create_response.json()["id"] - + # Get by ID response = client.get(f"/api/v1/qcmetrics/{qcrecord_id}") assert response.status_code == 200 - + data = response.json() assert data["id"] == qcrecord_id assert data["project_id"] == "P-GET-001" @@ -370,12 +370,12 @@ def test_delete_qcrecord(client: TestClient, session: Session): "metadata": {"pipeline": "RNA-Seq"} }) qcrecord_id = create_response.json()["id"] - + # Delete it response = client.delete(f"/api/v1/qcmetrics/{qcrecord_id}") assert response.status_code == 200 assert response.json()["status"] == "deleted" - + # Verify it's gone get_response = client.get(f"/api/v1/qcmetrics/{qcrecord_id}") assert get_response.status_code == 404 @@ -398,17 +398,17 @@ def test_duplicate_detection(client: TestClient, session: Session): "project_id": "P-DUP-001", "metadata": {"pipeline": "RNA-Seq", "version": "2.0"} } - + # Create first record response1 = client.post("/api/v1/qcmetrics?created_by=user1", json=qcrecord_data) assert response1.status_code == 201 id1 = response1.json()["id"] - + # Try to create identical record response2 = client.post("/api/v1/qcmetrics?created_by=user2", json=qcrecord_data) assert response2.status_code == 201 id2 = response2.json()["id"] - + # Should return the same record (duplicate detection) assert id1 == id2 @@ -437,17 +437,17 @@ def test_legacy_sample_level_metrics_format(client: TestClient, session: Session json=qcrecord_data ) assert response.status_code == 201 - + data = response.json() - + # Legacy format should be converted to metrics assert len(data["metrics"]) == 2 - + # Check that sample names are preserved metric_sample_names = set() for metric in data["metrics"]: for sample in metric["samples"]: metric_sample_names.add(sample["sample_name"]) - + assert "Sample1" in metric_sample_names assert "Sample2" in metric_sample_names From 4d609bc11d17138d99d247ee96cdf00c1e5e6d92 Mon Sep 17 00:00:00 2001 From: vasques1 Date: Fri, 30 Jan 2026 12:57:20 -0500 Subject: [PATCH 10/13] add comment re: created_by field which should be taken from auth token when this is implemented --- api/qcmetrics/routes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/api/qcmetrics/routes.py b/api/qcmetrics/routes.py index 03a8fb3..58e2be1 100644 --- a/api/qcmetrics/routes.py +++ b/api/qcmetrics/routes.py @@ -38,6 +38,9 @@ def create_qcrecord( The record stores quality control metrics from a pipeline execution. + Right now - created_by is just a string username passed as a query parameter; once + authentication is in place, this will be derived from the logged-in user. + **Request body format:** ```json From dc04f2bbe8b401995b04a3761880d55eb3cd7a8a Mon Sep 17 00:00:00 2001 From: vasques1 Date: Fri, 30 Jan 2026 14:50:59 -0500 Subject: [PATCH 11/13] rm handling legacy sample_level_metrics to unify; correctly handle numeric types --- api/qcmetrics/models.py | 8 ++-- api/qcmetrics/routes.py | 34 +++++++++++++- api/qcmetrics/services.py | 17 ++----- tests/api/test_qcmetrics.py | 88 ++++++++++++++++++++++++++++--------- 4 files changed, 106 insertions(+), 41 deletions(-) diff --git a/api/qcmetrics/models.py b/api/qcmetrics/models.py index 2a23c5a..4f112d4 100644 --- a/api/qcmetrics/models.py +++ b/api/qcmetrics/models.py @@ -175,19 +175,19 @@ class MetricInput(SQLModel): """Input model for a metric group.""" name: str samples: List[MetricSampleInput] | None = None - values: dict[str, str] # {"reads": "50000000", "alignment_rate": "95.5"} + values: dict[str, str | int | float] # {"reads": 50000000, "alignment_rate": 95.5} class QCRecordCreate(SQLModel): """ Request model for creating a QC record. - Accepts both the new explicit format and backward-compatible formats. + Uses the explicit metrics format with sample associations supporting + workflow-level, single-sample, and paired-sample (tumor/normal) metrics. """ project_id: str metadata: dict[str, str] | None = None # {"pipeline": "RNA-Seq", "version": "2.0"} - metrics: List[MetricInput] | None = None # New format with explicit sample associations - sample_level_metrics: dict[str, dict[str, str]] | None = None # Legacy ES format + metrics: List[MetricInput] | None = None # Metrics with explicit sample associations output_files: List[FileRecordCreate] | None = None model_config = ConfigDict(extra="forbid") diff --git a/api/qcmetrics/routes.py b/api/qcmetrics/routes.py index 58e2be1..9c94b84 100644 --- a/api/qcmetrics/routes.py +++ b/api/qcmetrics/routes.py @@ -38,8 +38,38 @@ def create_qcrecord( The record stores quality control metrics from a pipeline execution. - Right now - created_by is just a string username passed as a query parameter; once - authentication is in place, this will be derived from the logged-in user. + **Note:** Right now `created_by` is just a string username passed as a query parameter; + once authentication is in place, this will be derived from the logged-in user. + + **Example curl command:** + + ```bash + curl -X POST "http://localhost:8000/api/v1/qcmetrics?created_by=jsmith" \\ + -H "Content-Type: application/json" \\ + -d '{ + "project_id": "P-1234", + "metadata": { + "pipeline": "RNA-Seq", + "version": "2.0.0" + }, + "metrics": [ + { + "name": "alignment_stats", + "samples": [{"sample_name": "Sample1"}], + "values": {"reads": "50000000", "alignment_rate": "95.5"} + } + ], + "output_files": [ + { + "uri": "s3://bucket/path/file.bam", + "size": 123456789, + "samples": [{"sample_name": "Sample1"}], + "hash": {"md5": "abc123def456"}, + "tags": {"type": "alignment"} + } + ] + }' + ``` **Request body format:** diff --git a/api/qcmetrics/services.py b/api/qcmetrics/services.py index aad81e9..f4c8d5b 100644 --- a/api/qcmetrics/services.py +++ b/api/qcmetrics/services.py @@ -50,8 +50,8 @@ def create_qcrecord( """ Create a new QC record with all associated data. - Handles both the new explicit format (metrics with samples) and - the legacy ES format (sample_level_metrics dict). + Metrics can have numeric values (int, float) which are stored as strings + in the database. """ # Check for duplicate record existing = _check_duplicate_record(session, qcrecord_create) @@ -82,22 +82,11 @@ def create_qcrecord( ) session.add(metadata_entry) - # Add metrics (new format) + # Add metrics if qcrecord_create.metrics: for metric_input in qcrecord_create.metrics: _create_metric(session, qcrecord.id, metric_input) - # Handle legacy sample_level_metrics format (ES compatibility) - if qcrecord_create.sample_level_metrics: - for sample_name, metrics_dict in qcrecord_create.sample_level_metrics.items(): - # Convert to new format: one metric group per sample - metric_input = MetricInput( - name=f"sample_metrics_{sample_name}", - samples=[{"sample_name": sample_name}], - values=metrics_dict, - ) - _create_metric(session, qcrecord.id, metric_input) - # Add output files if qcrecord_create.output_files: for file_create in qcrecord_create.output_files: diff --git a/tests/api/test_qcmetrics.py b/tests/api/test_qcmetrics.py index c422254..1710a20 100644 --- a/tests/api/test_qcmetrics.py +++ b/tests/api/test_qcmetrics.py @@ -413,23 +413,32 @@ def test_duplicate_detection(client: TestClient, session: Session): assert id1 == id2 -def test_legacy_sample_level_metrics_format(client: TestClient, session: Session): +def test_numeric_metric_values(client: TestClient, session: Session): """ - Test backward compatibility with the legacy ES format (sample_level_metrics). + Test that numeric metric values (int, float) are accepted and stored as strings. + + This matches the legacy ES format where values like QC_ForwardReadCount=122483575 + were numeric rather than string. """ qcrecord_data = { - "project_id": "P-LEGACY-001", + "project_id": "P-NUMERIC-001", "metadata": {"pipeline": "RNA-Seq"}, - "sample_level_metrics": { - "Sample1": { - "reads": "50000000", - "alignment_rate": "95.5" - }, - "Sample2": { - "reads": "45000000", - "alignment_rate": "93.2" + "metrics": [ + { + "name": "sample_qc_metrics", + "samples": [{"sample_name": "SampleA"}], + "values": { + "QC_ForwardReadCount": 122483575, # int + "QC_ReverseReadCount": 122483575, # int + "QC_FractionContaminatedReads": 0, # int (zero) + "QC_MeanReadLength": 150, # int + "QC_FractionReadsAligned": 0.587, # float + "QC_StrandBalance": 0.5, # float + "QC_Median5Bias": 0.395753, # float + "QC_DynamicRange": 2452.4661796537 # float with high precision + } } - } + ] } response = client.post( @@ -439,15 +448,52 @@ def test_legacy_sample_level_metrics_format(client: TestClient, session: Session assert response.status_code == 201 data = response.json() + assert len(data["metrics"]) == 1 + + metric = data["metrics"][0] + assert metric["name"] == "sample_qc_metrics" + assert len(metric["samples"]) == 1 + assert metric["samples"][0]["sample_name"] == "SampleA" + + # Values should be stored as strings + values_dict = {v["key"]: v["value"] for v in metric["values"]} + assert values_dict["QC_ForwardReadCount"] == "122483575" + assert values_dict["QC_FractionReadsAligned"] == "0.587" + assert values_dict["QC_DynamicRange"] == "2452.4661796537" + + +def test_mixed_string_and_numeric_values(client: TestClient, session: Session): + """ + Test that both string and numeric values can be provided in the same metric. + """ + qcrecord_data = { + "project_id": "P-MIXED-001", + "metadata": {"pipeline": "RNA-Seq"}, + "metrics": [ + { + "name": "alignment_stats", + "samples": [{"sample_name": "Sample1"}], + "values": { + "total_reads": 50000000, # numeric int + "alignment_rate": 97.5, # numeric float + "reference_genome": "GRCh38", # string + "status": "passed" # string + } + } + ] + } - # Legacy format should be converted to metrics - assert len(data["metrics"]) == 2 + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 - # Check that sample names are preserved - metric_sample_names = set() - for metric in data["metrics"]: - for sample in metric["samples"]: - metric_sample_names.add(sample["sample_name"]) + data = response.json() + values_dict = {v["key"]: v["value"] for v in data["metrics"][0]["values"]} - assert "Sample1" in metric_sample_names - assert "Sample2" in metric_sample_names + # All values should be strings in the response + assert values_dict["total_reads"] == "50000000" + assert values_dict["alignment_rate"] == "97.5" + assert values_dict["reference_genome"] == "GRCh38" + assert values_dict["status"] == "passed" From 306c5d26c3e5bf769b0c3144dbb22604d49064af Mon Sep 17 00:00:00 2001 From: vasques1 Date: Fri, 30 Jan 2026 14:59:36 -0500 Subject: [PATCH 12/13] store metric value type so it can be re-cast on return --- ...5e6_add_qcmetrics_and_filerecord_tables.py | 6 ++- api/qcmetrics/models.py | 8 +++- api/qcmetrics/services.py | 28 +++++++++++++- tests/api/test_qcmetrics.py | 37 ++++++++++++++----- 4 files changed, 65 insertions(+), 14 deletions(-) diff --git a/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py b/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py index 4c489f2..0389b16 100644 --- a/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py +++ b/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py @@ -117,13 +117,17 @@ def upgrade() -> None: sa.UniqueConstraint('qcrecord_id', 'name', name='uq_qcmetric_record_name') ) - # qcmetricvalue - metric values + # qcmetricvalue - metric values with type preservation op.create_table( 'qcmetricvalue', sa.Column('id', sa.Uuid(), nullable=False), sa.Column('qc_metric_id', sa.Uuid(), nullable=False), sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), sa.Column('value', sa.Text(), nullable=False), + sa.Column( + 'value_type', sqlmodel.sql.sqltypes.AutoString(length=10), + nullable=False, server_default='str' + ), sa.ForeignKeyConstraint(['qc_metric_id'], ['qcmetric.id'], ondelete='CASCADE'), sa.PrimaryKeyConstraint('id'), sa.UniqueConstraint('qc_metric_id', 'key', name='uq_qcmetricvalue_metric_key') diff --git a/api/qcmetrics/models.py b/api/qcmetrics/models.py index 4f112d4..8236b5c 100644 --- a/api/qcmetrics/models.py +++ b/api/qcmetrics/models.py @@ -46,6 +46,9 @@ class QCMetricValue(SQLModel, table=True): """ Key-value store for individual metric values within a metric group. Examples: reads=50000000, alignment_rate=95.5, tmb=8.5 + + The value_type column preserves the original Python type so values + can be returned in their original format (int, float, or str). """ __tablename__ = "qcmetricvalue" @@ -53,6 +56,7 @@ class QCMetricValue(SQLModel, table=True): qc_metric_id: uuid.UUID = Field(foreign_key="qcmetric.id", nullable=False) key: str = Field(max_length=255, nullable=False) value: str = Field(nullable=False) + value_type: str = Field(max_length=10, default="str") # "str", "int", "float" # Relationship back to parent qc_metric: "QCMetric" = Relationship(back_populates="values") @@ -194,9 +198,9 @@ class QCRecordCreate(SQLModel): class MetricValuePublic(SQLModel): - """Public representation of a metric value.""" + """Public representation of a metric value with original type preserved.""" key: str - value: str + value: str | int | float class MetricSamplePublic(SQLModel): diff --git a/api/qcmetrics/services.py b/api/qcmetrics/services.py index f4c8d5b..04b11fc 100644 --- a/api/qcmetrics/services.py +++ b/api/qcmetrics/services.py @@ -135,12 +135,24 @@ def _create_metric( ) session.add(sample_assoc) - # Add metric values + # Add metric values with type preservation for key, value in metric_input.values.items(): + # Determine the original type + if isinstance(value, bool): + # bool is subclass of int, so check first + value_type = "str" # Store bools as strings + elif isinstance(value, int): + value_type = "int" + elif isinstance(value, float): + value_type = "float" + else: + value_type = "str" + metric_value = QCMetricValue( qc_metric_id=metric.id, key=key, value=str(value), + value_type=value_type, ) session.add(metric_value) @@ -372,6 +384,15 @@ def delete_qcrecord(session: Session, qcrecord_id: str) -> dict: return {"status": "deleted", "id": qcrecord_id} +def _convert_value_to_type(value: str, value_type: str) -> str | int | float: + """Convert a string value back to its original type.""" + if value_type == "int": + return int(value) + elif value_type == "float": + return float(value) + return value + + def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: """Convert a QCRecord database object to public format.""" # Get metadata @@ -410,7 +431,10 @@ def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: for s in samples ], values=[ - MetricValuePublic(key=v.key, value=v.value) + MetricValuePublic( + key=v.key, + value=_convert_value_to_type(v.value, v.value_type) + ) for v in values ], )) diff --git a/tests/api/test_qcmetrics.py b/tests/api/test_qcmetrics.py index 1710a20..78496ab 100644 --- a/tests/api/test_qcmetrics.py +++ b/tests/api/test_qcmetrics.py @@ -415,7 +415,8 @@ def test_duplicate_detection(client: TestClient, session: Session): def test_numeric_metric_values(client: TestClient, session: Session): """ - Test that numeric metric values (int, float) are accepted and stored as strings. + Test that numeric metric values (int, float) are accepted and returned + with their original types preserved. This matches the legacy ES format where values like QC_ForwardReadCount=122483575 were numeric rather than string. @@ -455,16 +456,28 @@ def test_numeric_metric_values(client: TestClient, session: Session): assert len(metric["samples"]) == 1 assert metric["samples"][0]["sample_name"] == "SampleA" - # Values should be stored as strings + # Values should be returned with their original types preserved values_dict = {v["key"]: v["value"] for v in metric["values"]} - assert values_dict["QC_ForwardReadCount"] == "122483575" - assert values_dict["QC_FractionReadsAligned"] == "0.587" - assert values_dict["QC_DynamicRange"] == "2452.4661796537" + + # Integer values + assert values_dict["QC_ForwardReadCount"] == 122483575 + assert isinstance(values_dict["QC_ForwardReadCount"], int) + assert values_dict["QC_FractionContaminatedReads"] == 0 + assert isinstance(values_dict["QC_FractionContaminatedReads"], int) + assert values_dict["QC_MeanReadLength"] == 150 + assert isinstance(values_dict["QC_MeanReadLength"], int) + + # Float values + assert values_dict["QC_FractionReadsAligned"] == 0.587 + assert isinstance(values_dict["QC_FractionReadsAligned"], float) + assert values_dict["QC_DynamicRange"] == 2452.4661796537 + assert isinstance(values_dict["QC_DynamicRange"], float) def test_mixed_string_and_numeric_values(client: TestClient, session: Session): """ - Test that both string and numeric values can be provided in the same metric. + Test that both string and numeric values can be provided in the same metric, + and each is returned with its original type. """ qcrecord_data = { "project_id": "P-MIXED-001", @@ -492,8 +505,14 @@ def test_mixed_string_and_numeric_values(client: TestClient, session: Session): data = response.json() values_dict = {v["key"]: v["value"] for v in data["metrics"][0]["values"]} - # All values should be strings in the response - assert values_dict["total_reads"] == "50000000" - assert values_dict["alignment_rate"] == "97.5" + # Numeric values returned with original types + assert values_dict["total_reads"] == 50000000 + assert isinstance(values_dict["total_reads"], int) + assert values_dict["alignment_rate"] == 97.5 + assert isinstance(values_dict["alignment_rate"], float) + + # String values remain as strings assert values_dict["reference_genome"] == "GRCh38" + assert isinstance(values_dict["reference_genome"], str) assert values_dict["status"] == "passed" + assert isinstance(values_dict["status"], str) From 75624323a79c74ae0d3eea224d6383c94c46a381 Mon Sep 17 00:00:00 2001 From: vasques1 Date: Fri, 30 Jan 2026 16:00:22 -0500 Subject: [PATCH 13/13] store both string and numeric value representations with index to enable ranged queries for LLM --- ...5e6_add_qcmetrics_and_filerecord_tables.py | 11 +++++-- api/qcmetrics/models.py | 10 ++++-- api/qcmetrics/services.py | 33 ++++++++++++------- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py b/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py index 0389b16..3d7a8cc 100644 --- a/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py +++ b/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py @@ -117,13 +117,14 @@ def upgrade() -> None: sa.UniqueConstraint('qcrecord_id', 'name', name='uq_qcmetric_record_name') ) - # qcmetricvalue - metric values with type preservation + # qcmetricvalue - metric values with dual storage for string/numeric queries op.create_table( 'qcmetricvalue', sa.Column('id', sa.Uuid(), nullable=False), sa.Column('qc_metric_id', sa.Uuid(), nullable=False), sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), - sa.Column('value', sa.Text(), nullable=False), + sa.Column('value_string', sa.Text(), nullable=False), + sa.Column('value_numeric', sa.Float(), nullable=True), sa.Column( 'value_type', sqlmodel.sql.sqltypes.AutoString(length=10), nullable=False, server_default='str' @@ -132,6 +133,11 @@ def upgrade() -> None: sa.PrimaryKeyConstraint('id'), sa.UniqueConstraint('qc_metric_id', 'key', name='uq_qcmetricvalue_metric_key') ) + # Index on key + value_numeric for efficient numeric range queries + op.create_index( + 'ix_qcmetricvalue_key_numeric', 'qcmetricvalue', + ['key', 'value_numeric'] + ) # qcmetricsample - sample associations for metrics op.create_table( @@ -151,6 +157,7 @@ def downgrade() -> None: # Drop QCRecord tables (in reverse order of creation) op.drop_table('qcmetricsample') + op.drop_index('ix_qcmetricvalue_key_numeric', table_name='qcmetricvalue') op.drop_table('qcmetricvalue') op.drop_table('qcmetric') op.drop_table('qcrecordmetadata') diff --git a/api/qcmetrics/models.py b/api/qcmetrics/models.py index 8236b5c..de81b19 100644 --- a/api/qcmetrics/models.py +++ b/api/qcmetrics/models.py @@ -47,15 +47,19 @@ class QCMetricValue(SQLModel, table=True): Key-value store for individual metric values within a metric group. Examples: reads=50000000, alignment_rate=95.5, tmb=8.5 - The value_type column preserves the original Python type so values - can be returned in their original format (int, float, or str). + Stores values in two formats: + - value_string: Always populated, used for string matching and display + - value_numeric: Populated only for int/float types, enables numeric queries + (greater than, less than, range, aggregations) + - value_type: Preserves original Python type ("str", "int", "float") """ __tablename__ = "qcmetricvalue" id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) qc_metric_id: uuid.UUID = Field(foreign_key="qcmetric.id", nullable=False) key: str = Field(max_length=255, nullable=False) - value: str = Field(nullable=False) + value_string: str = Field(nullable=False) + value_numeric: float | None = Field(default=None, nullable=True) # For numeric queries value_type: str = Field(max_length=10, default="str") # "str", "int", "float" # Relationship back to parent diff --git a/api/qcmetrics/services.py b/api/qcmetrics/services.py index 04b11fc..0115918 100644 --- a/api/qcmetrics/services.py +++ b/api/qcmetrics/services.py @@ -135,23 +135,28 @@ def _create_metric( ) session.add(sample_assoc) - # Add metric values with type preservation + # Add metric values with type preservation and dual storage for key, value in metric_input.values.items(): - # Determine the original type + # Determine the original type and numeric value if isinstance(value, bool): # bool is subclass of int, so check first - value_type = "str" # Store bools as strings + value_type = "str" + value_numeric = None elif isinstance(value, int): value_type = "int" + value_numeric = float(value) # Store as float for consistent numeric ops elif isinstance(value, float): value_type = "float" + value_numeric = value else: value_type = "str" + value_numeric = None metric_value = QCMetricValue( qc_metric_id=metric.id, key=key, - value=str(value), + value_string=str(value), + value_numeric=value_numeric, value_type=value_type, ) session.add(metric_value) @@ -384,13 +389,15 @@ def delete_qcrecord(session: Session, qcrecord_id: str) -> dict: return {"status": "deleted", "id": qcrecord_id} -def _convert_value_to_type(value: str, value_type: str) -> str | int | float: - """Convert a string value back to its original type.""" - if value_type == "int": - return int(value) - elif value_type == "float": - return float(value) - return value +def _convert_value_to_type( + value_string: str, value_numeric: float | None, value_type: str +) -> str | int | float: + """Convert stored values back to their original type.""" + if value_type == "int" and value_numeric is not None: + return int(value_numeric) + elif value_type == "float" and value_numeric is not None: + return value_numeric + return value_string def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: @@ -433,7 +440,9 @@ def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: values=[ MetricValuePublic( key=v.key, - value=_convert_value_to_type(v.value, v.value_type) + value=_convert_value_to_type( + v.value_string, v.value_numeric, v.value_type + ) ) for v in values ],