diff --git a/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py b/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py new file mode 100644 index 0000000..3d7a8cc --- /dev/null +++ b/alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py @@ -0,0 +1,172 @@ +"""Add QCMetrics and FileRecord tables + +Revision ID: f1a2b3c4d5e6 +Revises: e158df5a8df1 +Create Date: 2026-01-29 16:45:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = 'f1a2b3c4d5e6' +down_revision: Union[str, Sequence[str], None] = 'e158df5a8df1' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Create QCMetrics and FileRecord tables.""" + + # ======================================================================== + # FileRecord Tables (reusable across QCRecord, Sample, etc.) + # ======================================================================== + + # filerecord - main file metadata table + op.create_table( + 'filerecord', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('entity_type', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False), + sa.Column('entity_id', sa.Uuid(), nullable=False), + sa.Column('uri', sqlmodel.sql.sqltypes.AutoString(length=1024), nullable=False), + sa.Column('size', sa.BigInteger(), nullable=True), + sa.Column('created_on', sa.DateTime(), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_index( + 'ix_filerecord_entity', + 'filerecord', + ['entity_type', 'entity_id'] + ) + + # filerecordhash - hash values for files + op.create_table( + 'filerecordhash', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('file_record_id', sa.Uuid(), nullable=False), + sa.Column('algorithm', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False), + sa.Column('value', sqlmodel.sql.sqltypes.AutoString(length=128), nullable=False), + sa.ForeignKeyConstraint(['file_record_id'], ['filerecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_record_id', 'algorithm', name='uq_filerecordhash_file_algorithm') + ) + + # filerecordtag - key-value tags for files + op.create_table( + 'filerecordtag', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('file_record_id', sa.Uuid(), nullable=False), + sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('value', sa.Text(), nullable=False), + sa.ForeignKeyConstraint(['file_record_id'], ['filerecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_record_id', 'key', name='uq_filerecordtag_file_key') + ) + + # filerecordsample - sample associations for files + op.create_table( + 'filerecordsample', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('file_record_id', sa.Uuid(), nullable=False), + sa.Column('sample_name', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('role', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=True), + sa.ForeignKeyConstraint(['file_record_id'], ['filerecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_record_id', 'sample_name', name='uq_filerecordsample_file_sample') + ) + + # ======================================================================== + # QCRecord Tables + # ======================================================================== + + # qcrecord - main QC record table + op.create_table( + 'qcrecord', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('created_on', sa.DateTime(), nullable=False), + sa.Column('created_by', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=False), + sa.Column('project_id', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + op.create_index('ix_qcrecord_project_id', 'qcrecord', ['project_id']) + + # qcrecordmetadata - pipeline-level metadata + op.create_table( + 'qcrecordmetadata', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('qcrecord_id', sa.Uuid(), nullable=False), + sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('value', sa.Text(), nullable=False), + sa.ForeignKeyConstraint(['qcrecord_id'], ['qcrecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('qcrecord_id', 'key', name='uq_qcrecordmetadata_record_key') + ) + + # qcmetric - named metric groups + op.create_table( + 'qcmetric', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('qcrecord_id', sa.Uuid(), nullable=False), + sa.Column('name', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.ForeignKeyConstraint(['qcrecord_id'], ['qcrecord.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('qcrecord_id', 'name', name='uq_qcmetric_record_name') + ) + + # qcmetricvalue - metric values with dual storage for string/numeric queries + op.create_table( + 'qcmetricvalue', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('qc_metric_id', sa.Uuid(), nullable=False), + sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('value_string', sa.Text(), nullable=False), + sa.Column('value_numeric', sa.Float(), nullable=True), + sa.Column( + 'value_type', sqlmodel.sql.sqltypes.AutoString(length=10), + nullable=False, server_default='str' + ), + sa.ForeignKeyConstraint(['qc_metric_id'], ['qcmetric.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('qc_metric_id', 'key', name='uq_qcmetricvalue_metric_key') + ) + # Index on key + value_numeric for efficient numeric range queries + op.create_index( + 'ix_qcmetricvalue_key_numeric', 'qcmetricvalue', + ['key', 'value_numeric'] + ) + + # qcmetricsample - sample associations for metrics + op.create_table( + 'qcmetricsample', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('qc_metric_id', sa.Uuid(), nullable=False), + sa.Column('sample_name', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('role', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=True), + sa.ForeignKeyConstraint(['qc_metric_id'], ['qcmetric.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('qc_metric_id', 'sample_name', name='uq_qcmetricsample_metric_sample') + ) + + +def downgrade() -> None: + """Drop QCMetrics and FileRecord tables.""" + + # Drop QCRecord tables (in reverse order of creation) + op.drop_table('qcmetricsample') + op.drop_index('ix_qcmetricvalue_key_numeric', table_name='qcmetricvalue') + op.drop_table('qcmetricvalue') + op.drop_table('qcmetric') + op.drop_table('qcrecordmetadata') + op.drop_index('ix_qcrecord_project_id', table_name='qcrecord') + op.drop_table('qcrecord') + + # Drop FileRecord tables + op.drop_table('filerecordsample') + op.drop_table('filerecordtag') + op.drop_table('filerecordhash') + op.drop_index('ix_filerecord_entity', table_name='filerecord') + op.drop_table('filerecord') diff --git a/api/filerecord/__init__.py b/api/filerecord/__init__.py new file mode 100644 index 0000000..a238ddd --- /dev/null +++ b/api/filerecord/__init__.py @@ -0,0 +1,7 @@ +""" +FileRecord module - reusable file metadata records. + +This module provides a polymorphic file reference system that can associate +file metadata (URI, size, hashes, tags) with various entity types like +QCRecord, Sample, etc. +""" diff --git a/api/filerecord/models.py b/api/filerecord/models.py new file mode 100644 index 0000000..ae7588f --- /dev/null +++ b/api/filerecord/models.py @@ -0,0 +1,185 @@ +""" +FileRecord Models - Reusable file metadata records. + +These models provide a polymorphic file reference system that can associate +file metadata (URI, size, hashes, tags, samples) with various entity types. +""" + +import uuid +from datetime import datetime +from enum import Enum +from typing import List +from sqlmodel import SQLModel, Field, Relationship, UniqueConstraint +from pydantic import ConfigDict + + +class FileRecordEntityType(str, Enum): + """Entity types that can have file records associated.""" + QCRECORD = "QCRECORD" + SAMPLE = "SAMPLE" + + +# ============================================================================ +# Database Tables +# ============================================================================ + + +class FileRecordHash(SQLModel, table=True): + """ + Hash values for file records. + Supports multiple hash algorithms (md5, sha256, etag, etc.) per file. + """ + __tablename__ = "filerecordhash" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_record_id: uuid.UUID = Field(foreign_key="filerecord.id", nullable=False) + algorithm: str = Field(max_length=50, nullable=False) + value: str = Field(max_length=128, nullable=False) + + # Relationship back to parent + file_record: "FileRecord" = Relationship(back_populates="hashes") + + __table_args__ = ( + UniqueConstraint("file_record_id", "algorithm", name="uq_filerecordhash_file_algorithm"), + ) + + +class FileRecordTag(SQLModel, table=True): + """ + Key-value tags for file records. + Allows arbitrary metadata to be attached to files. + """ + __tablename__ = "filerecordtag" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_record_id: uuid.UUID = Field(foreign_key="filerecord.id", nullable=False) + key: str = Field(max_length=255, nullable=False) + value: str = Field(nullable=False) + + # Relationship back to parent + file_record: "FileRecord" = Relationship(back_populates="tags") + + __table_args__ = ( + UniqueConstraint("file_record_id", "key", name="uq_filerecordtag_file_key"), + ) + + +class FileRecordSample(SQLModel, table=True): + """ + Associates samples with a file record. + + Supports: + - 0 rows: workflow-level file (e.g., expression matrix) + - 1 row: single-sample file (e.g., BAM file) + - N rows: multi-sample file with roles (e.g., tumor/normal VCF) + """ + __tablename__ = "filerecordsample" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_record_id: uuid.UUID = Field(foreign_key="filerecord.id", nullable=False) + sample_name: str = Field(max_length=255, nullable=False) + role: str | None = Field(default=None, max_length=50) # e.g., "tumor", "normal" + + # Relationship back to parent + file_record: "FileRecord" = Relationship(back_populates="samples") + + __table_args__ = ( + UniqueConstraint("file_record_id", "sample_name", name="uq_filerecordsample_file_sample"), + ) + + +class FileRecord(SQLModel, table=True): + """ + Metadata record for files stored in external locations (S3, etc.). + + Uses polymorphic association via entity_type and entity_id to link + to parent entities (QCRecord, Sample, etc.) without hard FK constraints. + """ + __tablename__ = "filerecord" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + entity_type: FileRecordEntityType = Field(nullable=False) + entity_id: uuid.UUID = Field(nullable=False) + uri: str = Field(max_length=1024, nullable=False) + size: int | None = Field(default=None) # File size in bytes + created_on: datetime | None = Field(default=None) # File creation timestamp + + # Relationships to child tables + hashes: List["FileRecordHash"] = Relationship( + back_populates="file_record", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + tags: List["FileRecordTag"] = Relationship( + back_populates="file_record", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + samples: List["FileRecordSample"] = Relationship( + back_populates="file_record", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + + model_config = ConfigDict(from_attributes=True) + + +# ============================================================================ +# Request/Response Models (Pydantic) +# ============================================================================ + + +class HashInput(SQLModel): + """Hash input for file creation - key is algorithm, value is hash.""" + algorithm: str + value: str + + +class TagInput(SQLModel): + """Tag input for file creation.""" + key: str + value: str + + +class SampleInput(SQLModel): + """Sample association input for file creation.""" + sample_name: str + role: str | None = None + + +class FileRecordCreate(SQLModel): + """Request model for creating a file record.""" + uri: str + size: int | None = None + created_on: datetime | None = None + hash: dict[str, str] | None = None # {"md5": "abc...", "sha256": "def..."} + tags: dict[str, str] | None = None # {"type": "alignment", "format": "bam"} + samples: List[SampleInput] | None = None # Sample associations + + model_config = ConfigDict(extra="forbid") + + +class HashPublic(SQLModel): + """Public representation of a file hash.""" + algorithm: str + value: str + + +class TagPublic(SQLModel): + """Public representation of a file tag.""" + key: str + value: str + + +class SamplePublic(SQLModel): + """Public representation of a sample association.""" + sample_name: str + role: str | None + + +class FileRecordPublic(SQLModel): + """Public representation of a file record.""" + id: uuid.UUID + uri: str + size: int | None + created_on: datetime | None + hashes: List[HashPublic] + tags: List[TagPublic] + samples: List[SamplePublic] diff --git a/api/qcmetrics/__init__.py b/api/qcmetrics/__init__.py new file mode 100644 index 0000000..2328ac2 --- /dev/null +++ b/api/qcmetrics/__init__.py @@ -0,0 +1,6 @@ +""" +QCMetrics module - Quality control metrics from pipeline executions. + +This module provides models and APIs for storing and retrieving QC metrics +from bioinformatics pipeline runs. +""" diff --git a/api/qcmetrics/models.py b/api/qcmetrics/models.py new file mode 100644 index 0000000..de81b19 --- /dev/null +++ b/api/qcmetrics/models.py @@ -0,0 +1,249 @@ +""" +QCMetrics Models - Quality control metrics from pipeline executions. + +These models store QC metrics and outputs from bioinformatics pipelines, +supporting workflow-level, single-sample, and multi-sample (paired) metrics. +""" + +import uuid +from datetime import datetime, timezone +from typing import List +from sqlmodel import SQLModel, Field, Relationship, UniqueConstraint +from pydantic import ConfigDict + +from api.filerecord.models import ( + FileRecordCreate, + FileRecordPublic, +) + + +# ============================================================================ +# Database Tables +# ============================================================================ + + +class QCRecordMetadata(SQLModel, table=True): + """ + Key-value store for pipeline-level metadata. + Examples: pipeline name, version, configuration parameters. + """ + __tablename__ = "qcrecordmetadata" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + qcrecord_id: uuid.UUID = Field(foreign_key="qcrecord.id", nullable=False) + key: str = Field(max_length=255, nullable=False) + value: str = Field(nullable=False) + + # Relationship back to parent + qcrecord: "QCRecord" = Relationship(back_populates="pipeline_metadata") + + __table_args__ = ( + UniqueConstraint("qcrecord_id", "key", name="uq_qcrecordmetadata_record_key"), + ) + + +class QCMetricValue(SQLModel, table=True): + """ + Key-value store for individual metric values within a metric group. + Examples: reads=50000000, alignment_rate=95.5, tmb=8.5 + + Stores values in two formats: + - value_string: Always populated, used for string matching and display + - value_numeric: Populated only for int/float types, enables numeric queries + (greater than, less than, range, aggregations) + - value_type: Preserves original Python type ("str", "int", "float") + """ + __tablename__ = "qcmetricvalue" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + qc_metric_id: uuid.UUID = Field(foreign_key="qcmetric.id", nullable=False) + key: str = Field(max_length=255, nullable=False) + value_string: str = Field(nullable=False) + value_numeric: float | None = Field(default=None, nullable=True) # For numeric queries + value_type: str = Field(max_length=10, default="str") # "str", "int", "float" + + # Relationship back to parent + qc_metric: "QCMetric" = Relationship(back_populates="values") + + __table_args__ = ( + UniqueConstraint("qc_metric_id", "key", name="uq_qcmetricvalue_metric_key"), + ) + + +class QCMetricSample(SQLModel, table=True): + """ + Associates samples with a metric group. + + Supports: + - 0 rows: workflow-level metric (e.g., pipeline runtime) + - 1 row: single-sample metric (e.g., alignment stats for Sample1) + - N rows: multi-sample metric with roles (e.g., tumor/normal somatic variants) + """ + __tablename__ = "qcmetricsample" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + qc_metric_id: uuid.UUID = Field(foreign_key="qcmetric.id", nullable=False) + sample_name: str = Field(max_length=255, nullable=False) + role: str | None = Field(default=None, max_length=50) # e.g., "tumor", "normal" + + # Relationship back to parent + qc_metric: "QCMetric" = Relationship(back_populates="samples") + + __table_args__ = ( + UniqueConstraint("qc_metric_id", "sample_name", name="uq_qcmetricsample_metric_sample"), + ) + + +class QCMetric(SQLModel, table=True): + """ + A named group of metrics within a QC record. + + Can be workflow-level (no samples), single-sample, or multi-sample (paired). + Examples: alignment_stats, somatic_variants, expression_summary + """ + __tablename__ = "qcmetric" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + qcrecord_id: uuid.UUID = Field(foreign_key="qcrecord.id", nullable=False) + name: str = Field(max_length=255, nullable=False) + + # Relationships to child tables + values: List["QCMetricValue"] = Relationship( + back_populates="qc_metric", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + samples: List["QCMetricSample"] = Relationship( + back_populates="qc_metric", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + + # Relationship back to parent + qcrecord: "QCRecord" = Relationship(back_populates="metrics") + + __table_args__ = ( + UniqueConstraint("qcrecord_id", "name", name="uq_qcmetric_record_name"), + ) + + +class QCRecord(SQLModel, table=True): + """ + Main QC record entity - one per pipeline execution per project. + + Multiple records per project are allowed for versioning (history). + The created_on timestamp differentiates versions. + """ + __tablename__ = "qcrecord" + __searchable__ = ["project_id"] + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + created_on: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + nullable=False + ) + created_by: str = Field(max_length=100, nullable=False) + project_id: str = Field(max_length=50, nullable=False, index=True) + + # Relationships to child tables + pipeline_metadata: List["QCRecordMetadata"] = Relationship( + back_populates="qcrecord", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + metrics: List["QCMetric"] = Relationship( + back_populates="qcrecord", + sa_relationship_kwargs={"cascade": "all, delete-orphan"} + ) + + model_config = ConfigDict(from_attributes=True) + + +# ============================================================================ +# Request/Response Models (Pydantic) +# ============================================================================ + + +class MetadataKeyValue(SQLModel): + """Key-value pair for metadata.""" + key: str + value: str + + +class MetricValueInput(SQLModel): + """Key-value pair for metric values.""" + key: str + value: str + + +class MetricSampleInput(SQLModel): + """Sample association input for metrics.""" + sample_name: str + role: str | None = None + + +class MetricInput(SQLModel): + """Input model for a metric group.""" + name: str + samples: List[MetricSampleInput] | None = None + values: dict[str, str | int | float] # {"reads": 50000000, "alignment_rate": 95.5} + + +class QCRecordCreate(SQLModel): + """ + Request model for creating a QC record. + + Uses the explicit metrics format with sample associations supporting + workflow-level, single-sample, and paired-sample (tumor/normal) metrics. + """ + project_id: str + metadata: dict[str, str] | None = None # {"pipeline": "RNA-Seq", "version": "2.0"} + metrics: List[MetricInput] | None = None # Metrics with explicit sample associations + output_files: List[FileRecordCreate] | None = None + + model_config = ConfigDict(extra="forbid") + + +class MetricValuePublic(SQLModel): + """Public representation of a metric value with original type preserved.""" + key: str + value: str | int | float + + +class MetricSamplePublic(SQLModel): + """Public representation of a sample association.""" + sample_name: str + role: str | None + + +class MetricPublic(SQLModel): + """Public representation of a metric group.""" + name: str + samples: List[MetricSamplePublic] + values: List[MetricValuePublic] + + +class QCRecordPublic(SQLModel): + """Public representation of a QC record.""" + id: uuid.UUID + created_on: datetime + created_by: str + project_id: str + metadata: List[MetadataKeyValue] + metrics: List[MetricPublic] + output_files: List[FileRecordPublic] + + +class QCRecordsPublic(SQLModel): + """Paginated list of QC records.""" + data: List[QCRecordPublic] + total: int + page: int + per_page: int + + +class QCRecordSearchRequest(SQLModel): + """Request model for searching QC records.""" + filter_on: dict | None = None # Flexible filtering + page: int = 1 + per_page: int = 100 + latest: bool = True # Return only newest version per project + + model_config = ConfigDict(extra="forbid") diff --git a/api/qcmetrics/routes.py b/api/qcmetrics/routes.py new file mode 100644 index 0000000..9c94b84 --- /dev/null +++ b/api/qcmetrics/routes.py @@ -0,0 +1,240 @@ +""" +Routes/endpoints for the QCMetrics API. + +Provides endpoints for creating, searching, and deleting QC records. +""" + +from typing import Optional +from fastapi import APIRouter, Query, status + +from api.qcmetrics.models import ( + QCRecordCreate, + QCRecordPublic, + QCRecordsPublic, + QCRecordSearchRequest, +) +from api.qcmetrics import services +from core.deps import SessionDep + +router = APIRouter(prefix="/qcmetrics", tags=["QC Metrics"]) + + +@router.post( + "", + response_model=QCRecordPublic, + status_code=status.HTTP_201_CREATED, + summary="Create a new QC record", +) +def create_qcrecord( + session: SessionDep, + qcrecord_create: QCRecordCreate, + created_by: str = Query( + ..., + description="Username of the person creating this record" + ), +) -> QCRecordPublic: + """ + Create a new QC record with metrics and output files. + + The record stores quality control metrics from a pipeline execution. + + **Note:** Right now `created_by` is just a string username passed as a query parameter; + once authentication is in place, this will be derived from the logged-in user. + + **Example curl command:** + + ```bash + curl -X POST "http://localhost:8000/api/v1/qcmetrics?created_by=jsmith" \\ + -H "Content-Type: application/json" \\ + -d '{ + "project_id": "P-1234", + "metadata": { + "pipeline": "RNA-Seq", + "version": "2.0.0" + }, + "metrics": [ + { + "name": "alignment_stats", + "samples": [{"sample_name": "Sample1"}], + "values": {"reads": "50000000", "alignment_rate": "95.5"} + } + ], + "output_files": [ + { + "uri": "s3://bucket/path/file.bam", + "size": 123456789, + "samples": [{"sample_name": "Sample1"}], + "hash": {"md5": "abc123def456"}, + "tags": {"type": "alignment"} + } + ] + }' + ``` + + **Request body format:** + + ```json + { + "project_id": "P-1234", + "metadata": { + "pipeline": "RNA-Seq", + "version": "2.0.0" + }, + "metrics": [ + { + "name": "alignment_stats", + "samples": [{"sample_name": "Sample1"}], + "values": {"reads": "50000000", "alignment_rate": "95.5"} + } + ], + "output_files": [ + { + "uri": "s3://bucket/path/file.bam", + "size": 123456789, + "samples": [{"sample_name": "Sample1"}], + "hash": {"md5": "abc123..."}, + "tags": {"type": "alignment"} + } + ] + } + ``` + + **Sample association patterns:** + - **Workflow-level**: Omit `samples` array (applies to entire pipeline run) + - **Single sample**: One entry in `samples` array + - **Sample pair**: Two entries with roles, e.g., + `[{"sample_name": "T1", "role": "tumor"}, {"sample_name": "N1", "role": "normal"}]` + + **Duplicate detection:** + If an equivalent record already exists for the project (same metadata), + the existing record is returned instead of creating a duplicate. + """ + return services.create_qcrecord(session, qcrecord_create, created_by) + + +@router.get( + "/search", + response_model=QCRecordsPublic, + summary="Search QC records (GET)", +) +def search_qcrecords_get( + session: SessionDep, + project_id: Optional[str] = Query(None, description="Filter by project ID"), + latest: bool = Query(True, description="Return only newest record per project"), + page: int = Query(1, ge=1, description="Page number"), + per_page: int = Query(100, ge=1, le=1000, description="Results per page"), +) -> QCRecordsPublic: + """ + Search QC records using query parameters. + + **Parameters:** + - `project_id`: Filter to specific project(s) + - `latest`: If true (default), returns only the most recent QC record per project + - `page`: Page number for pagination (starts at 1) + - `per_page`: Number of results per page (max 1000) + + **Example:** + ``` + GET /api/v1/qcmetrics/search?project_id=P-1234&latest=true + ``` + """ + filter_on = {} + if project_id: + filter_on["project_id"] = project_id + + return services.search_qcrecords( + session, + filter_on=filter_on, + page=page, + per_page=per_page, + latest=latest, + ) + + +@router.post( + "/search", + response_model=QCRecordsPublic, + summary="Search QC records (POST)", +) +def search_qcrecords_post( + session: SessionDep, + search_request: QCRecordSearchRequest, +) -> QCRecordsPublic: + """ + Search QC records using a JSON body for advanced filtering. + + **Request body format:** + + ```json + { + "filter_on": { + "project_id": "P-1234", + "metadata": { + "pipeline": "RNA-Seq" + } + }, + "page": 1, + "per_page": 100, + "latest": true + } + ``` + + **Filter options:** + - `project_id`: Single value or list of project IDs + - `metadata`: Key-value pairs to match against pipeline metadata + + **Pagination:** + - `page`: Page number (starts at 1) + - `per_page`: Results per page (max 1000) + + **Latest filtering:** + - `latest: true` (default): Returns only the newest QC record per project + - `latest: false`: Returns all matching records (full history) + """ + return services.search_qcrecords( + session, + filter_on=search_request.filter_on, + page=search_request.page, + per_page=search_request.per_page, + latest=search_request.latest, + ) + + +@router.get( + "/{qcrecord_id}", + response_model=QCRecordPublic, + summary="Get QC record by ID", +) +def get_qcrecord( + session: SessionDep, + qcrecord_id: str, +) -> QCRecordPublic: + """ + Retrieve a specific QC record by its UUID. + + Returns the full QC record including metadata, metrics, and output files. + """ + return services.get_qcrecord_by_id(session, qcrecord_id) + + +@router.delete( + "/{qcrecord_id}", + status_code=status.HTTP_200_OK, + summary="Delete QC record", +) +def delete_qcrecord( + session: SessionDep, + qcrecord_id: str, +) -> dict: + """ + Delete a QC record and all associated data. + + This permanently removes: + - The QC record + - All associated metadata + - All associated metrics and metric values + - All associated output file records + + **Warning:** This action cannot be undone. + """ + return services.delete_qcrecord(session, qcrecord_id) diff --git a/api/qcmetrics/services.py b/api/qcmetrics/services.py new file mode 100644 index 0000000..0115918 --- /dev/null +++ b/api/qcmetrics/services.py @@ -0,0 +1,500 @@ +""" +Services for the QCMetrics API. + +Business logic for creating, searching, and deleting QC records. +""" + +import logging +from datetime import datetime, timezone +import uuid as uuid_module +from fastapi import HTTPException, status +from sqlmodel import Session, select, col + +from api.qcmetrics.models import ( + QCRecord, + QCRecordMetadata, + QCMetric, + QCMetricValue, + QCMetricSample, + QCRecordCreate, + QCRecordPublic, + QCRecordsPublic, + MetadataKeyValue, + MetricPublic, + MetricValuePublic, + MetricSamplePublic, + MetricInput, +) +from api.filerecord.models import ( + FileRecord, + FileRecordHash, + FileRecordTag, + FileRecordSample, + FileRecordEntityType, + FileRecordCreate, + FileRecordPublic, + HashPublic, + TagPublic, + SamplePublic, +) + + +logger = logging.getLogger(__name__) + + +def create_qcrecord( + session: Session, + qcrecord_create: QCRecordCreate, + created_by: str, +) -> QCRecordPublic: + """ + Create a new QC record with all associated data. + + Metrics can have numeric values (int, float) which are stored as strings + in the database. + """ + # Check for duplicate record + existing = _check_duplicate_record(session, qcrecord_create) + if existing: + logger.info( + "Equivalent QC record already exists for project %s: %s", + qcrecord_create.project_id, + existing.id + ) + return _qcrecord_to_public(session, existing) + + # Create main QC record + qcrecord = QCRecord( + created_on=datetime.now(timezone.utc), + created_by=created_by, + project_id=qcrecord_create.project_id, + ) + session.add(qcrecord) + session.flush() # Get the ID + + # Add metadata + if qcrecord_create.metadata: + for key, value in qcrecord_create.metadata.items(): + metadata_entry = QCRecordMetadata( + qcrecord_id=qcrecord.id, + key=key, + value=str(value), + ) + session.add(metadata_entry) + + # Add metrics + if qcrecord_create.metrics: + for metric_input in qcrecord_create.metrics: + _create_metric(session, qcrecord.id, metric_input) + + # Add output files + if qcrecord_create.output_files: + for file_create in qcrecord_create.output_files: + _create_file_record( + session, + entity_type=FileRecordEntityType.QCRECORD, + entity_id=qcrecord.id, + file_create=file_create, + ) + + session.commit() + session.refresh(qcrecord) + + logger.info( + "Created QC record %s for project %s by %s", + qcrecord.id, + qcrecord.project_id, + created_by + ) + + return _qcrecord_to_public(session, qcrecord) + + +def _create_metric( + session: Session, + qcrecord_id, + metric_input: MetricInput, +) -> QCMetric: + """Create a metric group with its samples and values.""" + metric = QCMetric( + qcrecord_id=qcrecord_id, + name=metric_input.name, + ) + session.add(metric) + session.flush() + + # Add sample associations + if metric_input.samples: + for sample_input in metric_input.samples: + sample_assoc = QCMetricSample( + qc_metric_id=metric.id, + sample_name=sample_input.sample_name if hasattr(sample_input, 'sample_name') + else sample_input['sample_name'], + role=sample_input.role if hasattr(sample_input, 'role') + else sample_input.get('role'), + ) + session.add(sample_assoc) + + # Add metric values with type preservation and dual storage + for key, value in metric_input.values.items(): + # Determine the original type and numeric value + if isinstance(value, bool): + # bool is subclass of int, so check first + value_type = "str" + value_numeric = None + elif isinstance(value, int): + value_type = "int" + value_numeric = float(value) # Store as float for consistent numeric ops + elif isinstance(value, float): + value_type = "float" + value_numeric = value + else: + value_type = "str" + value_numeric = None + + metric_value = QCMetricValue( + qc_metric_id=metric.id, + key=key, + value_string=str(value), + value_numeric=value_numeric, + value_type=value_type, + ) + session.add(metric_value) + + return metric + + +def _create_file_record( + session: Session, + entity_type: FileRecordEntityType, + entity_id, + file_create: FileRecordCreate, +) -> FileRecord: + """Create a file record with its hashes, tags, and samples.""" + file_record = FileRecord( + entity_type=entity_type, + entity_id=entity_id, + uri=file_create.uri, + size=file_create.size, + created_on=file_create.created_on, + ) + session.add(file_record) + session.flush() + + # Add hashes + if file_create.hash: + for algorithm, value in file_create.hash.items(): + hash_entry = FileRecordHash( + file_record_id=file_record.id, + algorithm=algorithm, + value=value, + ) + session.add(hash_entry) + + # Add tags + if file_create.tags: + for key, value in file_create.tags.items(): + tag_entry = FileRecordTag( + file_record_id=file_record.id, + key=key, + value=str(value), + ) + session.add(tag_entry) + + # Add sample associations + if file_create.samples: + for sample_input in file_create.samples: + sample_assoc = FileRecordSample( + file_record_id=file_record.id, + sample_name=sample_input.sample_name, + role=sample_input.role, + ) + session.add(sample_assoc) + + return file_record + + +def _check_duplicate_record( + session: Session, + qcrecord_create: QCRecordCreate, +) -> QCRecord | None: + """ + Check if an equivalent QC record already exists. + + Returns the existing record if found, None otherwise. + """ + # Find existing records for this project + stmt = select(QCRecord).where( + QCRecord.project_id == qcrecord_create.project_id + ).order_by(col(QCRecord.created_on).desc()) + + existing_records = session.exec(stmt).all() + + if not existing_records: + return None + + # For now, just check the latest record + # A full comparison would require comparing all nested data + # This is a simplified version that checks metadata keys + latest = existing_records[0] + + # Get existing metadata + existing_metadata = { + m.key: m.value + for m in session.exec( + select(QCRecordMetadata).where( + QCRecordMetadata.qcrecord_id == latest.id + ) + ).all() + } + + # Compare metadata + new_metadata = qcrecord_create.metadata or {} + if existing_metadata == {k: str(v) for k, v in new_metadata.items()}: + # Metadata matches - could do deeper comparison here + # For now, consider it a duplicate if metadata matches + return latest + + return None + + +def search_qcrecords( + session: Session, + filter_on: dict | None = None, + page: int = 1, + per_page: int = 100, + latest: bool = True, +) -> QCRecordsPublic: + """ + Search for QC records with filtering and pagination. + + Args: + session: Database session + filter_on: Dictionary of fields to filter by + page: Page number (1-based) + per_page: Results per page + latest: If True, return only the newest record per project + """ + filter_on = filter_on or {} + + # Build base query + stmt = select(QCRecord) + + # Apply filters + if "project_id" in filter_on: + project_ids = filter_on["project_id"] + if isinstance(project_ids, list): + stmt = stmt.where(col(QCRecord.project_id).in_(project_ids)) + else: + stmt = stmt.where(QCRecord.project_id == project_ids) + + # Handle metadata filtering + if "metadata" in filter_on and isinstance(filter_on["metadata"], dict): + for key, value in filter_on["metadata"].items(): + # Subquery to find QCRecords with matching metadata + subq = select(QCRecordMetadata.qcrecord_id).where( + QCRecordMetadata.key == key, + QCRecordMetadata.value == str(value) + ) + stmt = stmt.where(col(QCRecord.id).in_(subq)) + + # Order by created_on descending + stmt = stmt.order_by(col(QCRecord.created_on).desc()) + + # Execute to get all matching records + all_records = list(session.exec(stmt).all()) + + # Apply "latest" filter - keep only newest per project + if latest: + seen_projects = set() + filtered_records = [] + for record in all_records: + if record.project_id not in seen_projects: + filtered_records.append(record) + seen_projects.add(record.project_id) + all_records = filtered_records + + # Calculate pagination + total = len(all_records) + start_idx = (page - 1) * per_page + end_idx = start_idx + per_page + paginated_records = all_records[start_idx:end_idx] + + # Convert to public format + data = [_qcrecord_to_public(session, record) for record in paginated_records] + + return QCRecordsPublic( + data=data, + total=total, + page=page, + per_page=per_page, + ) + + +def get_qcrecord_by_id(session: Session, qcrecord_id: str) -> QCRecordPublic: + """Get a single QC record by ID.""" + + try: + record_uuid = uuid_module.UUID(qcrecord_id) + except ValueError as exc: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid UUID format: {qcrecord_id}" + ) from exc + + record = session.get(QCRecord, record_uuid) + if not record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"QC record not found: {qcrecord_id}" + ) + + return _qcrecord_to_public(session, record) + + +def delete_qcrecord(session: Session, qcrecord_id: str) -> dict: + """Delete a QC record and all associated data.""" + try: + record_uuid = uuid_module.UUID(qcrecord_id) + except ValueError as exc: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid UUID format: {qcrecord_id}" + ) from exc + + record = session.get(QCRecord, record_uuid) + if not record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"QC record not found: {qcrecord_id}" + ) + + # Delete associated file records (polymorphic, not cascade) + file_records = session.exec( + select(FileRecord).where( + FileRecord.entity_type == FileRecordEntityType.QCRECORD, + FileRecord.entity_id == record_uuid + ) + ).all() + + for file_record in file_records: + session.delete(file_record) + + # Delete the QC record (cascades to metadata, metrics, etc.) + session.delete(record) + session.commit() + + logger.info("Deleted QC record %s", qcrecord_id) + + return {"status": "deleted", "id": qcrecord_id} + + +def _convert_value_to_type( + value_string: str, value_numeric: float | None, value_type: str +) -> str | int | float: + """Convert stored values back to their original type.""" + if value_type == "int" and value_numeric is not None: + return int(value_numeric) + elif value_type == "float" and value_numeric is not None: + return value_numeric + return value_string + + +def _qcrecord_to_public(session: Session, record: QCRecord) -> QCRecordPublic: + """Convert a QCRecord database object to public format.""" + # Get metadata + metadata_entries = session.exec( + select(QCRecordMetadata).where( + QCRecordMetadata.qcrecord_id == record.id + ) + ).all() + + metadata = [ + MetadataKeyValue(key=m.key, value=m.value) + for m in metadata_entries + ] + + # Get metrics + metric_entries = session.exec( + select(QCMetric).where(QCMetric.qcrecord_id == record.id) + ).all() + + metrics = [] + for metric in metric_entries: + # Get metric values + values = session.exec( + select(QCMetricValue).where(QCMetricValue.qc_metric_id == metric.id) + ).all() + + # Get metric samples + samples = session.exec( + select(QCMetricSample).where(QCMetricSample.qc_metric_id == metric.id) + ).all() + + metrics.append(MetricPublic( + name=metric.name, + samples=[ + MetricSamplePublic(sample_name=s.sample_name, role=s.role) + for s in samples + ], + values=[ + MetricValuePublic( + key=v.key, + value=_convert_value_to_type( + v.value_string, v.value_numeric, v.value_type + ) + ) + for v in values + ], + )) + + # Get file records + file_records = session.exec( + select(FileRecord).where( + FileRecord.entity_type == FileRecordEntityType.QCRECORD, + FileRecord.entity_id == record.id + ) + ).all() + + output_files = [] + for file_record in file_records: + # Get hashes + hashes = session.exec( + select(FileRecordHash).where( + FileRecordHash.file_record_id == file_record.id + ) + ).all() + + # Get tags + tags = session.exec( + select(FileRecordTag).where( + FileRecordTag.file_record_id == file_record.id + ) + ).all() + + # Get samples + samples = session.exec( + select(FileRecordSample).where( + FileRecordSample.file_record_id == file_record.id + ) + ).all() + + output_files.append(FileRecordPublic( + id=file_record.id, + uri=file_record.uri, + size=file_record.size, + created_on=file_record.created_on, + hashes=[HashPublic(algorithm=h.algorithm, value=h.value) for h in hashes], + tags=[TagPublic(key=t.key, value=t.value) for t in tags], + samples=[SamplePublic(sample_name=s.sample_name, role=s.role) for s in samples], + )) + + return QCRecordPublic( + id=record.id, + created_on=record.created_on, + created_by=record.created_by, + project_id=record.project_id, + metadata=metadata, + metrics=metrics, + output_files=output_files, + ) diff --git a/main.py b/main.py index d9efa23..6f5eb3f 100644 --- a/main.py +++ b/main.py @@ -10,14 +10,15 @@ from api.files.routes import router as files_router from api.jobs.routes import router as jobs_router +from api.manifest.routes import router as manifest_router from api.project.routes import router as project_router +from api.qcmetrics.routes import router as qcmetrics_router from api.runs.routes import router as runs_router from api.samples.routes import router as samples_router from api.search.routes import router as search_router from api.settings.routes import router as settings_router from api.vendors.routes import router as vendors_router from api.workflow.routes import router as workflow_router -from api.manifest.routes import router as manifest_router # Customize route id's @@ -60,13 +61,14 @@ def health_check(): app.include_router(files_router, prefix=API_PREFIX) app.include_router(jobs_router, prefix=API_PREFIX) +app.include_router(manifest_router, prefix=API_PREFIX) app.include_router(project_router, prefix=API_PREFIX) +app.include_router(qcmetrics_router, prefix=API_PREFIX) app.include_router(runs_router, prefix=API_PREFIX) app.include_router(samples_router, prefix=API_PREFIX) app.include_router(search_router, prefix=API_PREFIX) app.include_router(settings_router, prefix=API_PREFIX) app.include_router(vendors_router, prefix=API_PREFIX) -app.include_router(manifest_router, prefix=API_PREFIX) app.include_router(workflow_router, prefix=API_PREFIX) diff --git a/tests/api/test_qcmetrics.py b/tests/api/test_qcmetrics.py new file mode 100644 index 0000000..78496ab --- /dev/null +++ b/tests/api/test_qcmetrics.py @@ -0,0 +1,518 @@ +""" +Tests for the QCMetrics API. +""" + +from fastapi.testclient import TestClient +from sqlmodel import Session + + +def test_create_qcrecord_basic(client: TestClient, session: Session): + """ + Test creating a basic QC record with metadata only. + """ + qcrecord_data = { + "project_id": "P-TEST-001", + "metadata": { + "pipeline": "RNA-Seq", + "version": "2.0.0" + } + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + assert data["project_id"] == "P-TEST-001" + assert data["created_by"] == "test_user" + assert len(data["metadata"]) == 2 + + # Check metadata values + metadata_dict = {m["key"]: m["value"] for m in data["metadata"]} + assert metadata_dict["pipeline"] == "RNA-Seq" + assert metadata_dict["version"] == "2.0.0" + + +def test_create_qcrecord_with_single_sample_metrics(client: TestClient, session: Session): + """ + Test creating a QC record with single-sample metrics. + """ + qcrecord_data = { + "project_id": "P-TEST-002", + "metadata": { + "pipeline": "WES" + }, + "metrics": [ + { + "name": "alignment_stats", + "samples": [{"sample_name": "Sample1"}], + "values": { + "total_reads": "50000000", + "mapped_reads": "48500000", + "alignment_rate": "97.0" + } + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + assert len(data["metrics"]) == 1 + + metric = data["metrics"][0] + assert metric["name"] == "alignment_stats" + assert len(metric["samples"]) == 1 + assert metric["samples"][0]["sample_name"] == "Sample1" + + # Check metric values + values_dict = {v["key"]: v["value"] for v in metric["values"]} + assert values_dict["total_reads"] == "50000000" + assert values_dict["alignment_rate"] == "97.0" + + +def test_create_qcrecord_with_paired_sample_metrics(client: TestClient, session: Session): + """ + Test creating a QC record with tumor/normal paired metrics. + """ + qcrecord_data = { + "project_id": "P-TEST-003", + "metadata": { + "pipeline": "Somatic" + }, + "metrics": [ + { + "name": "somatic_variants", + "samples": [ + {"sample_name": "Sample1", "role": "tumor"}, + {"sample_name": "Sample2", "role": "normal"} + ], + "values": { + "snv_count": "15234", + "indel_count": "1523", + "tmb": "8.5" + } + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + metric = data["metrics"][0] + + # Check paired samples with roles + assert len(metric["samples"]) == 2 + samples_by_role = {s["role"]: s["sample_name"] for s in metric["samples"]} + assert samples_by_role["tumor"] == "Sample1" + assert samples_by_role["normal"] == "Sample2" + + +def test_create_qcrecord_with_workflow_level_metrics(client: TestClient, session: Session): + """ + Test creating a QC record with workflow-level metrics (no samples). + """ + qcrecord_data = { + "project_id": "P-TEST-004", + "metadata": { + "pipeline": "RNA-Seq" + }, + "metrics": [ + { + "name": "pipeline_summary", + "values": { + "total_samples_processed": "48", + "samples_passed_qc": "46", + "pipeline_runtime_hours": "12.5" + } + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + metric = data["metrics"][0] + + # Workflow-level metrics have no samples + assert len(metric["samples"]) == 0 + + values_dict = {v["key"]: v["value"] for v in metric["values"]} + assert values_dict["total_samples_processed"] == "48" + + +def test_create_qcrecord_with_output_files(client: TestClient, session: Session): + """ + Test creating a QC record with output files. + """ + qcrecord_data = { + "project_id": "P-TEST-005", + "metadata": { + "pipeline": "WGS" + }, + "output_files": [ + { + "uri": "s3://bucket/Sample1.bam", + "size": 123456789, + "samples": [{"sample_name": "Sample1"}], + "hash": {"md5": "abc123def456"}, + "tags": {"type": "alignment", "format": "bam"} + }, + { + "uri": "s3://bucket/expression_matrix.tsv", + "size": 5678901, + "hash": {"sha256": "xyz789"}, + "tags": {"type": "expression"} + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + assert len(data["output_files"]) == 2 + + # Check first file (single sample) + bam_file = next(f for f in data["output_files"] if "bam" in f["uri"]) + assert bam_file["size"] == 123456789 + assert len(bam_file["samples"]) == 1 + assert bam_file["samples"][0]["sample_name"] == "Sample1" + + # Check hashes + hashes_dict = {h["algorithm"]: h["value"] for h in bam_file["hashes"]} + assert hashes_dict["md5"] == "abc123def456" + + # Check tags + tags_dict = {t["key"]: t["value"] for t in bam_file["tags"]} + assert tags_dict["type"] == "alignment" + + # Check second file (workflow-level, no samples) + matrix_file = next(f for f in data["output_files"] if "matrix" in f["uri"]) + assert len(matrix_file["samples"]) == 0 + + +def test_search_qcrecords_empty(client: TestClient, session: Session): + """ + Test searching QC records when none exist. + """ + response = client.get("/api/v1/qcmetrics/search") + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 0 + assert data["data"] == [] + + +def test_search_qcrecords_by_project_id(client: TestClient, session: Session): + """ + Test searching QC records by project ID. + """ + # Create a QC record + qcrecord_data = { + "project_id": "P-SEARCH-001", + "metadata": {"pipeline": "RNA-Seq"} + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data) + + # Search for it + response = client.get("/api/v1/qcmetrics/search?project_id=P-SEARCH-001") + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 1 + assert data["data"][0]["project_id"] == "P-SEARCH-001" + + +def test_search_qcrecords_latest_only(client: TestClient, session: Session): + """ + Test that latest=true returns only the newest record per project. + """ + # Create two QC records for the same project + qcrecord_data_1 = { + "project_id": "P-LATEST-001", + "metadata": {"version": "1.0"} + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_1) + + qcrecord_data_2 = { + "project_id": "P-LATEST-001", + "metadata": {"version": "2.0"} # Different metadata, so not a duplicate + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_2) + + # Search with latest=true (default) + response = client.get("/api/v1/qcmetrics/search?project_id=P-LATEST-001&latest=true") + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 1 + + # Should be version 2.0 (the latest) + metadata_dict = {m["key"]: m["value"] for m in data["data"][0]["metadata"]} + assert metadata_dict["version"] == "2.0" + + +def test_search_qcrecords_all_versions(client: TestClient, session: Session): + """ + Test that latest=false returns all versions. + """ + # Create two QC records for the same project + qcrecord_data_1 = { + "project_id": "P-ALLVER-001", + "metadata": {"version": "1.0"} + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_1) + + qcrecord_data_2 = { + "project_id": "P-ALLVER-001", + "metadata": {"version": "2.0"} + } + client.post("/api/v1/qcmetrics?created_by=test_user", json=qcrecord_data_2) + + # Search with latest=false + response = client.get("/api/v1/qcmetrics/search?project_id=P-ALLVER-001&latest=false") + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 2 + + +def test_search_qcrecords_post_with_metadata_filter(client: TestClient, session: Session): + """ + Test POST search with metadata filtering. + """ + # Create QC records with different pipelines + client.post("/api/v1/qcmetrics?created_by=test_user", json={ + "project_id": "P-META-001", + "metadata": {"pipeline": "RNA-Seq"} + }) + client.post("/api/v1/qcmetrics?created_by=test_user", json={ + "project_id": "P-META-002", + "metadata": {"pipeline": "WES"} + }) + + # Search for RNA-Seq pipeline only + search_request = { + "filter_on": { + "metadata": {"pipeline": "RNA-Seq"} + } + } + response = client.post("/api/v1/qcmetrics/search", json=search_request) + assert response.status_code == 200 + + data = response.json() + assert data["total"] == 1 + assert data["data"][0]["project_id"] == "P-META-001" + + +def test_get_qcrecord_by_id(client: TestClient, session: Session): + """ + Test getting a QC record by its ID. + """ + # Create a QC record + create_response = client.post("/api/v1/qcmetrics?created_by=test_user", json={ + "project_id": "P-GET-001", + "metadata": {"pipeline": "RNA-Seq"} + }) + qcrecord_id = create_response.json()["id"] + + # Get by ID + response = client.get(f"/api/v1/qcmetrics/{qcrecord_id}") + assert response.status_code == 200 + + data = response.json() + assert data["id"] == qcrecord_id + assert data["project_id"] == "P-GET-001" + + +def test_get_qcrecord_not_found(client: TestClient, session: Session): + """ + Test getting a non-existent QC record returns 404. + """ + fake_uuid = "00000000-0000-0000-0000-000000000000" + response = client.get(f"/api/v1/qcmetrics/{fake_uuid}") + assert response.status_code == 404 + + +def test_get_qcrecord_invalid_uuid(client: TestClient, session: Session): + """ + Test getting with an invalid UUID format returns 400. + """ + response = client.get("/api/v1/qcmetrics/not-a-uuid") + assert response.status_code == 400 + + +def test_delete_qcrecord(client: TestClient, session: Session): + """ + Test deleting a QC record. + """ + # Create a QC record + create_response = client.post("/api/v1/qcmetrics?created_by=test_user", json={ + "project_id": "P-DELETE-001", + "metadata": {"pipeline": "RNA-Seq"} + }) + qcrecord_id = create_response.json()["id"] + + # Delete it + response = client.delete(f"/api/v1/qcmetrics/{qcrecord_id}") + assert response.status_code == 200 + assert response.json()["status"] == "deleted" + + # Verify it's gone + get_response = client.get(f"/api/v1/qcmetrics/{qcrecord_id}") + assert get_response.status_code == 404 + + +def test_delete_qcrecord_not_found(client: TestClient, session: Session): + """ + Test deleting a non-existent QC record returns 404. + """ + fake_uuid = "00000000-0000-0000-0000-000000000000" + response = client.delete(f"/api/v1/qcmetrics/{fake_uuid}") + assert response.status_code == 404 + + +def test_duplicate_detection(client: TestClient, session: Session): + """ + Test that equivalent records are detected as duplicates. + """ + qcrecord_data = { + "project_id": "P-DUP-001", + "metadata": {"pipeline": "RNA-Seq", "version": "2.0"} + } + + # Create first record + response1 = client.post("/api/v1/qcmetrics?created_by=user1", json=qcrecord_data) + assert response1.status_code == 201 + id1 = response1.json()["id"] + + # Try to create identical record + response2 = client.post("/api/v1/qcmetrics?created_by=user2", json=qcrecord_data) + assert response2.status_code == 201 + id2 = response2.json()["id"] + + # Should return the same record (duplicate detection) + assert id1 == id2 + + +def test_numeric_metric_values(client: TestClient, session: Session): + """ + Test that numeric metric values (int, float) are accepted and returned + with their original types preserved. + + This matches the legacy ES format where values like QC_ForwardReadCount=122483575 + were numeric rather than string. + """ + qcrecord_data = { + "project_id": "P-NUMERIC-001", + "metadata": {"pipeline": "RNA-Seq"}, + "metrics": [ + { + "name": "sample_qc_metrics", + "samples": [{"sample_name": "SampleA"}], + "values": { + "QC_ForwardReadCount": 122483575, # int + "QC_ReverseReadCount": 122483575, # int + "QC_FractionContaminatedReads": 0, # int (zero) + "QC_MeanReadLength": 150, # int + "QC_FractionReadsAligned": 0.587, # float + "QC_StrandBalance": 0.5, # float + "QC_Median5Bias": 0.395753, # float + "QC_DynamicRange": 2452.4661796537 # float with high precision + } + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + assert len(data["metrics"]) == 1 + + metric = data["metrics"][0] + assert metric["name"] == "sample_qc_metrics" + assert len(metric["samples"]) == 1 + assert metric["samples"][0]["sample_name"] == "SampleA" + + # Values should be returned with their original types preserved + values_dict = {v["key"]: v["value"] for v in metric["values"]} + + # Integer values + assert values_dict["QC_ForwardReadCount"] == 122483575 + assert isinstance(values_dict["QC_ForwardReadCount"], int) + assert values_dict["QC_FractionContaminatedReads"] == 0 + assert isinstance(values_dict["QC_FractionContaminatedReads"], int) + assert values_dict["QC_MeanReadLength"] == 150 + assert isinstance(values_dict["QC_MeanReadLength"], int) + + # Float values + assert values_dict["QC_FractionReadsAligned"] == 0.587 + assert isinstance(values_dict["QC_FractionReadsAligned"], float) + assert values_dict["QC_DynamicRange"] == 2452.4661796537 + assert isinstance(values_dict["QC_DynamicRange"], float) + + +def test_mixed_string_and_numeric_values(client: TestClient, session: Session): + """ + Test that both string and numeric values can be provided in the same metric, + and each is returned with its original type. + """ + qcrecord_data = { + "project_id": "P-MIXED-001", + "metadata": {"pipeline": "RNA-Seq"}, + "metrics": [ + { + "name": "alignment_stats", + "samples": [{"sample_name": "Sample1"}], + "values": { + "total_reads": 50000000, # numeric int + "alignment_rate": 97.5, # numeric float + "reference_genome": "GRCh38", # string + "status": "passed" # string + } + } + ] + } + + response = client.post( + "/api/v1/qcmetrics?created_by=test_user", + json=qcrecord_data + ) + assert response.status_code == 201 + + data = response.json() + values_dict = {v["key"]: v["value"] for v in data["metrics"][0]["values"]} + + # Numeric values returned with original types + assert values_dict["total_reads"] == 50000000 + assert isinstance(values_dict["total_reads"], int) + assert values_dict["alignment_rate"] == 97.5 + assert isinstance(values_dict["alignment_rate"], float) + + # String values remain as strings + assert values_dict["reference_genome"] == "GRCh38" + assert isinstance(values_dict["reference_genome"], str) + assert values_dict["status"] == "passed" + assert isinstance(values_dict["status"], str)