Skip to content
172 changes: 172 additions & 0 deletions alembic/versions/f1a2b3c4d5e6_add_qcmetrics_and_filerecord_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""Add QCMetrics and FileRecord tables

Revision ID: f1a2b3c4d5e6
Revises: e158df5a8df1
Create Date: 2026-01-29 16:45:00.000000

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
import sqlmodel


# revision identifiers, used by Alembic.
revision: str = 'f1a2b3c4d5e6'
down_revision: Union[str, Sequence[str], None] = 'e158df5a8df1'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
"""Create QCMetrics and FileRecord tables."""

# ========================================================================
# FileRecord Tables (reusable across QCRecord, Sample, etc.)
# ========================================================================

# filerecord - main file metadata table
op.create_table(
'filerecord',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('entity_type', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False),
sa.Column('entity_id', sa.Uuid(), nullable=False),
sa.Column('uri', sqlmodel.sql.sqltypes.AutoString(length=1024), nullable=False),
sa.Column('size', sa.BigInteger(), nullable=True),
sa.Column('created_on', sa.DateTime(), nullable=True),
sa.PrimaryKeyConstraint('id')
)
op.create_index(
'ix_filerecord_entity',
'filerecord',
['entity_type', 'entity_id']
)

# filerecordhash - hash values for files
op.create_table(
'filerecordhash',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('file_record_id', sa.Uuid(), nullable=False),
sa.Column('algorithm', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False),
sa.Column('value', sqlmodel.sql.sqltypes.AutoString(length=128), nullable=False),
sa.ForeignKeyConstraint(['file_record_id'], ['filerecord.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('file_record_id', 'algorithm', name='uq_filerecordhash_file_algorithm')
)

# filerecordtag - key-value tags for files
op.create_table(
'filerecordtag',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('file_record_id', sa.Uuid(), nullable=False),
sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False),
sa.Column('value', sa.Text(), nullable=False),
sa.ForeignKeyConstraint(['file_record_id'], ['filerecord.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('file_record_id', 'key', name='uq_filerecordtag_file_key')
)

# filerecordsample - sample associations for files
op.create_table(
'filerecordsample',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('file_record_id', sa.Uuid(), nullable=False),
sa.Column('sample_name', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False),
sa.Column('role', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=True),
sa.ForeignKeyConstraint(['file_record_id'], ['filerecord.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('file_record_id', 'sample_name', name='uq_filerecordsample_file_sample')
)

# ========================================================================
# QCRecord Tables
# ========================================================================

# qcrecord - main QC record table
op.create_table(
'qcrecord',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('created_on', sa.DateTime(), nullable=False),
sa.Column('created_by', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=False),
sa.Column('project_id', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False),
sa.PrimaryKeyConstraint('id')
)
op.create_index('ix_qcrecord_project_id', 'qcrecord', ['project_id'])

# qcrecordmetadata - pipeline-level metadata
op.create_table(
'qcrecordmetadata',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('qcrecord_id', sa.Uuid(), nullable=False),
sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False),
sa.Column('value', sa.Text(), nullable=False),
sa.ForeignKeyConstraint(['qcrecord_id'], ['qcrecord.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('qcrecord_id', 'key', name='uq_qcrecordmetadata_record_key')
)

# qcmetric - named metric groups
op.create_table(
'qcmetric',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('qcrecord_id', sa.Uuid(), nullable=False),
sa.Column('name', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False),
sa.ForeignKeyConstraint(['qcrecord_id'], ['qcrecord.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('qcrecord_id', 'name', name='uq_qcmetric_record_name')
)

# qcmetricvalue - metric values with dual storage for string/numeric queries
op.create_table(
'qcmetricvalue',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('qc_metric_id', sa.Uuid(), nullable=False),
sa.Column('key', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False),
sa.Column('value_string', sa.Text(), nullable=False),
sa.Column('value_numeric', sa.Float(), nullable=True),
sa.Column(
'value_type', sqlmodel.sql.sqltypes.AutoString(length=10),
nullable=False, server_default='str'
),
sa.ForeignKeyConstraint(['qc_metric_id'], ['qcmetric.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('qc_metric_id', 'key', name='uq_qcmetricvalue_metric_key')
)
# Index on key + value_numeric for efficient numeric range queries
op.create_index(
'ix_qcmetricvalue_key_numeric', 'qcmetricvalue',
['key', 'value_numeric']
)

# qcmetricsample - sample associations for metrics
op.create_table(
'qcmetricsample',
sa.Column('id', sa.Uuid(), nullable=False),
sa.Column('qc_metric_id', sa.Uuid(), nullable=False),
sa.Column('sample_name', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False),
sa.Column('role', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=True),
sa.ForeignKeyConstraint(['qc_metric_id'], ['qcmetric.id'], ondelete='CASCADE'),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('qc_metric_id', 'sample_name', name='uq_qcmetricsample_metric_sample')
)


def downgrade() -> None:
"""Drop QCMetrics and FileRecord tables."""

# Drop QCRecord tables (in reverse order of creation)
op.drop_table('qcmetricsample')
op.drop_index('ix_qcmetricvalue_key_numeric', table_name='qcmetricvalue')
op.drop_table('qcmetricvalue')
op.drop_table('qcmetric')
op.drop_table('qcrecordmetadata')
op.drop_index('ix_qcrecord_project_id', table_name='qcrecord')
op.drop_table('qcrecord')

# Drop FileRecord tables
op.drop_table('filerecordsample')
op.drop_table('filerecordtag')
op.drop_table('filerecordhash')
op.drop_index('ix_filerecord_entity', table_name='filerecord')
op.drop_table('filerecord')
7 changes: 7 additions & 0 deletions api/filerecord/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""
FileRecord module - reusable file metadata records.

This module provides a polymorphic file reference system that can associate
file metadata (URI, size, hashes, tags) with various entity types like
QCRecord, Sample, etc.
"""
185 changes: 185 additions & 0 deletions api/filerecord/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
"""
FileRecord Models - Reusable file metadata records.

These models provide a polymorphic file reference system that can associate
file metadata (URI, size, hashes, tags, samples) with various entity types.
"""

import uuid
from datetime import datetime
from enum import Enum
from typing import List
from sqlmodel import SQLModel, Field, Relationship, UniqueConstraint
from pydantic import ConfigDict


class FileRecordEntityType(str, Enum):
"""Entity types that can have file records associated."""
QCRECORD = "QCRECORD"
SAMPLE = "SAMPLE"


# ============================================================================
# Database Tables
# ============================================================================


class FileRecordHash(SQLModel, table=True):
"""
Hash values for file records.
Supports multiple hash algorithms (md5, sha256, etag, etc.) per file.
"""
__tablename__ = "filerecordhash"

id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True)
file_record_id: uuid.UUID = Field(foreign_key="filerecord.id", nullable=False)
algorithm: str = Field(max_length=50, nullable=False)
value: str = Field(max_length=128, nullable=False)

# Relationship back to parent
file_record: "FileRecord" = Relationship(back_populates="hashes")

__table_args__ = (
UniqueConstraint("file_record_id", "algorithm", name="uq_filerecordhash_file_algorithm"),
)


class FileRecordTag(SQLModel, table=True):
"""
Key-value tags for file records.
Allows arbitrary metadata to be attached to files.
"""
__tablename__ = "filerecordtag"

id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True)
file_record_id: uuid.UUID = Field(foreign_key="filerecord.id", nullable=False)
key: str = Field(max_length=255, nullable=False)
value: str = Field(nullable=False)

# Relationship back to parent
file_record: "FileRecord" = Relationship(back_populates="tags")

__table_args__ = (
UniqueConstraint("file_record_id", "key", name="uq_filerecordtag_file_key"),
)


class FileRecordSample(SQLModel, table=True):
"""
Associates samples with a file record.

Supports:
- 0 rows: workflow-level file (e.g., expression matrix)
- 1 row: single-sample file (e.g., BAM file)
- N rows: multi-sample file with roles (e.g., tumor/normal VCF)
"""
__tablename__ = "filerecordsample"

id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True)
file_record_id: uuid.UUID = Field(foreign_key="filerecord.id", nullable=False)
sample_name: str = Field(max_length=255, nullable=False)
role: str | None = Field(default=None, max_length=50) # e.g., "tumor", "normal"

# Relationship back to parent
file_record: "FileRecord" = Relationship(back_populates="samples")

__table_args__ = (
UniqueConstraint("file_record_id", "sample_name", name="uq_filerecordsample_file_sample"),
)


class FileRecord(SQLModel, table=True):
"""
Metadata record for files stored in external locations (S3, etc.).

Uses polymorphic association via entity_type and entity_id to link
to parent entities (QCRecord, Sample, etc.) without hard FK constraints.
"""
__tablename__ = "filerecord"

id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True)
entity_type: FileRecordEntityType = Field(nullable=False)
entity_id: uuid.UUID = Field(nullable=False)
uri: str = Field(max_length=1024, nullable=False)
size: int | None = Field(default=None) # File size in bytes
created_on: datetime | None = Field(default=None) # File creation timestamp

# Relationships to child tables
hashes: List["FileRecordHash"] = Relationship(
back_populates="file_record",
sa_relationship_kwargs={"cascade": "all, delete-orphan"}
)
tags: List["FileRecordTag"] = Relationship(
back_populates="file_record",
sa_relationship_kwargs={"cascade": "all, delete-orphan"}
)
samples: List["FileRecordSample"] = Relationship(
back_populates="file_record",
sa_relationship_kwargs={"cascade": "all, delete-orphan"}
)

model_config = ConfigDict(from_attributes=True)


# ============================================================================
# Request/Response Models (Pydantic)
# ============================================================================


class HashInput(SQLModel):
"""Hash input for file creation - key is algorithm, value is hash."""
algorithm: str
value: str


class TagInput(SQLModel):
"""Tag input for file creation."""
key: str
value: str


class SampleInput(SQLModel):
"""Sample association input for file creation."""
sample_name: str
role: str | None = None


class FileRecordCreate(SQLModel):
"""Request model for creating a file record."""
uri: str
size: int | None = None
created_on: datetime | None = None
hash: dict[str, str] | None = None # {"md5": "abc...", "sha256": "def..."}
tags: dict[str, str] | None = None # {"type": "alignment", "format": "bam"}
samples: List[SampleInput] | None = None # Sample associations

model_config = ConfigDict(extra="forbid")


class HashPublic(SQLModel):
"""Public representation of a file hash."""
algorithm: str
value: str


class TagPublic(SQLModel):
"""Public representation of a file tag."""
key: str
value: str


class SamplePublic(SQLModel):
"""Public representation of a sample association."""
sample_name: str
role: str | None


class FileRecordPublic(SQLModel):
"""Public representation of a file record."""
id: uuid.UUID
uri: str
size: int | None
created_on: datetime | None
hashes: List[HashPublic]
tags: List[TagPublic]
samples: List[SamplePublic]
6 changes: 6 additions & 0 deletions api/qcmetrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""
QCMetrics module - Quality control metrics from pipeline executions.

This module provides models and APIs for storing and retrieving QC metrics
from bioinformatics pipeline runs.
"""
Loading