diff --git a/alembic/env.py b/alembic/env.py index a4977ad..68f0a1b 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -8,9 +8,10 @@ from core.config import get_settings # Import all models here so that they are registered with SQLModel metadata -from api.samples.models import Sample, SampleAttribute +from api.files.models import File from api.project.models import Project from api.runs.models import SequencingRun +from api.samples.models import Sample, SampleAttribute from api.vendors.models import Vendor from api.workflow.models import Workflow, WorkflowAttribute @@ -33,6 +34,7 @@ # target_metadata = None target_metadata = SQLModel.metadata + # other values from the config, defined by the needs of env.py, # can be acquired: # my_important_option = config.get_main_option("my_important_option") diff --git a/alembic/versions/95817dee746c_add_files_table.py b/alembic/versions/95817dee746c_add_files_table.py new file mode 100644 index 0000000..1c8fa83 --- /dev/null +++ b/alembic/versions/95817dee746c_add_files_table.py @@ -0,0 +1,53 @@ +"""add_files_table + +Revision ID: 95817dee746c +Revises: c955364e5391 +Create Date: 2025-09-20 14:28:45.987919 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = '95817dee746c' +down_revision: Union[str, Sequence[str], None] = '43c1f122cf7f' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('file', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('file_id', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=False), + sa.Column('filename', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('original_filename', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('file_path', sqlmodel.sql.sqltypes.AutoString(length=1024), nullable=False), + sa.Column('file_size', sa.Integer(), nullable=True), + sa.Column('mime_type', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=True), + sa.Column('checksum', sqlmodel.sql.sqltypes.AutoString(length=64), nullable=True), + sa.Column('description', sqlmodel.sql.sqltypes.AutoString(length=1024), nullable=True), + sa.Column('file_type', sa.Enum('FASTQ', 'BAM', 'VCF', 'SAMPLESHEET', 'METRICS', 'REPORT', 'LOG', 'IMAGE', 'DOCUMENT', 'OTHER', name='filetype'), nullable=False), + sa.Column('upload_date', sa.DateTime(), nullable=False), + sa.Column('created_by', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=True), + sa.Column('entity_type', sa.Enum('PROJECT', 'RUN', name='entitytype'), nullable=False), + sa.Column('entity_id', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=False), + sa.Column('storage_backend', sa.Enum('LOCAL', 'S3', 'AZURE', 'GCS', name='storagebackend'), nullable=False), + sa.Column('is_public', sa.Boolean(), nullable=False), + sa.Column('is_archived', sa.Boolean(), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_id') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('file') + # ### end Alembic commands ### diff --git a/api/files/models.py b/api/files/models.py index 67423b5..9c56a89 100644 --- a/api/files/models.py +++ b/api/files/models.py @@ -1,7 +1,195 @@ """ Models for the Files API """ -from sqlmodel import SQLModel + +from typing import List +import uuid +from datetime import datetime +from enum import Enum + +from sqlmodel import SQLModel, Field +from pydantic import ConfigDict + + +class FileType(str, Enum): + """File type categories""" + + FASTQ = "fastq" + BAM = "bam" + VCF = "vcf" + SAMPLESHEET = "samplesheet" + METRICS = "metrics" + REPORT = "report" + LOG = "log" + IMAGE = "image" + DOCUMENT = "document" + OTHER = "other" + + +class EntityType(str, Enum): + """Entity types that can have files""" + + PROJECT = "project" + RUN = "run" + + +class StorageBackend(str, Enum): + """Storage backend types""" + + LOCAL = "local" + S3 = "s3" + AZURE = "azure" + GCS = "gcs" + + +class File(SQLModel, table=True): + """Core file entity that can be associated with runs or projects""" + + __searchable__ = ["filename", "description", "file_id"] + + id: uuid.UUID | None = Field(default_factory=uuid.uuid4, primary_key=True) + file_id: str = Field(unique=True, max_length=100) # Human-readable identifier + filename: str = Field(max_length=255) + original_filename: str = Field(max_length=255) + file_path: str = Field(max_length=1024) # Storage path/URI + file_size: int | None = None # Size in bytes + mime_type: str | None = Field(default=None, max_length=100) + checksum: str | None = Field(default=None, max_length=64) # SHA-256 hash + + # Metadata + description: str | None = Field(default=None, max_length=1024) + file_type: FileType = Field(default=FileType.OTHER) + upload_date: datetime = Field(default_factory=datetime.utcnow) + created_by: str | None = Field(default=None, max_length=100) # User identifier + + # Polymorphic associations + entity_type: EntityType # "project" or "run" + entity_id: str = Field(max_length=100) # project_id or run barcode + + # Storage metadata + storage_backend: StorageBackend = Field(default=StorageBackend.LOCAL) + is_public: bool = Field(default=False) + is_archived: bool = Field(default=False) + + model_config = ConfigDict(from_attributes=True) + + def generate_file_id(self) -> str: + """Generate a unique file ID""" + import secrets + import string + + alphabet = string.ascii_letters + string.digits + return "".join(secrets.choice(alphabet) for _ in range(12)) + + +class FileCreate(SQLModel): + """Request model for creating a file""" + + filename: str + original_filename: str | None = None + description: str | None = None + file_type: FileType = FileType.OTHER + entity_type: EntityType + entity_id: str + is_public: bool = False + created_by: str | None = None + + model_config = ConfigDict(extra="forbid") + + +class FileUpdate(SQLModel): + """Request model for updating file metadata""" + + filename: str | None = None + description: str | None = None + file_type: FileType | None = None + is_public: bool | None = None + is_archived: bool | None = None + + model_config = ConfigDict(extra="forbid") + + +class FilePublic(SQLModel): + """Public file representation""" + + file_id: str + filename: str + original_filename: str + file_size: int | None + mime_type: str | None + description: str | None + file_type: FileType + upload_date: datetime + created_by: str | None + entity_type: EntityType + entity_id: str + is_public: bool + is_archived: bool + storage_backend: StorageBackend + checksum: str | None = None + + +class FilesPublic(SQLModel): + """Paginated file listing""" + + data: List[FilePublic] + total_items: int + total_pages: int + current_page: int + per_page: int + has_next: bool + has_prev: bool + + +class FileUploadRequest(SQLModel): + """Request model for file upload""" + + filename: str + description: str | None = None + file_type: FileType = FileType.OTHER + is_public: bool = False + + model_config = ConfigDict(extra="forbid") + + +class FileUploadResponse(SQLModel): + """Response model for file upload""" + + file_id: str + filename: str + file_size: int | None = None + checksum: str | None = None + upload_date: datetime + message: str = "File uploaded successfully" + + +class FileFilters(SQLModel): + """File filtering options""" + + entity_type: EntityType | None = None + entity_id: str | None = None + file_type: FileType | None = None + mime_type: str | None = None + created_by: str | None = None + is_public: bool | None = None + is_archived: bool | None = None + search_query: str | None = None # Search in filename/description + + model_config = ConfigDict(extra="forbid") + + +class PaginatedFileResponse(SQLModel): + """Paginated response for file listings""" + + data: list[FilePublic] + total_items: int + total_pages: int + current_page: int + per_page: int + has_next: bool + has_prev: bool + + model_config = ConfigDict(from_attributes=True) class FileBrowserFolder(SQLModel): diff --git a/api/files/routes.py b/api/files/routes.py index f5b8690..de7d702 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -1,16 +1,289 @@ """ Routes/endpoints for the Files API """ +from typing import Optional +import io from fastapi import APIRouter, Depends, Query from fastapi.responses import StreamingResponse -import io +from fastapi import ( + APIRouter, + Query, + HTTPException, + status, + UploadFile, + File as FastAPIFile, + Form, +) -from api.files.models import FileBrowserData +from core.deps import get_s3_client, SessionDep +from api.files.models import ( + FileCreate, + FileUpdate, + FilePublic, + PaginatedFileResponse, + FileFilters, + FileType, + EntityType, + FileBrowserData, +) from api.files import services -from core.deps import get_s3_client -router = APIRouter(prefix="/files", tags=["File Endpoints"]) +router = APIRouter(prefix="/files", tags=["Files"]) + + +@router.post( + "", + response_model=FilePublic, + status_code=status.HTTP_201_CREATED, + summary="Create a new file record", +) +def create_file( + session: SessionDep, + filename: str = Form(...), + entity_type: EntityType = Form(...), + entity_id: str = Form(...), + description: Optional[str] = Form(None), + file_type: FileType = Form(FileType.OTHER), + is_public: bool = Form(False), + created_by: Optional[str] = Form(None), + content: Optional[UploadFile] = FastAPIFile(None), + s3_client=Depends(get_s3_client), +) -> FilePublic: + """ + Create a new file record with optional file content upload. + + Storage backend (S3 vs Local) is automatically determined based on + configuration and entity type. + + - **filename**: Name of the file + - **description**: Optional description of the file + - **file_type**: Type of file (fastq, bam, vcf, etc.) + - **entity_type**: Whether this file belongs to a project or run + - **entity_id**: ID of the project or run this file belongs to + - **is_public**: Whether the file is publicly accessible + - **created_by**: User who created the file + """ + # Create FileCreate object from form data + file_in = FileCreate( + filename=filename, + description=description, + file_type=file_type, + entity_type=entity_type, + entity_id=entity_id, + is_public=is_public, + created_by=created_by, + ) + + file_content = None + if content and content.filename: + file_content = content.file.read() + + return services.create_file(session, file_in, file_content, s3_client=s3_client) + + +@router.get( + "/browse", response_model=FileBrowserData, summary="Browse filesystem directory" +) +def browse_filesystem( + directory_path: str = Query( + "", description="Directory path to browse (local path or s3://bucket/key)" + ), + storage_root: str = Query( + "storage", description="Storage root directory (ignored for S3 paths)" + ), +) -> FileBrowserData: + """ + Browse a filesystem directory or S3 bucket and return folders and files in structured format. + + Supports both local filesystem and AWS S3: + - **Local paths**: Relative to storage_root (empty for root) or absolute paths + - **S3 paths**: Use s3://bucket/key format (e.g., s3://my-bucket/path/to/folder/) + - **storage_root**: Base storage directory for local paths (ignored for S3) + + Returns separate arrays for folders and files with name, date, and size information. + + For S3 paths: + - Requires AWS credentials to be configured + - Folders represent S3 prefixes (common prefixes) + - Files show S3 object metadata (size, last modified) + + Examples: + - Local: `/browse?directory_path=project1/data` + - S3: `/browse?directory_path=s3://my-bucket/project1/data/` + """ + return services.browse_filesystem(directory_path, storage_root) + + +@router.get( + "/browse-db", + response_model=FileBrowserData, + summary="List database files in browser format", +) +def list_files_browser_format( + session: SessionDep, + page: int = Query(1, ge=1, description="Page number (1-indexed)"), + per_page: int = Query(20, ge=1, le=100, description="Number of items per page"), + entity_type: Optional[EntityType] = Query( + None, description="Filter by entity type" + ), + entity_id: Optional[str] = Query(None, description="Filter by entity ID"), + file_type: Optional[FileType] = Query(None, description="Filter by file type"), + search: Optional[str] = Query( + None, description="Search in filename and description" + ), + is_public: Optional[bool] = Query( + None, description="Filter by public/private status" + ), + created_by: Optional[str] = Query(None, description="Filter by creator"), +) -> FileBrowserData: + """ + Get database files in FileBrowserData format (files only, no folders). + + This endpoint returns the same file data as the regular list_files endpoint, + but formatted to match the FileBrowserData structure with separate folders and files arrays. + Since database files don't have folder structure, the folders array will be empty. + """ + filters = FileFilters( + entity_type=entity_type, + entity_id=entity_id, + file_type=file_type, + search_query=search, + is_public=is_public, + created_by=created_by, + ) + + return services.list_files_as_browser_data(session, filters, page, per_page) + + +@router.get("/{file_id}", response_model=FilePublic, summary="Get file by ID") +def get_file(session: SessionDep, file_id: str) -> FilePublic: + """ + Get a single file by its file_id. + + - **file_id**: The unique file identifier + """ + try: + return services.get_file(session, file_id) + except Exception as exc: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found", + ) from exc + + +@router.put("/{file_id}", response_model=FilePublic, summary="Update file metadata") +def update_file( + session: SessionDep, file_id: str, file_update: FileUpdate +) -> FilePublic: + """ + Update file metadata. + + - **file_id**: The unique file identifier + - **file_update**: Fields to update + """ + try: + return services.update_file(session, file_id, file_update) + except Exception as exc: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found", + ) from exc + + +@router.delete( + "/{file_id}", status_code=status.HTTP_204_NO_CONTENT, summary="Delete file" +) +def delete_file( + session: SessionDep, file_id: str, s3_client=Depends(get_s3_client) +) -> None: + """ + Delete a file and its content from storage (local or S3). + + - **file_id**: The unique file identifier + """ + try: + success = services.delete_file(session, file_id, s3_client=s3_client) + if not success: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found", + ) + except Exception as exc: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found", + ) from exc + + +@router.post( + "/{file_id}/content", response_model=FilePublic, summary="Upload file content" +) +def upload_file_content( + session: SessionDep, + file_id: str, + content: UploadFile = FastAPIFile(...), + s3_client=Depends(get_s3_client), +) -> FilePublic: + """ + Upload content for an existing file record. + + Updates file in appropriate storage backend (S3 or local). + + - **file_id**: The unique file identifier + - **content**: The file content to upload + """ + try: + file_content = content.file.read() + return services.update_file_content( + session, file_id, file_content, s3_client=s3_client + ) + except Exception as exc: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found", + ) from exc + + +@router.get( + "/entity/{entity_type}/{entity_id}", + response_model=PaginatedFileResponse, + summary="List files for a specific entity.", +) +def list_files_for_entity( + session: SessionDep, + entity_type: EntityType, + entity_id: str, + page: int = Query(1, ge=1, description="Page number (1-indexed)"), + per_page: int = Query(20, ge=1, le=100, description="Number of items per page"), + file_type: Optional[FileType] = Query(None, description="Filter by file type"), +) -> PaginatedFileResponse: + """ + Get all files associated with a specific project or run. + This is the same as /api/v1/files, but scoped to a specific entity. + + - **entity_type**: Either "project" or "run" + - **entity_id**: The project ID or run barcode + """ + return services.list_files_for_entity( + session, entity_type, entity_id, page, per_page, file_type + ) + + +@router.get( + "/entity/{entity_type}/{entity_id}/count", summary="Get file count for entity" +) +def get_file_count_for_entity( + session: SessionDep, entity_type: EntityType, entity_id: str +) -> dict: + """ + Get the total number of files for a specific project or run. + + - **entity_type**: Either "project" or "run" + - **entity_id**: The project ID or run barcode + """ + count = services.get_file_count_for_entity(session, entity_type, entity_id) + return {"entity_type": entity_type, "entity_id": entity_id, "file_count": count} @router.get("/list", response_model=FileBrowserData, tags=["File Endpoints"]) diff --git a/api/files/services.py b/api/files/services.py index 126c1dd..6f9c265 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -1,18 +1,603 @@ """ Services for the Files API """ - +import hashlib +import secrets +import string +import os +import logging +from pathlib import Path +from datetime import datetime +from sqlmodel import select, Session, func +from pydantic import PositiveInt from fastapi import HTTPException, status try: import boto3 - from botocore.exceptions import NoCredentialsError, ClientError + from botocore.exceptions import ClientError, NoCredentialsError BOTO3_AVAILABLE = True except ImportError: BOTO3_AVAILABLE = False -from api.files.models import FileBrowserData, FileBrowserFile, FileBrowserFolder +from api.files.models import ( + File, + FileCreate, + FileUpdate, + FilePublic, + FilesPublic, + FileFilters, + FileType, + EntityType, + StorageBackend, + FileBrowserData, + FileBrowserFolder, + FileBrowserFile, +) + + +def generate_file_id() -> str: + """Generate a unique file ID""" + alphabet = string.ascii_letters + string.digits + return "".join(secrets.choice(alphabet) for _ in range(12)) + + +def generate_file_path( + entity_type: EntityType, entity_id: str, file_type: FileType, filename: str +) -> str: + """Generate a structured file path""" + from datetime import datetime, timezone + + now = datetime.now(timezone.utc) + year = now.strftime("%Y") + month = now.strftime("%m") + + # Create path structure: /{entity_type}/{entity_id}/{file_type}/{year}/{month}/{filename} + path_parts = [entity_type.value, entity_id, file_type.value, year, month, filename] + return "/".join(path_parts) + + +def calculate_file_checksum(file_content: bytes) -> str: + """Calculate SHA-256 checksum of file content""" + return hashlib.sha256(file_content).hexdigest() + + +def get_mime_type(filename: str) -> str: + """Get MIME type based on file extension""" + import mimetypes + + mime_type, _ = mimetypes.guess_type(filename) + return mime_type or "application/octet-stream" + + +def determine_storage_backend( + entity_type: EntityType, entity_id: str +) -> tuple[StorageBackend, str]: + """ + Determine storage backend and base URI from configuration and entity type. + + Args: + entity_type: The entity type (PROJECT or RUN) + entity_id: The entity identifier + + Returns: + Tuple of (StorageBackend, base_uri) + """ + from core.config import get_settings + + settings = get_settings() + + # Check if S3 is configured based on entity type + if entity_type == EntityType.PROJECT: + if settings.DATA_BUCKET_URI.startswith("s3://"): + return StorageBackend.S3, settings.DATA_BUCKET_URI + elif entity_type == EntityType.RUN: + if settings.RESULTS_BUCKET_URI.startswith("s3://"): + return StorageBackend.S3, settings.RESULTS_BUCKET_URI + + # Fallback to local storage + return StorageBackend.LOCAL, "storage" + + +def _upload_to_s3( + file_content: bytes, s3_uri: str, mime_type: str, s3_client=None +) -> bool: + """ + Upload file content to S3. + + Args: + file_content: File bytes to upload + s3_uri: Full S3 URI (s3://bucket/key) + mime_type: Content type for S3 metadata + s3_client: Optional boto3 S3 client + + Returns: + True if successful + + Raises: + HTTPException: If upload fails + """ + if not BOTO3_AVAILABLE: + raise HTTPException( + status_code=status.HTTP_501_NOT_IMPLEMENTED, + detail="boto3 not available for S3 uploads. Install boto3 to enable S3 storage.", + ) + + try: + bucket, key = _parse_s3_path(s3_uri) + + if s3_client is None: + s3_client = boto3.client("s3") + + s3_client.put_object( + Bucket=bucket, + Key=key, + Body=file_content, + ContentType=mime_type, + Metadata={ + "uploaded-by": "ngs360-api", + "upload-timestamp": datetime.utcnow().isoformat(), + }, + ) + + logging.info(f"Successfully uploaded file to S3: {s3_uri}") + return True + + except NoCredentialsError as exc: + logging.error(f"AWS credentials not configured: {exc}") + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="AWS credentials not found. Please configure AWS credentials.", + ) from exc + except ClientError as exc: + error_code = exc.response["Error"]["Code"] + if error_code == "NoSuchBucket": + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"S3 bucket not found in URI: {s3_uri}", + ) from exc + elif error_code == "AccessDenied": + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"Access denied to S3 bucket: {s3_uri}", + ) from exc + else: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"S3 error: {exc.response['Error']['Message']}", + ) from exc + except Exception as exc: + logging.error(f"Failed to upload to S3: {exc}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Unexpected error uploading to S3: {str(exc)}", + ) from exc + + +def _is_valid_storage_path(path: str) -> bool: + """Validate storage path format""" + # Allow S3 paths, local paths, network paths + valid_prefixes = ["s3://", "/", "file://", "smb://", "ftp://"] + return any(path.startswith(prefix) for prefix in valid_prefixes) + + +def _save_samplesheet_to_run_folder( + session: Session, run_barcode: str, file_content: bytes +) -> bool: + """ + Save samplesheet content to the run's folder URI location. + Returns True if successful, False otherwise. + """ + from smart_open import open as smart_open + + try: + # Import here to avoid circular imports + from api.runs.services import get_run + + # Get run information + run = get_run(session, run_barcode) + if not run or not run.run_folder_uri: + logging.warning(f"No run folder URI found for run {run_barcode}") + return False + + # Construct samplesheet path - always use SampleSheet.csv as the standard name + samplesheet_path = f"{run.run_folder_uri.rstrip('/')}/SampleSheet.csv" + + # Validate path format + if not _is_valid_storage_path(samplesheet_path): + logging.warning(f"Invalid storage path format: {samplesheet_path}") + return False + + # Save using smart_open (handles S3, local, network paths) + with smart_open(samplesheet_path, "wb") as f: + f.write(file_content) + + logging.info( + f"Successfully saved samplesheet to run folder: {samplesheet_path}" + ) + return True + + except Exception as e: + # Log error but don't fail the upload + logging.error(f"Failed to save samplesheet to run folder {run_barcode}: {e}") + return False + + +def create_file( + session: Session, + file_create: FileCreate, + file_content: bytes | None = None, + storage_root: str = "storage", + s3_client=None, +) -> File: + """ + Create a new file record and store content to appropriate backend. + + Automatically determines storage backend (S3 vs Local) based on + configuration and entity type. + """ + + # Generate unique file ID + file_id = generate_file_id() + + # Use original_filename if provided, otherwise use filename + original_filename = file_create.original_filename or file_create.filename + + # Determine storage backend automatically from configuration + storage_backend, base_uri = determine_storage_backend( + file_create.entity_type, file_create.entity_id + ) + + # Generate file path structure + relative_file_path = generate_file_path( + file_create.entity_type, + file_create.entity_id, + file_create.file_type, + f"{file_id}_{file_create.filename}", + ) + + # Construct full storage path based on backend + if storage_backend == StorageBackend.S3: + # S3 URI: s3://bucket/entity_type/entity_id/file_type/year/month/filename + storage_path = f"{base_uri.rstrip('/')}/{relative_file_path}" + else: + # Local filesystem: relative path + storage_path = relative_file_path + + # Calculate file metadata if content is provided + file_size = len(file_content) if file_content else None + checksum = calculate_file_checksum(file_content) if file_content else None + mime_type = get_mime_type(file_create.filename) + + # Create file record + file_record = File( + file_id=file_id, + filename=file_create.filename, + original_filename=original_filename, + file_path=storage_path, + file_size=file_size, + mime_type=mime_type, + checksum=checksum, + description=file_create.description, + file_type=file_create.file_type, + created_by=file_create.created_by, + entity_type=file_create.entity_type, + entity_id=file_create.entity_id, + is_public=file_create.is_public, + storage_backend=storage_backend, + ) + + # Store file content based on backend + if file_content: + if storage_backend == StorageBackend.S3: + # Upload to S3 + _upload_to_s3(file_content, storage_path, mime_type, s3_client) + logging.info( + f"File {file_id} uploaded to S3: {storage_path}" + ) + else: + # Write to local filesystem + full_path = Path(storage_root) / relative_file_path + full_path.parent.mkdir(parents=True, exist_ok=True) + with open(full_path, "wb") as f: + f.write(file_content) + logging.info( + f"File {file_id} saved to local storage: {full_path}" + ) + + # SPECIAL HANDLING: If samplesheet for run, also save to run folder + if ( + file_create.file_type == FileType.SAMPLESHEET + and file_create.entity_type == EntityType.RUN + ): + dual_storage_success = _save_samplesheet_to_run_folder( + session, file_create.entity_id, file_content + ) + # Add note to description about dual storage status + if dual_storage_success: + status_note = "[Dual-stored to run folder]" + else: + status_note = "[Database-only - run folder write failed]" + + if file_record.description: + file_record.description = ( + f"{file_record.description} {status_note}" + ) + else: + file_record.description = status_note + + # Save to database + session.add(file_record) + session.commit() + session.refresh(file_record) + + return file_record + + +def get_file(session: Session, file_id: str) -> File: + """Get a file by its file_id""" + file_record = session.exec(select(File).where(File.file_id == file_id)).first() + + if not file_record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found", + ) + + return file_record + + +def get_file_by_id(session: Session, id: str) -> File: + """Get a file by its internal UUID""" + file_record = session.exec(select(File).where(File.id == id)).first() + + if not file_record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with internal id {id} not found", + ) + + return file_record + + +def update_file(session: Session, file_id: str, file_update: FileUpdate) -> File: + """Update file metadata""" + file_record = get_file(session, file_id) + + # Update fields that are provided + update_data = file_update.model_dump(exclude_unset=True) + for field, value in update_data.items(): + setattr(file_record, field, value) + + session.add(file_record) + session.commit() + session.refresh(file_record) + + return file_record + + +def delete_file( + session: Session, + file_id: str, + storage_root: str = "storage", + s3_client=None, +) -> bool: + """Delete a file record and its content from storage (local or S3)""" + file_record = get_file(session, file_id) + + # Delete physical file based on storage backend + if file_record.storage_backend == StorageBackend.S3: + # Delete from S3 + try: + bucket, key = _parse_s3_path(file_record.file_path) + if s3_client is None: + s3_client = boto3.client("s3") + s3_client.delete_object(Bucket=bucket, Key=key) + logging.info(f"Deleted file from S3: {file_record.file_path}") + except Exception as e: + logging.warning( + f"Failed to delete S3 object {file_record.file_path}: {e}" + ) + # Continue with database deletion even if S3 delete fails + else: + # Delete from local filesystem + full_path = Path(storage_root) / file_record.file_path + if full_path.exists(): + full_path.unlink() + + # Try to remove empty directories + try: + full_path.parent.rmdir() + except OSError: + # Directory not empty, that's fine + pass + + # Delete from database + session.delete(file_record) + session.commit() + + return True + + +def get_file_content( + session: Session, + file_id: str, + storage_root: str = "storage", + s3_client=None, +) -> bytes: + """Get file content from storage (local or S3)""" + file_record = get_file(session, file_id) + + if file_record.storage_backend == StorageBackend.S3: + # Download from S3 + try: + file_content, _, _ = download_file( + file_record.file_path, s3_client + ) + return file_content + except HTTPException: + raise + except Exception as e: + logging.error(f"Error downloading from S3: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error retrieving file from S3: {str(e)}", + ) from e + else: + # Read from local filesystem + full_path = Path(storage_root) / file_record.file_path + if not full_path.exists(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File content not found at {file_record.file_path}", + ) + + with open(full_path, "rb") as f: + return f.read() + + +def list_files_for_entity( + session: Session, + entity_type: EntityType, + entity_id: str, + page: PositiveInt = 1, + per_page: PositiveInt = 20, + file_type: FileType | None = None, +) -> FilesPublic: + """List files for a specific entity (project or run)""" + filters = FileFilters( + entity_type=entity_type, entity_id=entity_id, file_type=file_type + ) + + return list_files(session=session, filters=filters, page=page, per_page=per_page) + + +def get_file_count_for_entity( + session: Session, entity_type: EntityType, entity_id: str +) -> int: + """Get the count of files for a specific entity""" + count = session.exec( + select(func.count(File.id)).where( + File.entity_type == entity_type, + File.entity_id == entity_id, + ~File.is_archived, + ) + ).one() + + return count + + +def update_file_content( + session: Session, + file_id: str, + content: bytes, + storage_root: str = "storage", + s3_client=None, +) -> File: + """Update file content (local or S3)""" + # Get the file record + file_record = get_file(session, file_id) + + # Calculate new file metadata + file_size = len(content) + checksum = calculate_file_checksum(content) + + # Write content to appropriate storage backend + if file_record.storage_backend == StorageBackend.S3: + # Upload to S3 + mime_type = get_mime_type(file_record.filename) + _upload_to_s3(content, file_record.file_path, mime_type, s3_client) + logging.info(f"Updated file content in S3: {file_record.file_path}") + else: + # Write to local filesystem + storage_path = Path(storage_root) / file_record.file_path + storage_path.parent.mkdir(parents=True, exist_ok=True) + storage_path.write_bytes(content) + logging.info( + f"Updated file content in local storage: {storage_path}" + ) + + # Update file record + file_record.file_size = file_size + file_record.checksum = checksum + + session.add(file_record) + session.commit() + session.refresh(file_record) + + return file_record + + +def browse_filesystem( + directory_path: str, storage_root: str = "storage" +) -> FileBrowserData: + """ + Browse filesystem directory and return structured data. + + Automatically detects if the path is S3 (s3://bucket/key) or local + filesystem and routes to the appropriate handler. + """ + # Check if this is an S3 path + if directory_path.startswith("s3://"): + return _list_s3(directory_path) + + # Handle local filesystem + return _browse_local_filesystem(directory_path, storage_root) + + +def _browse_local_filesystem( + directory_path: str, storage_root: str = "storage" +) -> FileBrowserData: + """Browse local filesystem directory and return structured data""" + + # Construct full path + if os.path.isabs(directory_path): + full_path = Path(directory_path) + else: + full_path = Path(storage_root) / directory_path + + # Check if directory exists and is accessible + if not full_path.exists(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Directory not found: {directory_path}", + ) + + if not full_path.is_dir(): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Path is not a directory: {directory_path}", + ) + + folders = [] + files = [] + + try: + # List directory contents + for item in full_path.iterdir(): + # Get modification time + stat = item.stat() + mod_time = datetime.fromtimestamp(stat.st_mtime) + date_str = mod_time.strftime("%Y-%m-%d %H:%M:%S") + + if item.is_dir(): + folders.append(FileBrowserFolder(name=item.name, date=date_str)) + else: + files.append( + FileBrowserFile(name=item.name, date=date_str, size=stat.st_size) + ) + + except PermissionError: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"Permission denied accessing directory: {directory_path}", + ) + + # Sort folders and files by name + folders.sort(key=lambda x: x.name.lower()) + files.sort(key=lambda x: x.name.lower()) + + return FileBrowserData(folders=folders, files=files) def _parse_s3_path(s3_path: str) -> tuple[str, str]: @@ -148,6 +733,37 @@ def _list_s3(s3_path: str, s3_client=None) -> FileBrowserData: ) from exc +def list_files_as_browser_data( + session: Session, + filters: FileFilters | None = None, + page: PositiveInt = 1, + per_page: PositiveInt = 20, + sort_by: str = "upload_date", + sort_order: str = "desc", +) -> FileBrowserData: + """List database files in FileBrowserData format (files only, no folders)""" + + # Get files using existing list_files function + files_result = list_files(session, filters, page, per_page, sort_by, sort_order) + + # Convert to FileBrowserFile format + browser_files = [] + for file_record in files_result.data: + # Format date + date_str = file_record.upload_date.strftime("%Y-%m-%d %H:%M:%S") + + browser_files.append( + FileBrowserFile( + name=file_record.filename, + date=date_str, + size=file_record.file_size or 0, + ) + ) + + # No folders for database files + return FileBrowserData(folders=[], files=browser_files) + + def download_file(s3_path: str, s3_client=None) -> tuple[bytes, str, str]: """ Download a file from S3. @@ -259,3 +875,4 @@ def list_files(uri: str, s3_client=None) -> FileBrowserData: status_code=status.HTTP_400_BAD_REQUEST, detail=str(e), ) from e + diff --git a/files_api_architecture.md b/files_api_architecture.md new file mode 100644 index 0000000..d457350 --- /dev/null +++ b/files_api_architecture.md @@ -0,0 +1,292 @@ +# Files API Architecture Diagram + +## System Architecture Overview + +```mermaid +graph TB + subgraph "Client Layer" + WEB[Web Frontend] + API_CLIENT[API Clients] + CLI[CLI Tools] + end + + subgraph "API Layer" + ROUTER[FastAPI Router] + AUTH[Authentication] + VALID[Validation] + end + + subgraph "Service Layer" + FILE_SVC[File Service] + STORAGE_SVC[Storage Service] + METADATA_SVC[Metadata Service] + end + + subgraph "Storage Backends" + LOCAL[Local Storage] + S3[AWS S3] + AZURE[Azure Blob] + GCS[Google Cloud] + end + + subgraph "Database Layer" + FILE_TBL[(File Table)] + PROJECT_TBL[(Project Table)] + RUN_TBL[(Run Table)] + PERM_TBL[(Permissions Table)] + end + + subgraph "Search & Index" + OPENSEARCH[OpenSearch] + SEARCH_IDX[File Index] + end + + WEB --> ROUTER + API_CLIENT --> ROUTER + CLI --> ROUTER + + ROUTER --> AUTH + AUTH --> VALID + VALID --> FILE_SVC + + FILE_SVC --> STORAGE_SVC + FILE_SVC --> METADATA_SVC + FILE_SVC --> FILE_TBL + + STORAGE_SVC --> LOCAL + STORAGE_SVC --> S3 + STORAGE_SVC --> AZURE + STORAGE_SVC --> GCS + + FILE_TBL -.-> PROJECT_TBL + FILE_TBL -.-> RUN_TBL + FILE_TBL --> PERM_TBL + + METADATA_SVC --> OPENSEARCH + OPENSEARCH --> SEARCH_IDX +``` + +## Data Model Relationships + +```mermaid +erDiagram + PROJECT { + uuid id PK + string project_id UK + string name + } + + SEQUENCING_RUN { + uuid id PK + string barcode UK + date run_date + string machine_id + string status + } + + FILE { + uuid id PK + string file_id UK + string filename + string original_filename + string file_path + int file_size + string mime_type + string checksum + string description + enum file_type + datetime upload_date + string created_by + enum entity_type + string entity_id FK + enum storage_backend + boolean is_public + boolean is_archived + } + + FILE_PERMISSION { + uuid id PK + uuid file_id FK + string user_id + string group_id + enum permission_type + datetime granted_date + } + + FILE_VERSION { + uuid id PK + uuid file_id FK + int version_number + string file_path + int file_size + string checksum + datetime created_date + boolean is_current + } + + PROJECT ||--o{ FILE : "has files" + SEQUENCING_RUN ||--o{ FILE : "has files" + FILE ||--o{ FILE_PERMISSION : "has permissions" + FILE ||--o{ FILE_VERSION : "has versions" +``` + +## API Endpoint Structure + +```mermaid +graph LR + subgraph "Generic File Operations" + A[GET /files] --> A1[List all files] + B[POST /files] --> B1[Upload file] + C[GET /files/{id}] --> C1[Get file metadata] + D[PUT /files/{id}] --> D1[Update metadata] + E[DELETE /files/{id}] --> E1[Delete file] + F[GET /files/{id}/download] --> F1[Download file] + end + + subgraph "Project File Operations" + G[GET /projects/{id}/files] --> G1[List project files] + H[POST /projects/{id}/files] --> H1[Upload to project] + I[GET /projects/{id}/files/{file_id}] --> I1[Get project file] + end + + subgraph "Run File Operations" + J[GET /runs/{barcode}/files] --> J1[List run files] + K[POST /runs/{barcode}/files] --> K1[Upload to run] + L[GET /runs/{barcode}/files/{file_id}] --> L1[Get run file] + end + + subgraph "Bulk Operations" + M[POST /files/bulk-upload] --> M1[Upload multiple] + N[DELETE /files/bulk-delete] --> N1[Delete multiple] + end +``` + +## File Upload Flow + +```mermaid +sequenceDiagram + participant Client + participant API + participant FileService + participant StorageService + participant Database + participant Storage + + Client->>API: POST /projects/{id}/files + API->>API: Validate request + API->>FileService: upload_file() + FileService->>FileService: Generate file_id + FileService->>StorageService: store_file() + StorageService->>Storage: Upload file data + Storage-->>StorageService: Storage URI + StorageService-->>FileService: File path + FileService->>Database: Save file metadata + Database-->>FileService: File record + FileService-->>API: File metadata + API-->>Client: Upload response +``` + +## File Download Flow + +```mermaid +sequenceDiagram + participant Client + participant API + participant FileService + participant StorageService + participant Database + participant Storage + + Client->>API: GET /files/{id}/download + API->>API: Check permissions + API->>FileService: get_download_url() + FileService->>Database: Get file metadata + Database-->>FileService: File record + FileService->>StorageService: generate_download_url() + StorageService->>Storage: Create signed URL + Storage-->>StorageService: Signed URL + StorageService-->>FileService: Download URL + FileService-->>API: Download URL + API-->>Client: Redirect or URL response +``` + +## Storage Strategy + +```mermaid +graph TB + subgraph "File Organization" + ROOT[Storage Root] + ROOT --> PROJECTS[/projects/] + ROOT --> RUNS[/runs/] + + PROJECTS --> PROJ_ID[/{project_id}/] + RUNS --> RUN_ID[/{run_barcode}/] + + PROJ_ID --> PROJ_TYPE[/{file_type}/] + RUN_ID --> RUN_TYPE[/{file_type}/] + + PROJ_TYPE --> PROJ_DATE[/{year}/{month}/] + RUN_TYPE --> RUN_DATE[/{year}/{month}/] + + PROJ_DATE --> PROJ_FILE[/{file_id}_{filename}] + RUN_DATE --> RUN_FILE[/{file_id}_{filename}] + end + + subgraph "Storage Backends" + LOCAL_FS[Local Filesystem] + AWS_S3[AWS S3] + AZURE_BLOB[Azure Blob Storage] + GCP_STORAGE[Google Cloud Storage] + end + + PROJ_FILE -.-> LOCAL_FS + PROJ_FILE -.-> AWS_S3 + PROJ_FILE -.-> AZURE_BLOB + PROJ_FILE -.-> GCP_STORAGE + + RUN_FILE -.-> LOCAL_FS + RUN_FILE -.-> AWS_S3 + RUN_FILE -.-> AZURE_BLOB + RUN_FILE -.-> GCP_STORAGE +``` + +## Security and Access Control + +```mermaid +graph TB + subgraph "Authentication" + USER[User Request] + AUTH_CHECK[Authentication Check] + TOKEN[JWT Token Validation] + end + + subgraph "Authorization" + PERM_CHECK[Permission Check] + ENTITY_ACCESS[Entity Access Check] + FILE_ACCESS[File Access Check] + end + + subgraph "File Operations" + READ_OP[Read Operation] + WRITE_OP[Write Operation] + DELETE_OP[Delete Operation] + end + + USER --> AUTH_CHECK + AUTH_CHECK --> TOKEN + TOKEN --> PERM_CHECK + PERM_CHECK --> ENTITY_ACCESS + ENTITY_ACCESS --> FILE_ACCESS + FILE_ACCESS --> READ_OP + FILE_ACCESS --> WRITE_OP + FILE_ACCESS --> DELETE_OP +``` + +This architecture provides: + +1. **Scalable Design**: Supports multiple storage backends and can handle large file volumes +2. **Flexible Associations**: Files can be linked to any entity type (projects, runs, future entities) +3. **Rich Metadata**: Comprehensive file information and categorization +4. **Security**: Multi-layered permission system +5. **Performance**: Efficient querying and caching strategies +6. **Extensibility**: Easy to add new file types, storage backends, and features \ No newline at end of file diff --git a/files_api_design.md b/files_api_design.md new file mode 100644 index 0000000..c594eb1 --- /dev/null +++ b/files_api_design.md @@ -0,0 +1,334 @@ +# Files API Design for NGS360 + +## Overview +Design a unified Files API that supports file operations (list, fetch, upload, delete) for both sequencing runs and projects, with flexible storage backends and comprehensive metadata tracking. + +## Current System Analysis + +### Existing Models +- **Projects**: Have `project_id` (string), `name`, and `attributes` +- **Runs**: Have `id` (UUID), `barcode` (computed), `run_folder_uri`, and various metadata +- **Files API**: Currently has skeleton routes but missing models/services + +## Proposed Architecture + +### 1. Data Model Design + +#### Core File Model +```python +class File(SQLModel, table=True): + """Core file entity that can be associated with runs or projects""" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_id: str = Field(unique=True) # Human-readable identifier + filename: str = Field(max_length=255) + original_filename: str = Field(max_length=255) + file_path: str = Field(max_length=1024) # Storage path/URI + file_size: int | None = None # Size in bytes + mime_type: str | None = Field(max_length=100) + checksum: str | None = Field(max_length=64) # SHA-256 hash + + # Metadata + description: str | None = Field(max_length=1024) + file_type: FileType = Field(default=FileType.OTHER) + upload_date: datetime = Field(default_factory=datetime.utcnow) + created_by: str | None = Field(max_length=100) # User identifier + + # Polymorphic associations + entity_type: EntityType # "project" or "run" + entity_id: str # project_id or run barcode + + # Storage metadata + storage_backend: StorageBackend = Field(default=StorageBackend.LOCAL) + is_public: bool = Field(default=False) + is_archived: bool = Field(default=False) +``` + +#### Enums +```python +class FileType(str, Enum): + """File type categories""" + FASTQ = "fastq" + BAM = "bam" + VCF = "vcf" + SAMPLESHEET = "samplesheet" + METRICS = "metrics" + REPORT = "report" + LOG = "log" + IMAGE = "image" + DOCUMENT = "document" + OTHER = "other" + +class EntityType(str, Enum): + """Entity types that can have files""" + PROJECT = "project" + RUN = "run" + +class StorageBackend(str, Enum): + """Storage backend types""" + LOCAL = "local" + S3 = "s3" + AZURE = "azure" + GCS = "gcs" +``` + +### 2. API Endpoint Design + +#### Unified Files Endpoints +``` +# Generic file operations +GET /api/v1/files # List all files (with filters) +POST /api/v1/files # Upload file (requires entity association) +GET /api/v1/files/{file_id} # Get file metadata +PUT /api/v1/files/{file_id} # Update file metadata +DELETE /api/v1/files/{file_id} # Delete file +GET /api/v1/files/{file_id}/download # Download file content + +# Entity-specific file operations +GET /api/v1/projects/{project_id}/files # List project files +POST /api/v1/projects/{project_id}/files # Upload file to project +GET /api/v1/projects/{project_id}/files/{file_id} # Get project file + +GET /api/v1/runs/{run_barcode}/files # List run files +POST /api/v1/runs/{run_barcode}/files # Upload file to run +GET /api/v1/runs/{run_barcode}/files/{file_id} # Get run file + +# Bulk operations +POST /api/v1/files/bulk-upload # Upload multiple files +DELETE /api/v1/files/bulk-delete # Delete multiple files +``` + +### 3. Request/Response Models + +#### File Upload Request +```python +class FileUploadRequest(SQLModel): + """Request model for file upload""" + filename: str + description: str | None = None + file_type: FileType = FileType.OTHER + is_public: bool = False + tags: List[str] | None = None + +class FileUploadResponse(SQLModel): + """Response model for file upload""" + file_id: str + filename: str + upload_url: str | None = None # For direct upload scenarios + file_size: int | None = None + checksum: str | None = None +``` + +#### File Listing +```python +class FilePublic(SQLModel): + """Public file representation""" + file_id: str + filename: str + original_filename: str + file_size: int | None + mime_type: str | None + description: str | None + file_type: FileType + upload_date: datetime + created_by: str | None + entity_type: EntityType + entity_id: str + is_public: bool + download_url: str | None = None + +class FilesPublic(SQLModel): + """Paginated file listing""" + data: List[FilePublic] + total_items: int + total_pages: int + current_page: int + per_page: int + has_next: bool + has_prev: bool + filters: Dict[str, Any] | None = None +``` + +### 4. Storage Strategy + +#### Multi-Backend Support +```python +class StorageService: + """Abstract storage service interface""" + + async def upload_file(self, file_data: bytes, file_path: str) -> str: + """Upload file and return storage URI""" + pass + + async def download_file(self, file_path: str) -> bytes: + """Download file content""" + pass + + async def delete_file(self, file_path: str) -> bool: + """Delete file from storage""" + pass + + async def get_download_url(self, file_path: str, expires_in: int = 3600) -> str: + """Generate temporary download URL""" + pass + +class LocalStorageService(StorageService): + """Local filesystem storage""" + pass + +class S3StorageService(StorageService): + """AWS S3 storage""" + pass +``` + +#### File Path Strategy +``` +Storage Structure: +/{storage_root}/ + /projects/ + /{project_id}/ + /{file_type}/ + /{year}/{month}/ + /{file_id}_{original_filename} + /runs/ + /{run_barcode}/ + /{file_type}/ + /{year}/{month}/ + /{file_id}_{original_filename} +``` + +### 5. Advanced Features + +#### File Filtering and Search +```python +class FileFilters(SQLModel): + """File filtering options""" + entity_type: EntityType | None = None + entity_id: str | None = None + file_type: FileType | None = None + mime_type: str | None = None + created_by: str | None = None + date_from: datetime | None = None + date_to: datetime | None = None + is_public: bool | None = None + is_archived: bool | None = None + tags: List[str] | None = None + search_query: str | None = None # Search in filename/description +``` + +#### File Versioning (Future Enhancement) +```python +class FileVersion(SQLModel, table=True): + """File version tracking""" + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_id: uuid.UUID = Field(foreign_key="file.id") + version_number: int + file_path: str + file_size: int + checksum: str + created_date: datetime = Field(default_factory=datetime.utcnow) + is_current: bool = Field(default=True) +``` + +### 6. Security and Access Control + +#### File Access Permissions +```python +class FilePermission(SQLModel, table=True): + """File-level permissions""" + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_id: uuid.UUID = Field(foreign_key="file.id") + user_id: str | None = None + group_id: str | None = None + permission_type: PermissionType + granted_date: datetime = Field(default_factory=datetime.utcnow) + +class PermissionType(str, Enum): + READ = "read" + WRITE = "write" + DELETE = "delete" + ADMIN = "admin" +``` + +### 7. Integration Points + +#### With Existing Models +- **Projects**: Add relationship to files via `entity_id` = `project_id` +- **Runs**: Add relationship to files via `entity_id` = `run.barcode` +- **Search**: Include files in OpenSearch indexing for full-text search + +#### Database Relationships +```python +# In Project model +files: List["File"] = Relationship( + sa_relationship_kwargs={ + "primaryjoin": "and_(Project.project_id == File.entity_id, File.entity_type == 'project')", + "foreign_keys": "[File.entity_id]" + } +) + +# In SequencingRun model +files: List["File"] = Relationship( + sa_relationship_kwargs={ + "primaryjoin": "and_(SequencingRun.barcode == File.entity_id, File.entity_type == 'run')", + "foreign_keys": "[File.entity_id]" + } +) +``` + +## Implementation Plan + +### Phase 1: Core Infrastructure +1. Create file models and enums +2. Implement basic storage service (local filesystem) +3. Create core CRUD operations +4. Add database migration + +### Phase 2: API Endpoints +1. Implement generic file endpoints +2. Add entity-specific endpoints +3. Create file upload/download functionality +4. Add comprehensive error handling + +### Phase 3: Advanced Features +1. Add file filtering and search +2. Implement multiple storage backends +3. Add file metadata extraction +4. Create bulk operations + +### Phase 4: Integration & Security +1. Integrate with existing project/run models +2. Add authentication and authorization +3. Implement file permissions +4. Add audit logging + +## Benefits + +1. **Unified Interface**: Single API for all file operations across projects and runs +2. **Flexible Storage**: Support for multiple storage backends (local, S3, etc.) +3. **Rich Metadata**: Comprehensive file metadata and categorization +4. **Scalable**: Designed to handle large numbers of files efficiently +5. **Secure**: Built-in permission system and access controls +6. **Searchable**: Integration with existing search infrastructure +7. **Extensible**: Easy to add new file types and storage backends + +## Example Usage + +```python +# Upload a file to a project +POST /api/v1/projects/PROJ001/files +{ + "filename": "analysis_results.pdf", + "description": "Final analysis report", + "file_type": "report", + "is_public": false +} + +# List all FASTQ files for a run +GET /api/v1/runs/190110_MACHINE123_0001_FLOWCELL123/files?file_type=fastq + +# Search for files across all entities +GET /api/v1/files?search_query=analysis&file_type=report&date_from=2024-01-01 +``` + +This design provides a robust, scalable, and flexible file management system that integrates seamlessly with the existing NGS360 architecture. \ No newline at end of file diff --git a/files_api_implementation_plan.md b/files_api_implementation_plan.md new file mode 100644 index 0000000..670cbcf --- /dev/null +++ b/files_api_implementation_plan.md @@ -0,0 +1,428 @@ +# Files API Implementation Roadmap + +## Overview +This document outlines the step-by-step implementation plan for the Files API, organized into phases with specific deliverables and acceptance criteria. + +## Phase 1: Foundation (Week 1-2) + +### 1.1 Database Models and Migrations +**Priority: Critical** + +#### Tasks: +- [ ] Create `api/files/models.py` with core file models +- [ ] Create database migration for file tables +- [ ] Add foreign key relationships to existing models +- [ ] Create enum definitions for file types and storage backends + +#### Deliverables: +```python +# File models with proper relationships +class File(SQLModel, table=True) +class FileType(str, Enum) +class EntityType(str, Enum) +class StorageBackend(str, Enum) + +# Migration script +# alembic/versions/xxx_create_file_tables.py +``` + +#### Acceptance Criteria: +- [ ] Database migration runs successfully +- [ ] All model relationships work correctly +- [ ] Foreign key constraints are properly defined +- [ ] Enum values are correctly stored and retrieved + +### 1.2 Basic Storage Service +**Priority: Critical** + +#### Tasks: +- [ ] Create `api/files/storage.py` with storage interface +- [ ] Implement `LocalStorageService` for filesystem storage +- [ ] Create file path generation utilities +- [ ] Add basic file operations (upload, download, delete) + +#### Deliverables: +```python +# Storage service interface and implementation +class StorageService(ABC) +class LocalStorageService(StorageService) + +# Utility functions +def generate_file_path(entity_type, entity_id, file_type, filename) +def ensure_directory_exists(path) +``` + +#### Acceptance Criteria: +- [ ] Files can be uploaded to local storage +- [ ] Files can be downloaded from local storage +- [ ] File paths are generated consistently +- [ ] Directory structure is created automatically + +### 1.3 Core Service Layer +**Priority: Critical** + +#### Tasks: +- [ ] Create `api/files/services.py` with CRUD operations +- [ ] Implement file metadata management +- [ ] Add file validation and security checks +- [ ] Create file ID generation logic + +#### Deliverables: +```python +# Core file operations +def create_file(session, file_data, entity_type, entity_id) +def get_file(session, file_id) +def update_file(session, file_id, updates) +def delete_file(session, file_id) +def list_files(session, filters, pagination) +``` + +#### Acceptance Criteria: +- [ ] All CRUD operations work correctly +- [ ] File metadata is properly validated +- [ ] Unique file IDs are generated +- [ ] Database transactions are handled properly + +## Phase 2: API Endpoints (Week 3-4) + +### 2.1 Generic File Endpoints +**Priority: High** + +#### Tasks: +- [ ] Update `api/files/routes.py` with complete endpoint set +- [ ] Implement file upload with multipart form data +- [ ] Add file download with proper headers +- [ ] Create file listing with filtering and pagination + +#### Deliverables: +```python +# Complete API endpoints +@router.post("/files") # Upload file +@router.get("/files") # List files +@router.get("/files/{file_id}") # Get file metadata +@router.put("/files/{file_id}") # Update metadata +@router.delete("/files/{file_id}") # Delete file +@router.get("/files/{file_id}/download") # Download file +``` + +#### Acceptance Criteria: +- [ ] File upload works with proper validation +- [ ] File download returns correct content and headers +- [ ] File listing supports filtering and pagination +- [ ] All endpoints return proper HTTP status codes +- [ ] Error handling is comprehensive + +### 2.2 Entity-Specific Endpoints +**Priority: High** + +#### Tasks: +- [ ] Add project-specific file endpoints +- [ ] Add run-specific file endpoints +- [ ] Implement entity validation (project/run exists) +- [ ] Add entity-based access control + +#### Deliverables: +```python +# Project file endpoints +@router.get("/projects/{project_id}/files") +@router.post("/projects/{project_id}/files") +@router.get("/projects/{project_id}/files/{file_id}") + +# Run file endpoints +@router.get("/runs/{run_barcode}/files") +@router.post("/runs/{run_barcode}/files") +@router.get("/runs/{run_barcode}/files/{file_id}") +``` + +#### Acceptance Criteria: +- [ ] Entity validation works correctly +- [ ] Files are properly associated with entities +- [ ] Entity-specific file listing works +- [ ] Access control prevents unauthorized access + +### 2.3 Request/Response Models +**Priority: High** + +#### Tasks: +- [ ] Create comprehensive Pydantic models +- [ ] Add proper validation rules +- [ ] Implement response serialization +- [ ] Add API documentation + +#### Deliverables: +```python +# Request/Response models +class FileUploadRequest(SQLModel) +class FileUploadResponse(SQLModel) +class FilePublic(SQLModel) +class FilesPublic(SQLModel) +class FileFilters(SQLModel) +``` + +#### Acceptance Criteria: +- [ ] All models have proper validation +- [ ] API documentation is auto-generated +- [ ] Response serialization works correctly +- [ ] Error responses are properly formatted + +## Phase 3: Advanced Features (Week 5-6) + +### 3.1 File Search and Filtering +**Priority: Medium** + +#### Tasks: +- [ ] Implement advanced file filtering +- [ ] Add full-text search capabilities +- [ ] Integrate with OpenSearch +- [ ] Add file indexing for search + +#### Deliverables: +```python +# Advanced filtering +class FileFilters(SQLModel) +def search_files(session, search_query, filters) +def index_file_for_search(file_metadata) +``` + +#### Acceptance Criteria: +- [ ] Complex filtering works correctly +- [ ] Full-text search returns relevant results +- [ ] Search performance is acceptable +- [ ] Search index stays synchronized + +### 3.2 Multiple Storage Backends +**Priority: Medium** + +#### Tasks: +- [ ] Implement S3StorageService +- [ ] Add storage backend configuration +- [ ] Create storage backend selection logic +- [ ] Add storage backend migration tools + +#### Deliverables: +```python +# Additional storage backends +class S3StorageService(StorageService) +class AzureStorageService(StorageService) +class StorageBackendFactory + +# Configuration +STORAGE_BACKENDS = { + "local": LocalStorageService, + "s3": S3StorageService, + "azure": AzureStorageService +} +``` + +#### Acceptance Criteria: +- [ ] Multiple storage backends work correctly +- [ ] Storage backend can be configured per file +- [ ] File migration between backends is possible +- [ ] All backends support the same interface + +### 3.3 Bulk Operations +**Priority: Medium** + +#### Tasks: +- [ ] Implement bulk file upload +- [ ] Add bulk file deletion +- [ ] Create batch processing utilities +- [ ] Add progress tracking for bulk operations + +#### Deliverables: +```python +# Bulk operations +@router.post("/files/bulk-upload") +@router.delete("/files/bulk-delete") +def process_bulk_upload(files, entity_type, entity_id) +def process_bulk_delete(file_ids) +``` + +#### Acceptance Criteria: +- [ ] Bulk upload handles multiple files efficiently +- [ ] Bulk delete is transactional +- [ ] Progress tracking works correctly +- [ ] Error handling for partial failures + +## Phase 4: Integration & Security (Week 7-8) + +### 4.1 Authentication and Authorization +**Priority: High** + +#### Tasks: +- [ ] Integrate with existing auth system +- [ ] Implement file-level permissions +- [ ] Add user/group access control +- [ ] Create permission management endpoints + +#### Deliverables: +```python +# Permission models and services +class FilePermission(SQLModel, table=True) +class PermissionType(str, Enum) +def check_file_permission(user, file_id, permission_type) +def grant_file_permission(file_id, user_id, permission_type) +``` + +#### Acceptance Criteria: +- [ ] Authentication is required for all operations +- [ ] File permissions are enforced correctly +- [ ] Permission inheritance works properly +- [ ] Admin users can manage all files + +### 4.2 Integration with Existing Models +**Priority: High** + +#### Tasks: +- [ ] Add file relationships to Project model +- [ ] Add file relationships to SequencingRun model +- [ ] Update existing API responses to include file counts +- [ ] Create file association utilities + +#### Deliverables: +```python +# Model updates +# In api/project/models.py +files: List["File"] = Relationship(...) + +# In api/runs/models.py +files: List["File"] = Relationship(...) + +# Updated response models +class ProjectPublic(SQLModel): + file_count: int | None = None + +class SequencingRunPublic(SQLModel): + file_count: int | None = None +``` + +#### Acceptance Criteria: +- [ ] File relationships work correctly +- [ ] Existing APIs show file information +- [ ] File counts are accurate +- [ ] No breaking changes to existing APIs + +### 4.3 Audit Logging and Monitoring +**Priority: Medium** + +#### Tasks: +- [ ] Add audit logging for file operations +- [ ] Create file access logs +- [ ] Add monitoring metrics +- [ ] Implement file usage analytics + +#### Deliverables: +```python +# Audit logging +class FileAuditLog(SQLModel, table=True) +def log_file_operation(user, file_id, operation, details) + +# Monitoring metrics +def track_file_upload(file_size, file_type) +def track_file_download(file_id, user_id) +``` + +#### Acceptance Criteria: +- [ ] All file operations are logged +- [ ] Audit logs are searchable +- [ ] Monitoring metrics are collected +- [ ] Usage analytics are available + +## Phase 5: Testing & Documentation (Week 9-10) + +### 5.1 Comprehensive Testing +**Priority: Critical** + +#### Tasks: +- [ ] Create unit tests for all services +- [ ] Add integration tests for API endpoints +- [ ] Create performance tests for file operations +- [ ] Add security tests for access control + +#### Deliverables: +```python +# Test files +tests/api/test_files.py +tests/services/test_file_service.py +tests/storage/test_storage_backends.py +tests/security/test_file_permissions.py +``` + +#### Acceptance Criteria: +- [ ] Test coverage > 90% +- [ ] All tests pass consistently +- [ ] Performance tests meet requirements +- [ ] Security tests validate access control + +### 5.2 API Documentation +**Priority: High** + +#### Tasks: +- [ ] Complete OpenAPI documentation +- [ ] Create usage examples +- [ ] Add integration guides +- [ ] Create troubleshooting documentation + +#### Deliverables: +- Complete API documentation +- Usage examples and tutorials +- Integration guides for different storage backends +- Troubleshooting and FAQ documentation + +#### Acceptance Criteria: +- [ ] API documentation is complete and accurate +- [ ] Examples work correctly +- [ ] Integration guides are clear +- [ ] Documentation is accessible to developers + +## Success Metrics + +### Performance Targets +- File upload: < 2 seconds for files up to 100MB +- File download: < 1 second for metadata, streaming for content +- File listing: < 500ms for paginated results +- Search: < 1 second for complex queries + +### Scalability Targets +- Support for 10,000+ files per project/run +- Handle 100+ concurrent file operations +- Storage backend agnostic (local, S3, Azure, GCS) +- Horizontal scaling capability + +### Security Requirements +- All file operations require authentication +- File-level access control +- Audit logging for all operations +- Secure file URLs with expiration + +## Risk Mitigation + +### Technical Risks +1. **Large file handling**: Implement streaming uploads/downloads +2. **Storage costs**: Add file lifecycle management +3. **Performance**: Implement caching and CDN integration +4. **Data consistency**: Use database transactions and validation + +### Operational Risks +1. **Storage migration**: Create migration tools and procedures +2. **Backup and recovery**: Implement automated backup strategies +3. **Monitoring**: Add comprehensive logging and alerting +4. **Documentation**: Maintain up-to-date documentation + +## Dependencies + +### External Dependencies +- FastAPI for API framework +- SQLModel for database models +- Alembic for database migrations +- Boto3 for AWS S3 integration +- Azure SDK for Azure Blob Storage +- Google Cloud SDK for GCS + +### Internal Dependencies +- Existing authentication system +- Database infrastructure +- OpenSearch for file indexing +- Monitoring and logging infrastructure + +This implementation plan provides a structured approach to building a comprehensive Files API that integrates seamlessly with the existing NGS360 system while providing robust file management capabilities for both projects and runs. \ No newline at end of file diff --git a/main.py b/main.py index 3c26cb1..6f2fc13 100644 --- a/main.py +++ b/main.py @@ -13,9 +13,13 @@ from api.runs.routes import router as runs_router from api.samples.routes import router as samples_router from api.search.routes import router as search_router +<<<<<<< HEAD +from api.files.routes import router as files_router +======= from api.tools.routes import router as tools_router from api.vendors.routes import router as vendors_router from api.workflow.routes import router as workflow_router +>>>>>>> main # Customize route id's diff --git a/scripts/README_S3_Registration.md b/scripts/README_S3_Registration.md new file mode 100644 index 0000000..0c25204 --- /dev/null +++ b/scripts/README_S3_Registration.md @@ -0,0 +1,193 @@ +# S3 Bulk File Registration + +This script allows you to register existing S3 files in your database without moving them. Files remain in S3 but become discoverable through the Files API. + +## Prerequisites + +1. **AWS Credentials**: Configure AWS credentials using one of these methods: + - AWS CLI: `aws configure` + - Environment variables: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` + - IAM roles (if running on EC2) + - AWS credentials file + +2. **Required Permissions**: Your AWS credentials need the following S3 permissions: + - `s3:ListBucket` - to list objects in buckets + - `s3:GetObject` - to read object metadata + - `s3:HeadBucket` - to verify bucket access + +3. **Python Dependencies**: Install boto3 if not already installed: + ```bash + pip install boto3 + ``` + +## Usage Examples + +### 1. Single Bucket Registration + +Register all files from a specific bucket: +```bash +python scripts/register_s3_files.py --bucket my-ngs-data-bucket +``` + +Register files with a specific prefix: +```bash +python scripts/register_s3_files.py --bucket my-bucket --prefix projects/ +``` + +### 2. Dry Run Mode + +See what would be registered without making changes: +```bash +python scripts/register_s3_files.py --bucket my-bucket --dry-run +``` + +### 3. Configuration File Mode + +For multiple buckets or complex setups, use a configuration file: + +1. Create a configuration file (copy from `s3_registration_config.sample.json`): + ```bash + cp scripts/s3_registration_config.sample.json my_s3_config.json + ``` + +2. Edit the configuration file with your bucket details: + ```json + { + "buckets": [ + { + "name": "my-ngs-data-bucket", + "prefix": "projects/", + "entity_patterns": { + "project": "projects/([^/]+)", + "run": "runs/([^/]+)" + } + } + ], + "dry_run": false + } + ``` + +3. Run with the configuration: + ```bash + python scripts/register_s3_files.py --config my_s3_config.json + ``` + +### 4. Generate Sample Configuration + +Create a sample configuration file: +```bash +python scripts/register_s3_files.py --create-config +``` + +## Configuration File Format + +```json +{ + "buckets": [ + { + "name": "bucket-name", + "prefix": "optional/prefix/", + "entity_patterns": { + "project": "regex_pattern_to_extract_project_id", + "run": "regex_pattern_to_extract_run_id" + } + } + ], + "dry_run": true +} +``` + +### Entity Patterns + +Entity patterns are regex patterns used to extract project or run IDs from S3 object keys: + +- `"project": "projects/([^/]+)"` - Extracts project ID from paths like `projects/PROJ001/file.fastq` +- `"run": "runs/([^/]+)"` - Extracts run ID from paths like `runs/RUN123/data.bam` + +The captured group `([^/]+)` becomes the entity ID. + +## File Type Detection + +The script automatically detects file types based on extensions: + +| Extension | File Type | +|-----------|-----------| +| `.fastq`, `.fastq.gz`, `.fq`, `.fq.gz` | FASTQ | +| `.bam`, `.sam` | BAM | +| `.vcf`, `.vcf.gz` | VCF | +| `.csv`, `.xlsx` | SAMPLESHEET | +| `.json` | METRICS | +| `.html`, `.pdf` | REPORT | +| `.log`, `.txt` | LOG | +| `.png`, `.jpg`, `.jpeg`, `.svg` | IMAGE | +| `.doc`, `.docx`, `.md` | DOCUMENT | +| Others | OTHER | + +## What Gets Registered + +For each S3 object, the script creates a database record with: + +- **File metadata**: filename, size, MIME type, upload date +- **S3 location**: `file_path` set to `s3://bucket/key` +- **Storage backend**: Set to `StorageBackend.S3` +- **Entity association**: Project or run ID extracted from path +- **File type**: Auto-detected from extension +- **Default settings**: `is_public=False`, `created_by="s3_bulk_import"` + +## Error Handling + +The script handles common issues: + +- **Duplicate files**: Skips files already registered (based on S3 URI) +- **Access errors**: Reports permission or bucket access issues +- **Invalid patterns**: Falls back to default entity extraction +- **Database errors**: Rolls back failed registrations + +## Monitoring Progress + +The script provides progress updates every 100 files and a final summary: + +``` +Progress: 500 scanned, 450 registered, 30 skipped, 20 errors +========================================== +REGISTRATION SUMMARY +========================================== +Files scanned: 1000 +Files registered: 850 +Files skipped: 100 (already registered) +Errors: 50 (permission/validation issues) +``` + +## Best Practices + +1. **Start with dry run**: Always test with `--dry-run` first +2. **Use specific prefixes**: Limit scope with `--prefix` to avoid scanning entire buckets +3. **Monitor logs**: Check for permission or pattern matching issues +4. **Backup database**: Consider backing up before large imports +5. **Run incrementally**: Process buckets/prefixes separately for better control + +## Troubleshooting + +### Common Issues + +**"AWS credentials not found"** +- Configure AWS credentials using `aws configure` or environment variables + +**"Access denied to bucket"** +- Verify your AWS credentials have the required S3 permissions +- Check bucket policies and IAM roles + +**"No files registered"** +- Verify bucket name and prefix are correct +- Check entity patterns match your S3 structure +- Use `--dry-run` to see what would be processed + +**"Database connection failed"** +- Ensure your database is running and accessible +- Check `SQLALCHEMY_DATABASE_URI` environment variable + +### Getting Help + +Run the script with `--help` for detailed usage information: +```bash +python scripts/register_s3_files.py --help \ No newline at end of file diff --git a/scripts/register_s3_files.py b/scripts/register_s3_files.py new file mode 100644 index 0000000..070579e --- /dev/null +++ b/scripts/register_s3_files.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python +""" +Bulk registration script for existing S3 files. + +This script scans S3 buckets and registers existing files in the database +without moving them. Files remain in S3 but become discoverable through the API. + +Usage: + PYTHONPATH=. + python scripts/register_s3_files.py --bucket my-bucket --prefix data/ + python scripts/register_s3_files.py --config s3_config.json + python scripts/register_s3_files.py --bucket my-bucket --dry-run +""" + +import argparse +import json +import mimetypes +import re +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, Tuple + +import boto3 +from botocore.exceptions import ClientError, NoCredentialsError +from sqlmodel import Session, select + +# Add project root to path for imports +# sys.path.append(str(Path(__file__).parent.parent)) + +from api.files.models import FileCreate, FileType, EntityType, StorageBackend +from api.files.services import create_file +from core.db import get_session +from core.logger import logger + + +class S3FileRegistrar: + """Handles bulk registration of S3 files to the database.""" + + def __init__(self, session: Session, dry_run: bool = False): + self.session = session + self.dry_run = dry_run + self.stats = {"scanned": 0, "registered": 0, "skipped": 0, "errors": 0} + + # File type mapping based on extensions + self.file_type_mapping = { + ".fastq": FileType.FASTQ, + ".fastq.gz": FileType.FASTQ, + ".fq": FileType.FASTQ, + ".fq.gz": FileType.FASTQ, + ".bam": FileType.BAM, + ".sam": FileType.BAM, + ".vcf": FileType.VCF, + ".vcf.gz": FileType.VCF, + ".csv": FileType.SAMPLESHEET, + ".xlsx": FileType.SAMPLESHEET, + ".json": FileType.METRICS, + ".html": FileType.REPORT, + ".pdf": FileType.REPORT, + ".log": FileType.LOG, + ".txt": FileType.LOG, + ".png": FileType.IMAGE, + ".jpg": FileType.IMAGE, + ".jpeg": FileType.IMAGE, + ".svg": FileType.IMAGE, + ".doc": FileType.DOCUMENT, + ".docx": FileType.DOCUMENT, + ".md": FileType.DOCUMENT, + } + + def get_file_type(self, filename: str) -> FileType: + """Determine file type based on filename extension.""" + filename_lower = filename.lower() + + # Check for compound extensions first (e.g., .fastq.gz) + for ext, file_type in self.file_type_mapping.items(): + if filename_lower.endswith(ext): + return file_type + + return FileType.OTHER + + def extract_entity_info( + self, s3_key: str, patterns: Dict[str, str] + ) -> Tuple[EntityType, str]: + """ + Extract entity type and ID from S3 key using regex patterns. + + Args: + s3_key: S3 object key + patterns: Dict of entity_type -> regex pattern + + Returns: + Tuple of (EntityType, entity_id) + """ + for entity_type_str, pattern in patterns.items(): + match = re.search(pattern, s3_key) + if match: + entity_id = match.group(1) + entity_type = ( + EntityType.PROJECT + if entity_type_str == "project" + else EntityType.RUN + ) + return entity_type, entity_id + + # Default fallback - extract from path structure + path_parts = s3_key.split("/") + if len(path_parts) >= 2: + # Assume format like: projects/PROJ001/... or runs/RUN001/... + if path_parts[0].lower() in ["projects", "project"]: + return EntityType.PROJECT, path_parts[1] + elif path_parts[0].lower() in ["runs", "run", "sequencing_runs"]: + return EntityType.RUN, path_parts[1] + + # Final fallback - use first directory as project + return EntityType.PROJECT, path_parts[0] if path_parts else "unknown" + + def file_already_registered(self, s3_uri: str) -> bool: + """Check if file is already registered in database.""" + from api.files.models import File + + existing = self.session.exec( + select(File).where(File.file_path == s3_uri) + ).first() + + return existing is not None + + def register_s3_object( + self, bucket: str, obj: dict, entity_patterns: Dict[str, str] + ) -> bool: + """ + Register a single S3 object in the database. + + Args: + bucket: S3 bucket name + obj: S3 object metadata from boto3 + entity_patterns: Regex patterns for extracting entity info + + Returns: + True if registered successfully, False otherwise + """ + s3_key = obj["Key"] + filename = Path(s3_key).name + s3_uri = f"s3://{bucket}/{s3_key}" + + # Skip directories + if s3_key.endswith("/"): + return False + + # Skip if already registered + if self.file_already_registered(s3_uri): + logger.debug(f"File already registered: {s3_uri}") + self.stats["skipped"] += 1 + return False + + # Extract entity information + entity_type, entity_id = self.extract_entity_info(s3_key, entity_patterns) + + # Determine file type + file_type = self.get_file_type(filename) + + # Get MIME type + mime_type, _ = mimetypes.guess_type(filename) + + # Create file record + file_create = FileCreate( + filename=filename, + original_filename=filename, + description=f"Imported from S3: {s3_key}", + file_type=file_type, + entity_type=entity_type, + entity_id=entity_id, + is_public=False, # Default to private + created_by="s3_bulk_import", + ) + + if self.dry_run: + logger.info( + f"[DRY RUN] Would register: {s3_uri} -> {entity_type.value}/{entity_id}" + ) + self.stats["registered"] += 1 + return True + + try: + # Register file in database + file_record = create_file( + session=self.session, + file_create=file_create, + file_content=None, # No content upload, file stays in S3 + storage_root="storage", # Use default storage settings + ) + + # Update the file record with S3-specific information + file_record.file_path = s3_uri + file_record.storage_backend = StorageBackend.S3 + file_record.file_size = obj.get("Size", 0) + file_record.mime_type = mime_type + file_record.upload_date = obj.get( + "LastModified", datetime.now(timezone.utc) + ) + + self.session.add(file_record) + self.session.commit() + + logger.info(f"Registered: {s3_uri} -> {file_record.file_id}") + self.stats["registered"] += 1 + return True + + except Exception as e: + logger.error(f"Failed to register {s3_uri}: {e}") + self.stats["errors"] += 1 + self.session.rollback() + return False + + def scan_bucket( + self, bucket: str, prefix: str = "", entity_patterns: Dict[str, str] = None + ) -> None: + """ + Scan S3 bucket and register files. + + Args: + bucket: S3 bucket name + prefix: S3 key prefix to filter objects + entity_patterns: Regex patterns for extracting entity info + """ + if entity_patterns is None: + entity_patterns = { + "project": r"(?:projects?|proj)/([^/]+)", + "run": r"(?:runs?|sequencing_runs?)/([^/]+)", + } + + try: + s3_client = boto3.client("s3") + + # Test bucket access + try: + s3_client.head_bucket(Bucket=bucket) + except ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code == "404": + logger.error(f"Bucket '{bucket}' does not exist") + elif error_code == "403": + logger.error(f"Access denied to bucket '{bucket}'") + else: + logger.error(f"Error accessing bucket '{bucket}': {e}") + return + + logger.info(f"Scanning S3 bucket: s3://{bucket}/{prefix}") + + # List objects with pagination + paginator = s3_client.get_paginator("list_objects_v2") + page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) + + for page in page_iterator: + if "Contents" not in page: + continue + + for obj in page["Contents"]: + self.stats["scanned"] += 1 + self.register_s3_object(bucket, obj, entity_patterns) + + # Progress logging + if self.stats["scanned"] % 100 == 0: + logger.info( + f"Progress: {self.stats['scanned']} scanned, " + f"{self.stats['registered']} registered, " + f"{self.stats['skipped']} skipped, " + f"{self.stats['errors']} errors" + ) + + except NoCredentialsError: + logger.error("AWS credentials not found. Please configure AWS credentials.") + except Exception as e: + logger.error(f"Error scanning bucket: {e}") + + def print_summary(self): + """Print registration summary.""" + logger.info("=" * 50) + logger.info("REGISTRATION SUMMARY") + logger.info("=" * 50) + logger.info(f"Files scanned: {self.stats['scanned']}") + logger.info(f"Files registered: {self.stats['registered']}") + logger.info(f"Files skipped: {self.stats['skipped']}") + logger.info(f"Errors: {self.stats['errors']}") + + if self.dry_run: + logger.info("\n*** DRY RUN MODE - No files were actually registered ***") + + +def load_config(config_file: str) -> dict: + """Load configuration from JSON file.""" + try: + with open(config_file, "r") as f: + return json.load(f) + except FileNotFoundError: + logger.error(f"Configuration file not found: {config_file}") + sys.exit(1) + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON in configuration file: {e}") + sys.exit(1) + + +def create_sample_config(): + """Create a sample configuration file.""" + config = { + "buckets": [ + { + "name": "my-ngs-data-bucket", + "prefix": "projects/", + "entity_patterns": { + "project": r"projects/([^/]+)", + "run": r"runs/([^/]+)", + }, + }, + { + "name": "my-runs-bucket", + "prefix": "sequencing_runs/", + "entity_patterns": {"run": r"sequencing_runs/([^/]+)"}, + }, + ], + "dry_run": True, + } + + with open("s3_registration_config.json", "w") as f: + json.dump(config, f, indent=2) + + logger.info("Sample configuration created: s3_registration_config.json") + logger.info("Edit this file with your S3 bucket details and run:") + logger.info( + "python scripts/register_s3_files.py --config s3_registration_config.json" + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Bulk register existing S3 files in the database", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Register all files from a bucket + python scripts/register_s3_files.py --bucket my-bucket + + # Register files with specific prefix + python scripts/register_s3_files.py --bucket my-bucket --prefix projects/ + + # Dry run to see what would be registered + python scripts/register_s3_files.py --bucket my-bucket --dry-run + + # Use configuration file for multiple buckets + python scripts/register_s3_files.py --config s3_config.json + + # Create sample configuration file + python scripts/register_s3_files.py --create-config + """, + ) + + parser.add_argument("--bucket", help="S3 bucket name") + parser.add_argument("--prefix", default="", help="S3 key prefix to filter objects") + parser.add_argument("--config", help="JSON configuration file for multiple buckets") + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be registered without making changes", + ) + parser.add_argument( + "--create-config", + action="store_true", + help="Create a sample configuration file", + ) + + args = parser.parse_args() + + if args.create_config: + create_sample_config() + return + + if not args.bucket and not args.config: + parser.error("Either --bucket or --config must be specified") + + # Get database session + try: + session = next(get_session()) + except Exception as e: + logger.error(f"Failed to create database session: {e}") + sys.exit(1) + + registrar = S3FileRegistrar(session, dry_run=args.dry_run) + + try: + if args.config: + # Load configuration and process multiple buckets + config = load_config(args.config) + dry_run = config.get("dry_run", args.dry_run) + registrar.dry_run = dry_run + + for bucket_config in config["buckets"]: + bucket = bucket_config["name"] + prefix = bucket_config.get("prefix", "") + entity_patterns = bucket_config.get("entity_patterns") + + logger.info(f"Processing bucket: {bucket}") + registrar.scan_bucket(bucket, prefix, entity_patterns) + + else: + # Single bucket mode + registrar.scan_bucket(args.bucket, args.prefix) + + finally: + registrar.print_summary() + session.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/s3_registration_config.sample.json b/scripts/s3_registration_config.sample.json new file mode 100644 index 0000000..afbab7f --- /dev/null +++ b/scripts/s3_registration_config.sample.json @@ -0,0 +1,20 @@ +{ + "buckets": [ + { + "name": "my-ngs-data-bucket", + "prefix": "projects/", + "entity_patterns": { + "project": "projects/([^/]+)", + "run": "runs/([^/]+)" + } + }, + { + "name": "my-sequencing-runs-bucket", + "prefix": "sequencing_runs/", + "entity_patterns": { + "run": "sequencing_runs/([^/]+)" + } + } + ], + "dry_run": true +} \ No newline at end of file diff --git a/tests/api/test_files.py b/tests/api/test_files.py index 907be26..89ed757 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -2,6 +2,1154 @@ Test /files endpoint """ +import tempfile +import shutil +from pathlib import Path +from datetime import datetime, timezone + +import pytest +from fastapi import HTTPException +from fastapi.testclient import TestClient +from sqlmodel import Session + +from api.files.models import ( + FileCreate, + FileUpdate, + FileType, + EntityType, + StorageBackend, + FileBrowserData, + FileBrowserFolder, + FileBrowserFile, +) +from api.files.services import ( + create_file, + get_file, + update_file, + delete_file, + list_files, + get_file_content, + list_files_for_entity, + get_file_count_for_entity, + generate_file_id, + generate_file_path, + calculate_file_checksum, + get_mime_type, + browse_filesystem, + list_files_as_browser_data, +) + + +class TestFileModels: + """Test file model functionality""" + + def test_file_type_enum(self): + """Test FileType enum values""" + assert FileType.FASTQ == "fastq" + assert FileType.BAM == "bam" + assert FileType.VCF == "vcf" + assert FileType.SAMPLESHEET == "samplesheet" + assert FileType.METRICS == "metrics" + assert FileType.REPORT == "report" + assert FileType.LOG == "log" + assert FileType.IMAGE == "image" + assert FileType.DOCUMENT == "document" + assert FileType.OTHER == "other" + + def test_entity_type_enum(self): + """Test EntityType enum values""" + assert EntityType.PROJECT == "project" + assert EntityType.RUN == "run" + + def test_storage_backend_enum(self): + """Test StorageBackend enum values""" + assert StorageBackend.LOCAL == "local" + assert StorageBackend.S3 == "s3" + assert StorageBackend.AZURE == "azure" + assert StorageBackend.GCS == "gcs" + + def test_file_create_model(self): + """Test FileCreate model validation""" + file_create = FileCreate( + filename="test.txt", + description="Test file", + file_type=FileType.DOCUMENT, + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + is_public=True, + created_by="testuser", + ) + + assert file_create.filename == "test.txt" + assert file_create.description == "Test file" + assert file_create.file_type == FileType.DOCUMENT + assert file_create.entity_type == EntityType.PROJECT + assert file_create.entity_id == "PROJ001" + assert file_create.is_public is True + assert file_create.created_by == "testuser" + + def test_file_update_model(self): + """Test FileUpdate model validation""" + file_update = FileUpdate( + filename="updated.txt", description="Updated description", is_public=False + ) + + assert file_update.filename == "updated.txt" + assert file_update.description == "Updated description" + assert file_update.is_public is False + + +class TestFileServices: + """Test file service functions""" + + @pytest.fixture + def temp_storage(self): + """Create temporary storage directory""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + shutil.rmtree(temp_dir) + + def test_generate_file_id(self): + """Test file ID generation""" + file_id = generate_file_id() + assert len(file_id) == 12 + assert file_id.isalnum() + + # Test uniqueness + file_id2 = generate_file_id() + assert file_id != file_id2 + + def test_generate_file_path(self): + """Test file path generation""" + path = generate_file_path( + EntityType.PROJECT, "PROJ001", FileType.FASTQ, "sample.fastq" + ) + + # Should contain entity type, entity id, file type, year, month, filename + path_parts = path.split("/") + assert len(path_parts) == 6 + assert path_parts[0] == "project" + assert path_parts[1] == "PROJ001" + assert path_parts[2] == "fastq" + assert path_parts[5] == "sample.fastq" + + # Year and month should be current + now = datetime.now(timezone.utc) + assert path_parts[3] == now.strftime("%Y") + assert path_parts[4] == now.strftime("%m") + + def test_calculate_file_checksum(self): + """Test file checksum calculation""" + content = b"Hello, World!" + checksum = calculate_file_checksum(content) + + # Should be SHA-256 hash + assert len(checksum) == 64 + assert ( + checksum + == "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f" + ) + + def test_get_mime_type(self): + """Test MIME type detection""" + assert get_mime_type("test.txt") == "text/plain" + assert get_mime_type("test.pdf") == "application/pdf" + assert get_mime_type("test.jpg") == "image/jpeg" + assert ( + get_mime_type("test.fastq") == "application/octet-stream" + ) # Unknown extension + + def test_create_file_without_content(self, session: Session, temp_storage): + """Test creating file record without content""" + file_create = FileCreate( + filename="test.txt", + description="Test file", + file_type=FileType.DOCUMENT, + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + created_by="testuser", + ) + + file_record = create_file(session, file_create, storage_root=temp_storage) + + assert file_record.filename == "test.txt" + assert file_record.description == "Test file" + assert file_record.file_type == FileType.DOCUMENT + assert file_record.entity_type == EntityType.PROJECT + assert file_record.entity_id == "PROJ001" + assert file_record.created_by == "testuser" + assert file_record.file_size is None + assert file_record.checksum is None + assert file_record.mime_type == "text/plain" + assert len(file_record.file_id) == 12 + + def test_create_file_with_content(self, session: Session, temp_storage): + """Test creating file record with content""" + content = b"Hello, World!" + file_create = FileCreate( + filename="test.txt", + description="Test file", + file_type=FileType.DOCUMENT, + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + ) + + file_record = create_file( + session, file_create, content, storage_root=temp_storage + ) + + assert file_record.file_size == len(content) + assert file_record.checksum == calculate_file_checksum(content) + + # Check that file was actually written + file_path = Path(temp_storage) / file_record.file_path + assert file_path.exists() + assert file_path.read_bytes() == content + + def test_get_file(self, session: Session, temp_storage): + """Test getting file by file_id""" + file_create = FileCreate( + filename="test.txt", entity_type=EntityType.PROJECT, entity_id="PROJ001" + ) + + created_file = create_file(session, file_create, storage_root=temp_storage) + retrieved_file = get_file(session, created_file.file_id) + + assert retrieved_file.id == created_file.id + assert retrieved_file.file_id == created_file.file_id + assert retrieved_file.filename == created_file.filename + + def test_get_file_not_found(self, session: Session): + """Test getting non-existent file""" + with pytest.raises(Exception) as exc_info: + get_file(session, "nonexistent") + + assert "not found" in str(exc_info.value) + + def test_update_file(self, session: Session, temp_storage): + """Test updating file metadata""" + file_create = FileCreate( + filename="test.txt", + description="Original description", + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + ) + + created_file = create_file(session, file_create, storage_root=temp_storage) + + file_update = FileUpdate( + filename="updated.txt", description="Updated description", is_public=True + ) + + updated_file = update_file(session, created_file.file_id, file_update) + + assert updated_file.filename == "updated.txt" + assert updated_file.description == "Updated description" + assert updated_file.is_public is True + + def test_delete_file(self, session: Session, temp_storage): + """Test deleting file and content""" + content = b"Hello, World!" + file_create = FileCreate( + filename="test.txt", entity_type=EntityType.PROJECT, entity_id="PROJ001" + ) + + created_file = create_file( + session, file_create, content, storage_root=temp_storage + ) + file_path = Path(temp_storage) / created_file.file_path + + # Verify file exists + assert file_path.exists() + + # Delete file + result = delete_file(session, created_file.file_id, storage_root=temp_storage) + assert result is True + + # Verify file is deleted + assert not file_path.exists() + + # Verify database record is deleted + with pytest.raises(Exception): + get_file(session, created_file.file_id) + + def test_list_files_empty(self, session: Session): + """Test listing files when none exist""" + result = list_files(session) + + assert result.total_items == 0 + assert result.total_pages == 0 + assert result.current_page == 1 + assert result.per_page == 20 + assert result.has_next is False + assert result.has_prev is False + assert len(result.data) == 0 + + def test_list_files_with_data(self, session: Session, temp_storage): + """Test listing files with data""" + # Create test files + for i in range(3): + file_create = FileCreate( + filename=f"test{i}.txt", + description=f"Test file {i}", + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + file_type=FileType.DOCUMENT, + ) + create_file(session, file_create, storage_root=temp_storage) + + result = list_files(session) + + assert result.total_items == 3 + assert result.total_pages == 1 + assert result.current_page == 1 + assert result.per_page == 20 + assert result.has_next is False + assert result.has_prev is False + assert len(result.data) == 3 + + def test_list_files_for_entity(self, session: Session, temp_storage): + """Test listing files for specific entity""" + # Create files for different entities + for entity_id in ["PROJ001", "PROJ002"]: + for i in range(2): + file_create = FileCreate( + filename=f"test{i}.txt", + entity_type=EntityType.PROJECT, + entity_id=entity_id, + ) + create_file(session, file_create, storage_root=temp_storage) + + # List files for PROJ001 + result = list_files_for_entity(session, EntityType.PROJECT, "PROJ001") + + assert result.total_items == 2 + assert all(file.entity_id == "PROJ001" for file in result.data) + + def test_get_file_count_for_entity(self, session: Session, temp_storage): + """Test getting file count for entity""" + # Create files for entity + for i in range(3): + file_create = FileCreate( + filename=f"test{i}.txt", + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + ) + create_file(session, file_create, storage_root=temp_storage) + + count = get_file_count_for_entity(session, EntityType.PROJECT, "PROJ001") + assert count == 3 + + def test_get_file_content(self, session: Session, temp_storage): + """Test retrieving file content""" + content = b"Hello, World!" + file_create = FileCreate( + filename="test.txt", entity_type=EntityType.PROJECT, entity_id="PROJ001" + ) + + created_file = create_file( + session, file_create, content, storage_root=temp_storage + ) + retrieved_content = get_file_content( + session, created_file.file_id, storage_root=temp_storage + ) + + assert retrieved_content == content + + def test_get_file_content_not_found(self, session: Session, temp_storage): + """Test retrieving content for non-existent file""" + file_create = FileCreate( + filename="test.txt", entity_type=EntityType.PROJECT, entity_id="PROJ001" + ) + + # Create file record without content + created_file = create_file(session, file_create, storage_root=temp_storage) + + with pytest.raises(Exception) as exc_info: + get_file_content(session, created_file.file_id, storage_root=temp_storage) + + assert "not found" in str(exc_info.value) + + +class TestFileAPI: + """Test file API endpoints""" + + def test_create_file_endpoint(self, client: TestClient, session: Session): + """Test file creation endpoint""" + file_data = { + "filename": "test_api.txt", + "description": "Test file via API", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "is_public": "true", # Form data sends as string + "created_by": "api_test_user", + } + + response = client.post("/api/v1/files", data=file_data) + assert response.status_code == 201 + + data = response.json() + assert data["filename"] == "test_api.txt" + assert data["description"] == "Test file via API" + assert data["file_type"] == "document" + assert data["entity_type"] == "project" + assert data["entity_id"] == "PROJ001" + assert data["is_public"] is True + assert data["created_by"] == "api_test_user" + assert "file_id" in data + assert "upload_date" in data + + def test_get_files_endpoint(self, client: TestClient, session: Session): + """Test file listing endpoint""" + # First create some test files + for i in range(3): + file_data = { + "filename": f"test_list_{i}.txt", + "description": f"Test file {i}", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "list_test_user", + } + client.post("/api/v1/files", data=file_data) + + # Test basic listing + response = client.get("/api/v1/files") + assert response.status_code == 200 + + data = response.json() + assert "data" in data + assert "total_items" in data + assert "current_page" in data + assert "per_page" in data + assert data["total_items"] >= 3 + assert len(data["data"]) >= 3 + + # Test pagination + response = client.get("/api/v1/files?page=1&per_page=2") + assert response.status_code == 200 + data = response.json() + assert data["current_page"] == 1 + assert data["per_page"] == 2 + assert len(data["data"]) <= 2 + + # Test filtering by entity + response = client.get("/api/v1/files?entity_type=project&entity_id=PROJ001") + assert response.status_code == 200 + data = response.json() + for item in data["data"]: + assert item["entity_type"] == "project" + assert item["entity_id"] == "PROJ001" + + # Test filtering by file type + response = client.get("/api/v1/files?file_type=document") + assert response.status_code == 200 + data = response.json() + for item in data["data"]: + assert item["file_type"] == "document" + + # Test search functionality + response = client.get("/api/v1/files?search=test_list_1") + assert response.status_code == 200 + data = response.json() + assert data["total_items"] >= 1 + + def test_get_file_endpoint(self, client: TestClient, session: Session): + """Test file retrieval endpoint""" + # Create a test file + file_data = { + "filename": "test_get.txt", + "description": "Test file for GET", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "get_test_user", + } + + create_response = client.post("/api/v1/files", data=file_data) + assert create_response.status_code == 201 + created_file = create_response.json() + file_id = created_file["file_id"] + + # Test successful retrieval + response = client.get(f"/api/v1/files/{file_id}") + assert response.status_code == 200 + + data = response.json() + assert data["file_id"] == file_id + assert data["filename"] == "test_get.txt" + assert data["description"] == "Test file for GET" + + # Test non-existent file + response = client.get("/api/v1/files/non-existent-id") + assert response.status_code == 404 + + def test_update_file_endpoint(self, client: TestClient, session: Session): + """Test file update endpoint""" + # Create a test file + file_data = { + "filename": "test_update.txt", + "description": "Original description", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "update_test_user", + } + + create_response = client.post("/api/v1/files", data=file_data) + assert create_response.status_code == 201 + created_file = create_response.json() + file_id = created_file["file_id"] + + # Test successful update + update_data = {"description": "Updated description", "is_public": True} + + response = client.put(f"/api/v1/files/{file_id}", json=update_data) + assert response.status_code == 200 + + data = response.json() + assert data["file_id"] == file_id + assert data["description"] == "Updated description" + assert data["is_public"] is True + assert data["filename"] == "test_update.txt" # Should remain unchanged + + # Test non-existent file update + response = client.put("/api/v1/files/non-existent-id", json=update_data) + assert response.status_code == 404 + + def test_delete_file_endpoint(self, client: TestClient, session: Session): + """Test file deletion endpoint""" + # Create a test file + file_data = { + "filename": "test_delete.txt", + "description": "Test file for deletion", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "delete_test_user", + } + + create_response = client.post("/api/v1/files", data=file_data) + assert create_response.status_code == 201 + created_file = create_response.json() + file_id = created_file["file_id"] + + # Test successful deletion + response = client.delete(f"/api/v1/files/{file_id}") + assert response.status_code == 204 + + # Verify file is deleted by trying to get it + get_response = client.get(f"/api/v1/files/{file_id}") + assert get_response.status_code == 404 + + # Test non-existent file deletion + response = client.delete("/api/v1/files/non-existent-id") + assert response.status_code == 404 + + def test_list_files_for_entity_endpoint(self, client: TestClient, session: Session): + """Test listing files for a specific entity""" + # Create files for different entities + entities = [ + ("project", "PROJ001"), + ("project", "PROJ002"), + ("run", "190110_MACHINE123_0001_FLOWCELL123"), + ] + + for entity_type, entity_id in entities: + for i in range(2): + file_data = { + "filename": f"entity_test_{i}.txt", + "description": f"Test file {i} for {entity_type} {entity_id}", + "file_type": "document", + "entity_type": entity_type, + "entity_id": entity_id, + "created_by": "entity_test_user", + } + client.post("/api/v1/files", data=file_data) + + # Test listing files for specific project + response = client.get("/api/v1/files/entity/project/PROJ001") + assert response.status_code == 200 + + data = response.json() + assert data["total_items"] == 2 + for item in data["data"]: + assert item["entity_type"] == "project" + assert item["entity_id"] == "PROJ001" + + # Test listing files for specific run + response = client.get( + "/api/v1/files/entity/run/190110_MACHINE123_0001_FLOWCELL123" + ) + assert response.status_code == 200 + + data = response.json() + assert data["total_items"] == 2 + for item in data["data"]: + assert item["entity_type"] == "run" + assert item["entity_id"] == "190110_MACHINE123_0001_FLOWCELL123" + + # Test pagination for entity files + response = client.get("/api/v1/files/entity/project/PROJ001?page=1&per_page=1") + assert response.status_code == 200 + data = response.json() + assert data["current_page"] == 1 + assert data["per_page"] == 1 + assert len(data["data"]) == 1 + + def test_get_file_count_for_entity_endpoint( + self, client: TestClient, session: Session + ): + """Test getting file count for a specific entity""" + # Create files for a specific entity + entity_type = "project" + entity_id = "PROJ_COUNT_TEST" + + for i in range(5): + file_data = { + "filename": f"count_test_{i}.txt", + "description": f"Count test file {i}", + "file_type": "document", + "entity_type": entity_type, + "entity_id": entity_id, + "created_by": "count_test_user", + } + client.post("/api/v1/files", data=file_data) + + # Test file count endpoint + response = client.get(f"/api/v1/files/entity/{entity_type}/{entity_id}/count") + assert response.status_code == 200 + + data = response.json() + assert data["entity_type"] == entity_type + assert data["entity_id"] == entity_id + assert data["file_count"] == 5 + + # Test count for entity with no files + response = client.get("/api/v1/files/entity/project/EMPTY_PROJECT/count") + assert response.status_code == 200 + + data = response.json() + assert data["entity_type"] == "project" + assert data["entity_id"] == "EMPTY_PROJECT" + assert data["file_count"] == 0 + + def test_create_file_with_content_endpoint( + self, client: TestClient, session: Session + ): + """Test file creation with content upload""" + import io + + # Create file data + file_data = { + "filename": "test_with_content.txt", + "description": "Test file with content", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "content_test_user", + } + + # Create file content + file_content = b"Hello, this is test content!" + files = { + "content": ("test_content.txt", io.BytesIO(file_content), "text/plain") + } + + # Send multipart form data + response = client.post("/api/v1/files", data=file_data, files=files) + assert response.status_code == 201 + + data = response.json() + assert data["filename"] == "test_with_content.txt" + assert data["file_size"] == len(file_content) + assert data["mime_type"] == "text/plain" + + def test_error_handling(self, client: TestClient, session: Session): + """Test API error handling""" + # Test invalid file type + invalid_file_data = { + "filename": "test.txt", + "file_type": "invalid_type", + "entity_type": "project", + "entity_id": "PROJ001", + } + + response = client.post("/api/v1/files", data=invalid_file_data) + assert response.status_code == 422 # Validation error + + # Test invalid entity type + invalid_entity_data = { + "filename": "test.txt", + "file_type": "document", + "entity_type": "invalid_entity", + "entity_id": "PROJ001", + } + + response = client.post("/api/v1/files", data=invalid_entity_data) + assert response.status_code == 422 # Validation error + + +class TestFileIntegration: + """Integration tests for file operations""" + + @pytest.fixture + def temp_storage(self): + """Create temporary storage directory""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + shutil.rmtree(temp_dir) + + def test_complete_file_lifecycle(self, session: Session, temp_storage): + """Test complete file lifecycle: create, read, update, delete""" + content = b"Initial content" + + # Create file + file_create = FileCreate( + filename="lifecycle.txt", + description="Lifecycle test file", + file_type=FileType.DOCUMENT, + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + created_by="testuser", + ) + + created_file = create_file( + session, file_create, content, storage_root=temp_storage + ) + assert created_file.filename == "lifecycle.txt" + assert created_file.file_size == len(content) + + # Read file + retrieved_file = get_file(session, created_file.file_id) + assert retrieved_file.id == created_file.id + + retrieved_content = get_file_content( + session, created_file.file_id, storage_root=temp_storage + ) + assert retrieved_content == content + + # Update file metadata + file_update = FileUpdate( + description="Updated lifecycle test file", is_public=True + ) + + updated_file = update_file(session, created_file.file_id, file_update) + assert updated_file.description == "Updated lifecycle test file" + assert updated_file.is_public is True + + # Delete file + result = delete_file(session, created_file.file_id, storage_root=temp_storage) + assert result is True + + # Verify deletion + with pytest.raises(Exception): + get_file(session, created_file.file_id) + + def test_multiple_entities_file_management(self, session: Session, temp_storage): + """Test file management across multiple entities""" + entities = [ + (EntityType.PROJECT, "PROJ001"), + (EntityType.PROJECT, "PROJ002"), + (EntityType.RUN, "190110_MACHINE123_0001_FLOWCELL123"), + ] + + created_files = [] + + # Create files for different entities + for entity_type, entity_id in entities: + for i in range(2): + file_create = FileCreate( + filename=f"file{i}.txt", + entity_type=entity_type, + entity_id=entity_id, + file_type=FileType.DOCUMENT, + ) + + file_record = create_file( + session, file_create, storage_root=temp_storage + ) + created_files.append(file_record) + + # Verify total count + all_files = list_files(session) + assert all_files.total_items == 6 + + # Verify entity-specific counts + for entity_type, entity_id in entities: + entity_files = list_files_for_entity(session, entity_type, entity_id) + assert entity_files.total_items == 2 + + count = get_file_count_for_entity(session, entity_type, entity_id) + assert count == 2 + + def test_file_type_filtering(self, session: Session, temp_storage): + """Test filtering files by type""" + file_types = [FileType.FASTQ, FileType.BAM, FileType.VCF, FileType.DOCUMENT] + + # Create files of different types + for file_type in file_types: + file_create = FileCreate( + filename=f"test.{file_type.value}", + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + file_type=file_type, + ) + create_file(session, file_create, storage_root=temp_storage) + + # Test filtering by each type + for file_type in file_types: + from api.files.models import FileFilters + + filters = FileFilters(file_type=file_type) + result = list_files(session, filters=filters) + + assert result.total_items == 1 + assert result.data[0].file_type == file_type + + def test_samplesheet_dual_storage_success(self, session: Session, temp_storage): + """Test that samplesheets are saved to both database storage and run folder""" + from api.runs.models import SequencingRun, RunStatus + from datetime import date + + # Create a test run with run_folder_uri + run = SequencingRun( + run_date=date(2019, 1, 10), + machine_id="MACHINE123", + run_number=1, + flowcell_id="FLOWCELL123", + experiment_name="Test Experiment", + run_folder_uri=f"{temp_storage}/run_folder", + status=RunStatus.READY, + ) + session.add(run) + session.commit() + session.refresh(run) + + # Create run folder directory + run_folder = Path(temp_storage) / "run_folder" + run_folder.mkdir(parents=True, exist_ok=True) + + # Create samplesheet content + samplesheet_content = b"""[Header] +IEMFileVersion,4 +Investigator Name,Test User + +[Data] +Sample_ID,Sample_Name,index +Sample1,Sample1,ATCG +""" + + # Create samplesheet file + file_create = FileCreate( + filename="SampleSheet.csv", + description="Test samplesheet", + file_type=FileType.SAMPLESHEET, + entity_type=EntityType.RUN, + entity_id=run.barcode, + created_by="testuser", + ) + + created_file = create_file( + session, file_create, samplesheet_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + assert db_file_path.read_bytes() == samplesheet_content + + # Verify file was also saved to run folder + run_folder_samplesheet = run_folder / "SampleSheet.csv" + assert run_folder_samplesheet.exists() + assert run_folder_samplesheet.read_bytes() == samplesheet_content + + # Verify description indicates dual storage success + assert "[Dual-stored to run folder]" in created_file.description + + def test_samplesheet_dual_storage_no_run_folder( + self, session: Session, temp_storage + ): + """Test samplesheet upload when run has no run_folder_uri""" + from api.runs.models import SequencingRun, RunStatus + from datetime import date + + # Create a test run WITHOUT run_folder_uri + run = SequencingRun( + run_date=date(2019, 1, 10), + machine_id="MACHINE123", + run_number=2, + flowcell_id="FLOWCELL456", + experiment_name="Test Experiment No URI", + run_folder_uri=None, # No URI + status=RunStatus.READY, + ) + session.add(run) + session.commit() + session.refresh(run) + + samplesheet_content = b"Sample samplesheet content" + + # Create samplesheet file + file_create = FileCreate( + filename="SampleSheet.csv", + description="Test samplesheet", + file_type=FileType.SAMPLESHEET, + entity_type=EntityType.RUN, + entity_id=run.barcode, + created_by="testuser", + ) + + created_file = create_file( + session, file_create, samplesheet_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + + # Verify description indicates database-only storage + assert ( + "[Database-only storage - run folder write failed]" + in created_file.description + ) + + def test_samplesheet_dual_storage_invalid_path( + self, session: Session, temp_storage + ): + """Test samplesheet upload when run_folder_uri has invalid format""" + from api.runs.models import SequencingRun, RunStatus + from datetime import date + + # Create a test run with invalid run_folder_uri + run = SequencingRun( + run_date=date(2019, 1, 10), + machine_id="MACHINE123", + run_number=3, + flowcell_id="FLOWCELL789", + experiment_name="Test Experiment Invalid Path", + run_folder_uri="invalid://path/format", # Invalid format + status=RunStatus.READY, + ) + session.add(run) + session.commit() + session.refresh(run) + + samplesheet_content = b"Sample samplesheet content" + + # Create samplesheet file + file_create = FileCreate( + filename="SampleSheet.csv", + description="Test samplesheet", + file_type=FileType.SAMPLESHEET, + entity_type=EntityType.RUN, + entity_id=run.barcode, + created_by="testuser", + ) + + created_file = create_file( + session, file_create, samplesheet_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + + # Verify description indicates database-only storage + assert ( + "[Database-only storage - run folder write failed]" + in created_file.description + ) + + def test_non_samplesheet_files_not_dual_stored( + self, session: Session, temp_storage + ): + """Test that non-samplesheet files are not dual-stored""" + from api.runs.models import SequencingRun, RunStatus + from datetime import date + + # Create a test run with run_folder_uri + run = SequencingRun( + run_date=date(2019, 1, 10), + machine_id="MACHINE123", + run_number=4, + flowcell_id="FLOWCELL999", + experiment_name="Test Experiment", + run_folder_uri=f"{temp_storage}/run_folder_2", + status=RunStatus.READY, + ) + session.add(run) + session.commit() + session.refresh(run) + + # Create run folder directory + run_folder = Path(temp_storage) / "run_folder_2" + run_folder.mkdir(parents=True, exist_ok=True) + + # Create a NON-samplesheet file for the run + file_content = b"Some other file content" + file_create = FileCreate( + filename="metrics.json", + description="Test metrics file", + file_type=FileType.METRICS, # NOT a samplesheet + entity_type=EntityType.RUN, + entity_id=run.barcode, + created_by="testuser", + ) + + created_file = create_file( + session, file_create, file_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + + # Verify file was NOT saved to run folder + run_folder_file = run_folder / "SampleSheet.csv" + assert not run_folder_file.exists() + + # Verify description does NOT contain dual storage note + assert "[Dual-stored to run folder]" not in (created_file.description or "") + assert "[Database-only storage" not in (created_file.description or "") + + def test_samplesheet_for_project_not_dual_stored( + self, session: Session, temp_storage + ): + """Test that samplesheets for projects are not dual-stored""" + # Create a samplesheet for a PROJECT (not a run) + samplesheet_content = b"Sample samplesheet content" + file_create = FileCreate( + filename="SampleSheet.csv", + description="Project samplesheet", + file_type=FileType.SAMPLESHEET, + entity_type=EntityType.PROJECT, # PROJECT, not RUN + entity_id="PROJ001", + created_by="testuser", + ) + + created_file = create_file( + session, file_create, samplesheet_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + + # Verify description does NOT contain dual storage note + assert "[Dual-stored to run folder]" not in (created_file.description or "") + assert "[Database-only storage" not in (created_file.description or "") + + +class TestFileBrowserModels: + """Test file browser model functionality""" + + def test_file_browser_folder_model(self): + """Test FileBrowserFolder model""" + folder = FileBrowserFolder(name="test_folder", date="2023-01-01 12:00:00") + + assert folder.name == "test_folder" + assert folder.date == "2023-01-01 12:00:00" + + def test_file_browser_file_model(self): + """Test FileBrowserFile model""" + file = FileBrowserFile( + name="test_file.txt", date="2023-01-01 12:00:00", size=1024 + ) + + assert file.name == "test_file.txt" + assert file.date == "2023-01-01 12:00:00" + assert file.size == 1024 + + def test_file_browser_data_model(self): + """Test FileBrowserData model""" + folder = FileBrowserFolder(name="folder1", date="2023-01-01 12:00:00") + file = FileBrowserFile(name="file1.txt", date="2023-01-01 12:00:00", size=100) + + browser_data = FileBrowserData(folders=[folder], files=[file]) + + assert len(browser_data.folders) == 1 + assert len(browser_data.files) == 1 + assert browser_data.folders[0].name == "folder1" + assert browser_data.files[0].name == "file1.txt" + + +class TestFileBrowserServices: + """Test file browser service functions""" + + @pytest.fixture + def temp_storage(self): + """Create temporary storage with folder/file structure""" + temp_dir = tempfile.mkdtemp() + + # Create some folders + (Path(temp_dir) / "folder1").mkdir() + (Path(temp_dir) / "folder2").mkdir() + + # Create some files + (Path(temp_dir) / "file1.txt").write_text("content1") + (Path(temp_dir) / "file2.txt").write_text("content2") + + yield temp_dir + shutil.rmtree(temp_dir) + + def test_browse_filesystem(self, temp_storage): + """Test filesystem browsing""" + result = browse_filesystem("", temp_storage) + + assert isinstance(result, FileBrowserData) + assert len(result.folders) == 2 + assert len(result.files) == 2 + + # Check folder names + folder_names = [f.name for f in result.folders] + assert "folder1" in folder_names + assert "folder2" in folder_names + + # Check file names + file_names = [f.name for f in result.files] + assert "file1.txt" in file_names + assert "file2.txt" in file_names + + # Check file sizes + for file in result.files: + assert file.size > 0 + assert file.date is not None + + def test_browse_filesystem_nonexistent_directory(self): + """Test browsing non-existent directory""" + with pytest.raises(HTTPException) as exc_info: + browse_filesystem("nonexistent", "nonexistent_root") + + assert exc_info.value.status_code == 404 + assert "not found" in str(exc_info.value.detail) + + def test_list_files_as_browser_data(self, session: Session): + """Test converting database files to browser data format""" + # Create test files in database + for i in range(3): + file_create = FileCreate( + filename=f"db_file_{i}.txt", + description=f"Database file {i}", + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + file_type=FileType.DOCUMENT, + ) + create_file(session, file_create) + + # Get files in browser format + result = list_files_as_browser_data(session) + + assert isinstance(result, FileBrowserData) + assert len(result.folders) == 0 # No folders for database files + assert len(result.files) == 3 + + # Check file properties + for file in result.files: + assert file.name.startswith("db_file_") + assert file.date is not None + assert file.size >= 0 from datetime import datetime from fastapi.testclient import TestClient @@ -46,6 +1194,104 @@ def test_parse_s3_path(self): class TestFileBrowserAPI: """Test file browser API endpoints""" + def test_browse_filesystem_endpoint(self, client: TestClient): + """Test filesystem browsing endpoint""" + # Create a temporary directory structure for testing + temp_dir = tempfile.mkdtemp() + try: + # Create test structure + (Path(temp_dir) / "test_folder").mkdir() + (Path(temp_dir) / "test_file.txt").write_text("test content") + + # Test the endpoint + response = client.get(f"/api/v1/files/browse?storage_root={temp_dir}") + assert response.status_code == 200 + + data = response.json() + assert "folders" in data + assert "files" in data + assert isinstance(data["folders"], list) + assert isinstance(data["files"], list) + + finally: + shutil.rmtree(temp_dir) + + def test_s3_path_detection(self): + """Test S3 path detection functionality""" + from api.files.services import _parse_s3_path + + # Test S3 path parsing + bucket, key = _parse_s3_path("s3://my-bucket/path/to/folder/") + assert bucket == "my-bucket" + assert key == "path/to/folder/" + + bucket, key = _parse_s3_path("s3://my-bucket") + assert bucket == "my-bucket" + assert key == "" + + bucket, key = _parse_s3_path("s3://my-bucket/") + assert bucket == "my-bucket" + assert key == "" + + # Test invalid S3 path + with pytest.raises(ValueError): + _parse_s3_path("invalid://path") + + def test_browse_s3_without_boto3(self, client: TestClient, monkeypatch): + """Test S3 browsing when boto3 is not available""" + # Mock BOTO3_AVAILABLE to False + import api.files.services as services + + monkeypatch.setattr(services, "BOTO3_AVAILABLE", False) + + response = client.get("/api/v1/files/browse?directory_path=s3://test-bucket/") + assert response.status_code == 501 + assert "S3 support not available" in response.json()["detail"] + + def test_browse_s3_invalid_path(self, client: TestClient): + """Test S3 browsing with invalid S3 path""" + # Test with a path that looks like S3 but has invalid format + response = client.get("/api/v1/files/browse?directory_path=s3://") + assert response.status_code == 400 + assert "Invalid S3 path format" in response.json()["detail"] + + def test_browse_db_endpoint(self, client: TestClient, session: Session): + """Test database files browser endpoint""" + # Create test files + for i in range(2): + file_data = { + "filename": f"browser_test_{i}.txt", + "description": f"Browser test file {i}", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "browser_test_user", + } + client.post("/api/v1/files", data=file_data) + + # Test the browser endpoint + response = client.get("/api/v1/files/browse-db") + assert response.status_code == 200 + + data = response.json() + assert "folders" in data + assert "files" in data + assert len(data["folders"]) == 0 # No folders for database files + assert len(data["files"]) >= 2 + + # Check file structure + for file in data["files"]: + assert "name" in file + assert "date" in file + assert "size" in file + + def test_browse_filesystem_error_handling(self, client: TestClient): + """Test error handling for filesystem browsing""" + # Test non-existent directory + response = client.get( + "/api/v1/files/browse?directory_path=nonexistent&storage_root=nonexistent" + ) + assert response.status_code == 404 def test_list_s3(self, client: TestClient, mock_s3_client: MockS3Client): """Test S3 browsing with proper mocking""" # Setup mock S3 data