From 00f9e97aa60e48a049499141110f6ab6da756ba8 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sat, 20 Sep 2025 09:43:45 -0400 Subject: [PATCH 01/35] Add files architecture documentation --- files_api_architecture.md | 292 +++++++++++++++++++++ files_api_design.md | 334 ++++++++++++++++++++++++ files_api_implementation_plan.md | 428 +++++++++++++++++++++++++++++++ 3 files changed, 1054 insertions(+) create mode 100644 files_api_architecture.md create mode 100644 files_api_design.md create mode 100644 files_api_implementation_plan.md diff --git a/files_api_architecture.md b/files_api_architecture.md new file mode 100644 index 0000000..d457350 --- /dev/null +++ b/files_api_architecture.md @@ -0,0 +1,292 @@ +# Files API Architecture Diagram + +## System Architecture Overview + +```mermaid +graph TB + subgraph "Client Layer" + WEB[Web Frontend] + API_CLIENT[API Clients] + CLI[CLI Tools] + end + + subgraph "API Layer" + ROUTER[FastAPI Router] + AUTH[Authentication] + VALID[Validation] + end + + subgraph "Service Layer" + FILE_SVC[File Service] + STORAGE_SVC[Storage Service] + METADATA_SVC[Metadata Service] + end + + subgraph "Storage Backends" + LOCAL[Local Storage] + S3[AWS S3] + AZURE[Azure Blob] + GCS[Google Cloud] + end + + subgraph "Database Layer" + FILE_TBL[(File Table)] + PROJECT_TBL[(Project Table)] + RUN_TBL[(Run Table)] + PERM_TBL[(Permissions Table)] + end + + subgraph "Search & Index" + OPENSEARCH[OpenSearch] + SEARCH_IDX[File Index] + end + + WEB --> ROUTER + API_CLIENT --> ROUTER + CLI --> ROUTER + + ROUTER --> AUTH + AUTH --> VALID + VALID --> FILE_SVC + + FILE_SVC --> STORAGE_SVC + FILE_SVC --> METADATA_SVC + FILE_SVC --> FILE_TBL + + STORAGE_SVC --> LOCAL + STORAGE_SVC --> S3 + STORAGE_SVC --> AZURE + STORAGE_SVC --> GCS + + FILE_TBL -.-> PROJECT_TBL + FILE_TBL -.-> RUN_TBL + FILE_TBL --> PERM_TBL + + METADATA_SVC --> OPENSEARCH + OPENSEARCH --> SEARCH_IDX +``` + +## Data Model Relationships + +```mermaid +erDiagram + PROJECT { + uuid id PK + string project_id UK + string name + } + + SEQUENCING_RUN { + uuid id PK + string barcode UK + date run_date + string machine_id + string status + } + + FILE { + uuid id PK + string file_id UK + string filename + string original_filename + string file_path + int file_size + string mime_type + string checksum + string description + enum file_type + datetime upload_date + string created_by + enum entity_type + string entity_id FK + enum storage_backend + boolean is_public + boolean is_archived + } + + FILE_PERMISSION { + uuid id PK + uuid file_id FK + string user_id + string group_id + enum permission_type + datetime granted_date + } + + FILE_VERSION { + uuid id PK + uuid file_id FK + int version_number + string file_path + int file_size + string checksum + datetime created_date + boolean is_current + } + + PROJECT ||--o{ FILE : "has files" + SEQUENCING_RUN ||--o{ FILE : "has files" + FILE ||--o{ FILE_PERMISSION : "has permissions" + FILE ||--o{ FILE_VERSION : "has versions" +``` + +## API Endpoint Structure + +```mermaid +graph LR + subgraph "Generic File Operations" + A[GET /files] --> A1[List all files] + B[POST /files] --> B1[Upload file] + C[GET /files/{id}] --> C1[Get file metadata] + D[PUT /files/{id}] --> D1[Update metadata] + E[DELETE /files/{id}] --> E1[Delete file] + F[GET /files/{id}/download] --> F1[Download file] + end + + subgraph "Project File Operations" + G[GET /projects/{id}/files] --> G1[List project files] + H[POST /projects/{id}/files] --> H1[Upload to project] + I[GET /projects/{id}/files/{file_id}] --> I1[Get project file] + end + + subgraph "Run File Operations" + J[GET /runs/{barcode}/files] --> J1[List run files] + K[POST /runs/{barcode}/files] --> K1[Upload to run] + L[GET /runs/{barcode}/files/{file_id}] --> L1[Get run file] + end + + subgraph "Bulk Operations" + M[POST /files/bulk-upload] --> M1[Upload multiple] + N[DELETE /files/bulk-delete] --> N1[Delete multiple] + end +``` + +## File Upload Flow + +```mermaid +sequenceDiagram + participant Client + participant API + participant FileService + participant StorageService + participant Database + participant Storage + + Client->>API: POST /projects/{id}/files + API->>API: Validate request + API->>FileService: upload_file() + FileService->>FileService: Generate file_id + FileService->>StorageService: store_file() + StorageService->>Storage: Upload file data + Storage-->>StorageService: Storage URI + StorageService-->>FileService: File path + FileService->>Database: Save file metadata + Database-->>FileService: File record + FileService-->>API: File metadata + API-->>Client: Upload response +``` + +## File Download Flow + +```mermaid +sequenceDiagram + participant Client + participant API + participant FileService + participant StorageService + participant Database + participant Storage + + Client->>API: GET /files/{id}/download + API->>API: Check permissions + API->>FileService: get_download_url() + FileService->>Database: Get file metadata + Database-->>FileService: File record + FileService->>StorageService: generate_download_url() + StorageService->>Storage: Create signed URL + Storage-->>StorageService: Signed URL + StorageService-->>FileService: Download URL + FileService-->>API: Download URL + API-->>Client: Redirect or URL response +``` + +## Storage Strategy + +```mermaid +graph TB + subgraph "File Organization" + ROOT[Storage Root] + ROOT --> PROJECTS[/projects/] + ROOT --> RUNS[/runs/] + + PROJECTS --> PROJ_ID[/{project_id}/] + RUNS --> RUN_ID[/{run_barcode}/] + + PROJ_ID --> PROJ_TYPE[/{file_type}/] + RUN_ID --> RUN_TYPE[/{file_type}/] + + PROJ_TYPE --> PROJ_DATE[/{year}/{month}/] + RUN_TYPE --> RUN_DATE[/{year}/{month}/] + + PROJ_DATE --> PROJ_FILE[/{file_id}_{filename}] + RUN_DATE --> RUN_FILE[/{file_id}_{filename}] + end + + subgraph "Storage Backends" + LOCAL_FS[Local Filesystem] + AWS_S3[AWS S3] + AZURE_BLOB[Azure Blob Storage] + GCP_STORAGE[Google Cloud Storage] + end + + PROJ_FILE -.-> LOCAL_FS + PROJ_FILE -.-> AWS_S3 + PROJ_FILE -.-> AZURE_BLOB + PROJ_FILE -.-> GCP_STORAGE + + RUN_FILE -.-> LOCAL_FS + RUN_FILE -.-> AWS_S3 + RUN_FILE -.-> AZURE_BLOB + RUN_FILE -.-> GCP_STORAGE +``` + +## Security and Access Control + +```mermaid +graph TB + subgraph "Authentication" + USER[User Request] + AUTH_CHECK[Authentication Check] + TOKEN[JWT Token Validation] + end + + subgraph "Authorization" + PERM_CHECK[Permission Check] + ENTITY_ACCESS[Entity Access Check] + FILE_ACCESS[File Access Check] + end + + subgraph "File Operations" + READ_OP[Read Operation] + WRITE_OP[Write Operation] + DELETE_OP[Delete Operation] + end + + USER --> AUTH_CHECK + AUTH_CHECK --> TOKEN + TOKEN --> PERM_CHECK + PERM_CHECK --> ENTITY_ACCESS + ENTITY_ACCESS --> FILE_ACCESS + FILE_ACCESS --> READ_OP + FILE_ACCESS --> WRITE_OP + FILE_ACCESS --> DELETE_OP +``` + +This architecture provides: + +1. **Scalable Design**: Supports multiple storage backends and can handle large file volumes +2. **Flexible Associations**: Files can be linked to any entity type (projects, runs, future entities) +3. **Rich Metadata**: Comprehensive file information and categorization +4. **Security**: Multi-layered permission system +5. **Performance**: Efficient querying and caching strategies +6. **Extensibility**: Easy to add new file types, storage backends, and features \ No newline at end of file diff --git a/files_api_design.md b/files_api_design.md new file mode 100644 index 0000000..c594eb1 --- /dev/null +++ b/files_api_design.md @@ -0,0 +1,334 @@ +# Files API Design for NGS360 + +## Overview +Design a unified Files API that supports file operations (list, fetch, upload, delete) for both sequencing runs and projects, with flexible storage backends and comprehensive metadata tracking. + +## Current System Analysis + +### Existing Models +- **Projects**: Have `project_id` (string), `name`, and `attributes` +- **Runs**: Have `id` (UUID), `barcode` (computed), `run_folder_uri`, and various metadata +- **Files API**: Currently has skeleton routes but missing models/services + +## Proposed Architecture + +### 1. Data Model Design + +#### Core File Model +```python +class File(SQLModel, table=True): + """Core file entity that can be associated with runs or projects""" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_id: str = Field(unique=True) # Human-readable identifier + filename: str = Field(max_length=255) + original_filename: str = Field(max_length=255) + file_path: str = Field(max_length=1024) # Storage path/URI + file_size: int | None = None # Size in bytes + mime_type: str | None = Field(max_length=100) + checksum: str | None = Field(max_length=64) # SHA-256 hash + + # Metadata + description: str | None = Field(max_length=1024) + file_type: FileType = Field(default=FileType.OTHER) + upload_date: datetime = Field(default_factory=datetime.utcnow) + created_by: str | None = Field(max_length=100) # User identifier + + # Polymorphic associations + entity_type: EntityType # "project" or "run" + entity_id: str # project_id or run barcode + + # Storage metadata + storage_backend: StorageBackend = Field(default=StorageBackend.LOCAL) + is_public: bool = Field(default=False) + is_archived: bool = Field(default=False) +``` + +#### Enums +```python +class FileType(str, Enum): + """File type categories""" + FASTQ = "fastq" + BAM = "bam" + VCF = "vcf" + SAMPLESHEET = "samplesheet" + METRICS = "metrics" + REPORT = "report" + LOG = "log" + IMAGE = "image" + DOCUMENT = "document" + OTHER = "other" + +class EntityType(str, Enum): + """Entity types that can have files""" + PROJECT = "project" + RUN = "run" + +class StorageBackend(str, Enum): + """Storage backend types""" + LOCAL = "local" + S3 = "s3" + AZURE = "azure" + GCS = "gcs" +``` + +### 2. API Endpoint Design + +#### Unified Files Endpoints +``` +# Generic file operations +GET /api/v1/files # List all files (with filters) +POST /api/v1/files # Upload file (requires entity association) +GET /api/v1/files/{file_id} # Get file metadata +PUT /api/v1/files/{file_id} # Update file metadata +DELETE /api/v1/files/{file_id} # Delete file +GET /api/v1/files/{file_id}/download # Download file content + +# Entity-specific file operations +GET /api/v1/projects/{project_id}/files # List project files +POST /api/v1/projects/{project_id}/files # Upload file to project +GET /api/v1/projects/{project_id}/files/{file_id} # Get project file + +GET /api/v1/runs/{run_barcode}/files # List run files +POST /api/v1/runs/{run_barcode}/files # Upload file to run +GET /api/v1/runs/{run_barcode}/files/{file_id} # Get run file + +# Bulk operations +POST /api/v1/files/bulk-upload # Upload multiple files +DELETE /api/v1/files/bulk-delete # Delete multiple files +``` + +### 3. Request/Response Models + +#### File Upload Request +```python +class FileUploadRequest(SQLModel): + """Request model for file upload""" + filename: str + description: str | None = None + file_type: FileType = FileType.OTHER + is_public: bool = False + tags: List[str] | None = None + +class FileUploadResponse(SQLModel): + """Response model for file upload""" + file_id: str + filename: str + upload_url: str | None = None # For direct upload scenarios + file_size: int | None = None + checksum: str | None = None +``` + +#### File Listing +```python +class FilePublic(SQLModel): + """Public file representation""" + file_id: str + filename: str + original_filename: str + file_size: int | None + mime_type: str | None + description: str | None + file_type: FileType + upload_date: datetime + created_by: str | None + entity_type: EntityType + entity_id: str + is_public: bool + download_url: str | None = None + +class FilesPublic(SQLModel): + """Paginated file listing""" + data: List[FilePublic] + total_items: int + total_pages: int + current_page: int + per_page: int + has_next: bool + has_prev: bool + filters: Dict[str, Any] | None = None +``` + +### 4. Storage Strategy + +#### Multi-Backend Support +```python +class StorageService: + """Abstract storage service interface""" + + async def upload_file(self, file_data: bytes, file_path: str) -> str: + """Upload file and return storage URI""" + pass + + async def download_file(self, file_path: str) -> bytes: + """Download file content""" + pass + + async def delete_file(self, file_path: str) -> bool: + """Delete file from storage""" + pass + + async def get_download_url(self, file_path: str, expires_in: int = 3600) -> str: + """Generate temporary download URL""" + pass + +class LocalStorageService(StorageService): + """Local filesystem storage""" + pass + +class S3StorageService(StorageService): + """AWS S3 storage""" + pass +``` + +#### File Path Strategy +``` +Storage Structure: +/{storage_root}/ + /projects/ + /{project_id}/ + /{file_type}/ + /{year}/{month}/ + /{file_id}_{original_filename} + /runs/ + /{run_barcode}/ + /{file_type}/ + /{year}/{month}/ + /{file_id}_{original_filename} +``` + +### 5. Advanced Features + +#### File Filtering and Search +```python +class FileFilters(SQLModel): + """File filtering options""" + entity_type: EntityType | None = None + entity_id: str | None = None + file_type: FileType | None = None + mime_type: str | None = None + created_by: str | None = None + date_from: datetime | None = None + date_to: datetime | None = None + is_public: bool | None = None + is_archived: bool | None = None + tags: List[str] | None = None + search_query: str | None = None # Search in filename/description +``` + +#### File Versioning (Future Enhancement) +```python +class FileVersion(SQLModel, table=True): + """File version tracking""" + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_id: uuid.UUID = Field(foreign_key="file.id") + version_number: int + file_path: str + file_size: int + checksum: str + created_date: datetime = Field(default_factory=datetime.utcnow) + is_current: bool = Field(default=True) +``` + +### 6. Security and Access Control + +#### File Access Permissions +```python +class FilePermission(SQLModel, table=True): + """File-level permissions""" + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + file_id: uuid.UUID = Field(foreign_key="file.id") + user_id: str | None = None + group_id: str | None = None + permission_type: PermissionType + granted_date: datetime = Field(default_factory=datetime.utcnow) + +class PermissionType(str, Enum): + READ = "read" + WRITE = "write" + DELETE = "delete" + ADMIN = "admin" +``` + +### 7. Integration Points + +#### With Existing Models +- **Projects**: Add relationship to files via `entity_id` = `project_id` +- **Runs**: Add relationship to files via `entity_id` = `run.barcode` +- **Search**: Include files in OpenSearch indexing for full-text search + +#### Database Relationships +```python +# In Project model +files: List["File"] = Relationship( + sa_relationship_kwargs={ + "primaryjoin": "and_(Project.project_id == File.entity_id, File.entity_type == 'project')", + "foreign_keys": "[File.entity_id]" + } +) + +# In SequencingRun model +files: List["File"] = Relationship( + sa_relationship_kwargs={ + "primaryjoin": "and_(SequencingRun.barcode == File.entity_id, File.entity_type == 'run')", + "foreign_keys": "[File.entity_id]" + } +) +``` + +## Implementation Plan + +### Phase 1: Core Infrastructure +1. Create file models and enums +2. Implement basic storage service (local filesystem) +3. Create core CRUD operations +4. Add database migration + +### Phase 2: API Endpoints +1. Implement generic file endpoints +2. Add entity-specific endpoints +3. Create file upload/download functionality +4. Add comprehensive error handling + +### Phase 3: Advanced Features +1. Add file filtering and search +2. Implement multiple storage backends +3. Add file metadata extraction +4. Create bulk operations + +### Phase 4: Integration & Security +1. Integrate with existing project/run models +2. Add authentication and authorization +3. Implement file permissions +4. Add audit logging + +## Benefits + +1. **Unified Interface**: Single API for all file operations across projects and runs +2. **Flexible Storage**: Support for multiple storage backends (local, S3, etc.) +3. **Rich Metadata**: Comprehensive file metadata and categorization +4. **Scalable**: Designed to handle large numbers of files efficiently +5. **Secure**: Built-in permission system and access controls +6. **Searchable**: Integration with existing search infrastructure +7. **Extensible**: Easy to add new file types and storage backends + +## Example Usage + +```python +# Upload a file to a project +POST /api/v1/projects/PROJ001/files +{ + "filename": "analysis_results.pdf", + "description": "Final analysis report", + "file_type": "report", + "is_public": false +} + +# List all FASTQ files for a run +GET /api/v1/runs/190110_MACHINE123_0001_FLOWCELL123/files?file_type=fastq + +# Search for files across all entities +GET /api/v1/files?search_query=analysis&file_type=report&date_from=2024-01-01 +``` + +This design provides a robust, scalable, and flexible file management system that integrates seamlessly with the existing NGS360 architecture. \ No newline at end of file diff --git a/files_api_implementation_plan.md b/files_api_implementation_plan.md new file mode 100644 index 0000000..670cbcf --- /dev/null +++ b/files_api_implementation_plan.md @@ -0,0 +1,428 @@ +# Files API Implementation Roadmap + +## Overview +This document outlines the step-by-step implementation plan for the Files API, organized into phases with specific deliverables and acceptance criteria. + +## Phase 1: Foundation (Week 1-2) + +### 1.1 Database Models and Migrations +**Priority: Critical** + +#### Tasks: +- [ ] Create `api/files/models.py` with core file models +- [ ] Create database migration for file tables +- [ ] Add foreign key relationships to existing models +- [ ] Create enum definitions for file types and storage backends + +#### Deliverables: +```python +# File models with proper relationships +class File(SQLModel, table=True) +class FileType(str, Enum) +class EntityType(str, Enum) +class StorageBackend(str, Enum) + +# Migration script +# alembic/versions/xxx_create_file_tables.py +``` + +#### Acceptance Criteria: +- [ ] Database migration runs successfully +- [ ] All model relationships work correctly +- [ ] Foreign key constraints are properly defined +- [ ] Enum values are correctly stored and retrieved + +### 1.2 Basic Storage Service +**Priority: Critical** + +#### Tasks: +- [ ] Create `api/files/storage.py` with storage interface +- [ ] Implement `LocalStorageService` for filesystem storage +- [ ] Create file path generation utilities +- [ ] Add basic file operations (upload, download, delete) + +#### Deliverables: +```python +# Storage service interface and implementation +class StorageService(ABC) +class LocalStorageService(StorageService) + +# Utility functions +def generate_file_path(entity_type, entity_id, file_type, filename) +def ensure_directory_exists(path) +``` + +#### Acceptance Criteria: +- [ ] Files can be uploaded to local storage +- [ ] Files can be downloaded from local storage +- [ ] File paths are generated consistently +- [ ] Directory structure is created automatically + +### 1.3 Core Service Layer +**Priority: Critical** + +#### Tasks: +- [ ] Create `api/files/services.py` with CRUD operations +- [ ] Implement file metadata management +- [ ] Add file validation and security checks +- [ ] Create file ID generation logic + +#### Deliverables: +```python +# Core file operations +def create_file(session, file_data, entity_type, entity_id) +def get_file(session, file_id) +def update_file(session, file_id, updates) +def delete_file(session, file_id) +def list_files(session, filters, pagination) +``` + +#### Acceptance Criteria: +- [ ] All CRUD operations work correctly +- [ ] File metadata is properly validated +- [ ] Unique file IDs are generated +- [ ] Database transactions are handled properly + +## Phase 2: API Endpoints (Week 3-4) + +### 2.1 Generic File Endpoints +**Priority: High** + +#### Tasks: +- [ ] Update `api/files/routes.py` with complete endpoint set +- [ ] Implement file upload with multipart form data +- [ ] Add file download with proper headers +- [ ] Create file listing with filtering and pagination + +#### Deliverables: +```python +# Complete API endpoints +@router.post("/files") # Upload file +@router.get("/files") # List files +@router.get("/files/{file_id}") # Get file metadata +@router.put("/files/{file_id}") # Update metadata +@router.delete("/files/{file_id}") # Delete file +@router.get("/files/{file_id}/download") # Download file +``` + +#### Acceptance Criteria: +- [ ] File upload works with proper validation +- [ ] File download returns correct content and headers +- [ ] File listing supports filtering and pagination +- [ ] All endpoints return proper HTTP status codes +- [ ] Error handling is comprehensive + +### 2.2 Entity-Specific Endpoints +**Priority: High** + +#### Tasks: +- [ ] Add project-specific file endpoints +- [ ] Add run-specific file endpoints +- [ ] Implement entity validation (project/run exists) +- [ ] Add entity-based access control + +#### Deliverables: +```python +# Project file endpoints +@router.get("/projects/{project_id}/files") +@router.post("/projects/{project_id}/files") +@router.get("/projects/{project_id}/files/{file_id}") + +# Run file endpoints +@router.get("/runs/{run_barcode}/files") +@router.post("/runs/{run_barcode}/files") +@router.get("/runs/{run_barcode}/files/{file_id}") +``` + +#### Acceptance Criteria: +- [ ] Entity validation works correctly +- [ ] Files are properly associated with entities +- [ ] Entity-specific file listing works +- [ ] Access control prevents unauthorized access + +### 2.3 Request/Response Models +**Priority: High** + +#### Tasks: +- [ ] Create comprehensive Pydantic models +- [ ] Add proper validation rules +- [ ] Implement response serialization +- [ ] Add API documentation + +#### Deliverables: +```python +# Request/Response models +class FileUploadRequest(SQLModel) +class FileUploadResponse(SQLModel) +class FilePublic(SQLModel) +class FilesPublic(SQLModel) +class FileFilters(SQLModel) +``` + +#### Acceptance Criteria: +- [ ] All models have proper validation +- [ ] API documentation is auto-generated +- [ ] Response serialization works correctly +- [ ] Error responses are properly formatted + +## Phase 3: Advanced Features (Week 5-6) + +### 3.1 File Search and Filtering +**Priority: Medium** + +#### Tasks: +- [ ] Implement advanced file filtering +- [ ] Add full-text search capabilities +- [ ] Integrate with OpenSearch +- [ ] Add file indexing for search + +#### Deliverables: +```python +# Advanced filtering +class FileFilters(SQLModel) +def search_files(session, search_query, filters) +def index_file_for_search(file_metadata) +``` + +#### Acceptance Criteria: +- [ ] Complex filtering works correctly +- [ ] Full-text search returns relevant results +- [ ] Search performance is acceptable +- [ ] Search index stays synchronized + +### 3.2 Multiple Storage Backends +**Priority: Medium** + +#### Tasks: +- [ ] Implement S3StorageService +- [ ] Add storage backend configuration +- [ ] Create storage backend selection logic +- [ ] Add storage backend migration tools + +#### Deliverables: +```python +# Additional storage backends +class S3StorageService(StorageService) +class AzureStorageService(StorageService) +class StorageBackendFactory + +# Configuration +STORAGE_BACKENDS = { + "local": LocalStorageService, + "s3": S3StorageService, + "azure": AzureStorageService +} +``` + +#### Acceptance Criteria: +- [ ] Multiple storage backends work correctly +- [ ] Storage backend can be configured per file +- [ ] File migration between backends is possible +- [ ] All backends support the same interface + +### 3.3 Bulk Operations +**Priority: Medium** + +#### Tasks: +- [ ] Implement bulk file upload +- [ ] Add bulk file deletion +- [ ] Create batch processing utilities +- [ ] Add progress tracking for bulk operations + +#### Deliverables: +```python +# Bulk operations +@router.post("/files/bulk-upload") +@router.delete("/files/bulk-delete") +def process_bulk_upload(files, entity_type, entity_id) +def process_bulk_delete(file_ids) +``` + +#### Acceptance Criteria: +- [ ] Bulk upload handles multiple files efficiently +- [ ] Bulk delete is transactional +- [ ] Progress tracking works correctly +- [ ] Error handling for partial failures + +## Phase 4: Integration & Security (Week 7-8) + +### 4.1 Authentication and Authorization +**Priority: High** + +#### Tasks: +- [ ] Integrate with existing auth system +- [ ] Implement file-level permissions +- [ ] Add user/group access control +- [ ] Create permission management endpoints + +#### Deliverables: +```python +# Permission models and services +class FilePermission(SQLModel, table=True) +class PermissionType(str, Enum) +def check_file_permission(user, file_id, permission_type) +def grant_file_permission(file_id, user_id, permission_type) +``` + +#### Acceptance Criteria: +- [ ] Authentication is required for all operations +- [ ] File permissions are enforced correctly +- [ ] Permission inheritance works properly +- [ ] Admin users can manage all files + +### 4.2 Integration with Existing Models +**Priority: High** + +#### Tasks: +- [ ] Add file relationships to Project model +- [ ] Add file relationships to SequencingRun model +- [ ] Update existing API responses to include file counts +- [ ] Create file association utilities + +#### Deliverables: +```python +# Model updates +# In api/project/models.py +files: List["File"] = Relationship(...) + +# In api/runs/models.py +files: List["File"] = Relationship(...) + +# Updated response models +class ProjectPublic(SQLModel): + file_count: int | None = None + +class SequencingRunPublic(SQLModel): + file_count: int | None = None +``` + +#### Acceptance Criteria: +- [ ] File relationships work correctly +- [ ] Existing APIs show file information +- [ ] File counts are accurate +- [ ] No breaking changes to existing APIs + +### 4.3 Audit Logging and Monitoring +**Priority: Medium** + +#### Tasks: +- [ ] Add audit logging for file operations +- [ ] Create file access logs +- [ ] Add monitoring metrics +- [ ] Implement file usage analytics + +#### Deliverables: +```python +# Audit logging +class FileAuditLog(SQLModel, table=True) +def log_file_operation(user, file_id, operation, details) + +# Monitoring metrics +def track_file_upload(file_size, file_type) +def track_file_download(file_id, user_id) +``` + +#### Acceptance Criteria: +- [ ] All file operations are logged +- [ ] Audit logs are searchable +- [ ] Monitoring metrics are collected +- [ ] Usage analytics are available + +## Phase 5: Testing & Documentation (Week 9-10) + +### 5.1 Comprehensive Testing +**Priority: Critical** + +#### Tasks: +- [ ] Create unit tests for all services +- [ ] Add integration tests for API endpoints +- [ ] Create performance tests for file operations +- [ ] Add security tests for access control + +#### Deliverables: +```python +# Test files +tests/api/test_files.py +tests/services/test_file_service.py +tests/storage/test_storage_backends.py +tests/security/test_file_permissions.py +``` + +#### Acceptance Criteria: +- [ ] Test coverage > 90% +- [ ] All tests pass consistently +- [ ] Performance tests meet requirements +- [ ] Security tests validate access control + +### 5.2 API Documentation +**Priority: High** + +#### Tasks: +- [ ] Complete OpenAPI documentation +- [ ] Create usage examples +- [ ] Add integration guides +- [ ] Create troubleshooting documentation + +#### Deliverables: +- Complete API documentation +- Usage examples and tutorials +- Integration guides for different storage backends +- Troubleshooting and FAQ documentation + +#### Acceptance Criteria: +- [ ] API documentation is complete and accurate +- [ ] Examples work correctly +- [ ] Integration guides are clear +- [ ] Documentation is accessible to developers + +## Success Metrics + +### Performance Targets +- File upload: < 2 seconds for files up to 100MB +- File download: < 1 second for metadata, streaming for content +- File listing: < 500ms for paginated results +- Search: < 1 second for complex queries + +### Scalability Targets +- Support for 10,000+ files per project/run +- Handle 100+ concurrent file operations +- Storage backend agnostic (local, S3, Azure, GCS) +- Horizontal scaling capability + +### Security Requirements +- All file operations require authentication +- File-level access control +- Audit logging for all operations +- Secure file URLs with expiration + +## Risk Mitigation + +### Technical Risks +1. **Large file handling**: Implement streaming uploads/downloads +2. **Storage costs**: Add file lifecycle management +3. **Performance**: Implement caching and CDN integration +4. **Data consistency**: Use database transactions and validation + +### Operational Risks +1. **Storage migration**: Create migration tools and procedures +2. **Backup and recovery**: Implement automated backup strategies +3. **Monitoring**: Add comprehensive logging and alerting +4. **Documentation**: Maintain up-to-date documentation + +## Dependencies + +### External Dependencies +- FastAPI for API framework +- SQLModel for database models +- Alembic for database migrations +- Boto3 for AWS S3 integration +- Azure SDK for Azure Blob Storage +- Google Cloud SDK for GCS + +### Internal Dependencies +- Existing authentication system +- Database infrastructure +- OpenSearch for file indexing +- Monitoring and logging infrastructure + +This implementation plan provides a structured approach to building a comprehensive Files API that integrates seamlessly with the existing NGS360 system while providing robust file management capabilities for both projects and runs. \ No newline at end of file From 999747b31303dc3c38b7a843d7e469b80cde72ff Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sat, 20 Sep 2025 13:56:32 -0400 Subject: [PATCH 02/35] Add files test and code --- api/files/models.py | 166 +++++++++++++ api/files/services.py | 336 ++++++++++++++++++++++++++ tests/api/test_files.py | 513 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1015 insertions(+) create mode 100644 api/files/models.py create mode 100644 api/files/services.py create mode 100644 tests/api/test_files.py diff --git a/api/files/models.py b/api/files/models.py new file mode 100644 index 0000000..f78d4f1 --- /dev/null +++ b/api/files/models.py @@ -0,0 +1,166 @@ +""" +Models for the Files API +""" + +import uuid +from datetime import datetime +from enum import Enum +from sqlmodel import SQLModel, Field +from pydantic import ConfigDict +from typing import List + + +class FileType(str, Enum): + """File type categories""" + FASTQ = "fastq" + BAM = "bam" + VCF = "vcf" + SAMPLESHEET = "samplesheet" + METRICS = "metrics" + REPORT = "report" + LOG = "log" + IMAGE = "image" + DOCUMENT = "document" + OTHER = "other" + + +class EntityType(str, Enum): + """Entity types that can have files""" + PROJECT = "project" + RUN = "run" + + +class StorageBackend(str, Enum): + """Storage backend types""" + LOCAL = "local" + S3 = "s3" + AZURE = "azure" + GCS = "gcs" + + +class File(SQLModel, table=True): + """Core file entity that can be associated with runs or projects""" + + __searchable__ = ["filename", "description", "file_id"] + + id: uuid.UUID | None = Field(default_factory=uuid.uuid4, primary_key=True) + file_id: str = Field(unique=True, max_length=100) # Human-readable identifier + filename: str = Field(max_length=255) + original_filename: str = Field(max_length=255) + file_path: str = Field(max_length=1024) # Storage path/URI + file_size: int | None = None # Size in bytes + mime_type: str | None = Field(default=None, max_length=100) + checksum: str | None = Field(default=None, max_length=64) # SHA-256 hash + + # Metadata + description: str | None = Field(default=None, max_length=1024) + file_type: FileType = Field(default=FileType.OTHER) + upload_date: datetime = Field(default_factory=datetime.utcnow) + created_by: str | None = Field(default=None, max_length=100) # User identifier + + # Polymorphic associations + entity_type: EntityType # "project" or "run" + entity_id: str = Field(max_length=100) # project_id or run barcode + + # Storage metadata + storage_backend: StorageBackend = Field(default=StorageBackend.LOCAL) + is_public: bool = Field(default=False) + is_archived: bool = Field(default=False) + + model_config = ConfigDict(from_attributes=True) + + def generate_file_id(self) -> str: + """Generate a unique file ID""" + import secrets + import string + alphabet = string.ascii_letters + string.digits + return ''.join(secrets.choice(alphabet) for _ in range(12)) + + +class FileCreate(SQLModel): + """Request model for creating a file""" + filename: str + original_filename: str | None = None + description: str | None = None + file_type: FileType = FileType.OTHER + entity_type: EntityType + entity_id: str + is_public: bool = False + created_by: str | None = None + + model_config = ConfigDict(extra="forbid") + + +class FileUpdate(SQLModel): + """Request model for updating file metadata""" + filename: str | None = None + description: str | None = None + file_type: FileType | None = None + is_public: bool | None = None + is_archived: bool | None = None + + model_config = ConfigDict(extra="forbid") + + +class FilePublic(SQLModel): + """Public file representation""" + file_id: str + filename: str + original_filename: str + file_size: int | None + mime_type: str | None + description: str | None + file_type: FileType + upload_date: datetime + created_by: str | None + entity_type: EntityType + entity_id: str + is_public: bool + is_archived: bool + storage_backend: StorageBackend + checksum: str | None = None + + +class FilesPublic(SQLModel): + """Paginated file listing""" + data: List[FilePublic] + total_items: int + total_pages: int + current_page: int + per_page: int + has_next: bool + has_prev: bool + + +class FileUploadRequest(SQLModel): + """Request model for file upload""" + filename: str + description: str | None = None + file_type: FileType = FileType.OTHER + is_public: bool = False + + model_config = ConfigDict(extra="forbid") + + +class FileUploadResponse(SQLModel): + """Response model for file upload""" + file_id: str + filename: str + file_size: int | None = None + checksum: str | None = None + upload_date: datetime + message: str = "File uploaded successfully" + + +class FileFilters(SQLModel): + """File filtering options""" + entity_type: EntityType | None = None + entity_id: str | None = None + file_type: FileType | None = None + mime_type: str | None = None + created_by: str | None = None + is_public: bool | None = None + is_archived: bool | None = None + search_query: str | None = None # Search in filename/description + + model_config = ConfigDict(extra="forbid") \ No newline at end of file diff --git a/api/files/services.py b/api/files/services.py new file mode 100644 index 0000000..e1acf19 --- /dev/null +++ b/api/files/services.py @@ -0,0 +1,336 @@ +""" +Services for managing files. +""" + +import hashlib +import secrets +import string +from pathlib import Path +from sqlmodel import select, Session, func +from pydantic import PositiveInt +from fastapi import HTTPException, status + +from api.files.models import ( + File, + FileCreate, + FileUpdate, + FilePublic, + FilesPublic, + FileFilters, + FileType, + EntityType, + StorageBackend, +) + + +def generate_file_id() -> str: + """Generate a unique file ID""" + alphabet = string.ascii_letters + string.digits + return ''.join(secrets.choice(alphabet) for _ in range(12)) + + +def generate_file_path( + entity_type: EntityType, entity_id: str, file_type: FileType, filename: str +) -> str: + """Generate a structured file path""" + from datetime import datetime + now = datetime.utcnow() + year = now.strftime("%Y") + month = now.strftime("%m") + + # Create path structure: /{entity_type}/{entity_id}/{file_type}/{year}/{month}/{filename} + path_parts = [ + entity_type.value, + entity_id, + file_type.value, + year, + month, + filename + ] + return "/".join(path_parts) + + +def calculate_file_checksum(file_content: bytes) -> str: + """Calculate SHA-256 checksum of file content""" + return hashlib.sha256(file_content).hexdigest() + + +def get_mime_type(filename: str) -> str: + """Get MIME type based on file extension""" + import mimetypes + mime_type, _ = mimetypes.guess_type(filename) + return mime_type or "application/octet-stream" + + +def create_file( + session: Session, + file_create: FileCreate, + file_content: bytes | None = None, + storage_root: str = "storage" +) -> File: + """Create a new file record and optionally store content""" + + # Generate unique file ID + file_id = generate_file_id() + + # Use original_filename if provided, otherwise use filename + original_filename = file_create.original_filename or file_create.filename + + # Generate file path + file_path = generate_file_path( + file_create.entity_type, + file_create.entity_id, + file_create.file_type, + f"{file_id}_{file_create.filename}" + ) + + # Calculate file metadata if content is provided + file_size = len(file_content) if file_content else None + checksum = calculate_file_checksum(file_content) if file_content else None + mime_type = get_mime_type(file_create.filename) + + # Create file record + file_record = File( + file_id=file_id, + filename=file_create.filename, + original_filename=original_filename, + file_path=file_path, + file_size=file_size, + mime_type=mime_type, + checksum=checksum, + description=file_create.description, + file_type=file_create.file_type, + created_by=file_create.created_by, + entity_type=file_create.entity_type, + entity_id=file_create.entity_id, + is_public=file_create.is_public, + storage_backend=StorageBackend.LOCAL + ) + + # Store file content if provided + if file_content: + full_path = Path(storage_root) / file_path + full_path.parent.mkdir(parents=True, exist_ok=True) + with open(full_path, "wb") as f: + f.write(file_content) + + # Save to database + session.add(file_record) + session.commit() + session.refresh(file_record) + + return file_record + + +def get_file(session: Session, file_id: str) -> File: + """Get a file by its file_id""" + file_record = session.exec( + select(File).where(File.file_id == file_id) + ).first() + + if not file_record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found" + ) + + return file_record + + +def get_file_by_id(session: Session, id: str) -> File: + """Get a file by its internal UUID""" + file_record = session.exec( + select(File).where(File.id == id) + ).first() + + if not file_record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with internal id {id} not found" + ) + + return file_record + + +def update_file(session: Session, file_id: str, file_update: FileUpdate) -> File: + """Update file metadata""" + file_record = get_file(session, file_id) + + # Update fields that are provided + update_data = file_update.model_dump(exclude_unset=True) + for field, value in update_data.items(): + setattr(file_record, field, value) + + session.add(file_record) + session.commit() + session.refresh(file_record) + + return file_record + + +def delete_file(session: Session, file_id: str, storage_root: str = "storage") -> bool: + """Delete a file record and its content""" + file_record = get_file(session, file_id) + + # Delete physical file if it exists + full_path = Path(storage_root) / file_record.file_path + if full_path.exists(): + full_path.unlink() + + # Try to remove empty directories + try: + full_path.parent.rmdir() + except OSError: + # Directory not empty, that's fine + pass + + # Delete from database + session.delete(file_record) + session.commit() + + return True + + +def list_files( + session: Session, + filters: FileFilters | None = None, + page: PositiveInt = 1, + per_page: PositiveInt = 20, + sort_by: str = "upload_date", + sort_order: str = "desc" +) -> FilesPublic: + """List files with filtering and pagination""" + + # Build query + query = select(File) + + # Apply filters + if filters: + if filters.entity_type: + query = query.where(File.entity_type == filters.entity_type) + if filters.entity_id: + query = query.where(File.entity_id == filters.entity_id) + if filters.file_type: + query = query.where(File.file_type == filters.file_type) + if filters.mime_type: + query = query.where(File.mime_type == filters.mime_type) + if filters.created_by: + query = query.where(File.created_by == filters.created_by) + if filters.is_public is not None: + query = query.where(File.is_public == filters.is_public) + if filters.is_archived is not None: + query = query.where(File.is_archived == filters.is_archived) + if filters.search_query: + search_term = f"%{filters.search_query}%" + query = query.where( + (File.filename.ilike(search_term)) | + (File.description.ilike(search_term)) + ) + + # Get total count + total_count = session.exec( + select(func.count()).select_from(query.subquery()) + ).one() + + # Calculate pagination + total_pages = (total_count + per_page - 1) // per_page + + # Apply sorting + sort_field = getattr(File, sort_by, File.upload_date) + if sort_order == "desc": + query = query.order_by(sort_field.desc()) + else: + query = query.order_by(sort_field.asc()) + + # Apply pagination + query = query.offset((page - 1) * per_page).limit(per_page) + + # Execute query + files = session.exec(query).all() + + # Convert to public models + public_files = [ + FilePublic( + file_id=file.file_id, + filename=file.filename, + original_filename=file.original_filename, + file_size=file.file_size, + mime_type=file.mime_type, + description=file.description, + file_type=file.file_type, + upload_date=file.upload_date, + created_by=file.created_by, + entity_type=file.entity_type, + entity_id=file.entity_id, + is_public=file.is_public, + is_archived=file.is_archived, + storage_backend=file.storage_backend, + checksum=file.checksum + ) + for file in files + ] + + return FilesPublic( + data=public_files, + total_items=total_count, + total_pages=total_pages, + current_page=page, + per_page=per_page, + has_next=page < total_pages, + has_prev=page > 1 + ) + + +def get_file_content(session: Session, file_id: str, storage_root: str = "storage") -> bytes: + """Get file content from storage""" + file_record = get_file(session, file_id) + + full_path = Path(storage_root) / file_record.file_path + if not full_path.exists(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File content not found at {file_record.file_path}" + ) + + with open(full_path, "rb") as f: + return f.read() + + +def list_files_for_entity( + session: Session, + entity_type: EntityType, + entity_id: str, + page: PositiveInt = 1, + per_page: PositiveInt = 20, + file_type: FileType | None = None +) -> FilesPublic: + """List files for a specific entity (project or run)""" + filters = FileFilters( + entity_type=entity_type, + entity_id=entity_id, + file_type=file_type + ) + + return list_files( + session=session, + filters=filters, + page=page, + per_page=per_page + ) + + +def get_file_count_for_entity( + session: Session, + entity_type: EntityType, + entity_id: str +) -> int: + """Get the count of files for a specific entity""" + count = session.exec( + select(func.count(File.id)).where( + File.entity_type == entity_type, + File.entity_id == entity_id, + ~File.is_archived + ) + ).one() + + return count \ No newline at end of file diff --git a/tests/api/test_files.py b/tests/api/test_files.py new file mode 100644 index 0000000..d439772 --- /dev/null +++ b/tests/api/test_files.py @@ -0,0 +1,513 @@ +""" +Test /files endpoint +""" + +import tempfile +import shutil +from pathlib import Path +from datetime import datetime + +import pytest +from fastapi.testclient import TestClient +from sqlmodel import Session + +from api.files.models import ( + FileCreate, + FileUpdate, + FileType, + EntityType, + StorageBackend, +) +from api.files.services import ( + create_file, + get_file, + update_file, + delete_file, + list_files, + get_file_content, + list_files_for_entity, + get_file_count_for_entity, + generate_file_id, + generate_file_path, + calculate_file_checksum, + get_mime_type, +) + + +class TestFileModels: + """Test file model functionality""" + + def test_file_type_enum(self): + """Test FileType enum values""" + assert FileType.FASTQ == "fastq" + assert FileType.BAM == "bam" + assert FileType.VCF == "vcf" + assert FileType.SAMPLESHEET == "samplesheet" + assert FileType.METRICS == "metrics" + assert FileType.REPORT == "report" + assert FileType.LOG == "log" + assert FileType.IMAGE == "image" + assert FileType.DOCUMENT == "document" + assert FileType.OTHER == "other" + + def test_entity_type_enum(self): + """Test EntityType enum values""" + assert EntityType.PROJECT == "project" + assert EntityType.RUN == "run" + + def test_storage_backend_enum(self): + """Test StorageBackend enum values""" + assert StorageBackend.LOCAL == "local" + assert StorageBackend.S3 == "s3" + assert StorageBackend.AZURE == "azure" + assert StorageBackend.GCS == "gcs" + + def test_file_create_model(self): + """Test FileCreate model validation""" + file_create = FileCreate( + filename="test.txt", + description="Test file", + file_type=FileType.DOCUMENT, + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + is_public=True, + created_by="testuser" + ) + + assert file_create.filename == "test.txt" + assert file_create.description == "Test file" + assert file_create.file_type == FileType.DOCUMENT + assert file_create.entity_type == EntityType.PROJECT + assert file_create.entity_id == "PROJ001" + assert file_create.is_public is True + assert file_create.created_by == "testuser" + + def test_file_update_model(self): + """Test FileUpdate model validation""" + file_update = FileUpdate( + filename="updated.txt", + description="Updated description", + is_public=False + ) + + assert file_update.filename == "updated.txt" + assert file_update.description == "Updated description" + assert file_update.is_public is False + + +class TestFileServices: + """Test file service functions""" + + @pytest.fixture + def temp_storage(self): + """Create temporary storage directory""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + shutil.rmtree(temp_dir) + + def test_generate_file_id(self): + """Test file ID generation""" + file_id = generate_file_id() + assert len(file_id) == 12 + assert file_id.isalnum() + + # Test uniqueness + file_id2 = generate_file_id() + assert file_id != file_id2 + + def test_generate_file_path(self): + """Test file path generation""" + path = generate_file_path( + EntityType.PROJECT, + "PROJ001", + FileType.FASTQ, + "sample.fastq" + ) + + # Should contain entity type, entity id, file type, year, month, filename + path_parts = path.split("/") + assert len(path_parts) == 6 + assert path_parts[0] == "project" + assert path_parts[1] == "PROJ001" + assert path_parts[2] == "fastq" + assert path_parts[5] == "sample.fastq" + + # Year and month should be current + now = datetime.utcnow() + assert path_parts[3] == now.strftime("%Y") + assert path_parts[4] == now.strftime("%m") + + def test_calculate_file_checksum(self): + """Test file checksum calculation""" + content = b"Hello, World!" + checksum = calculate_file_checksum(content) + + # Should be SHA-256 hash + assert len(checksum) == 64 + assert checksum == "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f" + + def test_get_mime_type(self): + """Test MIME type detection""" + assert get_mime_type("test.txt") == "text/plain" + assert get_mime_type("test.pdf") == "application/pdf" + assert get_mime_type("test.jpg") == "image/jpeg" + assert get_mime_type("test.fastq") == "application/octet-stream" # Unknown extension + + def test_create_file_without_content(self, session: Session, temp_storage): + """Test creating file record without content""" + file_create = FileCreate( + filename="test.txt", + description="Test file", + file_type=FileType.DOCUMENT, + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + created_by="testuser" + ) + + file_record = create_file(session, file_create, storage_root=temp_storage) + + assert file_record.filename == "test.txt" + assert file_record.description == "Test file" + assert file_record.file_type == FileType.DOCUMENT + assert file_record.entity_type == EntityType.PROJECT + assert file_record.entity_id == "PROJ001" + assert file_record.created_by == "testuser" + assert file_record.file_size is None + assert file_record.checksum is None + assert file_record.mime_type == "text/plain" + assert len(file_record.file_id) == 12 + + def test_create_file_with_content(self, session: Session, temp_storage): + """Test creating file record with content""" + content = b"Hello, World!" + file_create = FileCreate( + filename="test.txt", + description="Test file", + file_type=FileType.DOCUMENT, + entity_type=EntityType.PROJECT, + entity_id="PROJ001" + ) + + file_record = create_file(session, file_create, content, storage_root=temp_storage) + + assert file_record.file_size == len(content) + assert file_record.checksum == calculate_file_checksum(content) + + # Check that file was actually written + file_path = Path(temp_storage) / file_record.file_path + assert file_path.exists() + assert file_path.read_bytes() == content + + def test_get_file(self, session: Session, temp_storage): + """Test getting file by file_id""" + file_create = FileCreate( + filename="test.txt", + entity_type=EntityType.PROJECT, + entity_id="PROJ001" + ) + + created_file = create_file(session, file_create, storage_root=temp_storage) + retrieved_file = get_file(session, created_file.file_id) + + assert retrieved_file.id == created_file.id + assert retrieved_file.file_id == created_file.file_id + assert retrieved_file.filename == created_file.filename + + def test_get_file_not_found(self, session: Session): + """Test getting non-existent file""" + with pytest.raises(Exception) as exc_info: + get_file(session, "nonexistent") + + assert "not found" in str(exc_info.value) + + def test_update_file(self, session: Session, temp_storage): + """Test updating file metadata""" + file_create = FileCreate( + filename="test.txt", + description="Original description", + entity_type=EntityType.PROJECT, + entity_id="PROJ001" + ) + + created_file = create_file(session, file_create, storage_root=temp_storage) + + file_update = FileUpdate( + filename="updated.txt", + description="Updated description", + is_public=True + ) + + updated_file = update_file(session, created_file.file_id, file_update) + + assert updated_file.filename == "updated.txt" + assert updated_file.description == "Updated description" + assert updated_file.is_public is True + + def test_delete_file(self, session: Session, temp_storage): + """Test deleting file and content""" + content = b"Hello, World!" + file_create = FileCreate( + filename="test.txt", + entity_type=EntityType.PROJECT, + entity_id="PROJ001" + ) + + created_file = create_file(session, file_create, content, storage_root=temp_storage) + file_path = Path(temp_storage) / created_file.file_path + + # Verify file exists + assert file_path.exists() + + # Delete file + result = delete_file(session, created_file.file_id, storage_root=temp_storage) + assert result is True + + # Verify file is deleted + assert not file_path.exists() + + # Verify database record is deleted + with pytest.raises(Exception): + get_file(session, created_file.file_id) + + def test_list_files_empty(self, session: Session): + """Test listing files when none exist""" + result = list_files(session) + + assert result.total_items == 0 + assert result.total_pages == 0 + assert result.current_page == 1 + assert result.per_page == 20 + assert result.has_next is False + assert result.has_prev is False + assert len(result.data) == 0 + + def test_list_files_with_data(self, session: Session, temp_storage): + """Test listing files with data""" + # Create test files + for i in range(3): + file_create = FileCreate( + filename=f"test{i}.txt", + description=f"Test file {i}", + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + file_type=FileType.DOCUMENT + ) + create_file(session, file_create, storage_root=temp_storage) + + result = list_files(session) + + assert result.total_items == 3 + assert result.total_pages == 1 + assert result.current_page == 1 + assert result.per_page == 20 + assert result.has_next is False + assert result.has_prev is False + assert len(result.data) == 3 + + def test_list_files_for_entity(self, session: Session, temp_storage): + """Test listing files for specific entity""" + # Create files for different entities + for entity_id in ["PROJ001", "PROJ002"]: + for i in range(2): + file_create = FileCreate( + filename=f"test{i}.txt", + entity_type=EntityType.PROJECT, + entity_id=entity_id + ) + create_file(session, file_create, storage_root=temp_storage) + + # List files for PROJ001 + result = list_files_for_entity(session, EntityType.PROJECT, "PROJ001") + + assert result.total_items == 2 + assert all(file.entity_id == "PROJ001" for file in result.data) + + def test_get_file_count_for_entity(self, session: Session, temp_storage): + """Test getting file count for entity""" + # Create files for entity + for i in range(3): + file_create = FileCreate( + filename=f"test{i}.txt", + entity_type=EntityType.PROJECT, + entity_id="PROJ001" + ) + create_file(session, file_create, storage_root=temp_storage) + + count = get_file_count_for_entity(session, EntityType.PROJECT, "PROJ001") + assert count == 3 + + def test_get_file_content(self, session: Session, temp_storage): + """Test retrieving file content""" + content = b"Hello, World!" + file_create = FileCreate( + filename="test.txt", + entity_type=EntityType.PROJECT, + entity_id="PROJ001" + ) + + created_file = create_file(session, file_create, content, storage_root=temp_storage) + retrieved_content = get_file_content( + session, created_file.file_id, storage_root=temp_storage + ) + + assert retrieved_content == content + + def test_get_file_content_not_found(self, session: Session, temp_storage): + """Test retrieving content for non-existent file""" + file_create = FileCreate( + filename="test.txt", + entity_type=EntityType.PROJECT, + entity_id="PROJ001" + ) + + # Create file record without content + created_file = create_file(session, file_create, storage_root=temp_storage) + + with pytest.raises(Exception) as exc_info: + get_file_content(session, created_file.file_id, storage_root=temp_storage) + + assert "not found" in str(exc_info.value) + + +class TestFileAPI: + """Test file API endpoints""" + + def test_create_file_endpoint(self, client: TestClient, session: Session): + """Test file creation endpoint""" + # This would test the actual API endpoint once routes are implemented + # For now, we'll skip this as routes.py needs to be updated + pass + + def test_get_files_endpoint(self, client: TestClient, session: Session): + """Test file listing endpoint""" + # This would test the actual API endpoint once routes are implemented + pass + + def test_get_file_endpoint(self, client: TestClient, session: Session): + """Test file retrieval endpoint""" + # This would test the actual API endpoint once routes are implemented + pass + + def test_update_file_endpoint(self, client: TestClient, session: Session): + """Test file update endpoint""" + # This would test the actual API endpoint once routes are implemented + pass + + def test_delete_file_endpoint(self, client: TestClient, session: Session): + """Test file deletion endpoint""" + # This would test the actual API endpoint once routes are implemented + pass + + +class TestFileIntegration: + """Integration tests for file operations""" + + @pytest.fixture + def temp_storage(self): + """Create temporary storage directory""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + shutil.rmtree(temp_dir) + + def test_complete_file_lifecycle(self, session: Session, temp_storage): + """Test complete file lifecycle: create, read, update, delete""" + content = b"Initial content" + + # Create file + file_create = FileCreate( + filename="lifecycle.txt", + description="Lifecycle test file", + file_type=FileType.DOCUMENT, + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + created_by="testuser" + ) + + created_file = create_file(session, file_create, content, storage_root=temp_storage) + assert created_file.filename == "lifecycle.txt" + assert created_file.file_size == len(content) + + # Read file + retrieved_file = get_file(session, created_file.file_id) + assert retrieved_file.id == created_file.id + + retrieved_content = get_file_content( + session, created_file.file_id, storage_root=temp_storage + ) + assert retrieved_content == content + + # Update file metadata + file_update = FileUpdate( + description="Updated lifecycle test file", + is_public=True + ) + + updated_file = update_file(session, created_file.file_id, file_update) + assert updated_file.description == "Updated lifecycle test file" + assert updated_file.is_public is True + + # Delete file + result = delete_file(session, created_file.file_id, storage_root=temp_storage) + assert result is True + + # Verify deletion + with pytest.raises(Exception): + get_file(session, created_file.file_id) + + def test_multiple_entities_file_management(self, session: Session, temp_storage): + """Test file management across multiple entities""" + entities = [ + (EntityType.PROJECT, "PROJ001"), + (EntityType.PROJECT, "PROJ002"), + (EntityType.RUN, "190110_MACHINE123_0001_FLOWCELL123") + ] + + created_files = [] + + # Create files for different entities + for entity_type, entity_id in entities: + for i in range(2): + file_create = FileCreate( + filename=f"file{i}.txt", + entity_type=entity_type, + entity_id=entity_id, + file_type=FileType.DOCUMENT + ) + + file_record = create_file(session, file_create, storage_root=temp_storage) + created_files.append(file_record) + + # Verify total count + all_files = list_files(session) + assert all_files.total_items == 6 + + # Verify entity-specific counts + for entity_type, entity_id in entities: + entity_files = list_files_for_entity(session, entity_type, entity_id) + assert entity_files.total_items == 2 + + count = get_file_count_for_entity(session, entity_type, entity_id) + assert count == 2 + + def test_file_type_filtering(self, session: Session, temp_storage): + """Test filtering files by type""" + file_types = [FileType.FASTQ, FileType.BAM, FileType.VCF, FileType.DOCUMENT] + + # Create files of different types + for file_type in file_types: + file_create = FileCreate( + filename=f"test.{file_type.value}", + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + file_type=file_type + ) + create_file(session, file_create, storage_root=temp_storage) + + # Test filtering by each type + for file_type in file_types: + from api.files.models import FileFilters + filters = FileFilters(file_type=file_type) + result = list_files(session, filters=filters) + + assert result.total_items == 1 + assert result.data[0].file_type == file_type \ No newline at end of file From 7b624c4c06155a10f96d9598c2c88a7871777ab9 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sat, 20 Sep 2025 14:01:58 -0400 Subject: [PATCH 03/35] lint --- api/files/models.py | 20 ++++---- api/files/services.py | 68 +++++++++++++------------- tests/api/test_files.py | 104 ++++++++++++++++++++-------------------- 3 files changed, 96 insertions(+), 96 deletions(-) diff --git a/api/files/models.py b/api/files/models.py index f78d4f1..0346e56 100644 --- a/api/files/models.py +++ b/api/files/models.py @@ -40,9 +40,9 @@ class StorageBackend(str, Enum): class File(SQLModel, table=True): """Core file entity that can be associated with runs or projects""" - + __searchable__ = ["filename", "description", "file_id"] - + id: uuid.UUID | None = Field(default_factory=uuid.uuid4, primary_key=True) file_id: str = Field(unique=True, max_length=100) # Human-readable identifier filename: str = Field(max_length=255) @@ -51,17 +51,17 @@ class File(SQLModel, table=True): file_size: int | None = None # Size in bytes mime_type: str | None = Field(default=None, max_length=100) checksum: str | None = Field(default=None, max_length=64) # SHA-256 hash - + # Metadata description: str | None = Field(default=None, max_length=1024) file_type: FileType = Field(default=FileType.OTHER) upload_date: datetime = Field(default_factory=datetime.utcnow) created_by: str | None = Field(default=None, max_length=100) # User identifier - + # Polymorphic associations entity_type: EntityType # "project" or "run" entity_id: str = Field(max_length=100) # project_id or run barcode - + # Storage metadata storage_backend: StorageBackend = Field(default=StorageBackend.LOCAL) is_public: bool = Field(default=False) @@ -87,7 +87,7 @@ class FileCreate(SQLModel): entity_id: str is_public: bool = False created_by: str | None = None - + model_config = ConfigDict(extra="forbid") @@ -98,7 +98,7 @@ class FileUpdate(SQLModel): file_type: FileType | None = None is_public: bool | None = None is_archived: bool | None = None - + model_config = ConfigDict(extra="forbid") @@ -138,7 +138,7 @@ class FileUploadRequest(SQLModel): description: str | None = None file_type: FileType = FileType.OTHER is_public: bool = False - + model_config = ConfigDict(extra="forbid") @@ -162,5 +162,5 @@ class FileFilters(SQLModel): is_public: bool | None = None is_archived: bool | None = None search_query: str | None = None # Search in filename/description - - model_config = ConfigDict(extra="forbid") \ No newline at end of file + + model_config = ConfigDict(extra="forbid") diff --git a/api/files/services.py b/api/files/services.py index e1acf19..507dfa7 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -37,7 +37,7 @@ def generate_file_path( now = datetime.utcnow() year = now.strftime("%Y") month = now.strftime("%m") - + # Create path structure: /{entity_type}/{entity_id}/{file_type}/{year}/{month}/{filename} path_parts = [ entity_type.value, @@ -69,13 +69,13 @@ def create_file( storage_root: str = "storage" ) -> File: """Create a new file record and optionally store content""" - + # Generate unique file ID file_id = generate_file_id() - + # Use original_filename if provided, otherwise use filename original_filename = file_create.original_filename or file_create.filename - + # Generate file path file_path = generate_file_path( file_create.entity_type, @@ -83,12 +83,12 @@ def create_file( file_create.file_type, f"{file_id}_{file_create.filename}" ) - + # Calculate file metadata if content is provided file_size = len(file_content) if file_content else None checksum = calculate_file_checksum(file_content) if file_content else None mime_type = get_mime_type(file_create.filename) - + # Create file record file_record = File( file_id=file_id, @@ -106,19 +106,19 @@ def create_file( is_public=file_create.is_public, storage_backend=StorageBackend.LOCAL ) - + # Store file content if provided if file_content: full_path = Path(storage_root) / file_path full_path.parent.mkdir(parents=True, exist_ok=True) with open(full_path, "wb") as f: f.write(file_content) - + # Save to database session.add(file_record) session.commit() session.refresh(file_record) - + return file_record @@ -127,13 +127,13 @@ def get_file(session: Session, file_id: str) -> File: file_record = session.exec( select(File).where(File.file_id == file_id) ).first() - + if not file_record: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"File with id {file_id} not found" ) - + return file_record @@ -142,52 +142,52 @@ def get_file_by_id(session: Session, id: str) -> File: file_record = session.exec( select(File).where(File.id == id) ).first() - + if not file_record: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"File with internal id {id} not found" ) - + return file_record def update_file(session: Session, file_id: str, file_update: FileUpdate) -> File: """Update file metadata""" file_record = get_file(session, file_id) - + # Update fields that are provided update_data = file_update.model_dump(exclude_unset=True) for field, value in update_data.items(): setattr(file_record, field, value) - + session.add(file_record) session.commit() session.refresh(file_record) - + return file_record def delete_file(session: Session, file_id: str, storage_root: str = "storage") -> bool: """Delete a file record and its content""" file_record = get_file(session, file_id) - + # Delete physical file if it exists full_path = Path(storage_root) / file_record.file_path if full_path.exists(): full_path.unlink() - + # Try to remove empty directories try: full_path.parent.rmdir() except OSError: # Directory not empty, that's fine pass - + # Delete from database session.delete(file_record) session.commit() - + return True @@ -200,10 +200,10 @@ def list_files( sort_order: str = "desc" ) -> FilesPublic: """List files with filtering and pagination""" - + # Build query query = select(File) - + # Apply filters if filters: if filters.entity_type: @@ -226,28 +226,28 @@ def list_files( (File.filename.ilike(search_term)) | (File.description.ilike(search_term)) ) - + # Get total count total_count = session.exec( select(func.count()).select_from(query.subquery()) ).one() - + # Calculate pagination total_pages = (total_count + per_page - 1) // per_page - + # Apply sorting sort_field = getattr(File, sort_by, File.upload_date) if sort_order == "desc": query = query.order_by(sort_field.desc()) else: query = query.order_by(sort_field.asc()) - + # Apply pagination query = query.offset((page - 1) * per_page).limit(per_page) - + # Execute query files = session.exec(query).all() - + # Convert to public models public_files = [ FilePublic( @@ -269,7 +269,7 @@ def list_files( ) for file in files ] - + return FilesPublic( data=public_files, total_items=total_count, @@ -284,14 +284,14 @@ def list_files( def get_file_content(session: Session, file_id: str, storage_root: str = "storage") -> bytes: """Get file content from storage""" file_record = get_file(session, file_id) - + full_path = Path(storage_root) / file_record.file_path if not full_path.exists(): raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"File content not found at {file_record.file_path}" ) - + with open(full_path, "rb") as f: return f.read() @@ -310,7 +310,7 @@ def list_files_for_entity( entity_id=entity_id, file_type=file_type ) - + return list_files( session=session, filters=filters, @@ -332,5 +332,5 @@ def get_file_count_for_entity( ~File.is_archived ) ).one() - - return count \ No newline at end of file + + return count diff --git a/tests/api/test_files.py b/tests/api/test_files.py index d439772..4ad7256 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -73,7 +73,7 @@ def test_file_create_model(self): is_public=True, created_by="testuser" ) - + assert file_create.filename == "test.txt" assert file_create.description == "Test file" assert file_create.file_type == FileType.DOCUMENT @@ -89,7 +89,7 @@ def test_file_update_model(self): description="Updated description", is_public=False ) - + assert file_update.filename == "updated.txt" assert file_update.description == "Updated description" assert file_update.is_public is False @@ -110,7 +110,7 @@ def test_generate_file_id(self): file_id = generate_file_id() assert len(file_id) == 12 assert file_id.isalnum() - + # Test uniqueness file_id2 = generate_file_id() assert file_id != file_id2 @@ -123,7 +123,7 @@ def test_generate_file_path(self): FileType.FASTQ, "sample.fastq" ) - + # Should contain entity type, entity id, file type, year, month, filename path_parts = path.split("/") assert len(path_parts) == 6 @@ -131,7 +131,7 @@ def test_generate_file_path(self): assert path_parts[1] == "PROJ001" assert path_parts[2] == "fastq" assert path_parts[5] == "sample.fastq" - + # Year and month should be current now = datetime.utcnow() assert path_parts[3] == now.strftime("%Y") @@ -141,7 +141,7 @@ def test_calculate_file_checksum(self): """Test file checksum calculation""" content = b"Hello, World!" checksum = calculate_file_checksum(content) - + # Should be SHA-256 hash assert len(checksum) == 64 assert checksum == "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f" @@ -163,9 +163,9 @@ def test_create_file_without_content(self, session: Session, temp_storage): entity_id="PROJ001", created_by="testuser" ) - + file_record = create_file(session, file_create, storage_root=temp_storage) - + assert file_record.filename == "test.txt" assert file_record.description == "Test file" assert file_record.file_type == FileType.DOCUMENT @@ -187,12 +187,12 @@ def test_create_file_with_content(self, session: Session, temp_storage): entity_type=EntityType.PROJECT, entity_id="PROJ001" ) - + file_record = create_file(session, file_create, content, storage_root=temp_storage) - + assert file_record.file_size == len(content) assert file_record.checksum == calculate_file_checksum(content) - + # Check that file was actually written file_path = Path(temp_storage) / file_record.file_path assert file_path.exists() @@ -205,10 +205,10 @@ def test_get_file(self, session: Session, temp_storage): entity_type=EntityType.PROJECT, entity_id="PROJ001" ) - + created_file = create_file(session, file_create, storage_root=temp_storage) retrieved_file = get_file(session, created_file.file_id) - + assert retrieved_file.id == created_file.id assert retrieved_file.file_id == created_file.file_id assert retrieved_file.filename == created_file.filename @@ -217,7 +217,7 @@ def test_get_file_not_found(self, session: Session): """Test getting non-existent file""" with pytest.raises(Exception) as exc_info: get_file(session, "nonexistent") - + assert "not found" in str(exc_info.value) def test_update_file(self, session: Session, temp_storage): @@ -228,17 +228,17 @@ def test_update_file(self, session: Session, temp_storage): entity_type=EntityType.PROJECT, entity_id="PROJ001" ) - + created_file = create_file(session, file_create, storage_root=temp_storage) - + file_update = FileUpdate( filename="updated.txt", description="Updated description", is_public=True ) - + updated_file = update_file(session, created_file.file_id, file_update) - + assert updated_file.filename == "updated.txt" assert updated_file.description == "Updated description" assert updated_file.is_public is True @@ -251,20 +251,20 @@ def test_delete_file(self, session: Session, temp_storage): entity_type=EntityType.PROJECT, entity_id="PROJ001" ) - + created_file = create_file(session, file_create, content, storage_root=temp_storage) file_path = Path(temp_storage) / created_file.file_path - + # Verify file exists assert file_path.exists() - + # Delete file result = delete_file(session, created_file.file_id, storage_root=temp_storage) assert result is True - + # Verify file is deleted assert not file_path.exists() - + # Verify database record is deleted with pytest.raises(Exception): get_file(session, created_file.file_id) @@ -272,7 +272,7 @@ def test_delete_file(self, session: Session, temp_storage): def test_list_files_empty(self, session: Session): """Test listing files when none exist""" result = list_files(session) - + assert result.total_items == 0 assert result.total_pages == 0 assert result.current_page == 1 @@ -293,9 +293,9 @@ def test_list_files_with_data(self, session: Session, temp_storage): file_type=FileType.DOCUMENT ) create_file(session, file_create, storage_root=temp_storage) - + result = list_files(session) - + assert result.total_items == 3 assert result.total_pages == 1 assert result.current_page == 1 @@ -315,10 +315,10 @@ def test_list_files_for_entity(self, session: Session, temp_storage): entity_id=entity_id ) create_file(session, file_create, storage_root=temp_storage) - + # List files for PROJ001 result = list_files_for_entity(session, EntityType.PROJECT, "PROJ001") - + assert result.total_items == 2 assert all(file.entity_id == "PROJ001" for file in result.data) @@ -332,7 +332,7 @@ def test_get_file_count_for_entity(self, session: Session, temp_storage): entity_id="PROJ001" ) create_file(session, file_create, storage_root=temp_storage) - + count = get_file_count_for_entity(session, EntityType.PROJECT, "PROJ001") assert count == 3 @@ -344,12 +344,12 @@ def test_get_file_content(self, session: Session, temp_storage): entity_type=EntityType.PROJECT, entity_id="PROJ001" ) - + created_file = create_file(session, file_create, content, storage_root=temp_storage) retrieved_content = get_file_content( session, created_file.file_id, storage_root=temp_storage ) - + assert retrieved_content == content def test_get_file_content_not_found(self, session: Session, temp_storage): @@ -359,13 +359,13 @@ def test_get_file_content_not_found(self, session: Session, temp_storage): entity_type=EntityType.PROJECT, entity_id="PROJ001" ) - + # Create file record without content created_file = create_file(session, file_create, storage_root=temp_storage) - + with pytest.raises(Exception) as exc_info: get_file_content(session, created_file.file_id, storage_root=temp_storage) - + assert "not found" in str(exc_info.value) @@ -412,7 +412,7 @@ def temp_storage(self): def test_complete_file_lifecycle(self, session: Session, temp_storage): """Test complete file lifecycle: create, read, update, delete""" content = b"Initial content" - + # Create file file_create = FileCreate( filename="lifecycle.txt", @@ -422,34 +422,34 @@ def test_complete_file_lifecycle(self, session: Session, temp_storage): entity_id="PROJ001", created_by="testuser" ) - + created_file = create_file(session, file_create, content, storage_root=temp_storage) assert created_file.filename == "lifecycle.txt" assert created_file.file_size == len(content) - + # Read file retrieved_file = get_file(session, created_file.file_id) assert retrieved_file.id == created_file.id - + retrieved_content = get_file_content( session, created_file.file_id, storage_root=temp_storage ) assert retrieved_content == content - + # Update file metadata file_update = FileUpdate( description="Updated lifecycle test file", is_public=True ) - + updated_file = update_file(session, created_file.file_id, file_update) assert updated_file.description == "Updated lifecycle test file" assert updated_file.is_public is True - + # Delete file result = delete_file(session, created_file.file_id, storage_root=temp_storage) assert result is True - + # Verify deletion with pytest.raises(Exception): get_file(session, created_file.file_id) @@ -461,9 +461,9 @@ def test_multiple_entities_file_management(self, session: Session, temp_storage) (EntityType.PROJECT, "PROJ002"), (EntityType.RUN, "190110_MACHINE123_0001_FLOWCELL123") ] - + created_files = [] - + # Create files for different entities for entity_type, entity_id in entities: for i in range(2): @@ -473,26 +473,26 @@ def test_multiple_entities_file_management(self, session: Session, temp_storage) entity_id=entity_id, file_type=FileType.DOCUMENT ) - + file_record = create_file(session, file_create, storage_root=temp_storage) created_files.append(file_record) - + # Verify total count all_files = list_files(session) assert all_files.total_items == 6 - + # Verify entity-specific counts for entity_type, entity_id in entities: entity_files = list_files_for_entity(session, entity_type, entity_id) assert entity_files.total_items == 2 - + count = get_file_count_for_entity(session, entity_type, entity_id) assert count == 2 def test_file_type_filtering(self, session: Session, temp_storage): """Test filtering files by type""" file_types = [FileType.FASTQ, FileType.BAM, FileType.VCF, FileType.DOCUMENT] - + # Create files of different types for file_type in file_types: file_create = FileCreate( @@ -502,12 +502,12 @@ def test_file_type_filtering(self, session: Session, temp_storage): file_type=file_type ) create_file(session, file_create, storage_root=temp_storage) - + # Test filtering by each type for file_type in file_types: from api.files.models import FileFilters filters = FileFilters(file_type=file_type) result = list_files(session, filters=filters) - + assert result.total_items == 1 - assert result.data[0].file_type == file_type \ No newline at end of file + assert result.data[0].file_type == file_type From 6d59ec46aa983d2806fe24fd85b4d3cc50a421a9 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sat, 20 Sep 2025 14:09:27 -0400 Subject: [PATCH 04/35] Add file db support --- .../versions/f0aae41a019b_add_file_table.py | 33 +++ api/files/models.py | 13 + api/files/routes.py | 277 ++++++++++++++++-- api/files/services.py | 27 ++ 4 files changed, 321 insertions(+), 29 deletions(-) create mode 100644 alembic/versions/f0aae41a019b_add_file_table.py diff --git a/alembic/versions/f0aae41a019b_add_file_table.py b/alembic/versions/f0aae41a019b_add_file_table.py new file mode 100644 index 0000000..1b97162 --- /dev/null +++ b/alembic/versions/f0aae41a019b_add_file_table.py @@ -0,0 +1,33 @@ +"""add_file_table + +Revision ID: f0aae41a019b +Revises: 43c1f122cf7f +Create Date: 2025-09-20 14:07:19.102747 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = 'f0aae41a019b' +down_revision: Union[str, Sequence[str], None] = '43c1f122cf7f' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### diff --git a/api/files/models.py b/api/files/models.py index 0346e56..468901e 100644 --- a/api/files/models.py +++ b/api/files/models.py @@ -164,3 +164,16 @@ class FileFilters(SQLModel): search_query: str | None = None # Search in filename/description model_config = ConfigDict(extra="forbid") + + +class PaginatedFileResponse(SQLModel): + """Paginated response for file listings""" + data: list[FilePublic] + total_items: int + total_pages: int + current_page: int + per_page: int + has_next: bool + has_prev: bool + + model_config = ConfigDict(from_attributes=True) diff --git a/api/files/routes.py b/api/files/routes.py index 3e88e67..65bf706 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -2,54 +2,273 @@ Routes/endpoints for the Files API """ -from typing import Literal -from fastapi import APIRouter, Query, status +from typing import Optional +from fastapi import APIRouter, Query, HTTPException, status, UploadFile, File as FastAPIFile +from fastapi.responses import StreamingResponse from core.deps import SessionDep -from api.files.models import File, FileCreate, FilePublic, FilesPublic +from api.files.models import ( + FileCreate, + FileUpdate, + FilePublic, + PaginatedFileResponse, + FileFilters, + FileType, + EntityType, +) import api.files.services as services -router = APIRouter(prefix="/files", tags=["File Endpoints"]) +router = APIRouter(prefix="/files", tags=["Files"]) @router.post( "", response_model=FilePublic, - tags=["File Endpoints"], status_code=status.HTTP_201_CREATED, + summary="Create a new file record" ) -def create_file(session: SessionDep, file_in: FileCreate) -> File: +def create_file( + session: SessionDep, + file_in: FileCreate, + content: Optional[UploadFile] = FastAPIFile(None) +) -> FilePublic: """ - Create a new file with optional attributes. + Create a new file record with optional file content upload. + + - **filename**: Name of the file + - **description**: Optional description of the file + - **file_type**: Type of file (fastq, bam, vcf, etc.) + - **entity_type**: Whether this file belongs to a project or run + - **entity_id**: ID of the project or run this file belongs to + - **is_public**: Whether the file is publicly accessible + - **created_by**: User who created the file """ - return services.create_file(session=session, file_in=file_in) + file_content = None + if content and content.filename: + file_content = content.file.read() + + return services.create_file(session, file_in, file_content) -@router.get("", response_model=FilesPublic, tags=["File Endpoints"]) -def get_files( +@router.get( + "", + response_model=PaginatedFileResponse, + summary="List files with filtering and pagination" +) +def list_files( session: SessionDep, - page: int = Query(1, description="Page number (1-indexed)"), - per_page: int = Query(20, description="Number of items per page"), - sort_by: str = Query("file_id", description="Field to sort by"), - sort_order: Literal["asc", "desc"] = Query( - "asc", description="Sort order (asc or desc)" - ), -) -> FilesPublic: + page: int = Query(1, ge=1, description="Page number (1-indexed)"), + per_page: int = Query(20, ge=1, le=100, description="Number of items per page"), + entity_type: Optional[EntityType] = Query(None, description="Filter by entity type"), + entity_id: Optional[str] = Query(None, description="Filter by entity ID"), + file_type: Optional[FileType] = Query(None, description="Filter by file type"), + search: Optional[str] = Query(None, description="Search in filename and description"), + is_public: Optional[bool] = Query(None, description="Filter by public/private status"), + created_by: Optional[str] = Query(None, description="Filter by creator"), +) -> PaginatedFileResponse: """ - Returns a paginated list of files. + Get a paginated list of files with optional filtering. + + Supports filtering by: + - Entity type and ID (project or run) + - File type (fastq, bam, vcf, etc.) + - Public/private status + - Creator + - Text search in filename and description """ - return services.get_files( - session=session, - page=page, - per_page=per_page, - sort_by=sort_by, - sort_order=sort_order, + filters = FileFilters( + entity_type=entity_type, + entity_id=entity_id, + file_type=file_type, + search_query=search, + is_public=is_public, + created_by=created_by, ) + + return services.list_files(session, filters, page, per_page) + + +@router.get( + "/{file_id}", + response_model=FilePublic, + summary="Get file by ID" +) +def get_file( + session: SessionDep, + file_id: str +) -> FilePublic: + """ + Get a single file by its file_id. + + - **file_id**: The unique file identifier + """ + try: + return services.get_file(session, file_id) + except Exception: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found" + ) + + +@router.put( + "/{file_id}", + response_model=FilePublic, + summary="Update file metadata" +) +def update_file( + session: SessionDep, + file_id: str, + file_update: FileUpdate +) -> FilePublic: + """ + Update file metadata. + + - **file_id**: The unique file identifier + - **file_update**: Fields to update + """ + try: + return services.update_file(session, file_id, file_update) + except Exception: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found" + ) + + +@router.delete( + "/{file_id}", + status_code=status.HTTP_204_NO_CONTENT, + summary="Delete file" +) +def delete_file( + session: SessionDep, + file_id: str +) -> None: + """ + Delete a file and its content. + + - **file_id**: The unique file identifier + """ + try: + success = services.delete_file(session, file_id) + if not success: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found" + ) + except Exception: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found" + ) -@router.get("/{file_id}", response_model=FilePublic, tags=["File Endpoints"]) -def get_file_by_file_id(session: SessionDep, file_id: str) -> File: +@router.get( + "/{file_id}/content", + summary="Download file content" +) +def download_file( + session: SessionDep, + file_id: str +): + """ + Download the content of a file. + + - **file_id**: The unique file identifier + """ + try: + # Get file metadata + file_record = services.get_file(session, file_id) + + # Get file content + content = services.get_file_content(session, file_id) + + # Create streaming response + def generate(): + yield content + + return StreamingResponse( + generate(), + media_type=file_record.mime_type or "application/octet-stream", + headers={ + "Content-Disposition": f"attachment; filename={file_record.filename}" + } + ) + except Exception: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found or content not available" + ) + + +@router.post( + "/{file_id}/content", + response_model=FilePublic, + summary="Upload file content" +) +def upload_file_content( + session: SessionDep, + file_id: str, + content: UploadFile = FastAPIFile(...) +) -> FilePublic: + """ + Upload content for an existing file record. + + - **file_id**: The unique file identifier + - **content**: The file content to upload + """ + try: + file_content = content.file.read() + return services.update_file_content(session, file_id, file_content) + except Exception: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File with id {file_id} not found" + ) + + +@router.get( + "/entity/{entity_type}/{entity_id}", + response_model=PaginatedFileResponse, + summary="List files for a specific entity" +) +def list_files_for_entity( + session: SessionDep, + entity_type: EntityType, + entity_id: str, + page: int = Query(1, ge=1, description="Page number (1-indexed)"), + per_page: int = Query(20, ge=1, le=100, description="Number of items per page"), + file_type: Optional[FileType] = Query(None, description="Filter by file type"), +) -> PaginatedFileResponse: + """ + Get all files associated with a specific project or run. + + - **entity_type**: Either "project" or "run" + - **entity_id**: The project ID or run barcode + """ + filters = FileFilters( + entity_type=entity_type, + entity_id=entity_id, + file_type=file_type, + ) + + return services.list_files_for_entity(session, entity_type, entity_id, page, per_page, filters) + + +@router.get( + "/entity/{entity_type}/{entity_id}/count", + summary="Get file count for entity" +) +def get_file_count_for_entity( + session: SessionDep, + entity_type: EntityType, + entity_id: str +) -> dict: """ - Returns a single file by its file_id. - Note: This is different from its internal "id". + Get the total number of files for a specific project or run. + + - **entity_type**: Either "project" or "run" + - **entity_id**: The project ID or run barcode """ - return services.get_file_by_file_id(session=session, file_id=file_id) + count = services.get_file_count_for_entity(session, entity_type, entity_id) + return {"entity_type": entity_type, "entity_id": entity_id, "file_count": count} diff --git a/api/files/services.py b/api/files/services.py index 507dfa7..8232577 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -334,3 +334,30 @@ def get_file_count_for_entity( ).one() return count + + +def update_file_content( + session: Session, file_id: str, content: bytes, storage_root: str = "storage" +) -> File: + """Update file content""" + # Get the file record + file_record = get_file(session, file_id) + + # Calculate new file metadata + file_size = len(content) + checksum = calculate_file_checksum(content) + + # Write content to storage + storage_path = Path(storage_root) / file_record.file_path + storage_path.parent.mkdir(parents=True, exist_ok=True) + storage_path.write_bytes(content) + + # Update file record + file_record.file_size = file_size + file_record.checksum = checksum + + session.add(file_record) + session.commit() + session.refresh(file_record) + + return file_record From 477dd3d291b31d8540a21a50f3edf0ffe6f2382c Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sat, 20 Sep 2025 14:09:39 -0400 Subject: [PATCH 05/35] Add files rout --- main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.py b/main.py index cf1f56b..08c38e0 100644 --- a/main.py +++ b/main.py @@ -12,6 +12,7 @@ from api.runs.routes import router as runs_router from api.samples.routes import router as samples_router from api.search.routes import router as search_router +from api.files.routes import router as files_router # Customize route id's @@ -50,6 +51,7 @@ def root(): app.include_router(runs_router, prefix=API_PREFIX) app.include_router(samples_router, prefix=API_PREFIX) app.include_router(search_router, prefix=API_PREFIX) +app.include_router(files_router, prefix=API_PREFIX) if __name__ == "__main__": # For debugging purposes From 1998cb8c7004b77e278f74456f315e1690624cf7 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sat, 20 Sep 2025 14:10:47 -0400 Subject: [PATCH 06/35] Remove empty alembic revision --- .../versions/f0aae41a019b_add_file_table.py | 33 ------------------- 1 file changed, 33 deletions(-) delete mode 100644 alembic/versions/f0aae41a019b_add_file_table.py diff --git a/alembic/versions/f0aae41a019b_add_file_table.py b/alembic/versions/f0aae41a019b_add_file_table.py deleted file mode 100644 index 1b97162..0000000 --- a/alembic/versions/f0aae41a019b_add_file_table.py +++ /dev/null @@ -1,33 +0,0 @@ -"""add_file_table - -Revision ID: f0aae41a019b -Revises: 43c1f122cf7f -Create Date: 2025-09-20 14:07:19.102747 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa -import sqlmodel - - -# revision identifiers, used by Alembic. -revision: str = 'f0aae41a019b' -down_revision: Union[str, Sequence[str], None] = '43c1f122cf7f' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - """Upgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - pass - # ### end Alembic commands ### - - -def downgrade() -> None: - """Downgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - pass - # ### end Alembic commands ### From 0e54889947ce0f230150467b2e9b258b0327a407 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sat, 20 Sep 2025 14:29:46 -0400 Subject: [PATCH 07/35] Properly generate the files table --- alembic/env.py | 3 ++ .../versions/95817dee746c_add_files_table.py | 53 +++++++++++++++++++ api/files/models.py | 2 +- 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 alembic/versions/95817dee746c_add_files_table.py diff --git a/alembic/env.py b/alembic/env.py index 0694a5b..f8206c9 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -11,6 +11,8 @@ from api.samples.models import Sample, SampleAttribute from api.project.models import Project from api.runs.models import SequencingRun +from api.files.models import File + # Import Base from your SQLModel setup #from core.db import Base @@ -31,6 +33,7 @@ # target_metadata = None target_metadata = SQLModel.metadata + # other values from the config, defined by the needs of env.py, # can be acquired: # my_important_option = config.get_main_option("my_important_option") diff --git a/alembic/versions/95817dee746c_add_files_table.py b/alembic/versions/95817dee746c_add_files_table.py new file mode 100644 index 0000000..a267f84 --- /dev/null +++ b/alembic/versions/95817dee746c_add_files_table.py @@ -0,0 +1,53 @@ +"""add_files_table + +Revision ID: 95817dee746c +Revises: c955364e5391 +Create Date: 2025-09-20 14:28:45.987919 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = '95817dee746c' +down_revision: Union[str, Sequence[str], None] = 'c955364e5391' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('file', + sa.Column('id', sa.Uuid(), nullable=False), + sa.Column('file_id', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=False), + sa.Column('filename', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('original_filename', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), + sa.Column('file_path', sqlmodel.sql.sqltypes.AutoString(length=1024), nullable=False), + sa.Column('file_size', sa.Integer(), nullable=True), + sa.Column('mime_type', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=True), + sa.Column('checksum', sqlmodel.sql.sqltypes.AutoString(length=64), nullable=True), + sa.Column('description', sqlmodel.sql.sqltypes.AutoString(length=1024), nullable=True), + sa.Column('file_type', sa.Enum('FASTQ', 'BAM', 'VCF', 'SAMPLESHEET', 'METRICS', 'REPORT', 'LOG', 'IMAGE', 'DOCUMENT', 'OTHER', name='filetype'), nullable=False), + sa.Column('upload_date', sa.DateTime(), nullable=False), + sa.Column('created_by', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=True), + sa.Column('entity_type', sa.Enum('PROJECT', 'RUN', name='entitytype'), nullable=False), + sa.Column('entity_id', sqlmodel.sql.sqltypes.AutoString(length=100), nullable=False), + sa.Column('storage_backend', sa.Enum('LOCAL', 'S3', 'AZURE', 'GCS', name='storagebackend'), nullable=False), + sa.Column('is_public', sa.Boolean(), nullable=False), + sa.Column('is_archived', sa.Boolean(), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('file_id') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('file') + # ### end Alembic commands ### diff --git a/api/files/models.py b/api/files/models.py index 468901e..b2f9646 100644 --- a/api/files/models.py +++ b/api/files/models.py @@ -2,12 +2,12 @@ Models for the Files API """ +from typing import List import uuid from datetime import datetime from enum import Enum from sqlmodel import SQLModel, Field from pydantic import ConfigDict -from typing import List class FileType(str, Enum): From f1c67f43e125f26598942de3255987d62f27238d Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sat, 20 Sep 2025 14:32:12 -0400 Subject: [PATCH 08/35] Fix revision history --- alembic/versions/95817dee746c_add_files_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/95817dee746c_add_files_table.py b/alembic/versions/95817dee746c_add_files_table.py index a267f84..1c8fa83 100644 --- a/alembic/versions/95817dee746c_add_files_table.py +++ b/alembic/versions/95817dee746c_add_files_table.py @@ -14,7 +14,7 @@ # revision identifiers, used by Alembic. revision: str = '95817dee746c' -down_revision: Union[str, Sequence[str], None] = 'c955364e5391' +down_revision: Union[str, Sequence[str], None] = '43c1f122cf7f' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None From 40311a60a9a02176a8f0b09936150cef653bd0ca Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sun, 21 Sep 2025 12:35:02 -0400 Subject: [PATCH 09/35] Fix deprecation warning --- api/files/services.py | 4 ++-- tests/api/test_files.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/api/files/services.py b/api/files/services.py index 8232577..c7b2e19 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -33,8 +33,8 @@ def generate_file_path( entity_type: EntityType, entity_id: str, file_type: FileType, filename: str ) -> str: """Generate a structured file path""" - from datetime import datetime - now = datetime.utcnow() + from datetime import datetime, timezone + now = datetime.now(timezone.utc) year = now.strftime("%Y") month = now.strftime("%m") diff --git a/tests/api/test_files.py b/tests/api/test_files.py index 4ad7256..ab642a9 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -5,7 +5,7 @@ import tempfile import shutil from pathlib import Path -from datetime import datetime +from datetime import datetime, timezone import pytest from fastapi.testclient import TestClient @@ -133,7 +133,7 @@ def test_generate_file_path(self): assert path_parts[5] == "sample.fastq" # Year and month should be current - now = datetime.utcnow() + now = datetime.now(timezone.utc) assert path_parts[3] == now.strftime("%Y") assert path_parts[4] == now.strftime("%m") From 41d871351ec779b72906e2735e724284094d6369 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Sun, 21 Sep 2025 12:39:32 -0400 Subject: [PATCH 10/35] Lint --- api/files/routes.py | 30 +++++++++++++++--------------- api/files/services.py | 10 +++++----- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/api/files/routes.py b/api/files/routes.py index 65bf706..b77fab2 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -33,7 +33,7 @@ def create_file( ) -> FilePublic: """ Create a new file record with optional file content upload. - + - **filename**: Name of the file - **description**: Optional description of the file - **file_type**: Type of file (fastq, bam, vcf, etc.) @@ -45,7 +45,7 @@ def create_file( file_content = None if content and content.filename: file_content = content.file.read() - + return services.create_file(session, file_in, file_content) @@ -67,7 +67,7 @@ def list_files( ) -> PaginatedFileResponse: """ Get a paginated list of files with optional filtering. - + Supports filtering by: - Entity type and ID (project or run) - File type (fastq, bam, vcf, etc.) @@ -83,7 +83,7 @@ def list_files( is_public=is_public, created_by=created_by, ) - + return services.list_files(session, filters, page, per_page) @@ -98,7 +98,7 @@ def get_file( ) -> FilePublic: """ Get a single file by its file_id. - + - **file_id**: The unique file identifier """ try: @@ -122,7 +122,7 @@ def update_file( ) -> FilePublic: """ Update file metadata. - + - **file_id**: The unique file identifier - **file_update**: Fields to update """ @@ -146,7 +146,7 @@ def delete_file( ) -> None: """ Delete a file and its content. - + - **file_id**: The unique file identifier """ try: @@ -173,20 +173,20 @@ def download_file( ): """ Download the content of a file. - + - **file_id**: The unique file identifier """ try: # Get file metadata file_record = services.get_file(session, file_id) - + # Get file content content = services.get_file_content(session, file_id) - + # Create streaming response def generate(): yield content - + return StreamingResponse( generate(), media_type=file_record.mime_type or "application/octet-stream", @@ -213,7 +213,7 @@ def upload_file_content( ) -> FilePublic: """ Upload content for an existing file record. - + - **file_id**: The unique file identifier - **content**: The file content to upload """ @@ -242,7 +242,7 @@ def list_files_for_entity( ) -> PaginatedFileResponse: """ Get all files associated with a specific project or run. - + - **entity_type**: Either "project" or "run" - **entity_id**: The project ID or run barcode """ @@ -251,7 +251,7 @@ def list_files_for_entity( entity_id=entity_id, file_type=file_type, ) - + return services.list_files_for_entity(session, entity_type, entity_id, page, per_page, filters) @@ -266,7 +266,7 @@ def get_file_count_for_entity( ) -> dict: """ Get the total number of files for a specific project or run. - + - **entity_type**: Either "project" or "run" - **entity_id**: The project ID or run barcode """ diff --git a/api/files/services.py b/api/files/services.py index c7b2e19..9a4ccf6 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -342,22 +342,22 @@ def update_file_content( """Update file content""" # Get the file record file_record = get_file(session, file_id) - + # Calculate new file metadata file_size = len(content) checksum = calculate_file_checksum(content) - + # Write content to storage storage_path = Path(storage_root) / file_record.file_path storage_path.parent.mkdir(parents=True, exist_ok=True) storage_path.write_bytes(content) - + # Update file record file_record.file_size = file_size file_record.checksum = checksum - + session.add(file_record) session.commit() session.refresh(file_record) - + return file_record From 36de0e6cc389eb1448d6e581e539c58c7661347c Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 10:16:39 -0400 Subject: [PATCH 11/35] Implement file api tests --- api/files/routes.py | 29 ++-- tests/api/test_files.py | 307 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 316 insertions(+), 20 deletions(-) diff --git a/api/files/routes.py b/api/files/routes.py index b77fab2..918a6cc 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -3,7 +3,7 @@ """ from typing import Optional -from fastapi import APIRouter, Query, HTTPException, status, UploadFile, File as FastAPIFile +from fastapi import APIRouter, Query, HTTPException, status, UploadFile, File as FastAPIFile, Form from fastapi.responses import StreamingResponse from core.deps import SessionDep from api.files.models import ( @@ -28,7 +28,13 @@ ) def create_file( session: SessionDep, - file_in: FileCreate, + filename: str = Form(...), + entity_type: EntityType = Form(...), + entity_id: str = Form(...), + description: Optional[str] = Form(None), + file_type: FileType = Form(FileType.OTHER), + is_public: bool = Form(False), + created_by: Optional[str] = Form(None), content: Optional[UploadFile] = FastAPIFile(None) ) -> FilePublic: """ @@ -42,6 +48,17 @@ def create_file( - **is_public**: Whether the file is publicly accessible - **created_by**: User who created the file """ + # Create FileCreate object from form data + file_in = FileCreate( + filename=filename, + description=description, + file_type=file_type, + entity_type=entity_type, + entity_id=entity_id, + is_public=is_public, + created_by=created_by + ) + file_content = None if content and content.filename: file_content = content.file.read() @@ -246,13 +263,7 @@ def list_files_for_entity( - **entity_type**: Either "project" or "run" - **entity_id**: The project ID or run barcode """ - filters = FileFilters( - entity_type=entity_type, - entity_id=entity_id, - file_type=file_type, - ) - - return services.list_files_for_entity(session, entity_type, entity_id, page, per_page, filters) + return services.list_files_for_entity(session, entity_type, entity_id, page, per_page, file_type) @router.get( diff --git a/tests/api/test_files.py b/tests/api/test_files.py index ab642a9..d2e6116 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -374,29 +374,314 @@ class TestFileAPI: def test_create_file_endpoint(self, client: TestClient, session: Session): """Test file creation endpoint""" - # This would test the actual API endpoint once routes are implemented - # For now, we'll skip this as routes.py needs to be updated - pass + file_data = { + "filename": "test_api.txt", + "description": "Test file via API", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "is_public": "true", # Form data sends as string + "created_by": "api_test_user" + } + + response = client.post("/api/v1/files", data=file_data) + assert response.status_code == 201 + + data = response.json() + assert data["filename"] == "test_api.txt" + assert data["description"] == "Test file via API" + assert data["file_type"] == "document" + assert data["entity_type"] == "project" + assert data["entity_id"] == "PROJ001" + assert data["is_public"] is True + assert data["created_by"] == "api_test_user" + assert "file_id" in data + assert "upload_date" in data def test_get_files_endpoint(self, client: TestClient, session: Session): """Test file listing endpoint""" - # This would test the actual API endpoint once routes are implemented - pass + # First create some test files + for i in range(3): + file_data = { + "filename": f"test_list_{i}.txt", + "description": f"Test file {i}", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "list_test_user" + } + client.post("/api/v1/files", data=file_data) + + # Test basic listing + response = client.get("/api/v1/files") + assert response.status_code == 200 + + data = response.json() + assert "data" in data + assert "total_items" in data + assert "current_page" in data + assert "per_page" in data + assert data["total_items"] >= 3 + assert len(data["data"]) >= 3 + + # Test pagination + response = client.get("/api/v1/files?page=1&per_page=2") + assert response.status_code == 200 + data = response.json() + assert data["current_page"] == 1 + assert data["per_page"] == 2 + assert len(data["data"]) <= 2 + + # Test filtering by entity + response = client.get("/api/v1/files?entity_type=project&entity_id=PROJ001") + assert response.status_code == 200 + data = response.json() + for item in data["data"]: + assert item["entity_type"] == "project" + assert item["entity_id"] == "PROJ001" + + # Test filtering by file type + response = client.get("/api/v1/files?file_type=document") + assert response.status_code == 200 + data = response.json() + for item in data["data"]: + assert item["file_type"] == "document" + + # Test search functionality + response = client.get("/api/v1/files?search=test_list_1") + assert response.status_code == 200 + data = response.json() + assert data["total_items"] >= 1 def test_get_file_endpoint(self, client: TestClient, session: Session): """Test file retrieval endpoint""" - # This would test the actual API endpoint once routes are implemented - pass + # Create a test file + file_data = { + "filename": "test_get.txt", + "description": "Test file for GET", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "get_test_user" + } + + create_response = client.post("/api/v1/files", data=file_data) + assert create_response.status_code == 201 + created_file = create_response.json() + file_id = created_file["file_id"] + + # Test successful retrieval + response = client.get(f"/api/v1/files/{file_id}") + assert response.status_code == 200 + + data = response.json() + assert data["file_id"] == file_id + assert data["filename"] == "test_get.txt" + assert data["description"] == "Test file for GET" + + # Test non-existent file + response = client.get("/api/v1/files/non-existent-id") + assert response.status_code == 404 def test_update_file_endpoint(self, client: TestClient, session: Session): """Test file update endpoint""" - # This would test the actual API endpoint once routes are implemented - pass + # Create a test file + file_data = { + "filename": "test_update.txt", + "description": "Original description", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "update_test_user" + } + + create_response = client.post("/api/v1/files", data=file_data) + assert create_response.status_code == 201 + created_file = create_response.json() + file_id = created_file["file_id"] + + # Test successful update + update_data = { + "description": "Updated description", + "is_public": True + } + + response = client.put(f"/api/v1/files/{file_id}", json=update_data) + assert response.status_code == 200 + + data = response.json() + assert data["file_id"] == file_id + assert data["description"] == "Updated description" + assert data["is_public"] is True + assert data["filename"] == "test_update.txt" # Should remain unchanged + + # Test non-existent file update + response = client.put("/api/v1/files/non-existent-id", json=update_data) + assert response.status_code == 404 def test_delete_file_endpoint(self, client: TestClient, session: Session): """Test file deletion endpoint""" - # This would test the actual API endpoint once routes are implemented - pass + # Create a test file + file_data = { + "filename": "test_delete.txt", + "description": "Test file for deletion", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "delete_test_user" + } + + create_response = client.post("/api/v1/files", data=file_data) + assert create_response.status_code == 201 + created_file = create_response.json() + file_id = created_file["file_id"] + + # Test successful deletion + response = client.delete(f"/api/v1/files/{file_id}") + assert response.status_code == 204 + + # Verify file is deleted by trying to get it + get_response = client.get(f"/api/v1/files/{file_id}") + assert get_response.status_code == 404 + + # Test non-existent file deletion + response = client.delete("/api/v1/files/non-existent-id") + assert response.status_code == 404 + + def test_list_files_for_entity_endpoint(self, client: TestClient, session: Session): + """Test listing files for a specific entity""" + # Create files for different entities + entities = [ + ("project", "PROJ001"), + ("project", "PROJ002"), + ("run", "190110_MACHINE123_0001_FLOWCELL123") + ] + + for entity_type, entity_id in entities: + for i in range(2): + file_data = { + "filename": f"entity_test_{i}.txt", + "description": f"Test file {i} for {entity_type} {entity_id}", + "file_type": "document", + "entity_type": entity_type, + "entity_id": entity_id, + "created_by": "entity_test_user" + } + client.post("/api/v1/files", data=file_data) + + # Test listing files for specific project + response = client.get("/api/v1/files/entity/project/PROJ001") + assert response.status_code == 200 + + data = response.json() + assert data["total_items"] == 2 + for item in data["data"]: + assert item["entity_type"] == "project" + assert item["entity_id"] == "PROJ001" + + # Test listing files for specific run + response = client.get("/api/v1/files/entity/run/190110_MACHINE123_0001_FLOWCELL123") + assert response.status_code == 200 + + data = response.json() + assert data["total_items"] == 2 + for item in data["data"]: + assert item["entity_type"] == "run" + assert item["entity_id"] == "190110_MACHINE123_0001_FLOWCELL123" + + # Test pagination for entity files + response = client.get("/api/v1/files/entity/project/PROJ001?page=1&per_page=1") + assert response.status_code == 200 + data = response.json() + assert data["current_page"] == 1 + assert data["per_page"] == 1 + assert len(data["data"]) == 1 + + def test_get_file_count_for_entity_endpoint(self, client: TestClient, session: Session): + """Test getting file count for a specific entity""" + # Create files for a specific entity + entity_type = "project" + entity_id = "PROJ_COUNT_TEST" + + for i in range(5): + file_data = { + "filename": f"count_test_{i}.txt", + "description": f"Count test file {i}", + "file_type": "document", + "entity_type": entity_type, + "entity_id": entity_id, + "created_by": "count_test_user" + } + client.post("/api/v1/files", data=file_data) + + # Test file count endpoint + response = client.get(f"/api/v1/files/entity/{entity_type}/{entity_id}/count") + assert response.status_code == 200 + + data = response.json() + assert data["entity_type"] == entity_type + assert data["entity_id"] == entity_id + assert data["file_count"] == 5 + + # Test count for entity with no files + response = client.get("/api/v1/files/entity/project/EMPTY_PROJECT/count") + assert response.status_code == 200 + + data = response.json() + assert data["entity_type"] == "project" + assert data["entity_id"] == "EMPTY_PROJECT" + assert data["file_count"] == 0 + + def test_create_file_with_content_endpoint(self, client: TestClient, session: Session): + """Test file creation with content upload""" + import io + + # Create file data + file_data = { + "filename": "test_with_content.txt", + "description": "Test file with content", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "content_test_user" + } + + # Create file content + file_content = b"Hello, this is test content!" + files = {"content": ("test_content.txt", io.BytesIO(file_content), "text/plain")} + + # Send multipart form data + response = client.post("/api/v1/files", data=file_data, files=files) + assert response.status_code == 201 + + data = response.json() + assert data["filename"] == "test_with_content.txt" + assert data["file_size"] == len(file_content) + assert data["mime_type"] == "text/plain" + + def test_error_handling(self, client: TestClient, session: Session): + """Test API error handling""" + # Test invalid file type + invalid_file_data = { + "filename": "test.txt", + "file_type": "invalid_type", + "entity_type": "project", + "entity_id": "PROJ001" + } + + response = client.post("/api/v1/files", data=invalid_file_data) + assert response.status_code == 422 # Validation error + + # Test invalid entity type + invalid_entity_data = { + "filename": "test.txt", + "file_type": "document", + "entity_type": "invalid_entity", + "entity_id": "PROJ001" + } + + response = client.post("/api/v1/files", data=invalid_entity_data) + assert response.status_code == 422 # Validation error class TestFileIntegration: From c632dbc9b24381631f15f48e082a543e825e01d5 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 10:36:26 -0400 Subject: [PATCH 12/35] Lint --- api/files/routes.py | 2 +- tests/api/test_files.py | 40 ++++++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/api/files/routes.py b/api/files/routes.py index 918a6cc..50c575f 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -58,7 +58,7 @@ def create_file( is_public=is_public, created_by=created_by ) - + file_content = None if content and content.filename: file_content = content.file.read() diff --git a/tests/api/test_files.py b/tests/api/test_files.py index d2e6116..321cb78 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -386,7 +386,7 @@ def test_create_file_endpoint(self, client: TestClient, session: Session): response = client.post("/api/v1/files", data=file_data) assert response.status_code == 201 - + data = response.json() assert data["filename"] == "test_api.txt" assert data["description"] == "Test file via API" @@ -415,7 +415,7 @@ def test_get_files_endpoint(self, client: TestClient, session: Session): # Test basic listing response = client.get("/api/v1/files") assert response.status_code == 200 - + data = response.json() assert "data" in data assert "total_items" in data @@ -464,7 +464,7 @@ def test_get_file_endpoint(self, client: TestClient, session: Session): "entity_id": "PROJ001", "created_by": "get_test_user" } - + create_response = client.post("/api/v1/files", data=file_data) assert create_response.status_code == 201 created_file = create_response.json() @@ -473,7 +473,7 @@ def test_get_file_endpoint(self, client: TestClient, session: Session): # Test successful retrieval response = client.get(f"/api/v1/files/{file_id}") assert response.status_code == 200 - + data = response.json() assert data["file_id"] == file_id assert data["filename"] == "test_get.txt" @@ -494,7 +494,7 @@ def test_update_file_endpoint(self, client: TestClient, session: Session): "entity_id": "PROJ001", "created_by": "update_test_user" } - + create_response = client.post("/api/v1/files", data=file_data) assert create_response.status_code == 201 created_file = create_response.json() @@ -505,10 +505,10 @@ def test_update_file_endpoint(self, client: TestClient, session: Session): "description": "Updated description", "is_public": True } - + response = client.put(f"/api/v1/files/{file_id}", json=update_data) assert response.status_code == 200 - + data = response.json() assert data["file_id"] == file_id assert data["description"] == "Updated description" @@ -530,7 +530,7 @@ def test_delete_file_endpoint(self, client: TestClient, session: Session): "entity_id": "PROJ001", "created_by": "delete_test_user" } - + create_response = client.post("/api/v1/files", data=file_data) assert create_response.status_code == 201 created_file = create_response.json() @@ -556,7 +556,7 @@ def test_list_files_for_entity_endpoint(self, client: TestClient, session: Sessi ("project", "PROJ002"), ("run", "190110_MACHINE123_0001_FLOWCELL123") ] - + for entity_type, entity_id in entities: for i in range(2): file_data = { @@ -572,7 +572,7 @@ def test_list_files_for_entity_endpoint(self, client: TestClient, session: Sessi # Test listing files for specific project response = client.get("/api/v1/files/entity/project/PROJ001") assert response.status_code == 200 - + data = response.json() assert data["total_items"] == 2 for item in data["data"]: @@ -582,7 +582,7 @@ def test_list_files_for_entity_endpoint(self, client: TestClient, session: Sessi # Test listing files for specific run response = client.get("/api/v1/files/entity/run/190110_MACHINE123_0001_FLOWCELL123") assert response.status_code == 200 - + data = response.json() assert data["total_items"] == 2 for item in data["data"]: @@ -602,7 +602,7 @@ def test_get_file_count_for_entity_endpoint(self, client: TestClient, session: S # Create files for a specific entity entity_type = "project" entity_id = "PROJ_COUNT_TEST" - + for i in range(5): file_data = { "filename": f"count_test_{i}.txt", @@ -617,7 +617,7 @@ def test_get_file_count_for_entity_endpoint(self, client: TestClient, session: S # Test file count endpoint response = client.get(f"/api/v1/files/entity/{entity_type}/{entity_id}/count") assert response.status_code == 200 - + data = response.json() assert data["entity_type"] == entity_type assert data["entity_id"] == entity_id @@ -626,7 +626,7 @@ def test_get_file_count_for_entity_endpoint(self, client: TestClient, session: S # Test count for entity with no files response = client.get("/api/v1/files/entity/project/EMPTY_PROJECT/count") assert response.status_code == 200 - + data = response.json() assert data["entity_type"] == "project" assert data["entity_id"] == "EMPTY_PROJECT" @@ -635,7 +635,7 @@ def test_get_file_count_for_entity_endpoint(self, client: TestClient, session: S def test_create_file_with_content_endpoint(self, client: TestClient, session: Session): """Test file creation with content upload""" import io - + # Create file data file_data = { "filename": "test_with_content.txt", @@ -645,15 +645,15 @@ def test_create_file_with_content_endpoint(self, client: TestClient, session: Se "entity_id": "PROJ001", "created_by": "content_test_user" } - + # Create file content file_content = b"Hello, this is test content!" files = {"content": ("test_content.txt", io.BytesIO(file_content), "text/plain")} - + # Send multipart form data response = client.post("/api/v1/files", data=file_data, files=files) assert response.status_code == 201 - + data = response.json() assert data["filename"] == "test_with_content.txt" assert data["file_size"] == len(file_content) @@ -668,7 +668,7 @@ def test_error_handling(self, client: TestClient, session: Session): "entity_type": "project", "entity_id": "PROJ001" } - + response = client.post("/api/v1/files", data=invalid_file_data) assert response.status_code == 422 # Validation error @@ -679,7 +679,7 @@ def test_error_handling(self, client: TestClient, session: Session): "entity_type": "invalid_entity", "entity_id": "PROJ001" } - + response = client.post("/api/v1/files", data=invalid_entity_data) assert response.status_code == 422 # Validation error From f9995a0564dbb5e2c9858c98621cc6c7bca3dcd5 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 10:38:09 -0400 Subject: [PATCH 13/35] Add file registration scripts --- scripts/README_S3_Registration.md | 193 ++++++++++ scripts/register_s3_files.py | 402 +++++++++++++++++++++ scripts/s3_registration_config.sample.json | 20 + 3 files changed, 615 insertions(+) create mode 100644 scripts/README_S3_Registration.md create mode 100644 scripts/register_s3_files.py create mode 100644 scripts/s3_registration_config.sample.json diff --git a/scripts/README_S3_Registration.md b/scripts/README_S3_Registration.md new file mode 100644 index 0000000..0c25204 --- /dev/null +++ b/scripts/README_S3_Registration.md @@ -0,0 +1,193 @@ +# S3 Bulk File Registration + +This script allows you to register existing S3 files in your database without moving them. Files remain in S3 but become discoverable through the Files API. + +## Prerequisites + +1. **AWS Credentials**: Configure AWS credentials using one of these methods: + - AWS CLI: `aws configure` + - Environment variables: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` + - IAM roles (if running on EC2) + - AWS credentials file + +2. **Required Permissions**: Your AWS credentials need the following S3 permissions: + - `s3:ListBucket` - to list objects in buckets + - `s3:GetObject` - to read object metadata + - `s3:HeadBucket` - to verify bucket access + +3. **Python Dependencies**: Install boto3 if not already installed: + ```bash + pip install boto3 + ``` + +## Usage Examples + +### 1. Single Bucket Registration + +Register all files from a specific bucket: +```bash +python scripts/register_s3_files.py --bucket my-ngs-data-bucket +``` + +Register files with a specific prefix: +```bash +python scripts/register_s3_files.py --bucket my-bucket --prefix projects/ +``` + +### 2. Dry Run Mode + +See what would be registered without making changes: +```bash +python scripts/register_s3_files.py --bucket my-bucket --dry-run +``` + +### 3. Configuration File Mode + +For multiple buckets or complex setups, use a configuration file: + +1. Create a configuration file (copy from `s3_registration_config.sample.json`): + ```bash + cp scripts/s3_registration_config.sample.json my_s3_config.json + ``` + +2. Edit the configuration file with your bucket details: + ```json + { + "buckets": [ + { + "name": "my-ngs-data-bucket", + "prefix": "projects/", + "entity_patterns": { + "project": "projects/([^/]+)", + "run": "runs/([^/]+)" + } + } + ], + "dry_run": false + } + ``` + +3. Run with the configuration: + ```bash + python scripts/register_s3_files.py --config my_s3_config.json + ``` + +### 4. Generate Sample Configuration + +Create a sample configuration file: +```bash +python scripts/register_s3_files.py --create-config +``` + +## Configuration File Format + +```json +{ + "buckets": [ + { + "name": "bucket-name", + "prefix": "optional/prefix/", + "entity_patterns": { + "project": "regex_pattern_to_extract_project_id", + "run": "regex_pattern_to_extract_run_id" + } + } + ], + "dry_run": true +} +``` + +### Entity Patterns + +Entity patterns are regex patterns used to extract project or run IDs from S3 object keys: + +- `"project": "projects/([^/]+)"` - Extracts project ID from paths like `projects/PROJ001/file.fastq` +- `"run": "runs/([^/]+)"` - Extracts run ID from paths like `runs/RUN123/data.bam` + +The captured group `([^/]+)` becomes the entity ID. + +## File Type Detection + +The script automatically detects file types based on extensions: + +| Extension | File Type | +|-----------|-----------| +| `.fastq`, `.fastq.gz`, `.fq`, `.fq.gz` | FASTQ | +| `.bam`, `.sam` | BAM | +| `.vcf`, `.vcf.gz` | VCF | +| `.csv`, `.xlsx` | SAMPLESHEET | +| `.json` | METRICS | +| `.html`, `.pdf` | REPORT | +| `.log`, `.txt` | LOG | +| `.png`, `.jpg`, `.jpeg`, `.svg` | IMAGE | +| `.doc`, `.docx`, `.md` | DOCUMENT | +| Others | OTHER | + +## What Gets Registered + +For each S3 object, the script creates a database record with: + +- **File metadata**: filename, size, MIME type, upload date +- **S3 location**: `file_path` set to `s3://bucket/key` +- **Storage backend**: Set to `StorageBackend.S3` +- **Entity association**: Project or run ID extracted from path +- **File type**: Auto-detected from extension +- **Default settings**: `is_public=False`, `created_by="s3_bulk_import"` + +## Error Handling + +The script handles common issues: + +- **Duplicate files**: Skips files already registered (based on S3 URI) +- **Access errors**: Reports permission or bucket access issues +- **Invalid patterns**: Falls back to default entity extraction +- **Database errors**: Rolls back failed registrations + +## Monitoring Progress + +The script provides progress updates every 100 files and a final summary: + +``` +Progress: 500 scanned, 450 registered, 30 skipped, 20 errors +========================================== +REGISTRATION SUMMARY +========================================== +Files scanned: 1000 +Files registered: 850 +Files skipped: 100 (already registered) +Errors: 50 (permission/validation issues) +``` + +## Best Practices + +1. **Start with dry run**: Always test with `--dry-run` first +2. **Use specific prefixes**: Limit scope with `--prefix` to avoid scanning entire buckets +3. **Monitor logs**: Check for permission or pattern matching issues +4. **Backup database**: Consider backing up before large imports +5. **Run incrementally**: Process buckets/prefixes separately for better control + +## Troubleshooting + +### Common Issues + +**"AWS credentials not found"** +- Configure AWS credentials using `aws configure` or environment variables + +**"Access denied to bucket"** +- Verify your AWS credentials have the required S3 permissions +- Check bucket policies and IAM roles + +**"No files registered"** +- Verify bucket name and prefix are correct +- Check entity patterns match your S3 structure +- Use `--dry-run` to see what would be processed + +**"Database connection failed"** +- Ensure your database is running and accessible +- Check `SQLALCHEMY_DATABASE_URI` environment variable + +### Getting Help + +Run the script with `--help` for detailed usage information: +```bash +python scripts/register_s3_files.py --help \ No newline at end of file diff --git a/scripts/register_s3_files.py b/scripts/register_s3_files.py new file mode 100644 index 0000000..d07db6f --- /dev/null +++ b/scripts/register_s3_files.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python +""" +Bulk registration script for existing S3 files. + +This script scans S3 buckets and registers existing files in the database +without moving them. Files remain in S3 but become discoverable through the API. + +Usage: + python scripts/register_s3_files.py --bucket my-bucket --prefix data/ + python scripts/register_s3_files.py --config s3_config.json + python scripts/register_s3_files.py --bucket my-bucket --dry-run +""" + +import argparse +import json +import mimetypes +import re +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, Tuple + +import boto3 +from botocore.exceptions import ClientError, NoCredentialsError +from sqlmodel import Session, select + +# Add project root to path for imports +sys.path.append(str(Path(__file__).parent.parent)) + +from api.files.models import FileCreate, FileType, EntityType, StorageBackend +from api.files.services import create_file +from core.db import get_session +from core.logger import logger + + +class S3FileRegistrar: + """Handles bulk registration of S3 files to the database.""" + + def __init__(self, session: Session, dry_run: bool = False): + self.session = session + self.dry_run = dry_run + self.stats = { + 'scanned': 0, + 'registered': 0, + 'skipped': 0, + 'errors': 0 + } + + # File type mapping based on extensions + self.file_type_mapping = { + '.fastq': FileType.FASTQ, + '.fastq.gz': FileType.FASTQ, + '.fq': FileType.FASTQ, + '.fq.gz': FileType.FASTQ, + '.bam': FileType.BAM, + '.sam': FileType.BAM, + '.vcf': FileType.VCF, + '.vcf.gz': FileType.VCF, + '.csv': FileType.SAMPLESHEET, + '.xlsx': FileType.SAMPLESHEET, + '.json': FileType.METRICS, + '.html': FileType.REPORT, + '.pdf': FileType.REPORT, + '.log': FileType.LOG, + '.txt': FileType.LOG, + '.png': FileType.IMAGE, + '.jpg': FileType.IMAGE, + '.jpeg': FileType.IMAGE, + '.svg': FileType.IMAGE, + '.doc': FileType.DOCUMENT, + '.docx': FileType.DOCUMENT, + '.md': FileType.DOCUMENT, + } + + def get_file_type(self, filename: str) -> FileType: + """Determine file type based on filename extension.""" + filename_lower = filename.lower() + + # Check for compound extensions first (e.g., .fastq.gz) + for ext, file_type in self.file_type_mapping.items(): + if filename_lower.endswith(ext): + return file_type + + return FileType.OTHER + + def extract_entity_info(self, s3_key: str, patterns: Dict[str, str]) -> Tuple[EntityType, str]: + """ + Extract entity type and ID from S3 key using regex patterns. + + Args: + s3_key: S3 object key + patterns: Dict of entity_type -> regex pattern + + Returns: + Tuple of (EntityType, entity_id) + """ + for entity_type_str, pattern in patterns.items(): + match = re.search(pattern, s3_key) + if match: + entity_id = match.group(1) + entity_type = EntityType.PROJECT if entity_type_str == 'project' else EntityType.RUN + return entity_type, entity_id + + # Default fallback - extract from path structure + path_parts = s3_key.split('/') + if len(path_parts) >= 2: + # Assume format like: projects/PROJ001/... or runs/RUN001/... + if path_parts[0].lower() in ['projects', 'project']: + return EntityType.PROJECT, path_parts[1] + elif path_parts[0].lower() in ['runs', 'run', 'sequencing_runs']: + return EntityType.RUN, path_parts[1] + + # Final fallback - use first directory as project + return EntityType.PROJECT, path_parts[0] if path_parts else 'unknown' + + def file_already_registered(self, s3_uri: str) -> bool: + """Check if file is already registered in database.""" + from api.files.models import File + + existing = self.session.exec( + select(File).where(File.file_path == s3_uri) + ).first() + + return existing is not None + + def register_s3_object(self, bucket: str, obj: dict, entity_patterns: Dict[str, str]) -> bool: + """ + Register a single S3 object in the database. + + Args: + bucket: S3 bucket name + obj: S3 object metadata from boto3 + entity_patterns: Regex patterns for extracting entity info + + Returns: + True if registered successfully, False otherwise + """ + s3_key = obj['Key'] + filename = Path(s3_key).name + s3_uri = f"s3://{bucket}/{s3_key}" + + # Skip directories + if s3_key.endswith('/'): + return False + + # Skip if already registered + if self.file_already_registered(s3_uri): + logger.debug(f"File already registered: {s3_uri}") + self.stats['skipped'] += 1 + return False + + # Extract entity information + entity_type, entity_id = self.extract_entity_info(s3_key, entity_patterns) + + # Determine file type + file_type = self.get_file_type(filename) + + # Get MIME type + mime_type, _ = mimetypes.guess_type(filename) + + # Create file record + file_create = FileCreate( + filename=filename, + original_filename=filename, + description=f"Imported from S3: {s3_key}", + file_type=file_type, + entity_type=entity_type, + entity_id=entity_id, + is_public=False, # Default to private + created_by="s3_bulk_import" + ) + + if self.dry_run: + logger.info(f"[DRY RUN] Would register: {s3_uri} -> {entity_type.value}/{entity_id}") + self.stats['registered'] += 1 + return True + + try: + # Register file in database + file_record = create_file( + session=self.session, + file_create=file_create, + file_content=None, # No content upload, file stays in S3 + storage_root="storage" # Use default storage settings + ) + + # Update the file record with S3-specific information + file_record.file_path = s3_uri + file_record.storage_backend = StorageBackend.S3 + file_record.file_size = obj.get('Size', 0) + file_record.mime_type = mime_type + file_record.upload_date = obj.get('LastModified', datetime.now(timezone.utc)) + + self.session.add(file_record) + self.session.commit() + + logger.info(f"Registered: {s3_uri} -> {file_record.file_id}") + self.stats['registered'] += 1 + return True + + except Exception as e: + logger.error(f"Failed to register {s3_uri}: {e}") + self.stats['errors'] += 1 + self.session.rollback() + return False + + def scan_bucket(self, bucket: str, prefix: str = "", + entity_patterns: Dict[str, str] = None) -> None: + """ + Scan S3 bucket and register files. + + Args: + bucket: S3 bucket name + prefix: S3 key prefix to filter objects + entity_patterns: Regex patterns for extracting entity info + """ + if entity_patterns is None: + entity_patterns = { + 'project': r'(?:projects?|proj)/([^/]+)', + 'run': r'(?:runs?|sequencing_runs?)/([^/]+)' + } + + try: + s3_client = boto3.client('s3') + + # Test bucket access + try: + s3_client.head_bucket(Bucket=bucket) + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == '404': + logger.error(f"Bucket '{bucket}' does not exist") + elif error_code == '403': + logger.error(f"Access denied to bucket '{bucket}'") + else: + logger.error(f"Error accessing bucket '{bucket}': {e}") + return + + logger.info(f"Scanning S3 bucket: s3://{bucket}/{prefix}") + + # List objects with pagination + paginator = s3_client.get_paginator('list_objects_v2') + page_iterator = paginator.paginate( + Bucket=bucket, + Prefix=prefix + ) + + for page in page_iterator: + if 'Contents' not in page: + continue + + for obj in page['Contents']: + self.stats['scanned'] += 1 + self.register_s3_object(bucket, obj, entity_patterns) + + # Progress logging + if self.stats['scanned'] % 100 == 0: + logger.info( + f"Progress: {self.stats['scanned']} scanned, " + f"{self.stats['registered']} registered, " + f"{self.stats['skipped']} skipped, " + f"{self.stats['errors']} errors" + ) + + except NoCredentialsError: + logger.error("AWS credentials not found. Please configure AWS credentials.") + except Exception as e: + logger.error(f"Error scanning bucket: {e}") + + def print_summary(self): + """Print registration summary.""" + logger.info("=" * 50) + logger.info("REGISTRATION SUMMARY") + logger.info("=" * 50) + logger.info(f"Files scanned: {self.stats['scanned']}") + logger.info(f"Files registered: {self.stats['registered']}") + logger.info(f"Files skipped: {self.stats['skipped']}") + logger.info(f"Errors: {self.stats['errors']}") + + if self.dry_run: + logger.info("\n*** DRY RUN MODE - No files were actually registered ***") + + +def load_config(config_file: str) -> dict: + """Load configuration from JSON file.""" + try: + with open(config_file, 'r') as f: + return json.load(f) + except FileNotFoundError: + logger.error(f"Configuration file not found: {config_file}") + sys.exit(1) + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON in configuration file: {e}") + sys.exit(1) + + +def create_sample_config(): + """Create a sample configuration file.""" + config = { + "buckets": [ + { + "name": "my-ngs-data-bucket", + "prefix": "projects/", + "entity_patterns": { + "project": r"projects/([^/]+)", + "run": r"runs/([^/]+)" + } + }, + { + "name": "my-runs-bucket", + "prefix": "sequencing_runs/", + "entity_patterns": { + "run": r"sequencing_runs/([^/]+)" + } + } + ], + "dry_run": True + } + + with open('s3_registration_config.json', 'w') as f: + json.dump(config, f, indent=2) + + logger.info("Sample configuration created: s3_registration_config.json") + logger.info("Edit this file with your S3 bucket details and run:") + logger.info("python scripts/register_s3_files.py --config s3_registration_config.json") + + +def main(): + parser = argparse.ArgumentParser( + description="Bulk register existing S3 files in the database", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Register all files from a bucket + python scripts/register_s3_files.py --bucket my-bucket + + # Register files with specific prefix + python scripts/register_s3_files.py --bucket my-bucket --prefix projects/ + + # Dry run to see what would be registered + python scripts/register_s3_files.py --bucket my-bucket --dry-run + + # Use configuration file for multiple buckets + python scripts/register_s3_files.py --config s3_config.json + + # Create sample configuration file + python scripts/register_s3_files.py --create-config + """ + ) + + parser.add_argument('--bucket', help='S3 bucket name') + parser.add_argument('--prefix', default='', help='S3 key prefix to filter objects') + parser.add_argument('--config', help='JSON configuration file for multiple buckets') + parser.add_argument('--dry-run', action='store_true', + help='Show what would be registered without making changes') + parser.add_argument('--create-config', action='store_true', + help='Create a sample configuration file') + + args = parser.parse_args() + + if args.create_config: + create_sample_config() + return + + if not args.bucket and not args.config: + parser.error("Either --bucket or --config must be specified") + + # Get database session + try: + session = next(get_session()) + except Exception as e: + logger.error(f"Failed to create database session: {e}") + sys.exit(1) + + registrar = S3FileRegistrar(session, dry_run=args.dry_run) + + try: + if args.config: + # Load configuration and process multiple buckets + config = load_config(args.config) + dry_run = config.get('dry_run', args.dry_run) + registrar.dry_run = dry_run + + for bucket_config in config['buckets']: + bucket = bucket_config['name'] + prefix = bucket_config.get('prefix', '') + entity_patterns = bucket_config.get('entity_patterns') + + logger.info(f"Processing bucket: {bucket}") + registrar.scan_bucket(bucket, prefix, entity_patterns) + + else: + # Single bucket mode + registrar.scan_bucket(args.bucket, args.prefix) + + finally: + registrar.print_summary() + session.close() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/s3_registration_config.sample.json b/scripts/s3_registration_config.sample.json new file mode 100644 index 0000000..afbab7f --- /dev/null +++ b/scripts/s3_registration_config.sample.json @@ -0,0 +1,20 @@ +{ + "buckets": [ + { + "name": "my-ngs-data-bucket", + "prefix": "projects/", + "entity_patterns": { + "project": "projects/([^/]+)", + "run": "runs/([^/]+)" + } + }, + { + "name": "my-sequencing-runs-bucket", + "prefix": "sequencing_runs/", + "entity_patterns": { + "run": "sequencing_runs/([^/]+)" + } + } + ], + "dry_run": true +} \ No newline at end of file From c4d4c7b6678606869bd982913c3822a805e0bdd9 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 10:39:23 -0400 Subject: [PATCH 14/35] Format with black --- scripts/register_s3_files.py | 293 ++++++++++++++++++----------------- 1 file changed, 152 insertions(+), 141 deletions(-) diff --git a/scripts/register_s3_files.py b/scripts/register_s3_files.py index d07db6f..8134e2c 100644 --- a/scripts/register_s3_files.py +++ b/scripts/register_s3_files.py @@ -35,62 +35,59 @@ class S3FileRegistrar: """Handles bulk registration of S3 files to the database.""" - + def __init__(self, session: Session, dry_run: bool = False): self.session = session self.dry_run = dry_run - self.stats = { - 'scanned': 0, - 'registered': 0, - 'skipped': 0, - 'errors': 0 - } - + self.stats = {"scanned": 0, "registered": 0, "skipped": 0, "errors": 0} + # File type mapping based on extensions self.file_type_mapping = { - '.fastq': FileType.FASTQ, - '.fastq.gz': FileType.FASTQ, - '.fq': FileType.FASTQ, - '.fq.gz': FileType.FASTQ, - '.bam': FileType.BAM, - '.sam': FileType.BAM, - '.vcf': FileType.VCF, - '.vcf.gz': FileType.VCF, - '.csv': FileType.SAMPLESHEET, - '.xlsx': FileType.SAMPLESHEET, - '.json': FileType.METRICS, - '.html': FileType.REPORT, - '.pdf': FileType.REPORT, - '.log': FileType.LOG, - '.txt': FileType.LOG, - '.png': FileType.IMAGE, - '.jpg': FileType.IMAGE, - '.jpeg': FileType.IMAGE, - '.svg': FileType.IMAGE, - '.doc': FileType.DOCUMENT, - '.docx': FileType.DOCUMENT, - '.md': FileType.DOCUMENT, + ".fastq": FileType.FASTQ, + ".fastq.gz": FileType.FASTQ, + ".fq": FileType.FASTQ, + ".fq.gz": FileType.FASTQ, + ".bam": FileType.BAM, + ".sam": FileType.BAM, + ".vcf": FileType.VCF, + ".vcf.gz": FileType.VCF, + ".csv": FileType.SAMPLESHEET, + ".xlsx": FileType.SAMPLESHEET, + ".json": FileType.METRICS, + ".html": FileType.REPORT, + ".pdf": FileType.REPORT, + ".log": FileType.LOG, + ".txt": FileType.LOG, + ".png": FileType.IMAGE, + ".jpg": FileType.IMAGE, + ".jpeg": FileType.IMAGE, + ".svg": FileType.IMAGE, + ".doc": FileType.DOCUMENT, + ".docx": FileType.DOCUMENT, + ".md": FileType.DOCUMENT, } - + def get_file_type(self, filename: str) -> FileType: """Determine file type based on filename extension.""" filename_lower = filename.lower() - + # Check for compound extensions first (e.g., .fastq.gz) for ext, file_type in self.file_type_mapping.items(): if filename_lower.endswith(ext): return file_type - + return FileType.OTHER - - def extract_entity_info(self, s3_key: str, patterns: Dict[str, str]) -> Tuple[EntityType, str]: + + def extract_entity_info( + self, s3_key: str, patterns: Dict[str, str] + ) -> Tuple[EntityType, str]: """ Extract entity type and ID from S3 key using regex patterns. - + Args: s3_key: S3 object key patterns: Dict of entity_type -> regex pattern - + Returns: Tuple of (EntityType, entity_id) """ @@ -98,66 +95,72 @@ def extract_entity_info(self, s3_key: str, patterns: Dict[str, str]) -> Tuple[En match = re.search(pattern, s3_key) if match: entity_id = match.group(1) - entity_type = EntityType.PROJECT if entity_type_str == 'project' else EntityType.RUN + entity_type = ( + EntityType.PROJECT + if entity_type_str == "project" + else EntityType.RUN + ) return entity_type, entity_id - + # Default fallback - extract from path structure - path_parts = s3_key.split('/') + path_parts = s3_key.split("/") if len(path_parts) >= 2: # Assume format like: projects/PROJ001/... or runs/RUN001/... - if path_parts[0].lower() in ['projects', 'project']: + if path_parts[0].lower() in ["projects", "project"]: return EntityType.PROJECT, path_parts[1] - elif path_parts[0].lower() in ['runs', 'run', 'sequencing_runs']: + elif path_parts[0].lower() in ["runs", "run", "sequencing_runs"]: return EntityType.RUN, path_parts[1] - + # Final fallback - use first directory as project - return EntityType.PROJECT, path_parts[0] if path_parts else 'unknown' - + return EntityType.PROJECT, path_parts[0] if path_parts else "unknown" + def file_already_registered(self, s3_uri: str) -> bool: """Check if file is already registered in database.""" from api.files.models import File - + existing = self.session.exec( select(File).where(File.file_path == s3_uri) ).first() - + return existing is not None - - def register_s3_object(self, bucket: str, obj: dict, entity_patterns: Dict[str, str]) -> bool: + + def register_s3_object( + self, bucket: str, obj: dict, entity_patterns: Dict[str, str] + ) -> bool: """ Register a single S3 object in the database. - + Args: bucket: S3 bucket name obj: S3 object metadata from boto3 entity_patterns: Regex patterns for extracting entity info - + Returns: True if registered successfully, False otherwise """ - s3_key = obj['Key'] + s3_key = obj["Key"] filename = Path(s3_key).name s3_uri = f"s3://{bucket}/{s3_key}" - + # Skip directories - if s3_key.endswith('/'): + if s3_key.endswith("/"): return False - + # Skip if already registered if self.file_already_registered(s3_uri): logger.debug(f"File already registered: {s3_uri}") - self.stats['skipped'] += 1 + self.stats["skipped"] += 1 return False - + # Extract entity information entity_type, entity_id = self.extract_entity_info(s3_key, entity_patterns) - + # Determine file type file_type = self.get_file_type(filename) - + # Get MIME type mime_type, _ = mimetypes.guess_type(filename) - + # Create file record file_create = FileCreate( filename=filename, @@ -167,48 +170,53 @@ def register_s3_object(self, bucket: str, obj: dict, entity_patterns: Dict[str, entity_type=entity_type, entity_id=entity_id, is_public=False, # Default to private - created_by="s3_bulk_import" + created_by="s3_bulk_import", ) - + if self.dry_run: - logger.info(f"[DRY RUN] Would register: {s3_uri} -> {entity_type.value}/{entity_id}") - self.stats['registered'] += 1 + logger.info( + f"[DRY RUN] Would register: {s3_uri} -> {entity_type.value}/{entity_id}" + ) + self.stats["registered"] += 1 return True - + try: # Register file in database file_record = create_file( session=self.session, file_create=file_create, file_content=None, # No content upload, file stays in S3 - storage_root="storage" # Use default storage settings + storage_root="storage", # Use default storage settings ) - + # Update the file record with S3-specific information file_record.file_path = s3_uri file_record.storage_backend = StorageBackend.S3 - file_record.file_size = obj.get('Size', 0) + file_record.file_size = obj.get("Size", 0) file_record.mime_type = mime_type - file_record.upload_date = obj.get('LastModified', datetime.now(timezone.utc)) - + file_record.upload_date = obj.get( + "LastModified", datetime.now(timezone.utc) + ) + self.session.add(file_record) self.session.commit() - + logger.info(f"Registered: {s3_uri} -> {file_record.file_id}") - self.stats['registered'] += 1 + self.stats["registered"] += 1 return True - + except Exception as e: logger.error(f"Failed to register {s3_uri}: {e}") - self.stats['errors'] += 1 + self.stats["errors"] += 1 self.session.rollback() return False - - def scan_bucket(self, bucket: str, prefix: str = "", - entity_patterns: Dict[str, str] = None) -> None: + + def scan_bucket( + self, bucket: str, prefix: str = "", entity_patterns: Dict[str, str] = None + ) -> None: """ Scan S3 bucket and register files. - + Args: bucket: S3 bucket name prefix: S3 key prefix to filter objects @@ -216,57 +224,54 @@ def scan_bucket(self, bucket: str, prefix: str = "", """ if entity_patterns is None: entity_patterns = { - 'project': r'(?:projects?|proj)/([^/]+)', - 'run': r'(?:runs?|sequencing_runs?)/([^/]+)' + "project": r"(?:projects?|proj)/([^/]+)", + "run": r"(?:runs?|sequencing_runs?)/([^/]+)", } - + try: - s3_client = boto3.client('s3') - + s3_client = boto3.client("s3") + # Test bucket access try: s3_client.head_bucket(Bucket=bucket) except ClientError as e: - error_code = e.response['Error']['Code'] - if error_code == '404': + error_code = e.response["Error"]["Code"] + if error_code == "404": logger.error(f"Bucket '{bucket}' does not exist") - elif error_code == '403': + elif error_code == "403": logger.error(f"Access denied to bucket '{bucket}'") else: logger.error(f"Error accessing bucket '{bucket}': {e}") return - + logger.info(f"Scanning S3 bucket: s3://{bucket}/{prefix}") - + # List objects with pagination - paginator = s3_client.get_paginator('list_objects_v2') - page_iterator = paginator.paginate( - Bucket=bucket, - Prefix=prefix - ) - + paginator = s3_client.get_paginator("list_objects_v2") + page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) + for page in page_iterator: - if 'Contents' not in page: + if "Contents" not in page: continue - - for obj in page['Contents']: - self.stats['scanned'] += 1 + + for obj in page["Contents"]: + self.stats["scanned"] += 1 self.register_s3_object(bucket, obj, entity_patterns) - + # Progress logging - if self.stats['scanned'] % 100 == 0: + if self.stats["scanned"] % 100 == 0: logger.info( f"Progress: {self.stats['scanned']} scanned, " f"{self.stats['registered']} registered, " f"{self.stats['skipped']} skipped, " f"{self.stats['errors']} errors" ) - + except NoCredentialsError: logger.error("AWS credentials not found. Please configure AWS credentials.") except Exception as e: logger.error(f"Error scanning bucket: {e}") - + def print_summary(self): """Print registration summary.""" logger.info("=" * 50) @@ -276,7 +281,7 @@ def print_summary(self): logger.info(f"Files registered: {self.stats['registered']}") logger.info(f"Files skipped: {self.stats['skipped']}") logger.info(f"Errors: {self.stats['errors']}") - + if self.dry_run: logger.info("\n*** DRY RUN MODE - No files were actually registered ***") @@ -284,7 +289,7 @@ def print_summary(self): def load_config(config_file: str) -> dict: """Load configuration from JSON file.""" try: - with open(config_file, 'r') as f: + with open(config_file, "r") as f: return json.load(f) except FileNotFoundError: logger.error(f"Configuration file not found: {config_file}") @@ -303,26 +308,26 @@ def create_sample_config(): "prefix": "projects/", "entity_patterns": { "project": r"projects/([^/]+)", - "run": r"runs/([^/]+)" - } + "run": r"runs/([^/]+)", + }, }, { - "name": "my-runs-bucket", + "name": "my-runs-bucket", "prefix": "sequencing_runs/", - "entity_patterns": { - "run": r"sequencing_runs/([^/]+)" - } - } + "entity_patterns": {"run": r"sequencing_runs/([^/]+)"}, + }, ], - "dry_run": True + "dry_run": True, } - - with open('s3_registration_config.json', 'w') as f: + + with open("s3_registration_config.json", "w") as f: json.dump(config, f, indent=2) - + logger.info("Sample configuration created: s3_registration_config.json") logger.info("Edit this file with your S3 bucket details and run:") - logger.info("python scripts/register_s3_files.py --config s3_registration_config.json") + logger.info( + "python scripts/register_s3_files.py --config s3_registration_config.json" + ) def main(): @@ -345,58 +350,64 @@ def main(): # Create sample configuration file python scripts/register_s3_files.py --create-config - """ + """, + ) + + parser.add_argument("--bucket", help="S3 bucket name") + parser.add_argument("--prefix", default="", help="S3 key prefix to filter objects") + parser.add_argument("--config", help="JSON configuration file for multiple buckets") + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be registered without making changes", + ) + parser.add_argument( + "--create-config", + action="store_true", + help="Create a sample configuration file", ) - - parser.add_argument('--bucket', help='S3 bucket name') - parser.add_argument('--prefix', default='', help='S3 key prefix to filter objects') - parser.add_argument('--config', help='JSON configuration file for multiple buckets') - parser.add_argument('--dry-run', action='store_true', - help='Show what would be registered without making changes') - parser.add_argument('--create-config', action='store_true', - help='Create a sample configuration file') - + args = parser.parse_args() - + if args.create_config: create_sample_config() return - + if not args.bucket and not args.config: parser.error("Either --bucket or --config must be specified") - + # Get database session try: session = next(get_session()) except Exception as e: logger.error(f"Failed to create database session: {e}") sys.exit(1) - + registrar = S3FileRegistrar(session, dry_run=args.dry_run) - + try: if args.config: # Load configuration and process multiple buckets config = load_config(args.config) - dry_run = config.get('dry_run', args.dry_run) + dry_run = config.get("dry_run", args.dry_run) registrar.dry_run = dry_run - - for bucket_config in config['buckets']: - bucket = bucket_config['name'] - prefix = bucket_config.get('prefix', '') - entity_patterns = bucket_config.get('entity_patterns') - + + for bucket_config in config["buckets"]: + bucket = bucket_config["name"] + prefix = bucket_config.get("prefix", "") + entity_patterns = bucket_config.get("entity_patterns") + logger.info(f"Processing bucket: {bucket}") registrar.scan_bucket(bucket, prefix, entity_patterns) - + else: # Single bucket mode registrar.scan_bucket(args.bucket, args.prefix) - + finally: registrar.print_summary() session.close() if __name__ == "__main__": - main() \ No newline at end of file + main() From cd9208856aa1577eb7b5d5ca8761fda0b3d03146 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 10:42:30 -0400 Subject: [PATCH 15/35] Lint --- scripts/register_s3_files.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/register_s3_files.py b/scripts/register_s3_files.py index 8134e2c..070579e 100644 --- a/scripts/register_s3_files.py +++ b/scripts/register_s3_files.py @@ -6,6 +6,7 @@ without moving them. Files remain in S3 but become discoverable through the API. Usage: + PYTHONPATH=. python scripts/register_s3_files.py --bucket my-bucket --prefix data/ python scripts/register_s3_files.py --config s3_config.json python scripts/register_s3_files.py --bucket my-bucket --dry-run @@ -25,7 +26,7 @@ from sqlmodel import Session, select # Add project root to path for imports -sys.path.append(str(Path(__file__).parent.parent)) +# sys.path.append(str(Path(__file__).parent.parent)) from api.files.models import FileCreate, FileType, EntityType, StorageBackend from api.files.services import create_file @@ -338,16 +339,16 @@ def main(): Examples: # Register all files from a bucket python scripts/register_s3_files.py --bucket my-bucket - + # Register files with specific prefix python scripts/register_s3_files.py --bucket my-bucket --prefix projects/ - + # Dry run to see what would be registered python scripts/register_s3_files.py --bucket my-bucket --dry-run - + # Use configuration file for multiple buckets python scripts/register_s3_files.py --config s3_config.json - + # Create sample configuration file python scripts/register_s3_files.py --create-config """, From f929f89eac865607b14264557572b48657e29567 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 10:54:23 -0400 Subject: [PATCH 16/35] Update method/swagger doc --- api/files/routes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/api/files/routes.py b/api/files/routes.py index 50c575f..2f92b84 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -247,7 +247,7 @@ def upload_file_content( @router.get( "/entity/{entity_type}/{entity_id}", response_model=PaginatedFileResponse, - summary="List files for a specific entity" + summary="List files for a specific entity." ) def list_files_for_entity( session: SessionDep, @@ -259,6 +259,7 @@ def list_files_for_entity( ) -> PaginatedFileResponse: """ Get all files associated with a specific project or run. + This is the same as /api/v1/files, but scoped to a specific entity. - **entity_type**: Either "project" or "run" - **entity_id**: The project ID or run barcode From 17754f3dc2365dbf5ddf6e62459dc485f796fa8e Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 16:21:16 -0400 Subject: [PATCH 17/35] Fix sort_by field on runs --- api/runs/routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/runs/routes.py b/api/runs/routes.py index fa11ee8..a5726c4 100644 --- a/api/runs/routes.py +++ b/api/runs/routes.py @@ -75,7 +75,7 @@ def get_runs( session: SessionDep, page: int = Query(1, description="Page number (1-indexed)"), per_page: int = Query(20, description="Number of items per page"), - sort_by: str = Query("project_id", description="Field to sort by"), + sort_by: str = Query("barcode", description="Field to sort by"), sort_order: Literal["asc", "desc"] = Query( "asc", description="Sort order (asc or desc)" ), From 27374136173d3dd8e7f4fde8d53e9ae466e45e49 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 16:23:51 -0400 Subject: [PATCH 18/35] Fix sort_by on barcode to support asc/desc --- api/runs/services.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/api/runs/services.py b/api/runs/services.py index 089344f..fc46c99 100644 --- a/api/runs/services.py +++ b/api/runs/services.py @@ -95,7 +95,17 @@ def get_runs( total_pages = (total_count + per_page - 1) // per_page # Ceiling division # Determine sort field and direction - sort_field = getattr(SequencingRun, sort_by, SequencingRun.id) + # Handle computed fields that can't be sorted directly + if sort_by == "barcode": + # For barcode sorting, use run_date as primary sort field since barcode starts with date + sort_field = SequencingRun.run_date + else: + # Get the actual database column, fallback to id if field doesn't exist + sort_field = getattr(SequencingRun, sort_by, SequencingRun.id) + # Ensure we got a column, not a property + if not hasattr(sort_field, 'asc'): + sort_field = SequencingRun.id + sort_direction = sort_field.asc() if sort_order == "asc" else sort_field.desc() # Get run selection From 73c4207eb3e9a5886146da259d427f30ead0faea Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 16:24:16 -0400 Subject: [PATCH 19/35] Lint --- api/runs/services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/runs/services.py b/api/runs/services.py index fc46c99..957208d 100644 --- a/api/runs/services.py +++ b/api/runs/services.py @@ -105,7 +105,7 @@ def get_runs( # Ensure we got a column, not a property if not hasattr(sort_field, 'asc'): sort_field = SequencingRun.id - + sort_direction = sort_field.asc() if sort_order == "asc" else sort_field.desc() # Get run selection From 7d62f10037e92594d151c7925ab0b7c9bb00a21c Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 19:31:03 -0400 Subject: [PATCH 20/35] Ensure run_time is 4 digits when provided --- api/runs/routes.py | 7 ++++++- tests/api/test_runs.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/api/runs/routes.py b/api/runs/routes.py index a5726c4..3cef2b1 100644 --- a/api/runs/routes.py +++ b/api/runs/routes.py @@ -14,7 +14,7 @@ """ from typing import Literal -from fastapi import APIRouter, Query, status +from fastapi import APIRouter, Query, status, HTTPException from core.deps import SessionDep, OpenSearchDep from api.runs.models import ( IlluminaMetricsResponseModel, @@ -46,6 +46,11 @@ def add_run( """ if sequencingrun_in.run_time == "": sequencingrun_in.run_time = None + elif sequencingrun_in.run_time and len(sequencingrun_in.run_time) != 4: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"run_time must be in HHMM format" + ) run = services.add_run( session=session, diff --git a/tests/api/test_runs.py b/tests/api/test_runs.py index a818be6..6ee5244 100644 --- a/tests/api/test_runs.py +++ b/tests/api/test_runs.py @@ -68,6 +68,22 @@ def test_add_run(client: TestClient): assert data["run_time"] is None assert data["barcode"] == "190110_MACHINE123_0002_FLOWCELL123" + # Try to add a run with an invalid run_time field + new_run = { + "run_date": "2019-01-10", + "machine_id": "MACHINE123", + "run_number": 3, + "flowcell_id": "FLOWCELL123", + "experiment_name": "Test Experiment", + "run_folder_uri": "s3://bucket/path/to/run", + "status": RunStatus.READY, + "run_time": "invalid_time_format", + } + response = client.post("/api/v1/runs", json=new_run) + assert response.status_code == 404 + data = response.json() + assert data["detail"] == "run_time must be in HHMM format" + def test_get_runs(client: TestClient, session: Session): """Test that we can get all runs""" From c2c30a8e7498033d0f2f41e880e0eb78e6756d43 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 23 Sep 2025 19:39:45 -0400 Subject: [PATCH 21/35] Ensure run_time is valid before adding to db and add test cases --- api/runs/models.py | 25 +++++++++++++++++++++++-- api/runs/routes.py | 8 -------- tests/api/test_runs.py | 32 +++++++++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 13 deletions(-) diff --git a/api/runs/models.py b/api/runs/models.py index 54ac84d..ccaad75 100644 --- a/api/runs/models.py +++ b/api/runs/models.py @@ -1,12 +1,12 @@ """ Models for the Runs API """ - +from typing import Optional import uuid from datetime import datetime, date from enum import Enum from sqlmodel import SQLModel, Field -from pydantic import ConfigDict, computed_field +from pydantic import ConfigDict, computed_field, field_validator class RunStatus(str, Enum): @@ -137,6 +137,27 @@ class SequencingRunCreate(SQLModel): model_config = ConfigDict(extra="forbid") + @field_validator('run_time', mode='before') + @classmethod + def preprocess_run_time(cls, v): + """Convert empty string to None before validation""" + return None if v == "" else v + + @field_validator('run_time') + @classmethod + def validate_run_time_format(cls, v: Optional[str]) -> Optional[str]: + if v is None: + return None + + if not isinstance(v, str) or len(v) != 4 or not v.isdigit(): + raise ValueError('run_time must be exactly 4 digits (HHMM format)') + + hours, minutes = int(v[:2]), int(v[2:]) + if not (0 <= hours <= 23) or not (0 <= minutes <= 59): + raise ValueError('Hours must be 00-23, minutes must be 00-59') + + return v + class SequencingRunUpdateRequest(SQLModel): run_status: RunStatus diff --git a/api/runs/routes.py b/api/runs/routes.py index 3cef2b1..01e5978 100644 --- a/api/runs/routes.py +++ b/api/runs/routes.py @@ -44,14 +44,6 @@ def add_run( """ Create a new project with optional attributes. """ - if sequencingrun_in.run_time == "": - sequencingrun_in.run_time = None - elif sequencingrun_in.run_time and len(sequencingrun_in.run_time) != 4: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=f"run_time must be in HHMM format" - ) - run = services.add_run( session=session, sequencingrun_in=sequencingrun_in, diff --git a/tests/api/test_runs.py b/tests/api/test_runs.py index 6ee5244..a3227a1 100644 --- a/tests/api/test_runs.py +++ b/tests/api/test_runs.py @@ -80,9 +80,35 @@ def test_add_run(client: TestClient): "run_time": "invalid_time_format", } response = client.post("/api/v1/runs", json=new_run) - assert response.status_code == 404 - data = response.json() - assert data["detail"] == "run_time must be in HHMM format" + assert response.status_code == 422 + + # Add a run with valid run_time + new_run = { + "run_date": "2019-01-10", + "machine_id": "MACHINE123", + "run_number": 4, + "flowcell_id": "FLOWCELL123", + "experiment_name": "Test Experiment", + "run_folder_uri": "s3://bucket/path/to/run", + "status": RunStatus.READY, + "run_time": "1230", + } + response = client.post("/api/v1/runs", json=new_run) + assert response.status_code == 201 + + # Add a run with an invalid run_time + new_run = { + "run_date": "2019-01-10", + "machine_id": "MACHINE123", + "run_number": 4, + "flowcell_id": "FLOWCELL123", + "experiment_name": "Test Experiment", + "run_folder_uri": "s3://bucket/path/to/run", + "status": RunStatus.READY, + "run_time": "5678", + } + response = client.post("/api/v1/runs", json=new_run) + assert response.status_code == 422 def test_get_runs(client: TestClient, session: Session): From ba55626df720998298394de634e14643233d2f41 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 26 Sep 2025 19:58:53 -0400 Subject: [PATCH 22/35] Add file browse endpoints --- api/files/models.py | 27 ++++++ api/files/routes.py | 58 ++++++++++++ api/files/services.py | 93 ++++++++++++++++++++ tests/api/test_files.py | 190 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 368 insertions(+) diff --git a/api/files/models.py b/api/files/models.py index b2f9646..03b084d 100644 --- a/api/files/models.py +++ b/api/files/models.py @@ -177,3 +177,30 @@ class PaginatedFileResponse(SQLModel): has_prev: bool model_config = ConfigDict(from_attributes=True) + + +class FileBrowserColumns(SQLModel): + """Individual file/folder item for file browser""" + name: str + date: str + size: int | None = None # None for directories + dir: bool # True for directories, False for files + + +class FileBrowserFolder(SQLModel): + """Folder item for file browser""" + name: str + date: str + + +class FileBrowserFile(SQLModel): + """File item for file browser""" + name: str + date: str + size: int + + +class FileBrowserData(SQLModel): + """File browser data structure with separate folders and files""" + folders: list[FileBrowserFolder] + files: list[FileBrowserFile] diff --git a/api/files/routes.py b/api/files/routes.py index 2f92b84..5b07596 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -14,6 +14,7 @@ FileFilters, FileType, EntityType, + FileBrowserData, ) import api.files.services as services @@ -104,6 +105,61 @@ def list_files( return services.list_files(session, filters, page, per_page) +@router.get( + "/browse", + response_model=FileBrowserData, + summary="Browse filesystem directory" +) +def browse_filesystem( + directory_path: str = Query("", description="Directory path to browse"), + storage_root: str = Query("storage", description="Storage root directory") +) -> FileBrowserData: + """ + Browse a filesystem directory and return folders and files in structured format. + + - **directory_path**: Path relative to storage root (empty for root) + - **storage_root**: Base storage directory (defaults to 'storage') + + Returns separate arrays for folders and files with name, date, and size information. + """ + return services.browse_filesystem(directory_path, storage_root) + + +@router.get( + "/browse-db", + response_model=FileBrowserData, + summary="List database files in browser format" +) +def list_files_browser_format( + session: SessionDep, + page: int = Query(1, ge=1, description="Page number (1-indexed)"), + per_page: int = Query(20, ge=1, le=100, description="Number of items per page"), + entity_type: Optional[EntityType] = Query(None, description="Filter by entity type"), + entity_id: Optional[str] = Query(None, description="Filter by entity ID"), + file_type: Optional[FileType] = Query(None, description="Filter by file type"), + search: Optional[str] = Query(None, description="Search in filename and description"), + is_public: Optional[bool] = Query(None, description="Filter by public/private status"), + created_by: Optional[str] = Query(None, description="Filter by creator"), +) -> FileBrowserData: + """ + Get database files in FileBrowserData format (files only, no folders). + + This endpoint returns the same file data as the regular list_files endpoint, + but formatted to match the FileBrowserData structure with separate folders and files arrays. + Since database files don't have folder structure, the folders array will be empty. + """ + filters = FileFilters( + entity_type=entity_type, + entity_id=entity_id, + file_type=file_type, + search_query=search, + is_public=is_public, + created_by=created_by, + ) + + return services.list_files_as_browser_data(session, filters, page, per_page) + + @router.get( "/{file_id}", response_model=FilePublic, @@ -284,3 +340,5 @@ def get_file_count_for_entity( """ count = services.get_file_count_for_entity(session, entity_type, entity_id) return {"entity_type": entity_type, "entity_id": entity_id, "file_count": count} + + diff --git a/api/files/services.py b/api/files/services.py index 9a4ccf6..7ef7049 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -5,7 +5,9 @@ import hashlib import secrets import string +import os from pathlib import Path +from datetime import datetime from sqlmodel import select, Session, func from pydantic import PositiveInt from fastapi import HTTPException, status @@ -20,6 +22,10 @@ FileType, EntityType, StorageBackend, + FileBrowserData, + FileBrowserFolder, + FileBrowserFile, + FileBrowserColumns, ) @@ -361,3 +367,90 @@ def update_file_content( session.refresh(file_record) return file_record + + +def browse_filesystem(directory_path: str, storage_root: str = "storage") -> FileBrowserData: + """Browse filesystem directory and return structured data""" + + # Construct full path + if os.path.isabs(directory_path): + full_path = Path(directory_path) + else: + full_path = Path(storage_root) / directory_path + + # Check if directory exists and is accessible + if not full_path.exists(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Directory not found: {directory_path}" + ) + + if not full_path.is_dir(): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Path is not a directory: {directory_path}" + ) + + folders = [] + files = [] + + try: + # List directory contents + for item in full_path.iterdir(): + # Get modification time + stat = item.stat() + mod_time = datetime.fromtimestamp(stat.st_mtime) + date_str = mod_time.strftime("%Y-%m-%d %H:%M:%S") + + if item.is_dir(): + folders.append(FileBrowserFolder( + name=item.name, + date=date_str + )) + else: + files.append(FileBrowserFile( + name=item.name, + date=date_str, + size=stat.st_size + )) + + except PermissionError: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"Permission denied accessing directory: {directory_path}" + ) + + # Sort folders and files by name + folders.sort(key=lambda x: x.name.lower()) + files.sort(key=lambda x: x.name.lower()) + + return FileBrowserData(folders=folders, files=files) + + +def list_files_as_browser_data( + session: Session, + filters: FileFilters | None = None, + page: PositiveInt = 1, + per_page: PositiveInt = 20, + sort_by: str = "upload_date", + sort_order: str = "desc" +) -> FileBrowserData: + """List database files in FileBrowserData format (files only, no folders)""" + + # Get files using existing list_files function + files_result = list_files(session, filters, page, per_page, sort_by, sort_order) + + # Convert to FileBrowserFile format + browser_files = [] + for file_record in files_result.data: + # Format date + date_str = file_record.upload_date.strftime("%Y-%m-%d %H:%M:%S") + + browser_files.append(FileBrowserFile( + name=file_record.filename, + date=date_str, + size=file_record.file_size or 0 + )) + + # No folders for database files + return FileBrowserData(folders=[], files=browser_files) diff --git a/tests/api/test_files.py b/tests/api/test_files.py index 321cb78..aa259c0 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -8,6 +8,7 @@ from datetime import datetime, timezone import pytest +from fastapi import HTTPException from fastapi.testclient import TestClient from sqlmodel import Session @@ -17,6 +18,9 @@ FileType, EntityType, StorageBackend, + FileBrowserData, + FileBrowserFolder, + FileBrowserFile, ) from api.files.services import ( create_file, @@ -31,6 +35,8 @@ generate_file_path, calculate_file_checksum, get_mime_type, + browse_filesystem, + list_files_as_browser_data, ) @@ -796,3 +802,187 @@ def test_file_type_filtering(self, session: Session, temp_storage): assert result.total_items == 1 assert result.data[0].file_type == file_type + + + +class TestFileBrowserModels: + """Test file browser model functionality""" + + def test_file_browser_folder_model(self): + """Test FileBrowserFolder model""" + folder = FileBrowserFolder( + name="test_folder", + date="2023-01-01 12:00:00" + ) + + assert folder.name == "test_folder" + assert folder.date == "2023-01-01 12:00:00" + + def test_file_browser_file_model(self): + """Test FileBrowserFile model""" + file = FileBrowserFile( + name="test_file.txt", + date="2023-01-01 12:00:00", + size=1024 + ) + + assert file.name == "test_file.txt" + assert file.date == "2023-01-01 12:00:00" + assert file.size == 1024 + + def test_file_browser_data_model(self): + """Test FileBrowserData model""" + folder = FileBrowserFolder(name="folder1", date="2023-01-01 12:00:00") + file = FileBrowserFile(name="file1.txt", date="2023-01-01 12:00:00", size=100) + + browser_data = FileBrowserData( + folders=[folder], + files=[file] + ) + + assert len(browser_data.folders) == 1 + assert len(browser_data.files) == 1 + assert browser_data.folders[0].name == "folder1" + assert browser_data.files[0].name == "file1.txt" + + +class TestFileBrowserServices: + """Test file browser service functions""" + + @pytest.fixture + def temp_storage(self): + """Create temporary storage with folder/file structure""" + temp_dir = tempfile.mkdtemp() + + # Create some folders + (Path(temp_dir) / "folder1").mkdir() + (Path(temp_dir) / "folder2").mkdir() + + # Create some files + (Path(temp_dir) / "file1.txt").write_text("content1") + (Path(temp_dir) / "file2.txt").write_text("content2") + + yield temp_dir + shutil.rmtree(temp_dir) + + def test_browse_filesystem(self, temp_storage): + """Test filesystem browsing""" + result = browse_filesystem("", temp_storage) + + assert isinstance(result, FileBrowserData) + assert len(result.folders) == 2 + assert len(result.files) == 2 + + # Check folder names + folder_names = [f.name for f in result.folders] + assert "folder1" in folder_names + assert "folder2" in folder_names + + # Check file names + file_names = [f.name for f in result.files] + assert "file1.txt" in file_names + assert "file2.txt" in file_names + + # Check file sizes + for file in result.files: + assert file.size > 0 + assert file.date is not None + + def test_browse_filesystem_nonexistent_directory(self): + """Test browsing non-existent directory""" + with pytest.raises(HTTPException) as exc_info: + browse_filesystem("nonexistent", "nonexistent_root") + + assert exc_info.value.status_code == 404 + assert "not found" in str(exc_info.value.detail) + + def test_list_files_as_browser_data(self, session: Session): + """Test converting database files to browser data format""" + # Create test files in database + for i in range(3): + file_create = FileCreate( + filename=f"db_file_{i}.txt", + description=f"Database file {i}", + entity_type=EntityType.PROJECT, + entity_id="PROJ001", + file_type=FileType.DOCUMENT + ) + create_file(session, file_create) + + # Get files in browser format + result = list_files_as_browser_data(session) + + assert isinstance(result, FileBrowserData) + assert len(result.folders) == 0 # No folders for database files + assert len(result.files) == 3 + + # Check file properties + for file in result.files: + assert file.name.startswith("db_file_") + assert file.date is not None + assert file.size >= 0 + + +class TestFileBrowserAPI: + """Test file browser API endpoints""" + + def test_browse_filesystem_endpoint(self, client: TestClient, temp_storage): + """Test filesystem browsing endpoint""" + # Create a temporary directory structure for testing + temp_dir = tempfile.mkdtemp() + try: + # Create test structure + (Path(temp_dir) / "test_folder").mkdir() + (Path(temp_dir) / "test_file.txt").write_text("test content") + + # Test the endpoint + response = client.get(f"/api/v1/files/browse?storage_root={temp_dir}") + assert response.status_code == 200 + + data = response.json() + assert "folders" in data + assert "files" in data + assert isinstance(data["folders"], list) + assert isinstance(data["files"], list) + + finally: + shutil.rmtree(temp_dir) + + def test_browse_db_endpoint(self, client: TestClient, session: Session): + """Test database files browser endpoint""" + # Create test files + for i in range(2): + file_data = { + "filename": f"browser_test_{i}.txt", + "description": f"Browser test file {i}", + "file_type": "document", + "entity_type": "project", + "entity_id": "PROJ001", + "created_by": "browser_test_user" + } + client.post("/api/v1/files", data=file_data) + + # Test the browser endpoint + response = client.get("/api/v1/files/browse-db") + assert response.status_code == 200 + + data = response.json() + assert "folders" in data + assert "files" in data + assert len(data["folders"]) == 0 # No folders for database files + assert len(data["files"]) >= 2 + + # Check file structure + for file in data["files"]: + assert "name" in file + assert "date" in file + assert "size" in file + + def test_browse_filesystem_error_handling(self, client: TestClient): + """Test error handling for filesystem browsing""" + # Test non-existent directory + response = client.get( + "/api/v1/files/browse?directory_path=nonexistent&storage_root=nonexistent" + ) + assert response.status_code == 404 + From 7e647d02274f5278cac2586b5ad7472f9c66e287 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 26 Sep 2025 20:09:37 -0400 Subject: [PATCH 23/35] Fix test --- tests/api/test_files.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/api/test_files.py b/tests/api/test_files.py index aa259c0..971b20e 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -804,7 +804,6 @@ def test_file_type_filtering(self, session: Session, temp_storage): assert result.data[0].file_type == file_type - class TestFileBrowserModels: """Test file browser model functionality""" @@ -926,7 +925,7 @@ def test_list_files_as_browser_data(self, session: Session): class TestFileBrowserAPI: """Test file browser API endpoints""" - def test_browse_filesystem_endpoint(self, client: TestClient, temp_storage): + def test_browse_filesystem_endpoint(self, client: TestClient): """Test filesystem browsing endpoint""" # Create a temporary directory structure for testing temp_dir = tempfile.mkdtemp() From 708c59e500415599c8a073f051eec8246f26c8fc Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 26 Sep 2025 20:10:57 -0400 Subject: [PATCH 24/35] Format with black --- api/files/models.py | 18 +++++- api/files/routes.py | 133 +++++++++++++++++++----------------------- api/files/services.py | 133 ++++++++++++++++++------------------------ 3 files changed, 134 insertions(+), 150 deletions(-) diff --git a/api/files/models.py b/api/files/models.py index 03b084d..51763a2 100644 --- a/api/files/models.py +++ b/api/files/models.py @@ -12,6 +12,7 @@ class FileType(str, Enum): """File type categories""" + FASTQ = "fastq" BAM = "bam" VCF = "vcf" @@ -26,12 +27,14 @@ class FileType(str, Enum): class EntityType(str, Enum): """Entity types that can have files""" + PROJECT = "project" RUN = "run" class StorageBackend(str, Enum): """Storage backend types""" + LOCAL = "local" S3 = "s3" AZURE = "azure" @@ -73,12 +76,14 @@ def generate_file_id(self) -> str: """Generate a unique file ID""" import secrets import string + alphabet = string.ascii_letters + string.digits - return ''.join(secrets.choice(alphabet) for _ in range(12)) + return "".join(secrets.choice(alphabet) for _ in range(12)) class FileCreate(SQLModel): """Request model for creating a file""" + filename: str original_filename: str | None = None description: str | None = None @@ -93,6 +98,7 @@ class FileCreate(SQLModel): class FileUpdate(SQLModel): """Request model for updating file metadata""" + filename: str | None = None description: str | None = None file_type: FileType | None = None @@ -104,6 +110,7 @@ class FileUpdate(SQLModel): class FilePublic(SQLModel): """Public file representation""" + file_id: str filename: str original_filename: str @@ -123,6 +130,7 @@ class FilePublic(SQLModel): class FilesPublic(SQLModel): """Paginated file listing""" + data: List[FilePublic] total_items: int total_pages: int @@ -134,6 +142,7 @@ class FilesPublic(SQLModel): class FileUploadRequest(SQLModel): """Request model for file upload""" + filename: str description: str | None = None file_type: FileType = FileType.OTHER @@ -144,6 +153,7 @@ class FileUploadRequest(SQLModel): class FileUploadResponse(SQLModel): """Response model for file upload""" + file_id: str filename: str file_size: int | None = None @@ -154,6 +164,7 @@ class FileUploadResponse(SQLModel): class FileFilters(SQLModel): """File filtering options""" + entity_type: EntityType | None = None entity_id: str | None = None file_type: FileType | None = None @@ -168,6 +179,7 @@ class FileFilters(SQLModel): class PaginatedFileResponse(SQLModel): """Paginated response for file listings""" + data: list[FilePublic] total_items: int total_pages: int @@ -181,6 +193,7 @@ class PaginatedFileResponse(SQLModel): class FileBrowserColumns(SQLModel): """Individual file/folder item for file browser""" + name: str date: str size: int | None = None # None for directories @@ -189,12 +202,14 @@ class FileBrowserColumns(SQLModel): class FileBrowserFolder(SQLModel): """Folder item for file browser""" + name: str date: str class FileBrowserFile(SQLModel): """File item for file browser""" + name: str date: str size: int @@ -202,5 +217,6 @@ class FileBrowserFile(SQLModel): class FileBrowserData(SQLModel): """File browser data structure with separate folders and files""" + folders: list[FileBrowserFolder] files: list[FileBrowserFile] diff --git a/api/files/routes.py b/api/files/routes.py index 5b07596..8d06ff9 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -3,7 +3,15 @@ """ from typing import Optional -from fastapi import APIRouter, Query, HTTPException, status, UploadFile, File as FastAPIFile, Form +from fastapi import ( + APIRouter, + Query, + HTTPException, + status, + UploadFile, + File as FastAPIFile, + Form, +) from fastapi.responses import StreamingResponse from core.deps import SessionDep from api.files.models import ( @@ -25,7 +33,7 @@ "", response_model=FilePublic, status_code=status.HTTP_201_CREATED, - summary="Create a new file record" + summary="Create a new file record", ) def create_file( session: SessionDep, @@ -36,7 +44,7 @@ def create_file( file_type: FileType = Form(FileType.OTHER), is_public: bool = Form(False), created_by: Optional[str] = Form(None), - content: Optional[UploadFile] = FastAPIFile(None) + content: Optional[UploadFile] = FastAPIFile(None), ) -> FilePublic: """ Create a new file record with optional file content upload. @@ -57,7 +65,7 @@ def create_file( entity_type=entity_type, entity_id=entity_id, is_public=is_public, - created_by=created_by + created_by=created_by, ) file_content = None @@ -70,17 +78,23 @@ def create_file( @router.get( "", response_model=PaginatedFileResponse, - summary="List files with filtering and pagination" + summary="List files with filtering and pagination", ) def list_files( session: SessionDep, page: int = Query(1, ge=1, description="Page number (1-indexed)"), per_page: int = Query(20, ge=1, le=100, description="Number of items per page"), - entity_type: Optional[EntityType] = Query(None, description="Filter by entity type"), + entity_type: Optional[EntityType] = Query( + None, description="Filter by entity type" + ), entity_id: Optional[str] = Query(None, description="Filter by entity ID"), file_type: Optional[FileType] = Query(None, description="Filter by file type"), - search: Optional[str] = Query(None, description="Search in filename and description"), - is_public: Optional[bool] = Query(None, description="Filter by public/private status"), + search: Optional[str] = Query( + None, description="Search in filename and description" + ), + is_public: Optional[bool] = Query( + None, description="Filter by public/private status" + ), created_by: Optional[str] = Query(None, description="Filter by creator"), ) -> PaginatedFileResponse: """ @@ -106,20 +120,18 @@ def list_files( @router.get( - "/browse", - response_model=FileBrowserData, - summary="Browse filesystem directory" + "/browse", response_model=FileBrowserData, summary="Browse filesystem directory" ) def browse_filesystem( directory_path: str = Query("", description="Directory path to browse"), - storage_root: str = Query("storage", description="Storage root directory") + storage_root: str = Query("storage", description="Storage root directory"), ) -> FileBrowserData: """ Browse a filesystem directory and return folders and files in structured format. - + - **directory_path**: Path relative to storage root (empty for root) - **storage_root**: Base storage directory (defaults to 'storage') - + Returns separate arrays for folders and files with name, date, and size information. """ return services.browse_filesystem(directory_path, storage_root) @@ -128,22 +140,28 @@ def browse_filesystem( @router.get( "/browse-db", response_model=FileBrowserData, - summary="List database files in browser format" + summary="List database files in browser format", ) def list_files_browser_format( session: SessionDep, page: int = Query(1, ge=1, description="Page number (1-indexed)"), per_page: int = Query(20, ge=1, le=100, description="Number of items per page"), - entity_type: Optional[EntityType] = Query(None, description="Filter by entity type"), + entity_type: Optional[EntityType] = Query( + None, description="Filter by entity type" + ), entity_id: Optional[str] = Query(None, description="Filter by entity ID"), file_type: Optional[FileType] = Query(None, description="Filter by file type"), - search: Optional[str] = Query(None, description="Search in filename and description"), - is_public: Optional[bool] = Query(None, description="Filter by public/private status"), + search: Optional[str] = Query( + None, description="Search in filename and description" + ), + is_public: Optional[bool] = Query( + None, description="Filter by public/private status" + ), created_by: Optional[str] = Query(None, description="Filter by creator"), ) -> FileBrowserData: """ Get database files in FileBrowserData format (files only, no folders). - + This endpoint returns the same file data as the regular list_files endpoint, but formatted to match the FileBrowserData structure with separate folders and files arrays. Since database files don't have folder structure, the folders array will be empty. @@ -160,15 +178,8 @@ def list_files_browser_format( return services.list_files_as_browser_data(session, filters, page, per_page) -@router.get( - "/{file_id}", - response_model=FilePublic, - summary="Get file by ID" -) -def get_file( - session: SessionDep, - file_id: str -) -> FilePublic: +@router.get("/{file_id}", response_model=FilePublic, summary="Get file by ID") +def get_file(session: SessionDep, file_id: str) -> FilePublic: """ Get a single file by its file_id. @@ -179,19 +190,13 @@ def get_file( except Exception: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"File with id {file_id} not found" + detail=f"File with id {file_id} not found", ) -@router.put( - "/{file_id}", - response_model=FilePublic, - summary="Update file metadata" -) +@router.put("/{file_id}", response_model=FilePublic, summary="Update file metadata") def update_file( - session: SessionDep, - file_id: str, - file_update: FileUpdate + session: SessionDep, file_id: str, file_update: FileUpdate ) -> FilePublic: """ Update file metadata. @@ -204,19 +209,14 @@ def update_file( except Exception: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"File with id {file_id} not found" + detail=f"File with id {file_id} not found", ) @router.delete( - "/{file_id}", - status_code=status.HTTP_204_NO_CONTENT, - summary="Delete file" + "/{file_id}", status_code=status.HTTP_204_NO_CONTENT, summary="Delete file" ) -def delete_file( - session: SessionDep, - file_id: str -) -> None: +def delete_file(session: SessionDep, file_id: str) -> None: """ Delete a file and its content. @@ -227,23 +227,17 @@ def delete_file( if not success: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"File with id {file_id} not found" + detail=f"File with id {file_id} not found", ) except Exception: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"File with id {file_id} not found" + detail=f"File with id {file_id} not found", ) -@router.get( - "/{file_id}/content", - summary="Download file content" -) -def download_file( - session: SessionDep, - file_id: str -): +@router.get("/{file_id}/content", summary="Download file content") +def download_file(session: SessionDep, file_id: str): """ Download the content of a file. @@ -265,24 +259,20 @@ def generate(): media_type=file_record.mime_type or "application/octet-stream", headers={ "Content-Disposition": f"attachment; filename={file_record.filename}" - } + }, ) except Exception: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"File with id {file_id} not found or content not available" + detail=f"File with id {file_id} not found or content not available", ) @router.post( - "/{file_id}/content", - response_model=FilePublic, - summary="Upload file content" + "/{file_id}/content", response_model=FilePublic, summary="Upload file content" ) def upload_file_content( - session: SessionDep, - file_id: str, - content: UploadFile = FastAPIFile(...) + session: SessionDep, file_id: str, content: UploadFile = FastAPIFile(...) ) -> FilePublic: """ Upload content for an existing file record. @@ -296,14 +286,14 @@ def upload_file_content( except Exception: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"File with id {file_id} not found" + detail=f"File with id {file_id} not found", ) @router.get( "/entity/{entity_type}/{entity_id}", response_model=PaginatedFileResponse, - summary="List files for a specific entity." + summary="List files for a specific entity.", ) def list_files_for_entity( session: SessionDep, @@ -320,17 +310,16 @@ def list_files_for_entity( - **entity_type**: Either "project" or "run" - **entity_id**: The project ID or run barcode """ - return services.list_files_for_entity(session, entity_type, entity_id, page, per_page, file_type) + return services.list_files_for_entity( + session, entity_type, entity_id, page, per_page, file_type + ) @router.get( - "/entity/{entity_type}/{entity_id}/count", - summary="Get file count for entity" + "/entity/{entity_type}/{entity_id}/count", summary="Get file count for entity" ) def get_file_count_for_entity( - session: SessionDep, - entity_type: EntityType, - entity_id: str + session: SessionDep, entity_type: EntityType, entity_id: str ) -> dict: """ Get the total number of files for a specific project or run. @@ -340,5 +329,3 @@ def get_file_count_for_entity( """ count = services.get_file_count_for_entity(session, entity_type, entity_id) return {"entity_type": entity_type, "entity_id": entity_id, "file_count": count} - - diff --git a/api/files/services.py b/api/files/services.py index 7ef7049..dbbd83e 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -32,7 +32,7 @@ def generate_file_id() -> str: """Generate a unique file ID""" alphabet = string.ascii_letters + string.digits - return ''.join(secrets.choice(alphabet) for _ in range(12)) + return "".join(secrets.choice(alphabet) for _ in range(12)) def generate_file_path( @@ -40,19 +40,13 @@ def generate_file_path( ) -> str: """Generate a structured file path""" from datetime import datetime, timezone + now = datetime.now(timezone.utc) year = now.strftime("%Y") month = now.strftime("%m") # Create path structure: /{entity_type}/{entity_id}/{file_type}/{year}/{month}/{filename} - path_parts = [ - entity_type.value, - entity_id, - file_type.value, - year, - month, - filename - ] + path_parts = [entity_type.value, entity_id, file_type.value, year, month, filename] return "/".join(path_parts) @@ -64,6 +58,7 @@ def calculate_file_checksum(file_content: bytes) -> str: def get_mime_type(filename: str) -> str: """Get MIME type based on file extension""" import mimetypes + mime_type, _ = mimetypes.guess_type(filename) return mime_type or "application/octet-stream" @@ -72,7 +67,7 @@ def create_file( session: Session, file_create: FileCreate, file_content: bytes | None = None, - storage_root: str = "storage" + storage_root: str = "storage", ) -> File: """Create a new file record and optionally store content""" @@ -87,7 +82,7 @@ def create_file( file_create.entity_type, file_create.entity_id, file_create.file_type, - f"{file_id}_{file_create.filename}" + f"{file_id}_{file_create.filename}", ) # Calculate file metadata if content is provided @@ -110,7 +105,7 @@ def create_file( entity_type=file_create.entity_type, entity_id=file_create.entity_id, is_public=file_create.is_public, - storage_backend=StorageBackend.LOCAL + storage_backend=StorageBackend.LOCAL, ) # Store file content if provided @@ -130,14 +125,12 @@ def create_file( def get_file(session: Session, file_id: str) -> File: """Get a file by its file_id""" - file_record = session.exec( - select(File).where(File.file_id == file_id) - ).first() + file_record = session.exec(select(File).where(File.file_id == file_id)).first() if not file_record: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"File with id {file_id} not found" + detail=f"File with id {file_id} not found", ) return file_record @@ -145,14 +138,12 @@ def get_file(session: Session, file_id: str) -> File: def get_file_by_id(session: Session, id: str) -> File: """Get a file by its internal UUID""" - file_record = session.exec( - select(File).where(File.id == id) - ).first() + file_record = session.exec(select(File).where(File.id == id)).first() if not file_record: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"File with internal id {id} not found" + detail=f"File with internal id {id} not found", ) return file_record @@ -203,7 +194,7 @@ def list_files( page: PositiveInt = 1, per_page: PositiveInt = 20, sort_by: str = "upload_date", - sort_order: str = "desc" + sort_order: str = "desc", ) -> FilesPublic: """List files with filtering and pagination""" @@ -229,14 +220,12 @@ def list_files( if filters.search_query: search_term = f"%{filters.search_query}%" query = query.where( - (File.filename.ilike(search_term)) | - (File.description.ilike(search_term)) + (File.filename.ilike(search_term)) + | (File.description.ilike(search_term)) ) # Get total count - total_count = session.exec( - select(func.count()).select_from(query.subquery()) - ).one() + total_count = session.exec(select(func.count()).select_from(query.subquery())).one() # Calculate pagination total_pages = (total_count + per_page - 1) // per_page @@ -271,7 +260,7 @@ def list_files( is_public=file.is_public, is_archived=file.is_archived, storage_backend=file.storage_backend, - checksum=file.checksum + checksum=file.checksum, ) for file in files ] @@ -283,11 +272,13 @@ def list_files( current_page=page, per_page=per_page, has_next=page < total_pages, - has_prev=page > 1 + has_prev=page > 1, ) -def get_file_content(session: Session, file_id: str, storage_root: str = "storage") -> bytes: +def get_file_content( + session: Session, file_id: str, storage_root: str = "storage" +) -> bytes: """Get file content from storage""" file_record = get_file(session, file_id) @@ -295,7 +286,7 @@ def get_file_content(session: Session, file_id: str, storage_root: str = "storag if not full_path.exists(): raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"File content not found at {file_record.file_path}" + detail=f"File content not found at {file_record.file_path}", ) with open(full_path, "rb") as f: @@ -308,34 +299,25 @@ def list_files_for_entity( entity_id: str, page: PositiveInt = 1, per_page: PositiveInt = 20, - file_type: FileType | None = None + file_type: FileType | None = None, ) -> FilesPublic: """List files for a specific entity (project or run)""" filters = FileFilters( - entity_type=entity_type, - entity_id=entity_id, - file_type=file_type + entity_type=entity_type, entity_id=entity_id, file_type=file_type ) - return list_files( - session=session, - filters=filters, - page=page, - per_page=per_page - ) + return list_files(session=session, filters=filters, page=page, per_page=per_page) def get_file_count_for_entity( - session: Session, - entity_type: EntityType, - entity_id: str + session: Session, entity_type: EntityType, entity_id: str ) -> int: """Get the count of files for a specific entity""" count = session.exec( select(func.count(File.id)).where( File.entity_type == entity_type, File.entity_id == entity_id, - ~File.is_archived + ~File.is_archived, ) ).one() @@ -369,31 +351,33 @@ def update_file_content( return file_record -def browse_filesystem(directory_path: str, storage_root: str = "storage") -> FileBrowserData: +def browse_filesystem( + directory_path: str, storage_root: str = "storage" +) -> FileBrowserData: """Browse filesystem directory and return structured data""" - + # Construct full path if os.path.isabs(directory_path): full_path = Path(directory_path) else: full_path = Path(storage_root) / directory_path - + # Check if directory exists and is accessible if not full_path.exists(): raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"Directory not found: {directory_path}" + detail=f"Directory not found: {directory_path}", ) - + if not full_path.is_dir(): raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Path is not a directory: {directory_path}" + detail=f"Path is not a directory: {directory_path}", ) - + folders = [] files = [] - + try: # List directory contents for item in full_path.iterdir(): @@ -401,29 +385,24 @@ def browse_filesystem(directory_path: str, storage_root: str = "storage") -> Fil stat = item.stat() mod_time = datetime.fromtimestamp(stat.st_mtime) date_str = mod_time.strftime("%Y-%m-%d %H:%M:%S") - + if item.is_dir(): - folders.append(FileBrowserFolder( - name=item.name, - date=date_str - )) + folders.append(FileBrowserFolder(name=item.name, date=date_str)) else: - files.append(FileBrowserFile( - name=item.name, - date=date_str, - size=stat.st_size - )) - + files.append( + FileBrowserFile(name=item.name, date=date_str, size=stat.st_size) + ) + except PermissionError: raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, - detail=f"Permission denied accessing directory: {directory_path}" + detail=f"Permission denied accessing directory: {directory_path}", ) - + # Sort folders and files by name folders.sort(key=lambda x: x.name.lower()) files.sort(key=lambda x: x.name.lower()) - + return FileBrowserData(folders=folders, files=files) @@ -433,24 +412,26 @@ def list_files_as_browser_data( page: PositiveInt = 1, per_page: PositiveInt = 20, sort_by: str = "upload_date", - sort_order: str = "desc" + sort_order: str = "desc", ) -> FileBrowserData: """List database files in FileBrowserData format (files only, no folders)""" - + # Get files using existing list_files function files_result = list_files(session, filters, page, per_page, sort_by, sort_order) - + # Convert to FileBrowserFile format browser_files = [] for file_record in files_result.data: # Format date date_str = file_record.upload_date.strftime("%Y-%m-%d %H:%M:%S") - - browser_files.append(FileBrowserFile( - name=file_record.filename, - date=date_str, - size=file_record.file_size or 0 - )) - + + browser_files.append( + FileBrowserFile( + name=file_record.filename, + date=date_str, + size=file_record.file_size or 0, + ) + ) + # No folders for database files return FileBrowserData(folders=[], files=browser_files) From 0197383421fbf2c78e3fd9b7c383575eb740bc25 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 26 Sep 2025 20:11:15 -0400 Subject: [PATCH 25/35] Format with black --- tests/api/test_files.py | 184 ++++++++++++++++++++-------------------- 1 file changed, 90 insertions(+), 94 deletions(-) diff --git a/tests/api/test_files.py b/tests/api/test_files.py index 971b20e..05398e6 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -77,7 +77,7 @@ def test_file_create_model(self): entity_type=EntityType.PROJECT, entity_id="PROJ001", is_public=True, - created_by="testuser" + created_by="testuser", ) assert file_create.filename == "test.txt" @@ -91,9 +91,7 @@ def test_file_create_model(self): def test_file_update_model(self): """Test FileUpdate model validation""" file_update = FileUpdate( - filename="updated.txt", - description="Updated description", - is_public=False + filename="updated.txt", description="Updated description", is_public=False ) assert file_update.filename == "updated.txt" @@ -124,10 +122,7 @@ def test_generate_file_id(self): def test_generate_file_path(self): """Test file path generation""" path = generate_file_path( - EntityType.PROJECT, - "PROJ001", - FileType.FASTQ, - "sample.fastq" + EntityType.PROJECT, "PROJ001", FileType.FASTQ, "sample.fastq" ) # Should contain entity type, entity id, file type, year, month, filename @@ -150,14 +145,19 @@ def test_calculate_file_checksum(self): # Should be SHA-256 hash assert len(checksum) == 64 - assert checksum == "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f" + assert ( + checksum + == "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f" + ) def test_get_mime_type(self): """Test MIME type detection""" assert get_mime_type("test.txt") == "text/plain" assert get_mime_type("test.pdf") == "application/pdf" assert get_mime_type("test.jpg") == "image/jpeg" - assert get_mime_type("test.fastq") == "application/octet-stream" # Unknown extension + assert ( + get_mime_type("test.fastq") == "application/octet-stream" + ) # Unknown extension def test_create_file_without_content(self, session: Session, temp_storage): """Test creating file record without content""" @@ -167,7 +167,7 @@ def test_create_file_without_content(self, session: Session, temp_storage): file_type=FileType.DOCUMENT, entity_type=EntityType.PROJECT, entity_id="PROJ001", - created_by="testuser" + created_by="testuser", ) file_record = create_file(session, file_create, storage_root=temp_storage) @@ -191,10 +191,12 @@ def test_create_file_with_content(self, session: Session, temp_storage): description="Test file", file_type=FileType.DOCUMENT, entity_type=EntityType.PROJECT, - entity_id="PROJ001" + entity_id="PROJ001", ) - file_record = create_file(session, file_create, content, storage_root=temp_storage) + file_record = create_file( + session, file_create, content, storage_root=temp_storage + ) assert file_record.file_size == len(content) assert file_record.checksum == calculate_file_checksum(content) @@ -207,9 +209,7 @@ def test_create_file_with_content(self, session: Session, temp_storage): def test_get_file(self, session: Session, temp_storage): """Test getting file by file_id""" file_create = FileCreate( - filename="test.txt", - entity_type=EntityType.PROJECT, - entity_id="PROJ001" + filename="test.txt", entity_type=EntityType.PROJECT, entity_id="PROJ001" ) created_file = create_file(session, file_create, storage_root=temp_storage) @@ -232,15 +232,13 @@ def test_update_file(self, session: Session, temp_storage): filename="test.txt", description="Original description", entity_type=EntityType.PROJECT, - entity_id="PROJ001" + entity_id="PROJ001", ) created_file = create_file(session, file_create, storage_root=temp_storage) file_update = FileUpdate( - filename="updated.txt", - description="Updated description", - is_public=True + filename="updated.txt", description="Updated description", is_public=True ) updated_file = update_file(session, created_file.file_id, file_update) @@ -253,12 +251,12 @@ def test_delete_file(self, session: Session, temp_storage): """Test deleting file and content""" content = b"Hello, World!" file_create = FileCreate( - filename="test.txt", - entity_type=EntityType.PROJECT, - entity_id="PROJ001" + filename="test.txt", entity_type=EntityType.PROJECT, entity_id="PROJ001" ) - created_file = create_file(session, file_create, content, storage_root=temp_storage) + created_file = create_file( + session, file_create, content, storage_root=temp_storage + ) file_path = Path(temp_storage) / created_file.file_path # Verify file exists @@ -296,7 +294,7 @@ def test_list_files_with_data(self, session: Session, temp_storage): description=f"Test file {i}", entity_type=EntityType.PROJECT, entity_id="PROJ001", - file_type=FileType.DOCUMENT + file_type=FileType.DOCUMENT, ) create_file(session, file_create, storage_root=temp_storage) @@ -318,7 +316,7 @@ def test_list_files_for_entity(self, session: Session, temp_storage): file_create = FileCreate( filename=f"test{i}.txt", entity_type=EntityType.PROJECT, - entity_id=entity_id + entity_id=entity_id, ) create_file(session, file_create, storage_root=temp_storage) @@ -335,7 +333,7 @@ def test_get_file_count_for_entity(self, session: Session, temp_storage): file_create = FileCreate( filename=f"test{i}.txt", entity_type=EntityType.PROJECT, - entity_id="PROJ001" + entity_id="PROJ001", ) create_file(session, file_create, storage_root=temp_storage) @@ -346,12 +344,12 @@ def test_get_file_content(self, session: Session, temp_storage): """Test retrieving file content""" content = b"Hello, World!" file_create = FileCreate( - filename="test.txt", - entity_type=EntityType.PROJECT, - entity_id="PROJ001" + filename="test.txt", entity_type=EntityType.PROJECT, entity_id="PROJ001" ) - created_file = create_file(session, file_create, content, storage_root=temp_storage) + created_file = create_file( + session, file_create, content, storage_root=temp_storage + ) retrieved_content = get_file_content( session, created_file.file_id, storage_root=temp_storage ) @@ -361,9 +359,7 @@ def test_get_file_content(self, session: Session, temp_storage): def test_get_file_content_not_found(self, session: Session, temp_storage): """Test retrieving content for non-existent file""" file_create = FileCreate( - filename="test.txt", - entity_type=EntityType.PROJECT, - entity_id="PROJ001" + filename="test.txt", entity_type=EntityType.PROJECT, entity_id="PROJ001" ) # Create file record without content @@ -387,7 +383,7 @@ def test_create_file_endpoint(self, client: TestClient, session: Session): "entity_type": "project", "entity_id": "PROJ001", "is_public": "true", # Form data sends as string - "created_by": "api_test_user" + "created_by": "api_test_user", } response = client.post("/api/v1/files", data=file_data) @@ -414,7 +410,7 @@ def test_get_files_endpoint(self, client: TestClient, session: Session): "file_type": "document", "entity_type": "project", "entity_id": "PROJ001", - "created_by": "list_test_user" + "created_by": "list_test_user", } client.post("/api/v1/files", data=file_data) @@ -468,7 +464,7 @@ def test_get_file_endpoint(self, client: TestClient, session: Session): "file_type": "document", "entity_type": "project", "entity_id": "PROJ001", - "created_by": "get_test_user" + "created_by": "get_test_user", } create_response = client.post("/api/v1/files", data=file_data) @@ -498,7 +494,7 @@ def test_update_file_endpoint(self, client: TestClient, session: Session): "file_type": "document", "entity_type": "project", "entity_id": "PROJ001", - "created_by": "update_test_user" + "created_by": "update_test_user", } create_response = client.post("/api/v1/files", data=file_data) @@ -507,10 +503,7 @@ def test_update_file_endpoint(self, client: TestClient, session: Session): file_id = created_file["file_id"] # Test successful update - update_data = { - "description": "Updated description", - "is_public": True - } + update_data = {"description": "Updated description", "is_public": True} response = client.put(f"/api/v1/files/{file_id}", json=update_data) assert response.status_code == 200 @@ -534,7 +527,7 @@ def test_delete_file_endpoint(self, client: TestClient, session: Session): "file_type": "document", "entity_type": "project", "entity_id": "PROJ001", - "created_by": "delete_test_user" + "created_by": "delete_test_user", } create_response = client.post("/api/v1/files", data=file_data) @@ -560,7 +553,7 @@ def test_list_files_for_entity_endpoint(self, client: TestClient, session: Sessi entities = [ ("project", "PROJ001"), ("project", "PROJ002"), - ("run", "190110_MACHINE123_0001_FLOWCELL123") + ("run", "190110_MACHINE123_0001_FLOWCELL123"), ] for entity_type, entity_id in entities: @@ -571,7 +564,7 @@ def test_list_files_for_entity_endpoint(self, client: TestClient, session: Sessi "file_type": "document", "entity_type": entity_type, "entity_id": entity_id, - "created_by": "entity_test_user" + "created_by": "entity_test_user", } client.post("/api/v1/files", data=file_data) @@ -586,7 +579,9 @@ def test_list_files_for_entity_endpoint(self, client: TestClient, session: Sessi assert item["entity_id"] == "PROJ001" # Test listing files for specific run - response = client.get("/api/v1/files/entity/run/190110_MACHINE123_0001_FLOWCELL123") + response = client.get( + "/api/v1/files/entity/run/190110_MACHINE123_0001_FLOWCELL123" + ) assert response.status_code == 200 data = response.json() @@ -603,7 +598,9 @@ def test_list_files_for_entity_endpoint(self, client: TestClient, session: Sessi assert data["per_page"] == 1 assert len(data["data"]) == 1 - def test_get_file_count_for_entity_endpoint(self, client: TestClient, session: Session): + def test_get_file_count_for_entity_endpoint( + self, client: TestClient, session: Session + ): """Test getting file count for a specific entity""" # Create files for a specific entity entity_type = "project" @@ -616,7 +613,7 @@ def test_get_file_count_for_entity_endpoint(self, client: TestClient, session: S "file_type": "document", "entity_type": entity_type, "entity_id": entity_id, - "created_by": "count_test_user" + "created_by": "count_test_user", } client.post("/api/v1/files", data=file_data) @@ -638,7 +635,9 @@ def test_get_file_count_for_entity_endpoint(self, client: TestClient, session: S assert data["entity_id"] == "EMPTY_PROJECT" assert data["file_count"] == 0 - def test_create_file_with_content_endpoint(self, client: TestClient, session: Session): + def test_create_file_with_content_endpoint( + self, client: TestClient, session: Session + ): """Test file creation with content upload""" import io @@ -649,12 +648,14 @@ def test_create_file_with_content_endpoint(self, client: TestClient, session: Se "file_type": "document", "entity_type": "project", "entity_id": "PROJ001", - "created_by": "content_test_user" + "created_by": "content_test_user", } # Create file content file_content = b"Hello, this is test content!" - files = {"content": ("test_content.txt", io.BytesIO(file_content), "text/plain")} + files = { + "content": ("test_content.txt", io.BytesIO(file_content), "text/plain") + } # Send multipart form data response = client.post("/api/v1/files", data=file_data, files=files) @@ -672,7 +673,7 @@ def test_error_handling(self, client: TestClient, session: Session): "filename": "test.txt", "file_type": "invalid_type", "entity_type": "project", - "entity_id": "PROJ001" + "entity_id": "PROJ001", } response = client.post("/api/v1/files", data=invalid_file_data) @@ -683,7 +684,7 @@ def test_error_handling(self, client: TestClient, session: Session): "filename": "test.txt", "file_type": "document", "entity_type": "invalid_entity", - "entity_id": "PROJ001" + "entity_id": "PROJ001", } response = client.post("/api/v1/files", data=invalid_entity_data) @@ -711,10 +712,12 @@ def test_complete_file_lifecycle(self, session: Session, temp_storage): file_type=FileType.DOCUMENT, entity_type=EntityType.PROJECT, entity_id="PROJ001", - created_by="testuser" + created_by="testuser", ) - created_file = create_file(session, file_create, content, storage_root=temp_storage) + created_file = create_file( + session, file_create, content, storage_root=temp_storage + ) assert created_file.filename == "lifecycle.txt" assert created_file.file_size == len(content) @@ -729,8 +732,7 @@ def test_complete_file_lifecycle(self, session: Session, temp_storage): # Update file metadata file_update = FileUpdate( - description="Updated lifecycle test file", - is_public=True + description="Updated lifecycle test file", is_public=True ) updated_file = update_file(session, created_file.file_id, file_update) @@ -750,7 +752,7 @@ def test_multiple_entities_file_management(self, session: Session, temp_storage) entities = [ (EntityType.PROJECT, "PROJ001"), (EntityType.PROJECT, "PROJ002"), - (EntityType.RUN, "190110_MACHINE123_0001_FLOWCELL123") + (EntityType.RUN, "190110_MACHINE123_0001_FLOWCELL123"), ] created_files = [] @@ -762,10 +764,12 @@ def test_multiple_entities_file_management(self, session: Session, temp_storage) filename=f"file{i}.txt", entity_type=entity_type, entity_id=entity_id, - file_type=FileType.DOCUMENT + file_type=FileType.DOCUMENT, ) - file_record = create_file(session, file_create, storage_root=temp_storage) + file_record = create_file( + session, file_create, storage_root=temp_storage + ) created_files.append(file_record) # Verify total count @@ -790,13 +794,14 @@ def test_file_type_filtering(self, session: Session, temp_storage): filename=f"test.{file_type.value}", entity_type=EntityType.PROJECT, entity_id="PROJ001", - file_type=file_type + file_type=file_type, ) create_file(session, file_create, storage_root=temp_storage) # Test filtering by each type for file_type in file_types: from api.files.models import FileFilters + filters = FileFilters(file_type=file_type) result = list_files(session, filters=filters) @@ -809,22 +814,17 @@ class TestFileBrowserModels: def test_file_browser_folder_model(self): """Test FileBrowserFolder model""" - folder = FileBrowserFolder( - name="test_folder", - date="2023-01-01 12:00:00" - ) - + folder = FileBrowserFolder(name="test_folder", date="2023-01-01 12:00:00") + assert folder.name == "test_folder" assert folder.date == "2023-01-01 12:00:00" def test_file_browser_file_model(self): """Test FileBrowserFile model""" file = FileBrowserFile( - name="test_file.txt", - date="2023-01-01 12:00:00", - size=1024 + name="test_file.txt", date="2023-01-01 12:00:00", size=1024 ) - + assert file.name == "test_file.txt" assert file.date == "2023-01-01 12:00:00" assert file.size == 1024 @@ -833,12 +833,9 @@ def test_file_browser_data_model(self): """Test FileBrowserData model""" folder = FileBrowserFolder(name="folder1", date="2023-01-01 12:00:00") file = FileBrowserFile(name="file1.txt", date="2023-01-01 12:00:00", size=100) - - browser_data = FileBrowserData( - folders=[folder], - files=[file] - ) - + + browser_data = FileBrowserData(folders=[folder], files=[file]) + assert len(browser_data.folders) == 1 assert len(browser_data.files) == 1 assert browser_data.folders[0].name == "folder1" @@ -852,36 +849,36 @@ class TestFileBrowserServices: def temp_storage(self): """Create temporary storage with folder/file structure""" temp_dir = tempfile.mkdtemp() - + # Create some folders (Path(temp_dir) / "folder1").mkdir() (Path(temp_dir) / "folder2").mkdir() - + # Create some files (Path(temp_dir) / "file1.txt").write_text("content1") (Path(temp_dir) / "file2.txt").write_text("content2") - + yield temp_dir shutil.rmtree(temp_dir) def test_browse_filesystem(self, temp_storage): """Test filesystem browsing""" result = browse_filesystem("", temp_storage) - + assert isinstance(result, FileBrowserData) assert len(result.folders) == 2 assert len(result.files) == 2 - + # Check folder names folder_names = [f.name for f in result.folders] assert "folder1" in folder_names assert "folder2" in folder_names - + # Check file names file_names = [f.name for f in result.files] assert "file1.txt" in file_names assert "file2.txt" in file_names - + # Check file sizes for file in result.files: assert file.size > 0 @@ -891,7 +888,7 @@ def test_browse_filesystem_nonexistent_directory(self): """Test browsing non-existent directory""" with pytest.raises(HTTPException) as exc_info: browse_filesystem("nonexistent", "nonexistent_root") - + assert exc_info.value.status_code == 404 assert "not found" in str(exc_info.value.detail) @@ -904,17 +901,17 @@ def test_list_files_as_browser_data(self, session: Session): description=f"Database file {i}", entity_type=EntityType.PROJECT, entity_id="PROJ001", - file_type=FileType.DOCUMENT + file_type=FileType.DOCUMENT, ) create_file(session, file_create) # Get files in browser format result = list_files_as_browser_data(session) - + assert isinstance(result, FileBrowserData) assert len(result.folders) == 0 # No folders for database files assert len(result.files) == 3 - + # Check file properties for file in result.files: assert file.name.startswith("db_file_") @@ -933,17 +930,17 @@ def test_browse_filesystem_endpoint(self, client: TestClient): # Create test structure (Path(temp_dir) / "test_folder").mkdir() (Path(temp_dir) / "test_file.txt").write_text("test content") - + # Test the endpoint response = client.get(f"/api/v1/files/browse?storage_root={temp_dir}") assert response.status_code == 200 - + data = response.json() assert "folders" in data assert "files" in data assert isinstance(data["folders"], list) assert isinstance(data["files"], list) - + finally: shutil.rmtree(temp_dir) @@ -957,20 +954,20 @@ def test_browse_db_endpoint(self, client: TestClient, session: Session): "file_type": "document", "entity_type": "project", "entity_id": "PROJ001", - "created_by": "browser_test_user" + "created_by": "browser_test_user", } client.post("/api/v1/files", data=file_data) # Test the browser endpoint response = client.get("/api/v1/files/browse-db") assert response.status_code == 200 - + data = response.json() assert "folders" in data assert "files" in data assert len(data["folders"]) == 0 # No folders for database files assert len(data["files"]) >= 2 - + # Check file structure for file in data["files"]: assert "name" in file @@ -984,4 +981,3 @@ def test_browse_filesystem_error_handling(self, client: TestClient): "/api/v1/files/browse?directory_path=nonexistent&storage_root=nonexistent" ) assert response.status_code == 404 - From 15ad92c2f3b627cebbfed7e36c9b0f6777e92c02 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 26 Sep 2025 21:39:16 -0400 Subject: [PATCH 26/35] Add s3 support --- api/files/routes.py | 25 +++++-- api/files/services.py | 160 +++++++++++++++++++++++++++++++++++++++- tests/api/test_files.py | 45 +++++++++++ 3 files changed, 224 insertions(+), 6 deletions(-) diff --git a/api/files/routes.py b/api/files/routes.py index 8d06ff9..ea886fc 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -123,16 +123,31 @@ def list_files( "/browse", response_model=FileBrowserData, summary="Browse filesystem directory" ) def browse_filesystem( - directory_path: str = Query("", description="Directory path to browse"), - storage_root: str = Query("storage", description="Storage root directory"), + directory_path: str = Query( + "", description="Directory path to browse (local path or s3://bucket/key)" + ), + storage_root: str = Query( + "storage", description="Storage root directory (ignored for S3 paths)" + ), ) -> FileBrowserData: """ - Browse a filesystem directory and return folders and files in structured format. + Browse a filesystem directory or S3 bucket and return folders and files in structured format. - - **directory_path**: Path relative to storage root (empty for root) - - **storage_root**: Base storage directory (defaults to 'storage') + Supports both local filesystem and AWS S3: + - **Local paths**: Relative to storage_root (empty for root) or absolute paths + - **S3 paths**: Use s3://bucket/key format (e.g., s3://my-bucket/path/to/folder/) + - **storage_root**: Base storage directory for local paths (ignored for S3) Returns separate arrays for folders and files with name, date, and size information. + + For S3 paths: + - Requires AWS credentials to be configured + - Folders represent S3 prefixes (common prefixes) + - Files show S3 object metadata (size, last modified) + + Examples: + - Local: `/browse?directory_path=project1/data` + - S3: `/browse?directory_path=s3://my-bucket/project1/data/` """ return services.browse_filesystem(directory_path, storage_root) diff --git a/api/files/services.py b/api/files/services.py index dbbd83e..a81969e 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -12,6 +12,13 @@ from pydantic import PositiveInt from fastapi import HTTPException, status +try: + import boto3 + from botocore.exceptions import ClientError, NoCredentialsError + BOTO3_AVAILABLE = True +except ImportError: + BOTO3_AVAILABLE = False + from api.files.models import ( File, FileCreate, @@ -354,7 +361,24 @@ def update_file_content( def browse_filesystem( directory_path: str, storage_root: str = "storage" ) -> FileBrowserData: - """Browse filesystem directory and return structured data""" + """ + Browse filesystem directory and return structured data. + + Automatically detects if the path is S3 (s3://bucket/key) or local filesystem + and routes to the appropriate handler. + """ + # Check if this is an S3 path + if _is_s3_path(directory_path): + return browse_s3(directory_path) + + # Handle local filesystem + return _browse_local_filesystem(directory_path, storage_root) + + +def _browse_local_filesystem( + directory_path: str, storage_root: str = "storage" +) -> FileBrowserData: + """Browse local filesystem directory and return structured data""" # Construct full path if os.path.isabs(directory_path): @@ -406,6 +430,140 @@ def browse_filesystem( return FileBrowserData(folders=folders, files=files) +def _is_s3_path(path: str) -> bool: + """Check if a path is an S3 URI (s3://bucket/key)""" + return path.startswith("s3://") + + +def _parse_s3_path(s3_path: str) -> tuple[str, str]: + """Parse S3 path into bucket and prefix""" + if not s3_path.startswith("s3://"): + raise ValueError("Invalid S3 path format. Must start with s3://") + + # Remove s3:// prefix + path_without_scheme = s3_path[5:] + + # Check for empty path after s3:// + if not path_without_scheme: + raise ValueError("Invalid S3 path format. Bucket name is required") + + # Split into bucket and key + if "/" in path_without_scheme: + bucket, key = path_without_scheme.split("/", 1) + else: + bucket = path_without_scheme + key = "" + + # Validate bucket name is not empty + if not bucket: + raise ValueError("Invalid S3 path format. Bucket name cannot be empty") + + return bucket, key + + +def browse_s3(s3_path: str) -> FileBrowserData: + """Browse S3 bucket/prefix and return structured data""" + if not BOTO3_AVAILABLE: + raise HTTPException( + status_code=status.HTTP_501_NOT_IMPLEMENTED, + detail="S3 support not available. Install boto3 to enable S3 browsing.", + ) + + try: + bucket, prefix = _parse_s3_path(s3_path) + except ValueError as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=str(e), + ) + + try: + # Initialize S3 client + s3_client = boto3.client('s3') + + # List objects with the given prefix + paginator = s3_client.get_paginator('list_objects_v2') + page_iterator = paginator.paginate( + Bucket=bucket, + Prefix=prefix, + Delimiter='/' # This helps us get "folders" + ) + + folders = [] + files = [] + + for page in page_iterator: + # Handle "folders" (common prefixes) + for common_prefix in page.get('CommonPrefixes', []): + folder_prefix = common_prefix['Prefix'] + # Remove the current prefix to get just the folder name + folder_name = folder_prefix[len(prefix):].rstrip('/') + if folder_name: # Skip empty names + folders.append(FileBrowserFolder( + name=folder_name, + date="" # S3 prefixes don't have modification dates + )) + + # Handle actual files + for obj in page.get('Contents', []): + key = obj['Key'] + # Skip if this is just the prefix itself (directory marker) + if key == prefix: + continue + + # Remove the current prefix to get just the file name + file_name = key[len(prefix):] + + # Skip files that are in subdirectories (contain '/') + if '/' in file_name: + continue + + if file_name: # Skip empty names + # Format the date + mod_time = obj['LastModified'] + date_str = mod_time.strftime("%Y-%m-%d %H:%M:%S") + + files.append(FileBrowserFile( + name=file_name, + date=date_str, + size=obj['Size'] + )) + + # Sort folders and files by name + folders.sort(key=lambda x: x.name.lower()) + files.sort(key=lambda x: x.name.lower()) + + return FileBrowserData(folders=folders, files=files) + + except NoCredentialsError: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="AWS credentials not found. Please configure AWS credentials.", + ) + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == 'NoSuchBucket': + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"S3 bucket not found: {bucket}", + ) + elif error_code == 'AccessDenied': + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"Access denied to S3 bucket: {bucket}", + ) + else: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"S3 error: {e.response['Error']['Message']}", + ) + except Exception as e: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Unexpected error browsing S3: {str(e)}", + ) + + def list_files_as_browser_data( session: Session, filters: FileFilters | None = None, diff --git a/tests/api/test_files.py b/tests/api/test_files.py index 05398e6..47898c6 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -944,6 +944,51 @@ def test_browse_filesystem_endpoint(self, client: TestClient): finally: shutil.rmtree(temp_dir) + def test_s3_path_detection(self): + """Test S3 path detection functionality""" + from api.files.services import _is_s3_path, _parse_s3_path + + # Test S3 path detection + assert _is_s3_path("s3://bucket/key") is True + assert _is_s3_path("s3://bucket") is True + assert _is_s3_path("/local/path") is False + assert _is_s3_path("local/path") is False + assert _is_s3_path("") is False + + # Test S3 path parsing + bucket, key = _parse_s3_path("s3://my-bucket/path/to/folder/") + assert bucket == "my-bucket" + assert key == "path/to/folder/" + + bucket, key = _parse_s3_path("s3://my-bucket") + assert bucket == "my-bucket" + assert key == "" + + bucket, key = _parse_s3_path("s3://my-bucket/") + assert bucket == "my-bucket" + assert key == "" + + # Test invalid S3 path + with pytest.raises(ValueError): + _parse_s3_path("invalid://path") + + def test_browse_s3_without_boto3(self, client: TestClient, monkeypatch): + """Test S3 browsing when boto3 is not available""" + # Mock BOTO3_AVAILABLE to False + import api.files.services as services + monkeypatch.setattr(services, "BOTO3_AVAILABLE", False) + + response = client.get("/api/v1/files/browse?directory_path=s3://test-bucket/") + assert response.status_code == 501 + assert "S3 support not available" in response.json()["detail"] + + def test_browse_s3_invalid_path(self, client: TestClient): + """Test S3 browsing with invalid S3 path""" + # Test with a path that looks like S3 but has invalid format + response = client.get("/api/v1/files/browse?directory_path=s3://") + assert response.status_code == 400 + assert "Invalid S3 path format" in response.json()["detail"] + def test_browse_db_endpoint(self, client: TestClient, session: Session): """Test database files browser endpoint""" # Create test files From a9bd8d5ea968e323da0971e0ad4babcc40ab55c9 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 26 Sep 2025 21:39:45 -0400 Subject: [PATCH 27/35] Format with black --- api/files/routes.py | 4 +- api/files/services.py | 89 ++++++++++++++++++++--------------------- tests/api/test_files.py | 13 +++--- 3 files changed, 53 insertions(+), 53 deletions(-) diff --git a/api/files/routes.py b/api/files/routes.py index ea886fc..3b2afce 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -139,12 +139,12 @@ def browse_filesystem( - **storage_root**: Base storage directory for local paths (ignored for S3) Returns separate arrays for folders and files with name, date, and size information. - + For S3 paths: - Requires AWS credentials to be configured - Folders represent S3 prefixes (common prefixes) - Files show S3 object metadata (size, last modified) - + Examples: - Local: `/browse?directory_path=project1/data` - S3: `/browse?directory_path=s3://my-bucket/project1/data/` diff --git a/api/files/services.py b/api/files/services.py index a81969e..336d58b 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -15,6 +15,7 @@ try: import boto3 from botocore.exceptions import ClientError, NoCredentialsError + BOTO3_AVAILABLE = True except ImportError: BOTO3_AVAILABLE = False @@ -363,14 +364,14 @@ def browse_filesystem( ) -> FileBrowserData: """ Browse filesystem directory and return structured data. - + Automatically detects if the path is S3 (s3://bucket/key) or local filesystem and routes to the appropriate handler. """ # Check if this is an S3 path if _is_s3_path(directory_path): return browse_s3(directory_path) - + # Handle local filesystem return _browse_local_filesystem(directory_path, storage_root) @@ -439,25 +440,25 @@ def _parse_s3_path(s3_path: str) -> tuple[str, str]: """Parse S3 path into bucket and prefix""" if not s3_path.startswith("s3://"): raise ValueError("Invalid S3 path format. Must start with s3://") - + # Remove s3:// prefix path_without_scheme = s3_path[5:] - + # Check for empty path after s3:// if not path_without_scheme: raise ValueError("Invalid S3 path format. Bucket name is required") - + # Split into bucket and key if "/" in path_without_scheme: bucket, key = path_without_scheme.split("/", 1) else: bucket = path_without_scheme key = "" - + # Validate bucket name is not empty if not bucket: raise ValueError("Invalid S3 path format. Bucket name cannot be empty") - + return bucket, key @@ -468,7 +469,7 @@ def browse_s3(s3_path: str) -> FileBrowserData: status_code=status.HTTP_501_NOT_IMPLEMENTED, detail="S3 support not available. Install boto3 to enable S3 browsing.", ) - + try: bucket, prefix = _parse_s3_path(s3_path) except ValueError as e: @@ -476,78 +477,76 @@ def browse_s3(s3_path: str) -> FileBrowserData: status_code=status.HTTP_400_BAD_REQUEST, detail=str(e), ) - + try: # Initialize S3 client - s3_client = boto3.client('s3') - + s3_client = boto3.client("s3") + # List objects with the given prefix - paginator = s3_client.get_paginator('list_objects_v2') + paginator = s3_client.get_paginator("list_objects_v2") page_iterator = paginator.paginate( - Bucket=bucket, - Prefix=prefix, - Delimiter='/' # This helps us get "folders" + Bucket=bucket, Prefix=prefix, Delimiter="/" # This helps us get "folders" ) - + folders = [] files = [] - + for page in page_iterator: # Handle "folders" (common prefixes) - for common_prefix in page.get('CommonPrefixes', []): - folder_prefix = common_prefix['Prefix'] + for common_prefix in page.get("CommonPrefixes", []): + folder_prefix = common_prefix["Prefix"] # Remove the current prefix to get just the folder name - folder_name = folder_prefix[len(prefix):].rstrip('/') + folder_name = folder_prefix[len(prefix) :].rstrip("/") if folder_name: # Skip empty names - folders.append(FileBrowserFolder( - name=folder_name, - date="" # S3 prefixes don't have modification dates - )) - + folders.append( + FileBrowserFolder( + name=folder_name, + date="", # S3 prefixes don't have modification dates + ) + ) + # Handle actual files - for obj in page.get('Contents', []): - key = obj['Key'] + for obj in page.get("Contents", []): + key = obj["Key"] # Skip if this is just the prefix itself (directory marker) if key == prefix: continue - + # Remove the current prefix to get just the file name - file_name = key[len(prefix):] - + file_name = key[len(prefix) :] + # Skip files that are in subdirectories (contain '/') - if '/' in file_name: + if "/" in file_name: continue - + if file_name: # Skip empty names # Format the date - mod_time = obj['LastModified'] + mod_time = obj["LastModified"] date_str = mod_time.strftime("%Y-%m-%d %H:%M:%S") - - files.append(FileBrowserFile( - name=file_name, - date=date_str, - size=obj['Size'] - )) - + + files.append( + FileBrowserFile(name=file_name, date=date_str, size=obj["Size"]) + ) + # Sort folders and files by name folders.sort(key=lambda x: x.name.lower()) files.sort(key=lambda x: x.name.lower()) - + return FileBrowserData(folders=folders, files=files) - + except NoCredentialsError: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="AWS credentials not found. Please configure AWS credentials.", ) except ClientError as e: - error_code = e.response['Error']['Code'] - if error_code == 'NoSuchBucket': + error_code = e.response["Error"]["Code"] + if error_code == "NoSuchBucket": raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"S3 bucket not found: {bucket}", ) - elif error_code == 'AccessDenied': + elif error_code == "AccessDenied": raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail=f"Access denied to S3 bucket: {bucket}", diff --git a/tests/api/test_files.py b/tests/api/test_files.py index 47898c6..a04ad18 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -947,27 +947,27 @@ def test_browse_filesystem_endpoint(self, client: TestClient): def test_s3_path_detection(self): """Test S3 path detection functionality""" from api.files.services import _is_s3_path, _parse_s3_path - + # Test S3 path detection assert _is_s3_path("s3://bucket/key") is True assert _is_s3_path("s3://bucket") is True assert _is_s3_path("/local/path") is False assert _is_s3_path("local/path") is False assert _is_s3_path("") is False - + # Test S3 path parsing bucket, key = _parse_s3_path("s3://my-bucket/path/to/folder/") assert bucket == "my-bucket" assert key == "path/to/folder/" - + bucket, key = _parse_s3_path("s3://my-bucket") assert bucket == "my-bucket" assert key == "" - + bucket, key = _parse_s3_path("s3://my-bucket/") assert bucket == "my-bucket" assert key == "" - + # Test invalid S3 path with pytest.raises(ValueError): _parse_s3_path("invalid://path") @@ -976,8 +976,9 @@ def test_browse_s3_without_boto3(self, client: TestClient, monkeypatch): """Test S3 browsing when boto3 is not available""" # Mock BOTO3_AVAILABLE to False import api.files.services as services + monkeypatch.setattr(services, "BOTO3_AVAILABLE", False) - + response = client.get("/api/v1/files/browse?directory_path=s3://test-bucket/") assert response.status_code == 501 assert "S3 support not available" in response.json()["detail"] From 67882b117795f9291a8a94e6e2e7fc40ae11342a Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 26 Sep 2025 21:43:56 -0400 Subject: [PATCH 28/35] fix Lint --- api/files/services.py | 7 +++---- api/runs/routes.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/api/files/services.py b/api/files/services.py index 336d58b..b03c02d 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -33,7 +33,6 @@ FileBrowserData, FileBrowserFolder, FileBrowserFile, - FileBrowserColumns, ) @@ -476,7 +475,7 @@ def browse_s3(s3_path: str) -> FileBrowserData: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=str(e), - ) + ) from e try: # Initialize S3 client @@ -496,7 +495,7 @@ def browse_s3(s3_path: str) -> FileBrowserData: for common_prefix in page.get("CommonPrefixes", []): folder_prefix = common_prefix["Prefix"] # Remove the current prefix to get just the folder name - folder_name = folder_prefix[len(prefix) :].rstrip("/") + folder_name = folder_prefix[len(prefix):].rstrip("/") if folder_name: # Skip empty names folders.append( FileBrowserFolder( @@ -513,7 +512,7 @@ def browse_s3(s3_path: str) -> FileBrowserData: continue # Remove the current prefix to get just the file name - file_name = key[len(prefix) :] + file_name = key[len(prefix):] # Skip files that are in subdirectories (contain '/') if "/" in file_name: diff --git a/api/runs/routes.py b/api/runs/routes.py index 01e5978..4b40549 100644 --- a/api/runs/routes.py +++ b/api/runs/routes.py @@ -14,7 +14,7 @@ """ from typing import Literal -from fastapi import APIRouter, Query, status, HTTPException +from fastapi import APIRouter, Query, status from core.deps import SessionDep, OpenSearchDep from api.runs.models import ( IlluminaMetricsResponseModel, From d87eaee64da69b1944868ffbc5185ab8dc5a818d Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 26 Sep 2025 21:45:37 -0400 Subject: [PATCH 29/35] Remove FileBrowserColumns model --- api/files/models.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/api/files/models.py b/api/files/models.py index 51763a2..13dd544 100644 --- a/api/files/models.py +++ b/api/files/models.py @@ -191,15 +191,6 @@ class PaginatedFileResponse(SQLModel): model_config = ConfigDict(from_attributes=True) -class FileBrowserColumns(SQLModel): - """Individual file/folder item for file browser""" - - name: str - date: str - size: int | None = None # None for directories - dir: bool # True for directories, False for files - - class FileBrowserFolder(SQLModel): """Folder item for file browser""" From 564b3949c16406da12b3655e8d0329d1dbd38891 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 26 Sep 2025 21:49:03 -0400 Subject: [PATCH 30/35] Update exception handling --- api/files/routes.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/api/files/routes.py b/api/files/routes.py index 3b2afce..8f6bd13 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -202,11 +202,11 @@ def get_file(session: SessionDep, file_id: str) -> FilePublic: """ try: return services.get_file(session, file_id) - except Exception: + except Exception as exc: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"File with id {file_id} not found", - ) + ) from exc @router.put("/{file_id}", response_model=FilePublic, summary="Update file metadata") @@ -221,11 +221,11 @@ def update_file( """ try: return services.update_file(session, file_id, file_update) - except Exception: + except Exception as exc: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"File with id {file_id} not found", - ) + ) from exc @router.delete( @@ -244,11 +244,11 @@ def delete_file(session: SessionDep, file_id: str) -> None: status_code=status.HTTP_404_NOT_FOUND, detail=f"File with id {file_id} not found", ) - except Exception: + except Exception as exc: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"File with id {file_id} not found", - ) + ) from exc @router.get("/{file_id}/content", summary="Download file content") @@ -276,11 +276,11 @@ def generate(): "Content-Disposition": f"attachment; filename={file_record.filename}" }, ) - except Exception: + except Exception as exc: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"File with id {file_id} not found or content not available", - ) + ) from exc @router.post( @@ -298,11 +298,11 @@ def upload_file_content( try: file_content = content.file.read() return services.update_file_content(session, file_id, file_content) - except Exception: + except Exception as exc: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"File with id {file_id} not found", - ) + ) from exc @router.get( From 6a7d668510697a49ebba8464b348ae2e3781e206 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 30 Sep 2025 11:43:03 -0400 Subject: [PATCH 31/35] Implement dual-storage support for Illumina samplesheets --- api/files/services.py | 66 ++++++++++++ tests/api/test_files.py | 218 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+) diff --git a/api/files/services.py b/api/files/services.py index b03c02d..8e33a02 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -6,6 +6,7 @@ import secrets import string import os +import logging from pathlib import Path from datetime import datetime from sqlmodel import select, Session, func @@ -70,6 +71,53 @@ def get_mime_type(filename: str) -> str: return mime_type or "application/octet-stream" +def _is_valid_storage_path(path: str) -> bool: + """Validate storage path format""" + # Allow S3 paths, local paths, network paths + valid_prefixes = ['s3://', '/', 'file://', 'smb://', 'ftp://'] + return any(path.startswith(prefix) for prefix in valid_prefixes) + + +def _save_samplesheet_to_run_folder( + session: Session, run_barcode: str, file_content: bytes +) -> bool: + """ + Save samplesheet content to the run's folder URI location. + Returns True if successful, False otherwise. + """ + from smart_open import open as smart_open + + try: + # Import here to avoid circular imports + from api.runs.services import get_run + + # Get run information + run = get_run(session, run_barcode) + if not run or not run.run_folder_uri: + logging.warning(f"No run folder URI found for run {run_barcode}") + return False + + # Construct samplesheet path - always use SampleSheet.csv as the standard name + samplesheet_path = f"{run.run_folder_uri.rstrip('/')}/SampleSheet.csv" + + # Validate path format + if not _is_valid_storage_path(samplesheet_path): + logging.warning(f"Invalid storage path format: {samplesheet_path}") + return False + + # Save using smart_open (handles S3, local, network paths) + with smart_open(samplesheet_path, 'wb') as f: + f.write(file_content) + + logging.info(f"Successfully saved samplesheet to run folder: {samplesheet_path}") + return True + + except Exception as e: + # Log error but don't fail the upload + logging.error(f"Failed to save samplesheet to run folder {run_barcode}: {e}") + return False + + def create_file( session: Session, file_create: FileCreate, @@ -117,11 +165,29 @@ def create_file( # Store file content if provided if file_content: + # 1. Save to standard database storage location full_path = Path(storage_root) / file_path full_path.parent.mkdir(parents=True, exist_ok=True) with open(full_path, "wb") as f: f.write(file_content) + # 2. SPECIAL HANDLING: If samplesheet for run, also save to run folder + if (file_create.file_type == FileType.SAMPLESHEET and + file_create.entity_type == EntityType.RUN): + dual_storage_success = _save_samplesheet_to_run_folder( + session, file_create.entity_id, file_content + ) + # Add note to description about dual storage status + if dual_storage_success: + status_note = "[Dual-stored to run folder]" + else: + status_note = "[Database-only storage - run folder write failed]" + + if file_record.description: + file_record.description = f"{file_record.description} {status_note}" + else: + file_record.description = status_note + # Save to database session.add(file_record) session.commit() diff --git a/tests/api/test_files.py b/tests/api/test_files.py index a04ad18..5140f02 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -808,6 +808,224 @@ def test_file_type_filtering(self, session: Session, temp_storage): assert result.total_items == 1 assert result.data[0].file_type == file_type + def test_samplesheet_dual_storage_success(self, session: Session, temp_storage): + """Test that samplesheets are saved to both database storage and run folder""" + from api.runs.models import SequencingRun, RunStatus + from datetime import date + # Create a test run with run_folder_uri + run = SequencingRun( + run_date=date(2019, 1, 10), + machine_id="MACHINE123", + run_number=1, + flowcell_id="FLOWCELL123", + experiment_name="Test Experiment", + run_folder_uri=f"{temp_storage}/run_folder", + status=RunStatus.READY, + ) + session.add(run) + session.commit() + session.refresh(run) + + # Create run folder directory + run_folder = Path(temp_storage) / "run_folder" + run_folder.mkdir(parents=True, exist_ok=True) + + # Create samplesheet content + samplesheet_content = b"""[Header] +IEMFileVersion,4 +Investigator Name,Test User + +[Data] +Sample_ID,Sample_Name,index +Sample1,Sample1,ATCG +""" + + # Create samplesheet file + file_create = FileCreate( + filename="SampleSheet.csv", + description="Test samplesheet", + file_type=FileType.SAMPLESHEET, + entity_type=EntityType.RUN, + entity_id=run.barcode, + created_by="testuser", + ) + + created_file = create_file( + session, file_create, samplesheet_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + assert db_file_path.read_bytes() == samplesheet_content + + # Verify file was also saved to run folder + run_folder_samplesheet = run_folder / "SampleSheet.csv" + assert run_folder_samplesheet.exists() + assert run_folder_samplesheet.read_bytes() == samplesheet_content + + # Verify description indicates dual storage success + assert "[Dual-stored to run folder]" in created_file.description + + def test_samplesheet_dual_storage_no_run_folder(self, session: Session, temp_storage): + """Test samplesheet upload when run has no run_folder_uri""" + from api.runs.models import SequencingRun, RunStatus + from datetime import date + + # Create a test run WITHOUT run_folder_uri + run = SequencingRun( + run_date=date(2019, 1, 10), + machine_id="MACHINE123", + run_number=2, + flowcell_id="FLOWCELL456", + experiment_name="Test Experiment No URI", + run_folder_uri=None, # No URI + status=RunStatus.READY, + ) + session.add(run) + session.commit() + session.refresh(run) + + samplesheet_content = b"Sample samplesheet content" + + # Create samplesheet file + file_create = FileCreate( + filename="SampleSheet.csv", + description="Test samplesheet", + file_type=FileType.SAMPLESHEET, + entity_type=EntityType.RUN, + entity_id=run.barcode, + created_by="testuser", + ) + + created_file = create_file( + session, file_create, samplesheet_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + + # Verify description indicates database-only storage + assert "[Database-only storage - run folder write failed]" in created_file.description + + def test_samplesheet_dual_storage_invalid_path(self, session: Session, temp_storage): + """Test samplesheet upload when run_folder_uri has invalid format""" + from api.runs.models import SequencingRun, RunStatus + from datetime import date + + # Create a test run with invalid run_folder_uri + run = SequencingRun( + run_date=date(2019, 1, 10), + machine_id="MACHINE123", + run_number=3, + flowcell_id="FLOWCELL789", + experiment_name="Test Experiment Invalid Path", + run_folder_uri="invalid://path/format", # Invalid format + status=RunStatus.READY, + ) + session.add(run) + session.commit() + session.refresh(run) + + samplesheet_content = b"Sample samplesheet content" + + # Create samplesheet file + file_create = FileCreate( + filename="SampleSheet.csv", + description="Test samplesheet", + file_type=FileType.SAMPLESHEET, + entity_type=EntityType.RUN, + entity_id=run.barcode, + created_by="testuser", + ) + + created_file = create_file( + session, file_create, samplesheet_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + + # Verify description indicates database-only storage + assert "[Database-only storage - run folder write failed]" in created_file.description + + def test_non_samplesheet_files_not_dual_stored(self, session: Session, temp_storage): + """Test that non-samplesheet files are not dual-stored""" + from api.runs.models import SequencingRun, RunStatus + from datetime import date + + # Create a test run with run_folder_uri + run = SequencingRun( + run_date=date(2019, 1, 10), + machine_id="MACHINE123", + run_number=4, + flowcell_id="FLOWCELL999", + experiment_name="Test Experiment", + run_folder_uri=f"{temp_storage}/run_folder_2", + status=RunStatus.READY, + ) + session.add(run) + session.commit() + session.refresh(run) + + # Create run folder directory + run_folder = Path(temp_storage) / "run_folder_2" + run_folder.mkdir(parents=True, exist_ok=True) + + # Create a NON-samplesheet file for the run + file_content = b"Some other file content" + file_create = FileCreate( + filename="metrics.json", + description="Test metrics file", + file_type=FileType.METRICS, # NOT a samplesheet + entity_type=EntityType.RUN, + entity_id=run.barcode, + created_by="testuser", + ) + + created_file = create_file( + session, file_create, file_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + + # Verify file was NOT saved to run folder + run_folder_file = run_folder / "SampleSheet.csv" + assert not run_folder_file.exists() + + # Verify description does NOT contain dual storage note + assert "[Dual-stored to run folder]" not in (created_file.description or "") + assert "[Database-only storage" not in (created_file.description or "") + + def test_samplesheet_for_project_not_dual_stored(self, session: Session, temp_storage): + """Test that samplesheets for projects are not dual-stored""" + # Create a samplesheet for a PROJECT (not a run) + samplesheet_content = b"Sample samplesheet content" + file_create = FileCreate( + filename="SampleSheet.csv", + description="Project samplesheet", + file_type=FileType.SAMPLESHEET, + entity_type=EntityType.PROJECT, # PROJECT, not RUN + entity_id="PROJ001", + created_by="testuser", + ) + + created_file = create_file( + session, file_create, samplesheet_content, storage_root=temp_storage + ) + + # Verify file was created in database storage + db_file_path = Path(temp_storage) / created_file.file_path + assert db_file_path.exists() + + # Verify description does NOT contain dual storage note + assert "[Dual-stored to run folder]" not in (created_file.description or "") + assert "[Database-only storage" not in (created_file.description or "") + class TestFileBrowserModels: """Test file browser model functionality""" From 139ddfe993d4451374723d1a1191c47204ccc89f Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 30 Sep 2025 11:47:04 -0400 Subject: [PATCH 32/35] Black --- api/files/services.py | 34 +++++++++-------- tests/api/test_files.py | 85 ++++++++++++++++++++++++----------------- 2 files changed, 69 insertions(+), 50 deletions(-) diff --git a/api/files/services.py b/api/files/services.py index 8e33a02..e4d015c 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -74,7 +74,7 @@ def get_mime_type(filename: str) -> str: def _is_valid_storage_path(path: str) -> bool: """Validate storage path format""" # Allow S3 paths, local paths, network paths - valid_prefixes = ['s3://', '/', 'file://', 'smb://', 'ftp://'] + valid_prefixes = ["s3://", "/", "file://", "smb://", "ftp://"] return any(path.startswith(prefix) for prefix in valid_prefixes) @@ -86,32 +86,34 @@ def _save_samplesheet_to_run_folder( Returns True if successful, False otherwise. """ from smart_open import open as smart_open - + try: # Import here to avoid circular imports from api.runs.services import get_run - + # Get run information run = get_run(session, run_barcode) if not run or not run.run_folder_uri: logging.warning(f"No run folder URI found for run {run_barcode}") return False - + # Construct samplesheet path - always use SampleSheet.csv as the standard name samplesheet_path = f"{run.run_folder_uri.rstrip('/')}/SampleSheet.csv" - + # Validate path format if not _is_valid_storage_path(samplesheet_path): logging.warning(f"Invalid storage path format: {samplesheet_path}") return False - + # Save using smart_open (handles S3, local, network paths) - with smart_open(samplesheet_path, 'wb') as f: + with smart_open(samplesheet_path, "wb") as f: f.write(file_content) - - logging.info(f"Successfully saved samplesheet to run folder: {samplesheet_path}") + + logging.info( + f"Successfully saved samplesheet to run folder: {samplesheet_path}" + ) return True - + except Exception as e: # Log error but don't fail the upload logging.error(f"Failed to save samplesheet to run folder {run_barcode}: {e}") @@ -172,8 +174,10 @@ def create_file( f.write(file_content) # 2. SPECIAL HANDLING: If samplesheet for run, also save to run folder - if (file_create.file_type == FileType.SAMPLESHEET and - file_create.entity_type == EntityType.RUN): + if ( + file_create.file_type == FileType.SAMPLESHEET + and file_create.entity_type == EntityType.RUN + ): dual_storage_success = _save_samplesheet_to_run_folder( session, file_create.entity_id, file_content ) @@ -182,7 +186,7 @@ def create_file( status_note = "[Dual-stored to run folder]" else: status_note = "[Database-only storage - run folder write failed]" - + if file_record.description: file_record.description = f"{file_record.description} {status_note}" else: @@ -561,7 +565,7 @@ def browse_s3(s3_path: str) -> FileBrowserData: for common_prefix in page.get("CommonPrefixes", []): folder_prefix = common_prefix["Prefix"] # Remove the current prefix to get just the folder name - folder_name = folder_prefix[len(prefix):].rstrip("/") + folder_name = folder_prefix[len(prefix) :].rstrip("/") if folder_name: # Skip empty names folders.append( FileBrowserFolder( @@ -578,7 +582,7 @@ def browse_s3(s3_path: str) -> FileBrowserData: continue # Remove the current prefix to get just the file name - file_name = key[len(prefix):] + file_name = key[len(prefix) :] # Skip files that are in subdirectories (contain '/') if "/" in file_name: diff --git a/tests/api/test_files.py b/tests/api/test_files.py index 5140f02..8bebbf9 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -812,6 +812,7 @@ def test_samplesheet_dual_storage_success(self, session: Session, temp_storage): """Test that samplesheets are saved to both database storage and run folder""" from api.runs.models import SequencingRun, RunStatus from datetime import date + # Create a test run with run_folder_uri run = SequencingRun( run_date=date(2019, 1, 10), @@ -825,11 +826,11 @@ def test_samplesheet_dual_storage_success(self, session: Session, temp_storage): session.add(run) session.commit() session.refresh(run) - + # Create run folder directory run_folder = Path(temp_storage) / "run_folder" run_folder.mkdir(parents=True, exist_ok=True) - + # Create samplesheet content samplesheet_content = b"""[Header] IEMFileVersion,4 @@ -839,7 +840,7 @@ def test_samplesheet_dual_storage_success(self, session: Session, temp_storage): Sample_ID,Sample_Name,index Sample1,Sample1,ATCG """ - + # Create samplesheet file file_create = FileCreate( filename="SampleSheet.csv", @@ -849,29 +850,31 @@ def test_samplesheet_dual_storage_success(self, session: Session, temp_storage): entity_id=run.barcode, created_by="testuser", ) - + created_file = create_file( session, file_create, samplesheet_content, storage_root=temp_storage ) - + # Verify file was created in database storage db_file_path = Path(temp_storage) / created_file.file_path assert db_file_path.exists() assert db_file_path.read_bytes() == samplesheet_content - + # Verify file was also saved to run folder run_folder_samplesheet = run_folder / "SampleSheet.csv" assert run_folder_samplesheet.exists() assert run_folder_samplesheet.read_bytes() == samplesheet_content - + # Verify description indicates dual storage success assert "[Dual-stored to run folder]" in created_file.description - def test_samplesheet_dual_storage_no_run_folder(self, session: Session, temp_storage): + def test_samplesheet_dual_storage_no_run_folder( + self, session: Session, temp_storage + ): """Test samplesheet upload when run has no run_folder_uri""" from api.runs.models import SequencingRun, RunStatus from datetime import date - + # Create a test run WITHOUT run_folder_uri run = SequencingRun( run_date=date(2019, 1, 10), @@ -885,9 +888,9 @@ def test_samplesheet_dual_storage_no_run_folder(self, session: Session, temp_sto session.add(run) session.commit() session.refresh(run) - + samplesheet_content = b"Sample samplesheet content" - + # Create samplesheet file file_create = FileCreate( filename="SampleSheet.csv", @@ -897,23 +900,28 @@ def test_samplesheet_dual_storage_no_run_folder(self, session: Session, temp_sto entity_id=run.barcode, created_by="testuser", ) - + created_file = create_file( session, file_create, samplesheet_content, storage_root=temp_storage ) - + # Verify file was created in database storage db_file_path = Path(temp_storage) / created_file.file_path assert db_file_path.exists() - + # Verify description indicates database-only storage - assert "[Database-only storage - run folder write failed]" in created_file.description + assert ( + "[Database-only storage - run folder write failed]" + in created_file.description + ) - def test_samplesheet_dual_storage_invalid_path(self, session: Session, temp_storage): + def test_samplesheet_dual_storage_invalid_path( + self, session: Session, temp_storage + ): """Test samplesheet upload when run_folder_uri has invalid format""" from api.runs.models import SequencingRun, RunStatus from datetime import date - + # Create a test run with invalid run_folder_uri run = SequencingRun( run_date=date(2019, 1, 10), @@ -927,9 +935,9 @@ def test_samplesheet_dual_storage_invalid_path(self, session: Session, temp_stor session.add(run) session.commit() session.refresh(run) - + samplesheet_content = b"Sample samplesheet content" - + # Create samplesheet file file_create = FileCreate( filename="SampleSheet.csv", @@ -939,23 +947,28 @@ def test_samplesheet_dual_storage_invalid_path(self, session: Session, temp_stor entity_id=run.barcode, created_by="testuser", ) - + created_file = create_file( session, file_create, samplesheet_content, storage_root=temp_storage ) - + # Verify file was created in database storage db_file_path = Path(temp_storage) / created_file.file_path assert db_file_path.exists() - + # Verify description indicates database-only storage - assert "[Database-only storage - run folder write failed]" in created_file.description + assert ( + "[Database-only storage - run folder write failed]" + in created_file.description + ) - def test_non_samplesheet_files_not_dual_stored(self, session: Session, temp_storage): + def test_non_samplesheet_files_not_dual_stored( + self, session: Session, temp_storage + ): """Test that non-samplesheet files are not dual-stored""" from api.runs.models import SequencingRun, RunStatus from datetime import date - + # Create a test run with run_folder_uri run = SequencingRun( run_date=date(2019, 1, 10), @@ -969,11 +982,11 @@ def test_non_samplesheet_files_not_dual_stored(self, session: Session, temp_stor session.add(run) session.commit() session.refresh(run) - + # Create run folder directory run_folder = Path(temp_storage) / "run_folder_2" run_folder.mkdir(parents=True, exist_ok=True) - + # Create a NON-samplesheet file for the run file_content = b"Some other file content" file_create = FileCreate( @@ -984,24 +997,26 @@ def test_non_samplesheet_files_not_dual_stored(self, session: Session, temp_stor entity_id=run.barcode, created_by="testuser", ) - + created_file = create_file( session, file_create, file_content, storage_root=temp_storage ) - + # Verify file was created in database storage db_file_path = Path(temp_storage) / created_file.file_path assert db_file_path.exists() - + # Verify file was NOT saved to run folder run_folder_file = run_folder / "SampleSheet.csv" assert not run_folder_file.exists() - + # Verify description does NOT contain dual storage note assert "[Dual-stored to run folder]" not in (created_file.description or "") assert "[Database-only storage" not in (created_file.description or "") - def test_samplesheet_for_project_not_dual_stored(self, session: Session, temp_storage): + def test_samplesheet_for_project_not_dual_stored( + self, session: Session, temp_storage + ): """Test that samplesheets for projects are not dual-stored""" # Create a samplesheet for a PROJECT (not a run) samplesheet_content = b"Sample samplesheet content" @@ -1013,15 +1028,15 @@ def test_samplesheet_for_project_not_dual_stored(self, session: Session, temp_st entity_id="PROJ001", created_by="testuser", ) - + created_file = create_file( session, file_create, samplesheet_content, storage_root=temp_storage ) - + # Verify file was created in database storage db_file_path = Path(temp_storage) / created_file.file_path assert db_file_path.exists() - + # Verify description does NOT contain dual storage note assert "[Dual-stored to run folder]" not in (created_file.description or "") assert "[Database-only storage" not in (created_file.description or "") From 9a25b90af4e40fe9d3f9961e0c2b779ab19fbe1f Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Tue, 30 Sep 2025 11:48:19 -0400 Subject: [PATCH 33/35] Remove whitespace --- api/files/services.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/files/services.py b/api/files/services.py index e4d015c..825a138 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -565,7 +565,7 @@ def browse_s3(s3_path: str) -> FileBrowserData: for common_prefix in page.get("CommonPrefixes", []): folder_prefix = common_prefix["Prefix"] # Remove the current prefix to get just the folder name - folder_name = folder_prefix[len(prefix) :].rstrip("/") + folder_name = folder_prefix[len(prefix):].rstrip("/") if folder_name: # Skip empty names folders.append( FileBrowserFolder( @@ -582,7 +582,7 @@ def browse_s3(s3_path: str) -> FileBrowserData: continue # Remove the current prefix to get just the file name - file_name = key[len(prefix) :] + file_name = key[len(prefix):] # Skip files that are in subdirectories (contain '/') if "/" in file_name: From 6e5f361b555fa8df7cf9d28553bf5dc15c194521 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Thu, 2 Oct 2025 09:33:18 -0400 Subject: [PATCH 34/35] Remove extraneous function --- api/files/services.py | 7 +------ tests/api/test_files.py | 9 +-------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/api/files/services.py b/api/files/services.py index 825a138..9871a75 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -438,7 +438,7 @@ def browse_filesystem( and routes to the appropriate handler. """ # Check if this is an S3 path - if _is_s3_path(directory_path): + if directory_path.startswith("s3://"): return browse_s3(directory_path) # Handle local filesystem @@ -500,11 +500,6 @@ def _browse_local_filesystem( return FileBrowserData(folders=folders, files=files) -def _is_s3_path(path: str) -> bool: - """Check if a path is an S3 URI (s3://bucket/key)""" - return path.startswith("s3://") - - def _parse_s3_path(s3_path: str) -> tuple[str, str]: """Parse S3 path into bucket and prefix""" if not s3_path.startswith("s3://"): diff --git a/tests/api/test_files.py b/tests/api/test_files.py index 8bebbf9..5eb5926 100644 --- a/tests/api/test_files.py +++ b/tests/api/test_files.py @@ -1179,14 +1179,7 @@ def test_browse_filesystem_endpoint(self, client: TestClient): def test_s3_path_detection(self): """Test S3 path detection functionality""" - from api.files.services import _is_s3_path, _parse_s3_path - - # Test S3 path detection - assert _is_s3_path("s3://bucket/key") is True - assert _is_s3_path("s3://bucket") is True - assert _is_s3_path("/local/path") is False - assert _is_s3_path("local/path") is False - assert _is_s3_path("") is False + from api.files.services import _parse_s3_path # Test S3 path parsing bucket, key = _parse_s3_path("s3://my-bucket/path/to/folder/") From 581e260414170ee71f5c40762b8a3b76f4acf996 Mon Sep 17 00:00:00 2001 From: Ryan Golhar Date: Fri, 9 Jan 2026 10:55:58 -0500 Subject: [PATCH 35/35] Add S3 support to create_file --- api/files/routes.py | 25 +++- api/files/services.py | 279 +++++++++++++++++++++++++++++++++++------- 2 files changed, 253 insertions(+), 51 deletions(-) diff --git a/api/files/routes.py b/api/files/routes.py index 52d4e80..de7d702 100644 --- a/api/files/routes.py +++ b/api/files/routes.py @@ -48,10 +48,14 @@ def create_file( is_public: bool = Form(False), created_by: Optional[str] = Form(None), content: Optional[UploadFile] = FastAPIFile(None), + s3_client=Depends(get_s3_client), ) -> FilePublic: """ Create a new file record with optional file content upload. + Storage backend (S3 vs Local) is automatically determined based on + configuration and entity type. + - **filename**: Name of the file - **description**: Optional description of the file - **file_type**: Type of file (fastq, bam, vcf, etc.) @@ -75,7 +79,7 @@ def create_file( if content and content.filename: file_content = content.file.read() - return services.create_file(session, file_in, file_content) + return services.create_file(session, file_in, file_content, s3_client=s3_client) @router.get( @@ -190,14 +194,16 @@ def update_file( @router.delete( "/{file_id}", status_code=status.HTTP_204_NO_CONTENT, summary="Delete file" ) -def delete_file(session: SessionDep, file_id: str) -> None: +def delete_file( + session: SessionDep, file_id: str, s3_client=Depends(get_s3_client) +) -> None: """ - Delete a file and its content. + Delete a file and its content from storage (local or S3). - **file_id**: The unique file identifier """ try: - success = services.delete_file(session, file_id) + success = services.delete_file(session, file_id, s3_client=s3_client) if not success: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, @@ -214,17 +220,24 @@ def delete_file(session: SessionDep, file_id: str) -> None: "/{file_id}/content", response_model=FilePublic, summary="Upload file content" ) def upload_file_content( - session: SessionDep, file_id: str, content: UploadFile = FastAPIFile(...) + session: SessionDep, + file_id: str, + content: UploadFile = FastAPIFile(...), + s3_client=Depends(get_s3_client), ) -> FilePublic: """ Upload content for an existing file record. + + Updates file in appropriate storage backend (S3 or local). - **file_id**: The unique file identifier - **content**: The file content to upload """ try: file_content = content.file.read() - return services.update_file_content(session, file_id, file_content) + return services.update_file_content( + session, file_id, file_content, s3_client=s3_client + ) except Exception as exc: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, diff --git a/api/files/services.py b/api/files/services.py index eb04e6a..6f9c265 100644 --- a/api/files/services.py +++ b/api/files/services.py @@ -70,6 +70,110 @@ def get_mime_type(filename: str) -> str: return mime_type or "application/octet-stream" +def determine_storage_backend( + entity_type: EntityType, entity_id: str +) -> tuple[StorageBackend, str]: + """ + Determine storage backend and base URI from configuration and entity type. + + Args: + entity_type: The entity type (PROJECT or RUN) + entity_id: The entity identifier + + Returns: + Tuple of (StorageBackend, base_uri) + """ + from core.config import get_settings + + settings = get_settings() + + # Check if S3 is configured based on entity type + if entity_type == EntityType.PROJECT: + if settings.DATA_BUCKET_URI.startswith("s3://"): + return StorageBackend.S3, settings.DATA_BUCKET_URI + elif entity_type == EntityType.RUN: + if settings.RESULTS_BUCKET_URI.startswith("s3://"): + return StorageBackend.S3, settings.RESULTS_BUCKET_URI + + # Fallback to local storage + return StorageBackend.LOCAL, "storage" + + +def _upload_to_s3( + file_content: bytes, s3_uri: str, mime_type: str, s3_client=None +) -> bool: + """ + Upload file content to S3. + + Args: + file_content: File bytes to upload + s3_uri: Full S3 URI (s3://bucket/key) + mime_type: Content type for S3 metadata + s3_client: Optional boto3 S3 client + + Returns: + True if successful + + Raises: + HTTPException: If upload fails + """ + if not BOTO3_AVAILABLE: + raise HTTPException( + status_code=status.HTTP_501_NOT_IMPLEMENTED, + detail="boto3 not available for S3 uploads. Install boto3 to enable S3 storage.", + ) + + try: + bucket, key = _parse_s3_path(s3_uri) + + if s3_client is None: + s3_client = boto3.client("s3") + + s3_client.put_object( + Bucket=bucket, + Key=key, + Body=file_content, + ContentType=mime_type, + Metadata={ + "uploaded-by": "ngs360-api", + "upload-timestamp": datetime.utcnow().isoformat(), + }, + ) + + logging.info(f"Successfully uploaded file to S3: {s3_uri}") + return True + + except NoCredentialsError as exc: + logging.error(f"AWS credentials not configured: {exc}") + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="AWS credentials not found. Please configure AWS credentials.", + ) from exc + except ClientError as exc: + error_code = exc.response["Error"]["Code"] + if error_code == "NoSuchBucket": + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"S3 bucket not found in URI: {s3_uri}", + ) from exc + elif error_code == "AccessDenied": + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"Access denied to S3 bucket: {s3_uri}", + ) from exc + else: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"S3 error: {exc.response['Error']['Message']}", + ) from exc + except Exception as exc: + logging.error(f"Failed to upload to S3: {exc}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Unexpected error uploading to S3: {str(exc)}", + ) from exc + + def _is_valid_storage_path(path: str) -> bool: """Validate storage path format""" # Allow S3 paths, local paths, network paths @@ -124,8 +228,14 @@ def create_file( file_create: FileCreate, file_content: bytes | None = None, storage_root: str = "storage", + s3_client=None, ) -> File: - """Create a new file record and optionally store content""" + """ + Create a new file record and store content to appropriate backend. + + Automatically determines storage backend (S3 vs Local) based on + configuration and entity type. + """ # Generate unique file ID file_id = generate_file_id() @@ -133,14 +243,27 @@ def create_file( # Use original_filename if provided, otherwise use filename original_filename = file_create.original_filename or file_create.filename - # Generate file path - file_path = generate_file_path( + # Determine storage backend automatically from configuration + storage_backend, base_uri = determine_storage_backend( + file_create.entity_type, file_create.entity_id + ) + + # Generate file path structure + relative_file_path = generate_file_path( file_create.entity_type, file_create.entity_id, file_create.file_type, f"{file_id}_{file_create.filename}", ) + # Construct full storage path based on backend + if storage_backend == StorageBackend.S3: + # S3 URI: s3://bucket/entity_type/entity_id/file_type/year/month/filename + storage_path = f"{base_uri.rstrip('/')}/{relative_file_path}" + else: + # Local filesystem: relative path + storage_path = relative_file_path + # Calculate file metadata if content is provided file_size = len(file_content) if file_content else None checksum = calculate_file_checksum(file_content) if file_content else None @@ -151,7 +274,7 @@ def create_file( file_id=file_id, filename=file_create.filename, original_filename=original_filename, - file_path=file_path, + file_path=storage_path, file_size=file_size, mime_type=mime_type, checksum=checksum, @@ -161,18 +284,28 @@ def create_file( entity_type=file_create.entity_type, entity_id=file_create.entity_id, is_public=file_create.is_public, - storage_backend=StorageBackend.LOCAL, + storage_backend=storage_backend, ) - # Store file content if provided + # Store file content based on backend if file_content: - # 1. Save to standard database storage location - full_path = Path(storage_root) / file_path - full_path.parent.mkdir(parents=True, exist_ok=True) - with open(full_path, "wb") as f: - f.write(file_content) + if storage_backend == StorageBackend.S3: + # Upload to S3 + _upload_to_s3(file_content, storage_path, mime_type, s3_client) + logging.info( + f"File {file_id} uploaded to S3: {storage_path}" + ) + else: + # Write to local filesystem + full_path = Path(storage_root) / relative_file_path + full_path.parent.mkdir(parents=True, exist_ok=True) + with open(full_path, "wb") as f: + f.write(file_content) + logging.info( + f"File {file_id} saved to local storage: {full_path}" + ) - # 2. SPECIAL HANDLING: If samplesheet for run, also save to run folder + # SPECIAL HANDLING: If samplesheet for run, also save to run folder if ( file_create.file_type == FileType.SAMPLESHEET and file_create.entity_type == EntityType.RUN @@ -184,10 +317,12 @@ def create_file( if dual_storage_success: status_note = "[Dual-stored to run folder]" else: - status_note = "[Database-only storage - run folder write failed]" + status_note = "[Database-only - run folder write failed]" if file_record.description: - file_record.description = f"{file_record.description} {status_note}" + file_record.description = ( + f"{file_record.description} {status_note}" + ) else: file_record.description = status_note @@ -241,21 +376,41 @@ def update_file(session: Session, file_id: str, file_update: FileUpdate) -> File return file_record -def delete_file(session: Session, file_id: str, storage_root: str = "storage") -> bool: - """Delete a file record and its content""" +def delete_file( + session: Session, + file_id: str, + storage_root: str = "storage", + s3_client=None, +) -> bool: + """Delete a file record and its content from storage (local or S3)""" file_record = get_file(session, file_id) - # Delete physical file if it exists - full_path = Path(storage_root) / file_record.file_path - if full_path.exists(): - full_path.unlink() - - # Try to remove empty directories + # Delete physical file based on storage backend + if file_record.storage_backend == StorageBackend.S3: + # Delete from S3 try: - full_path.parent.rmdir() - except OSError: - # Directory not empty, that's fine - pass + bucket, key = _parse_s3_path(file_record.file_path) + if s3_client is None: + s3_client = boto3.client("s3") + s3_client.delete_object(Bucket=bucket, Key=key) + logging.info(f"Deleted file from S3: {file_record.file_path}") + except Exception as e: + logging.warning( + f"Failed to delete S3 object {file_record.file_path}: {e}" + ) + # Continue with database deletion even if S3 delete fails + else: + # Delete from local filesystem + full_path = Path(storage_root) / file_record.file_path + if full_path.exists(): + full_path.unlink() + + # Try to remove empty directories + try: + full_path.parent.rmdir() + except OSError: + # Directory not empty, that's fine + pass # Delete from database session.delete(file_record) @@ -265,20 +420,40 @@ def delete_file(session: Session, file_id: str, storage_root: str = "storage") - def get_file_content( - session: Session, file_id: str, storage_root: str = "storage" + session: Session, + file_id: str, + storage_root: str = "storage", + s3_client=None, ) -> bytes: - """Get file content from storage""" + """Get file content from storage (local or S3)""" file_record = get_file(session, file_id) - full_path = Path(storage_root) / file_record.file_path - if not full_path.exists(): - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=f"File content not found at {file_record.file_path}", - ) + if file_record.storage_backend == StorageBackend.S3: + # Download from S3 + try: + file_content, _, _ = download_file( + file_record.file_path, s3_client + ) + return file_content + except HTTPException: + raise + except Exception as e: + logging.error(f"Error downloading from S3: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error retrieving file from S3: {str(e)}", + ) from e + else: + # Read from local filesystem + full_path = Path(storage_root) / file_record.file_path + if not full_path.exists(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"File content not found at {file_record.file_path}", + ) - with open(full_path, "rb") as f: - return f.read() + with open(full_path, "rb") as f: + return f.read() def list_files_for_entity( @@ -313,9 +488,13 @@ def get_file_count_for_entity( def update_file_content( - session: Session, file_id: str, content: bytes, storage_root: str = "storage" + session: Session, + file_id: str, + content: bytes, + storage_root: str = "storage", + s3_client=None, ) -> File: - """Update file content""" + """Update file content (local or S3)""" # Get the file record file_record = get_file(session, file_id) @@ -323,10 +502,20 @@ def update_file_content( file_size = len(content) checksum = calculate_file_checksum(content) - # Write content to storage - storage_path = Path(storage_root) / file_record.file_path - storage_path.parent.mkdir(parents=True, exist_ok=True) - storage_path.write_bytes(content) + # Write content to appropriate storage backend + if file_record.storage_backend == StorageBackend.S3: + # Upload to S3 + mime_type = get_mime_type(file_record.filename) + _upload_to_s3(content, file_record.file_path, mime_type, s3_client) + logging.info(f"Updated file content in S3: {file_record.file_path}") + else: + # Write to local filesystem + storage_path = Path(storage_root) / file_record.file_path + storage_path.parent.mkdir(parents=True, exist_ok=True) + storage_path.write_bytes(content) + logging.info( + f"Updated file content in local storage: {storage_path}" + ) # Update file record file_record.file_size = file_size @@ -345,12 +534,12 @@ def browse_filesystem( """ Browse filesystem directory and return structured data. - Automatically detects if the path is S3 (s3://bucket/key) or local filesystem - and routes to the appropriate handler. + Automatically detects if the path is S3 (s3://bucket/key) or local + filesystem and routes to the appropriate handler. """ # Check if this is an S3 path if directory_path.startswith("s3://"): - return browse_s3(directory_path) + return _list_s3(directory_path) # Handle local filesystem return _browse_local_filesystem(directory_path, storage_root)