ProjectTech4DevAI · AkhileshNegi · Feb 9, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 31, 2026
diff --git a/backend/app/alembic/versions/045_add_stt_evaluation_tables.py b/backend/app/alembic/versions/045_add_stt_evaluation_tables.py
diff --git a/backend/app/api/docs/stt_evaluation/create_dataset.md b/backend/app/api/docs/stt_evaluation/create_dataset.md
@@ -0,0 +1,5 @@
+Create a new STT evaluation dataset with audio samples.
+
+Each sample requires:
+- **object_store_url**: S3 URL of the audio file (from /evaluations/stt/files endpoint)
+- **ground_truth**: Reference transcription (optional, for WER/CER metrics)
diff --git a/backend/app/api/docs/stt_evaluation/get_dataset.md b/backend/app/api/docs/stt_evaluation/get_dataset.md
@@ -0,0 +1 @@
+Get an STT dataset with its samples.
diff --git a/backend/app/api/docs/stt_evaluation/get_result.md b/backend/app/api/docs/stt_evaluation/get_result.md
@@ -0,0 +1 @@
+Get a single STT transcription result.
diff --git a/backend/app/api/docs/stt_evaluation/get_run.md b/backend/app/api/docs/stt_evaluation/get_run.md
@@ -0,0 +1 @@
+Get an STT evaluation run with its results.
diff --git a/backend/app/api/docs/stt_evaluation/list_datasets.md b/backend/app/api/docs/stt_evaluation/list_datasets.md
@@ -0,0 +1 @@
+List all STT evaluation datasets for the current project.
diff --git a/backend/app/api/docs/stt_evaluation/list_runs.md b/backend/app/api/docs/stt_evaluation/list_runs.md
@@ -0,0 +1 @@
+List all STT evaluation runs for the current project.
diff --git a/backend/app/api/docs/stt_evaluation/start_evaluation.md b/backend/app/api/docs/stt_evaluation/start_evaluation.md
@@ -0,0 +1,8 @@
+Start an STT evaluation run on a dataset.
+
+The evaluation will:
+1. Process each audio sample through the specified providers
+2. Generate transcriptions using Gemini Batch API
+3. Store results for human review
+
+**Supported providers:** gemini-2.5-pro
-Start an STT evaluation run on a dataset.
-
-The evaluation will:
-1. Process each audio sample through the specified providers
-2. Generate transcriptions using Gemini Batch API
-3. Store results for human review
-
-**Supported providers:** gemini-2.5-pro
+Start an STT evaluation run on a dataset.
+
+The evaluation will:
+1. Process each audio sample through the specified models
+2. Generate transcriptions using Gemini Batch API
+3. Store results for human review
+
+**Supported models:** gemini-2.5-pro
-Start an STT evaluation run on a dataset.
-
-The evaluation will:
-1. Process each audio sample through the specified providers
-2. Generate transcriptions using Gemini Batch API
-3. Store results for human review
-
-**Supported providers:** gemini-2.5-pro
+Start an STT evaluation run on a dataset.
+
+The evaluation will:
+1. Process each audio sample through the specified models
+2. Generate transcriptions using Gemini Batch API
+3. Store results for human review
+
+**Supported models:** gemini-2.5-pro
diff --git a/backend/app/api/docs/stt_evaluation/update_feedback.md b/backend/app/api/docs/stt_evaluation/update_feedback.md
@@ -0,0 +1,5 @@
+Update human feedback on an STT transcription result.
+
+**Fields:**
+- **is_correct**: Boolean indicating if the transcription is correct
+- **comment**: Optional feedback comment explaining issues or observations
diff --git a/backend/app/api/docs/stt_evaluation/upload_audio.md b/backend/app/api/docs/stt_evaluation/upload_audio.md
@@ -0,0 +1,7 @@
+Upload a single audio file to S3 for STT evaluation.
+
+**Supported formats:** mp3, wav, flac, m4a, ogg, webm
+
+**Maximum file size:** 200 MB
+
+Returns the S3 URL which can be used when creating an STT dataset.
diff --git a/backend/app/api/main.py b/backend/app/api/main.py
@@ -25,7 +25,7 @@
     model_evaluation,
     collection_job,
 )
-from app.api.routes.evaluations import dataset as evaluation_dataset, evaluation
+from app.api.routes import evaluations
 from app.core.config import settings
 
 api_router = APIRouter()
@@ -38,8 +38,7 @@
 api_router.include_router(cron.router)
 api_router.include_router(documents.router)
 api_router.include_router(doc_transformation_job.router)
-api_router.include_router(evaluation_dataset.router)
-api_router.include_router(evaluation.router)
+api_router.include_router(evaluations.router)
 api_router.include_router(languages.router)
 api_router.include_router(llm.router)
 api_router.include_router(login.router)

diff --git a/backend/app/api/routes/evaluations/__init__.py b/backend/app/api/routes/evaluations/__init__.py
@@ -0,0 +1,12 @@
+"""Main router for evaluation API routes."""
+
+from fastapi import APIRouter
+
+from app.api.routes.evaluations import dataset, evaluation
+from app.api.routes.stt_evaluations.router import router as stt_router
+
+router = APIRouter()
+
+router.include_router(evaluation.router)
+router.include_router(dataset.router)
+router.include_router(stt_router)
diff --git a/backend/app/api/routes/stt_evaluations/__init__.py b/backend/app/api/routes/stt_evaluations/__init__.py
@@ -0,0 +1,5 @@
+"""STT Evaluation API routes."""
+
+from .router import router
+
+__all__ = ["router"]
diff --git a/backend/app/api/routes/stt_evaluations/dataset.py b/backend/app/api/routes/stt_evaluations/dataset.py
@@ -0,0 +1,193 @@
+"""STT dataset API routes."""
+
+import logging
+
+from fastapi import APIRouter, Body, Depends, HTTPException, Query
+
+from app.api.deps import AuthContextDep, SessionDep
+from app.api.permissions import Permission, require_permission
+from app.crud.file import get_files_by_ids
+from app.crud.language import get_language_by_id
+from app.crud.stt_evaluations import (
+    get_stt_dataset_by_id,
+    list_stt_datasets,
+    get_samples_by_dataset_id,
+)
+from app.models.stt_evaluation import (
+    STTDatasetCreate,
+    STTDatasetPublic,
+    STTDatasetWithSamples,
+    STTSamplePublic,
+)
+from app.services.stt_evaluations.dataset import upload_stt_dataset
+from app.utils import APIResponse, load_description
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+@router.post(
+    "/datasets",
+    response_model=APIResponse[STTDatasetPublic],
+    dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
+    summary="Create STT dataset",
+    description=load_description("stt_evaluation/create_dataset.md"),
+)
+def create_dataset(
+    _session: SessionDep,
+    auth_context: AuthContextDep,
+    dataset_create: STTDatasetCreate = Body(...),
+) -> APIResponse[STTDatasetPublic]:
+    """Create an STT evaluation dataset."""
+    # Validate language_id if provided
+    if dataset_create.language_id is not None:
+        language = get_language_by_id(
+            session=_session, language_id=dataset_create.language_id
+        )
+        if not language:
+            raise HTTPException(
+                status_code=400, detail="Invalid language_id: language not found"
+            )
+
+    dataset, samples = upload_stt_dataset(
+        session=_session,
+        name=dataset_create.name,
+        samples=dataset_create.samples,
+        organization_id=auth_context.organization_.id,
+        project_id=auth_context.project_.id,
+        description=dataset_create.description,
+        language_id=dataset_create.language_id,
+    )
+
+    return APIResponse.success_response(
+        data=STTDatasetPublic(
+            id=dataset.id,
+            name=dataset.name,
+            description=dataset.description,
+            type=dataset.type,
+            language_id=dataset.language_id,
+            object_store_url=dataset.object_store_url,
+            dataset_metadata=dataset.dataset_metadata,
+            sample_count=len(samples),
+            organization_id=dataset.organization_id,
+            project_id=dataset.project_id,
+            inserted_at=dataset.inserted_at,
+            updated_at=dataset.updated_at,
+        )
+    )
+
+
+@router.get(
+    "/datasets",
+    response_model=APIResponse[list[STTDatasetPublic]],
+    dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
+    summary="List STT datasets",
+    description=load_description("stt_evaluation/list_datasets.md"),
+)
+def list_datasets(
+    _session: SessionDep,
+    auth_context: AuthContextDep,
+    limit: int = Query(50, ge=1, le=100, description="Maximum results to return"),
+    offset: int = Query(0, ge=0, description="Number of results to skip"),
+) -> APIResponse[list[STTDatasetPublic]]:
+    """List STT evaluation datasets."""
+    datasets, total = list_stt_datasets(
+        session=_session,
+        org_id=auth_context.organization_.id,
+        project_id=auth_context.project_.id,
+        limit=limit,
+        offset=offset,
+    )
+
+    return APIResponse.success_response(
+        data=datasets,
+        metadata={"total": total, "limit": limit, "offset": offset},
+    )
+
+
+@router.get(
+    "/datasets/{dataset_id}",
+    response_model=APIResponse[STTDatasetWithSamples],
+    dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
+    summary="Get STT dataset",
+    description=load_description("stt_evaluation/get_dataset.md"),
+)
+def get_dataset(
+    _session: SessionDep,
+    auth_context: AuthContextDep,
+    dataset_id: int,
+    include_samples: bool = Query(True, description="Include samples in response"),
+    sample_limit: int = Query(100, ge=1, le=1000, description="Max samples to return"),
+    sample_offset: int = Query(0, ge=0, description="Sample offset"),
+) -> APIResponse[STTDatasetWithSamples]:
+    """Get an STT evaluation dataset."""
+    dataset = get_stt_dataset_by_id(
+        session=_session,
+        dataset_id=dataset_id,
+        org_id=auth_context.organization_.id,
+        project_id=auth_context.project_.id,
+    )
+
+    if not dataset:
+        raise HTTPException(status_code=404, detail="Dataset not found")
+
+    samples = []
+    samples_total = (dataset.dataset_metadata or {}).get("sample_count", 0)
+
+    if include_samples:
+        sample_records = get_samples_by_dataset_id(
+            session=_session,
+            dataset_id=dataset_id,
+            org_id=auth_context.organization_.id,
+            project_id=auth_context.project_.id,
+            limit=sample_limit,
+            offset=sample_offset,
+        )
+
+        # Fetch file records to get object_store_url
+        file_ids = [s.file_id for s in sample_records]
+        file_records = get_files_by_ids(
+            session=_session,
+            file_ids=file_ids,
+            organization_id=auth_context.organization_.id,
+            project_id=auth_context.project_.id,
+        )
+        file_map = {f.id: f for f in file_records}
+
+        samples = [
+            STTSamplePublic(
+                id=s.id,
+                file_id=s.file_id,
+                object_store_url=file_map.get(s.file_id).object_store_url
+                if s.file_id in file_map
+                else None,
+                language_id=s.language_id,
+                ground_truth=s.ground_truth,
+                sample_metadata=s.sample_metadata,
+                dataset_id=s.dataset_id,
+                organization_id=s.organization_id,
+                project_id=s.project_id,
+                inserted_at=s.inserted_at,
+                updated_at=s.updated_at,
+            )
+            for s in sample_records
+        ]
+
+    return APIResponse.success_response(
+        data=STTDatasetWithSamples(
+            id=dataset.id,
+            name=dataset.name,
+            description=dataset.description,
+            type=dataset.type,
+            language_id=dataset.language_id,
+            object_store_url=dataset.object_store_url,
+            dataset_metadata=dataset.dataset_metadata,
+            organization_id=dataset.organization_id,
+            project_id=dataset.project_id,
+            inserted_at=dataset.inserted_at,
+            updated_at=dataset.updated_at,
+            samples=samples,
+        ),
+        metadata={"samples_total": samples_total},
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		List all STT evaluation datasets for the current project.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		List all STT evaluation runs for the current project.