Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions backend/configs/development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ llm_assistant:
sentence_annotation:
few_shot_threshold: 4
model_training_threshold: 100
span_annotation:
few_shot_threshold: 4
model_training_threshold: 100

cota:
model: "paraphrase-multilingual-mpnet-base-v2"
Expand Down
3 changes: 3 additions & 0 deletions backend/configs/production.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ llm_assistant:
sentence_annotation:
few_shot_threshold: 4
model_training_threshold: 100
span_annotation:
few_shot_threshold: 4
model_training_threshold: 100

cota:
model: "paraphrase-multilingual-mpnet-base-v2"
Expand Down
26 changes: 26 additions & 0 deletions backend/src/core/annotation/span_annotation_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,32 @@ def read_by_code_and_user(

return query.all()

def read_by_codes(
self,
db: Session,
*,
code_ids: list[int],
) -> list[SpanAnnotationORM]:
if not code_ids:
return []

query = db.query(self.model).filter(self.model.code_id.in_(code_ids))
return query.all()

def read_by_user_sdocs_codes(
self, db: Session, *, user_id: int, sdoc_ids: list[int], code_ids: list[int]
) -> list[SpanAnnotationORM]:
query = (
db.query(self.model)
.join(self.model.annotation_document)
.filter(
AnnotationDocumentORM.user_id == user_id,
AnnotationDocumentORM.source_document_id.in_(sdoc_ids),
self.model.code_id.in_(code_ids),
)
)
return query.all()

def update(
self, db: Session, *, id: int, update_dto: SpanAnnotationUpdate
) -> SpanAnnotationORM:
Expand Down
24 changes: 24 additions & 0 deletions backend/src/core/metadata/project_metadata_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from common.meta_type import MetaType
from config import conf
from core.metadata.project_metadata_dto import (
ProjectMetadataBulkUpdate,
ProjectMetadataCreate,
ProjectMetadataUpdate,
)
Expand Down Expand Up @@ -170,6 +171,29 @@ def update(

return metadata_orm

def update_bulk(
self, db: Session, *, update_dtos: list[ProjectMetadataBulkUpdate]
) -> list[ProjectMetadataORM]:
db_objs = []
for update_dto in update_dtos:
db_obj = self.update(
db=db,
metadata_id=update_dto.id,
update_dto=ProjectMetadataUpdate(
**update_dto.model_dump(exclude={"id"})
),
)
db_objs.append(db_obj)
return db_objs

### DELETE OPERATIONS ###
def delete_bulk(self, db: Session, *, ids: list[int]) -> list[ProjectMetadataORM]:
db_objs = []
for id in ids:
db_obj = self.delete(db=db, id=id)
db_objs.append(db_obj)
return db_objs

### OTHER OPERATIONS ###

def exists_by_project_and_key_and_metatype_and_doctype(
Expand Down
4 changes: 4 additions & 0 deletions backend/src/core/metadata/project_metadata_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ class ProjectMetadataUpdate(BaseModel, UpdateDTOBase):
)


class ProjectMetadataBulkUpdate(ProjectMetadataBaseDTO, UpdateDTOBase):
id: int = Field(description="ID of the ProjectMetadata")


# Properties for reading (as in ORM)
class ProjectMetadataRead(ProjectMetadataBaseDTO):
id: int = Field(description="ID of the ProjectMetadata")
Expand Down
87 changes: 87 additions & 0 deletions backend/src/core/metadata/project_metadata_endpoint.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections import Counter

from fastapi import APIRouter, Depends
from sqlalchemy.orm import Session

Expand All @@ -6,10 +8,13 @@
from core.auth.authz_user import AuthzUser
from core.metadata.project_metadata_crud import crud_project_meta
from core.metadata.project_metadata_dto import (
ProjectMetadataBulkUpdate,
ProjectMetadataCreate,
ProjectMetadataRead,
ProjectMetadataUpdate,
)
from core.metadata.source_document_metadata_crud import crud_sdoc_meta
from core.metadata.source_document_metadata_dto import MetadataFrequencyRead

router = APIRouter(
prefix="/projmeta",
Expand Down Expand Up @@ -90,6 +95,24 @@ def update_by_id(
return ProjectMetadataRead.model_validate(db_obj)


@router.patch(
"/bulk/update",
response_model=list[ProjectMetadataRead],
summary="Updates multiple project metadata at once.",
)
def update_bulk(
*,
db: Session = Depends(get_db_session),
metadatas: list[ProjectMetadataBulkUpdate],
authz_user: AuthzUser = Depends(),
) -> list[ProjectMetadataRead]:
authz_user.assert_in_same_project_as_many(
Crud.PROJECT_METADATA, [m.id for m in metadatas]
)
db_objs = crud_project_meta.update_bulk(db=db, update_dtos=metadatas)
return [ProjectMetadataRead.model_validate(db_obj) for db_obj in db_objs]


@router.delete(
"/{metadata_id}",
response_model=ProjectMetadataRead,
Expand All @@ -105,3 +128,67 @@ def delete_by_id(

db_obj = crud_project_meta.delete(db=db, id=metadata_id)
return ProjectMetadataRead.model_validate(db_obj)


@router.delete(
"/bulk/delete",
response_model=list[ProjectMetadataRead],
summary="Deletes all ProjectMetadata with the given IDs.",
)
def delete_bulk_by_id(
*,
db: Session = Depends(get_db_session),
metadata_ids: list[int],
authz_user: AuthzUser = Depends(),
) -> list[ProjectMetadataRead]:
authz_user.assert_in_same_project_as_many(Crud.PROJECT_METADATA, metadata_ids)

db_objs = crud_project_meta.delete_bulk(db=db, ids=metadata_ids)
return [ProjectMetadataRead.model_validate(db_obj) for db_obj in db_objs]


@router.get(
"/{proj_metadata_id}/frequencies",
response_model=list[MetadataFrequencyRead],
summary="Returns a frequency count of all values for a specific ProjectMetadata definition.",
)
def get_metadata_frequencies(
*,
db: Session = Depends(get_db_session),
proj_metadata_id: int,
authz_user: AuthzUser = Depends(),
) -> list[MetadataFrequencyRead]:
authz_user.assert_in_same_project_as(Crud.PROJECT_METADATA, proj_metadata_id)

db_objs = crud_sdoc_meta.read_by_project_metadata(
db=db, proj_metadata_id=proj_metadata_id
)

# extract the non-null value from the database object
def extract_value(obj):
if obj.int_value is not None:
return obj.int_value
if obj.str_value is not None:
return obj.str_value
if obj.boolean_value is not None:
return obj.boolean_value
if obj.date_value is not None:
return obj.date_value.isoformat()
if obj.list_value is not None:
return str(
obj.list_value
) # lists must be stringified to be hashable for counting
return None

# count frequencies
value_counts = Counter(extract_value(obj) for obj in db_objs)
total_count = sum(value_counts.values())

return [
MetadataFrequencyRead(
value=val,
count=count,
percentage=round((count / total_count), 2) if total_count > 0 else 0,
)
for val, count in value_counts.items()
]
35 changes: 35 additions & 0 deletions backend/src/core/metadata/source_document_metadata_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,41 @@ def read_by_project(
db_objs = query.all()
return db_objs

def read_by_project_metadata(
self,
db: Session,
*,
proj_metadata_id: int,
skip: int | None = None,
limit: int | None = None,
) -> list[SourceDocumentMetadataORM]:
"""Fetches all SourceDocumentMetadata entries for a specific ProjectMetadata definition."""
query = db.query(self.model).filter(
SourceDocumentMetadataORM.project_metadata_id == proj_metadata_id
)

if skip is not None:
query = query.offset(skip)
if limit is not None:
query = query.limit(limit)

return query.all()

def read_by_key(
self,
db: Session,
*,
key: str,
) -> list[SourceDocumentMetadataORM]:
query = (
db.query(self.model)
.join(SourceDocumentMetadataORM.project_metadata)
.filter(
ProjectMetadataORM.key == key,
)
)
return query.all()

def read_by_sdoc_and_key(
self,
db: Session,
Expand Down
10 changes: 10 additions & 0 deletions backend/src/core/metadata/source_document_metadata_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,3 +281,13 @@ def with_value(
source_document_id=source_document_id,
project_metadata=project_metadata,
)


class MetadataFrequencyRead(BaseModel):
value: str | int | float | bool | None = Field(
description="The unique metadata value"
)
count: int = Field(description="Number of documents that have this value")
percentage: float = Field(
description="Percentage of documents that have this value (between 0 and 1)"
)
34 changes: 34 additions & 0 deletions backend/src/core/metadata/source_document_metadata_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,37 @@ def delete_by_id(
authz_user.assert_in_same_project_as(Crud.SOURCE_DOCUMENT_METADATA, metadata_id)
db_obj = crud_sdoc_meta.delete(db=db, id=metadata_id)
return SourceDocumentMetadataRead.model_validate(db_obj)


@router.get(
"/project-metadata/{proj_metadata_id}",
response_model=list[SourceDocumentMetadataRead],
summary="Returns all SourceDocumentMetadata values for a specific ProjectMetadata definition.",
)
def get_all_values_by_project_metadata(
*,
db: Session = Depends(get_db_session),
proj_metadata_id: int,
authz_user: AuthzUser = Depends(),
) -> list[SourceDocumentMetadataRead]:
authz_user.assert_in_same_project_as(Crud.PROJECT_METADATA, proj_metadata_id)
db_objs = crud_sdoc_meta.read_by_project_metadata(
db=db, proj_metadata_id=proj_metadata_id
)
return [SourceDocumentMetadataRead.model_validate(obj) for obj in db_objs]


@router.get(
"/project/{project_id}/bulk",
response_model=list[SourceDocumentMetadataRead],
summary="Returns ALL SourceDocumentMetadata for ALL documents within a specific project.",
)
def get_all_sdoc_metadata_for_project(
*,
db: Session = Depends(get_db_session),
project_id: int,
authz_user: AuthzUser = Depends(),
) -> list[SourceDocumentMetadataRead]:
authz_user.assert_in_project(project_id)
db_objs = crud_sdoc_meta.read_by_project(db=db, proj_id=project_id)
return [SourceDocumentMetadataRead.model_validate(obj) for obj in db_objs]
3 changes: 3 additions & 0 deletions backend/src/modules/llm_assistant/llm_job_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class AnnotationParams(DocumentBasedTaskParams):
code_ids: list[int] = Field(
description="IDs of the codes to use for the annotation"
)
delete_existing_annotations: bool = Field(
description="Delete existing annotations before creating new ones", default=True
)


class SentenceAnnotationParams(DocumentBasedTaskParams):
Expand Down
Loading