diff --git a/backend/app/routers/translation_memory.py b/backend/app/routers/translation_memory.py index 5ac66f5..128cb8c 100644 --- a/backend/app/routers/translation_memory.py +++ b/backend/app/routers/translation_memory.py @@ -20,7 +20,7 @@ def get_memory_by_id(db: Session, memory_id: int): doc = TranslationMemoryQuery(db).get_memory(memory_id) if not doc: raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, detail="Document not found" + status_code=status.HTTP_404_NOT_FOUND, detail="Memory not found" ) return doc @@ -53,20 +53,41 @@ def get_memory_records( tm_id: int, db: Annotated[Session, Depends(get_db)], page: Annotated[int | None, Query(ge=0)] = None, -) -> list[schema.TranslationMemoryRecord]: + query: Annotated[str | None, Query()] = None, +) -> schema.TranslationMemoryListResponse: page_records: Final = 100 if not page: page = 0 get_memory_by_id(db, tm_id) - return [ - schema.TranslationMemoryRecord( - id=record.id, source=record.source, target=record.target - ) - for record in TranslationMemoryQuery(db).get_memory_records_paged( - tm_id, page, page_records - ) - ] + records, count = TranslationMemoryQuery(db).get_memory_records_paged( + tm_id, page, page_records, query + ) + return schema.TranslationMemoryListResponse( + records=records, + page=page, + total_records=count, + ) + + +@router.get("/{tm_id}/records/similar") +def get_memory_records_similar( + tm_id: int, + db: Annotated[Session, Depends(get_db)], + query: Annotated[str, Query()], +) -> schema.TranslationMemoryListSimilarResponse: + page_records: Final = 20 + + get_memory_by_id(db, tm_id) + records = TranslationMemoryQuery(db).get_memory_records_paged_similar( + tm_id, page_records, query + ) + return schema.TranslationMemoryListSimilarResponse( + records=records, + page=0, + # this is incorrect in general case, but for 20 records is fine + total_records=len(records), + ) @router.post("/upload") diff --git a/backend/app/translation_memory/query.py b/backend/app/translation_memory/query.py index 5a0ff5f..16f2c54 100644 --- a/backend/app/translation_memory/query.py +++ b/backend/app/translation_memory/query.py @@ -4,8 +4,9 @@ from sqlalchemy import func, select, text from sqlalchemy.orm import Session +from app.translation_memory import schema + from .models import TranslationMemory, TranslationMemoryRecord -from .schema import MemorySubstitution class TranslationMemoryQuery: @@ -35,15 +36,72 @@ def get_memory_records_count(self, memory_id: int) -> int: ).scalar_one() def get_memory_records_paged( - self, memory_id: int, page: int, page_records: int - ) -> Iterable[TranslationMemoryRecord]: - return self.__db.execute( - select(TranslationMemoryRecord) - .filter(TranslationMemoryRecord.document_id == memory_id) - .order_by(TranslationMemoryRecord.id) - .offset(page_records * page) - .limit(page_records) - ).scalars() + self, + memory_id: int, + page: int, + page_records: int, + query: str | None, + ) -> tuple[list[schema.TranslationMemoryRecord], int]: + filters = [TranslationMemoryRecord.document_id == memory_id] + if query: + filters.append(TranslationMemoryRecord.source.ilike(f"%{query}%")) + + count = self.__db.execute( + select( + func.count(TranslationMemoryRecord.id), + ).filter(*filters) + ).scalar_one() + + return [ + schema.TranslationMemoryRecord( + id=scalar.id, source=scalar.source, target=scalar.target + ) + for scalar in self.__db.execute( + select(TranslationMemoryRecord) + .filter(*filters) + .order_by(TranslationMemoryRecord.id) + .offset(page_records * page) + .limit(page_records) + ).scalars() + ], count + + def get_memory_records_paged_similar( + self, + memory_id: int, + page_records: int, + query: str, + ) -> list[schema.TranslationMemoryRecordWithSimilarity]: + # Use the same approach as get_substitutions but with different parameters + similarity_func = func.similarity(TranslationMemoryRecord.source, query) + + # Set similarity threshold to 0.25 (25%) as required + self.__db.execute( + text("SET pg_trgm.similarity_threshold TO :threshold"), + {"threshold": 0.25}, + ) + + return [ + schema.TranslationMemoryRecordWithSimilarity( + id=scalar.id, + source=scalar.source, + target=scalar.target, + similarity=scalar.similarity, + ) + for scalar in self.__db.execute( + select( + TranslationMemoryRecord.id, + TranslationMemoryRecord.source, + TranslationMemoryRecord.target, + similarity_func, + ) + .filter( + TranslationMemoryRecord.document_id == memory_id, + TranslationMemoryRecord.source.op("%")(query), + ) + .order_by(similarity_func.desc()) + .limit(page_records) + ).all() + ] def get_substitutions( self, @@ -51,7 +109,7 @@ def get_substitutions( tm_ids: list[int], threshold: float = 0.75, count: int = 10, - ) -> list[MemorySubstitution]: + ) -> list[schema.MemorySubstitution]: similarity_func = func.similarity(TranslationMemoryRecord.source, source) self.__db.execute( text("SET pg_trgm.similarity_threshold TO :threshold"), @@ -72,7 +130,7 @@ def get_substitutions( ).all() return [ - MemorySubstitution( + schema.MemorySubstitution( source=record.source, target=record.target, similarity=record.similarity ) for record in records diff --git a/backend/app/translation_memory/schema.py b/backend/app/translation_memory/schema.py index 3c15d01..3ba36b7 100644 --- a/backend/app/translation_memory/schema.py +++ b/backend/app/translation_memory/schema.py @@ -30,5 +30,21 @@ class TranslationMemoryRecord(Identified): target: str +class TranslationMemoryListResponse(BaseModel): + records: list[TranslationMemoryRecord] + page: int + total_records: int + + +class TranslationMemoryRecordWithSimilarity(TranslationMemoryRecord): + similarity: float + + +class TranslationMemoryListSimilarResponse(BaseModel): + records: list[TranslationMemoryRecordWithSimilarity] + page: int + total_records: int + + class TranslationMemoryCreationSettings(BaseModel): name: str = Field(min_length=1) diff --git a/backend/app/translators/llm.py b/backend/app/translators/llm.py index 8ed72fe..f76c8e7 100644 --- a/backend/app/translators/llm.py +++ b/backend/app/translators/llm.py @@ -14,7 +14,7 @@ def generate_prompt_prologue() -> str: if not settings.llm_prompt: - logging.error('No LLM prompt configured') + logging.error("No LLM prompt configured") return settings.llm_prompt @@ -55,22 +55,28 @@ def generate_prompt( return "\n\n".join(parts), len(task_lines) +SEG_MATCHER = re.compile(r"(.*)") + + def parse_lines(network_out: str, expected_size: int) -> tuple[list[str], bool]: - output = [] + output: list[str] = [] split = network_out.strip().splitlines() if len(split) != expected_size: logging.warning("Unexpected LLM output, not enough lines returned %s", split) return [], False + failed = False for line in split: - m = re.match(r"(.*)", line) + m = re.match(SEG_MATCHER, line) if not m: - logging.warning("Unexpected LLM output, not match found in %s", line) - return [], False + logging.warning("Unexpected LLM output, no match found in %s", line) + output.append("") + failed = True + continue output.append(m.group(1)) - return output, True + return output, not failed def translate_lines( diff --git a/backend/tests/routers/test_routes_tms.py b/backend/tests/routers/test_routes_tms.py index 6ab0a7d..f8f6626 100644 --- a/backend/tests/routers/test_routes_tms.py +++ b/backend/tests/routers/test_routes_tms.py @@ -63,10 +63,14 @@ def test_can_get_tm_records(user_logged_client: TestClient, session: Session): response = user_logged_client.get("/translation_memory/1/records") assert response.status_code == 200 - assert response.json() == [ - {"id": 1, "source": "Regional Effects", "target": "Translation"}, - {"id": 2, "source": "User Interface", "target": "UI"}, - ] + assert response.json() == { + "records": [ + {"id": 1, "source": "Regional Effects", "target": "Translation"}, + {"id": 2, "source": "User Interface", "target": "UI"}, + ], + "page": 0, + "total_records": 2, + } def test_can_get_tm_records_with_page(user_logged_client: TestClient, session: Session): @@ -86,8 +90,9 @@ def test_can_get_tm_records_with_page(user_logged_client: TestClient, session: S "/translation_memory/1/records", params={"page": "1"} ) assert response.status_code == 200 - assert len(response.json()) == 50 - assert response.json()[0] == {"id": 101, "source": "line100", "target": "line100"} + json = response.json() + assert len(json["records"]) == 50 + assert json["records"][0] == {"id": 101, "source": "line100", "target": "line100"} def test_tm_records_are_empty_for_too_large_page( @@ -109,7 +114,63 @@ def test_tm_records_are_empty_for_too_large_page( "/translation_memory/1/records", params={"page": "20"} ) assert response.status_code == 200 - assert response.json() == [] + assert response.json()["records"] == [] + + +def test_tm_records_exact_match(user_logged_client: TestClient, session: Session): + tm_records = [ + TranslationMemoryRecord(source="Hello world", target="Hola mundo"), + TranslationMemoryRecord(source="Goodbye world", target="Adiós mundo"), + TranslationMemoryRecord(source="Welcome home", target="Bienvenido a casa"), + ] + with session as s: + s.add(TranslationMemory(name="test_doc.tmx", records=tm_records, created_by=1)) + s.commit() + + # Test exact search for "world" + response = user_logged_client.get( + "/translation_memory/1/records", + params={"query": "world", "query_mode": "exact"}, + ) + assert response.status_code == 200 + + json = response.json() + assert len(json["records"]) == 2 + assert json["total_records"] == 2 + # Should return records containing "world" in source + sources = [result["source"] for result in json["records"]] + assert "Hello world" in sources + assert "Goodbye world" in sources + # Similarity should be None for exact search + assert all("similarity" not in result for result in json["records"]) + + +def test_tm_records_exact_match_in_nonexistent_tm(user_logged_client: TestClient): + response = user_logged_client.get( + "/translation_memory/999/records", + params={"query": "test", "query_mode": "exact"}, + ) + assert response.status_code == 404 + assert "Memory not found" in response.json()["detail"] + + +def test_search_no_results(user_logged_client: TestClient, session: Session): + tm_records = [ + TranslationMemoryRecord(source="Hello world", target="Hola mundo"), + ] + with session as s: + s.add(TranslationMemory(name="test_doc.tmx", records=tm_records, created_by=1)) + s.commit() + + # Search for something that doesn't exist + response = user_logged_client.get( + "/translation_memory/1/records", + params={"query": "nonexistent", "query_mode": "exact"}, + ) + assert response.status_code == 200 + + results = response.json()["records"] + assert len(results) == 0 def test_tm_records_returns_404_for_nonexistent_document( diff --git a/backend/tests/test_llm.py b/backend/tests/test_llm.py index 4f515a9..b54d9d5 100644 --- a/backend/tests/test_llm.py +++ b/backend/tests/test_llm.py @@ -194,7 +194,7 @@ def test_parse_lines_invalid_format(): result, success = llm.parse_lines(network_out, expected_size) assert not success - assert result == [] + assert result == ["", "translation2"] def test_parse_lines_empty_content(): diff --git a/frontend/mocks/tmMocks.ts b/frontend/mocks/tmMocks.ts index e39736f..974c9be 100644 --- a/frontend/mocks/tmMocks.ts +++ b/frontend/mocks/tmMocks.ts @@ -1,15 +1,31 @@ import {http, HttpResponse} from 'msw' import {AwaitedReturnType} from './utils' -import {getMemories} from '../src/client/services/TmsService' +import {getMemories, getMemory} from '../src/client/services/TmsService' +import {TranslationMemoryWithRecordsCount} from '../src/client/schemas/TranslationMemoryWithRecordsCount' + +const tms: TranslationMemoryWithRecordsCount[] = [ + { + id: 42, + created_by: 12, + name: 'Some TM', + records_count: 5, + }, +] export const tmMocks = [ http.get('http://localhost:8000/translation_memory/', () => - HttpResponse.json>([ - { - id: 42, - created_by: 12, - name: 'Some TM', - }, - ]) + HttpResponse.json>(tms) + ), + http.get<{id: string}>( + 'http://localhost:8000/translation_memory/:id', + ({params}) => { + const id = Number(params.id) + const tm = tms.find((t) => t.id == id) + if (tm) { + return HttpResponse.json>(tm) + } else { + return new HttpResponse(null, {status: 404}) + } + } ), ] diff --git a/frontend/src/client/schemas/TranslationMemoryListResponse.ts b/frontend/src/client/schemas/TranslationMemoryListResponse.ts new file mode 100644 index 0000000..23b13a0 --- /dev/null +++ b/frontend/src/client/schemas/TranslationMemoryListResponse.ts @@ -0,0 +1,9 @@ +// This file is autogenerated, do not edit directly. + +import {TranslationMemoryRecord} from './TranslationMemoryRecord' + +export interface TranslationMemoryListResponse { + records: TranslationMemoryRecord[] + page: number + total_records: number +} diff --git a/frontend/src/client/schemas/TranslationMemoryListSimilarResponse.ts b/frontend/src/client/schemas/TranslationMemoryListSimilarResponse.ts new file mode 100644 index 0000000..afcbe68 --- /dev/null +++ b/frontend/src/client/schemas/TranslationMemoryListSimilarResponse.ts @@ -0,0 +1,9 @@ +// This file is autogenerated, do not edit directly. + +import {TranslationMemoryRecordWithSimilarity} from './TranslationMemoryRecordWithSimilarity' + +export interface TranslationMemoryListSimilarResponse { + records: TranslationMemoryRecordWithSimilarity[] + page: number + total_records: number +} diff --git a/frontend/src/client/schemas/TranslationMemoryRecordWithSimilarity.ts b/frontend/src/client/schemas/TranslationMemoryRecordWithSimilarity.ts new file mode 100644 index 0000000..6909c50 --- /dev/null +++ b/frontend/src/client/schemas/TranslationMemoryRecordWithSimilarity.ts @@ -0,0 +1,8 @@ +// This file is autogenerated, do not edit directly. + +export interface TranslationMemoryRecordWithSimilarity { + id: number + source: string + target: string + similarity: number +} diff --git a/frontend/src/client/services/TmsService.ts b/frontend/src/client/services/TmsService.ts index cda8bb8..bfb6a3c 100644 --- a/frontend/src/client/services/TmsService.ts +++ b/frontend/src/client/services/TmsService.ts @@ -6,7 +6,8 @@ import {TranslationMemory} from '../schemas/TranslationMemory' import {TranslationMemoryCreationSettings} from '../schemas/TranslationMemoryCreationSettings' import {TranslationMemoryWithRecordsCount} from '../schemas/TranslationMemoryWithRecordsCount' import {StatusMessage} from '../schemas/StatusMessage' -import {TranslationMemoryRecord} from '../schemas/TranslationMemoryRecord' +import {TranslationMemoryListResponse} from '../schemas/TranslationMemoryListResponse' +import {TranslationMemoryListSimilarResponse} from '../schemas/TranslationMemoryListSimilarResponse' import {Body_create_memory_from_file_translation_memory_upload_post} from '../schemas/Body_create_memory_from_file_translation_memory_upload_post' export const getMemories = async (): Promise => { @@ -21,8 +22,11 @@ export const getMemory = async (tm_id: number): Promise => { return await api.delete(`/translation_memory/${tm_id}`) } -export const getMemoryRecords = async (tm_id: number, page?: number | null): Promise => { - return await api.get(`/translation_memory/${tm_id}/records`, {query: {page}}) +export const getMemoryRecords = async (tm_id: number, page?: number | null, query?: string | null): Promise => { + return await api.get(`/translation_memory/${tm_id}/records`, {query: {page, query}}) +} +export const getMemoryRecordsSimilar = async (tm_id: number, query: string): Promise => { + return await api.get(`/translation_memory/${tm_id}/records/similar`, {query: {query}}) } export const createMemoryFromFile = async (data: Body_create_memory_from_file_translation_memory_upload_post): Promise => { const formData = new FormData() diff --git a/frontend/src/views/TmView.vue b/frontend/src/views/TmView.vue index bbfca2f..ffe77d0 100644 --- a/frontend/src/views/TmView.vue +++ b/frontend/src/views/TmView.vue @@ -1,98 +1,152 @@ - + - File ID: {{ document?.id }} - File name: {{ document?.name }} - Number of records: {{ document?.records_count }} + + + + + + + updatePage(event)" + @page="(event) => updatePage(event.page)" /> - updatePage(event)" - />
File ID: {{ document?.id }}
File name: {{ document?.name }}
Number of records: {{ document?.records_count }}