From 7f847d0c7f84ec79a21d491a5eb8b2a5cd522d4a Mon Sep 17 00:00:00 2001 From: Samuel Rutunda Date: Mon, 3 Apr 2023 12:50:26 +0200 Subject: [PATCH 01/20] deploy customized to kinyarwanda --- config.json | 12 ++---------- docker-compose.yml | 4 ++-- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/config.json b/config.json index a970171..a043f97 100644 --- a/config.json +++ b/config.json @@ -2,15 +2,7 @@ "languages": { "en": "English", "fr": "French", - "de": "German", - "ha": "Hausa", - "apc":"Levantine Arabic", - "sw_cd":"Congolese Swahili", - "ti":"Tigrinya", - "uk":"Ukranian", - "kr": "Kanuri", - "ff": "Fulfulde", - "tr": "Turkish" + "rw": "Kinyarwanda" }, "models": [ { @@ -19,7 +11,7 @@ "multilingual": true, "load": true, "sentence_split": "nltk", - "supported_pairs": ["en-kr", "en-fr", "en-ff", "en-ha"], + "supported_pairs": ["en-rw", "en-fr", "fr-rw", "fr-en","rw-en","rw-fr"], "pipeline": { "translate": true } diff --git a/docker-compose.yml b/docker-compose.yml index f364550..31bb840 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: build: . command: uvicorn main:app --reload --host 0.0.0.0 --port 8000 --log-config logging.yml restart: unless-stopped - runtime: nvidia # Comment out in local + #runtime: nvidia # Comment out in local ports: - 8001:8000 volumes: @@ -14,7 +14,7 @@ services: - ../translation-models:/app/models environment: - MT_API_CONFIG=/app/config.json - - MT_API_DEVICE=gpu #or gpu, if so make runtime:nvidia + - MT_API_DEVICE=cpu #or gpu, if so make runtime:nvidia - MT_API_THREADS=16 - MODELS_ROOT=/app/models - NVIDIA_VISIBLE_DEVICES=all From 444fabddeaffb8b1eca34effbf7bdadf127768c0 Mon Sep 17 00:00:00 2001 From: Samuel R Date: Thu, 10 Aug 2023 16:28:10 +0200 Subject: [PATCH 02/20] Update docker-compose.yml --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 6338416..f588e75 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,7 +16,7 @@ services: environment: - MT_API_CONFIG=/app/config.json - MT_API_DEVICE=cpu #or gpu, if so make runtime:nvidia - - MT_API_THREADS=16 + - MT_API_THREADS=4 - MODELS_ROOT=/app/models - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=all From fad8481309efb0f2b0af449d4daee0abb30ab17a Mon Sep 17 00:00:00 2001 From: Samuel R Date: Thu, 10 Aug 2023 16:28:42 +0200 Subject: [PATCH 03/20] Update config.json --- config.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.json b/config.json index 13c9e2f..e59542b 100644 --- a/config.json +++ b/config.json @@ -9,7 +9,7 @@ "model_type": "nllb", "checkpoint_id": "nllb-200-distilled-600M", "multilingual": true, - "load": false, + "load": true, "sentence_split": "nltk", "supported_pairs": ["en-rw", "en-fr", "fr-rw", "fr-en","rw-en","rw-fr"], "pipeline": { @@ -22,7 +22,7 @@ "model_type": "m2m100", "checkpoint_id": "m2m100_418M", "multilingual": true, - "load": true, + "load": false, "sentence_split": "nltk", "supported_pairs": ["en-tr"], "pipeline": { @@ -33,7 +33,7 @@ "model_type": "nllb", "checkpoint_id": "DigitalUmuganda/m2m100_en_kin_from_en_fr", "multilingual": true, - "load": true, + "load": false, "alt": "du_en_kin", "sentence_split": "nltk", "supported_pairs": ["en-fr"], From cbbe4ceba079ebe4003e0ebfbd04426a13222903 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Tue, 12 Dec 2023 00:01:20 +0200 Subject: [PATCH 04/20] Ignoring the unknown token --- app/utils/translators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/utils/translators.py b/app/utils/translators.py index 3eedfa3..3833126 100644 --- a/app/utils/translators.py +++ b/app/utils/translators.py @@ -39,7 +39,7 @@ def get_ctranslator(ctranslator_model_path: str) -> Callable: # ] def translator(text, src=None, tgt=None): - return ctranslator.translate_batch([text])[0][0]['tokens'] + return ctranslator.translate_batch([text],disable_unk=True)[0][0]['tokens'] return translator From 762c59240d6b6e137555b0671ed893cb02e0fa1f Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Fri, 23 Feb 2024 10:36:41 +0200 Subject: [PATCH 05/20] Update translators.py Ignoring unknown characters on batch predictions --- app/utils/translators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/utils/translators.py b/app/utils/translators.py index 3833126..4522d3d 100644 --- a/app/utils/translators.py +++ b/app/utils/translators.py @@ -62,7 +62,7 @@ def translator(src_texts, src=None, tgt=None): target_prefix = [[tgt]] * len(src_texts) src_texts = [sent + ["", src] for sent in src_texts] - translations = ctranslator.translate_batch(src_texts, target_prefix=target_prefix) + translations = ctranslator.translate_batch([src_texts],disable_unk=True, target_prefix=target_prefix) translations = [translation.hypotheses[0][1:] for translation in translations] else: translations = [s.hypotheses[0] for s in ctranslator.translate_batch(src_texts)] From fbab844758c22bba49bb3d37b8a99e3ebc913feb Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Mon, 25 Mar 2024 12:28:45 +0200 Subject: [PATCH 06/20] adding the html edit --- app/views/v1/translate_html.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 app/views/v1/translate_html.py diff --git a/app/views/v1/translate_html.py b/app/views/v1/translate_html.py new file mode 100644 index 0000000..3f941ac --- /dev/null +++ b/app/views/v1/translate_html.py @@ -0,0 +1,18 @@ +from app.views.v1.translate import * +from bs4 import BeautifulSoup +import random +import os + +translate_html = APIRouter(prefix='/api/v1/translate_html') + +@translate_html.post('/translate_page', status_code=status.HTTP_200_OK) +async def modify_html_content(request: TranslationRequest): + + model_id, src, tgt = fetch_model_data_from_request(request) + # Parse the HTML content + soup = BeautifulSoup(request.text, 'html.parser') + # Modify paragraphs + for p in soup.find_all('p'): + p.string = f"{translate_text(model_id, p.get_text(), src, tgt)}" + + return TranslationResponse(translation=soup) \ No newline at end of file From bf61b8d3868554660b4c8027a1ae8d001aa26ef8 Mon Sep 17 00:00:00 2001 From: kurt0cougar Date: Fri, 29 Mar 2024 23:26:38 +0200 Subject: [PATCH 07/20] Update to deal with text from other tags in addition to paragraphs translate_html.py --- app/views/v1/translate_html.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/app/views/v1/translate_html.py b/app/views/v1/translate_html.py index 3f941ac..b1286fd 100644 --- a/app/views/v1/translate_html.py +++ b/app/views/v1/translate_html.py @@ -1,5 +1,5 @@ from app.views.v1.translate import * -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString import random import os @@ -11,8 +11,24 @@ async def modify_html_content(request: TranslationRequest): model_id, src, tgt = fetch_model_data_from_request(request) # Parse the HTML content soup = BeautifulSoup(request.text, 'html.parser') - # Modify paragraphs - for p in soup.find_all('p'): - p.string = f"{translate_text(model_id, p.get_text(), src, tgt)}" - return TranslationResponse(translation=soup) \ No newline at end of file + + def edit_text(element,model_id,src,tgt): + """ + Recursively edits the text of the given BeautifulSoup element and its children. + """ + if isinstance(element, NavigableString): + return + if element.name in ['script', 'style']: + return + + for child in element.children: + if isinstance(child, NavigableString): + edited_text = translate_text(model_id,child,src,tgt) + child.replace_with(edited_text) + else: + edit_text(element=child,model_id=model_id,src=src,tgt=tgt) + + edit_text(element=soup,model_id=model_id,src=src,tgt=tgt) + + return TranslationResponse(translation=soup) From 7563ea5447f37a5c3653f889ca3c8e25c44a8654 Mon Sep 17 00:00:00 2001 From: Samuel Rutunda Date: Wed, 3 Apr 2024 14:22:48 +0200 Subject: [PATCH 08/20] update translate html --- Dockerfile.gpu | 32 ++++++++++++++++++++++++++++++++ app/__init__.py | 4 ++++ config.json | 4 ++-- 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 Dockerfile.gpu diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..78d5b7e --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,32 @@ +# If you intend to run your MT API on a GPU server it is advisable to using this Dockerfile +FROM nvidia/cuda:11.5.2-devel-ubuntu20.04 +# Project setup + +ENV VIRTUAL_ENV=/opt/venv + +RUN apt-get update && apt-get clean +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + wget \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y --no-install-recommends\ + python3.8-venv + +RUN python3 -m venv "$VIRTUAL_ENV" +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +RUN pip install --quiet --upgrade pip && \ + pip install --quiet pip-tools +COPY ./requirements.txt /app/requirements.txt +RUN pip install -r /app/requirements.txt \ + && rm -rf /root/.cache/pip + +COPY . /app +WORKDIR /app + +COPY ./app/nltk_pkg.py /app/nltk_pkg.py +RUN python3 /app/nltk_pkg.py \ No newline at end of file diff --git a/app/__init__.py b/app/__init__.py index a7daa86..cbac8f8 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -15,8 +15,12 @@ def create_app() -> FastAPI: ) from app.views.v1.translate import translate_v1 + from app.views.v1.translate_html import translate_html + app.include_router(translate_v1) + app.include_router(translate_html) + @app.on_event('startup') async def startup_event() -> None: diff --git a/config.json b/config.json index db3ab39..0841709 100644 --- a/config.json +++ b/config.json @@ -31,8 +31,8 @@ }, { "model_type": "ctranslator2", - "model_path": "nllb-200-distilled-600M-int8", - "alt": "education", + "model_path": "DigitalUmuganda/Quantized_Mbaza_MT_v1", + "alt": "", "src_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "tgt_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "multilingual": true, From 1849eecf283cba77fc573a9d80a7402717d9887b Mon Sep 17 00:00:00 2001 From: Irakoze <31562814+Cedric0852@users.noreply.github.com> Date: Wed, 18 Sep 2024 01:47:04 -0700 Subject: [PATCH 09/20] Update main.py added prometheus function --- main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 0a23b5a..6d3ff21 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,5 @@ from app import create_app - +from prometheus_fastapi_instrumentator import Instrumentator app = create_app() +#Monitoring +Instrumentator().instrument(app).expose(app) From 36f9b04366929e63fc5713fd6cc9627e63bcaaed Mon Sep 17 00:00:00 2001 From: Irakoze <31562814+Cedric0852@users.noreply.github.com> Date: Wed, 18 Sep 2024 02:03:25 -0700 Subject: [PATCH 10/20] Update requirements.txt added prometheus library --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index ec71fab..af7827d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ nltk==3.8.1 sentencepiece==0.1.99 torch==2.1.0 transformers==4.34.1 +tenacity==8.2.2 +prometheus-fastapi-instrumentator==5.9.1 From e9c63fcdaf8514280c8563a1cfb4d2b4bfec6e72 Mon Sep 17 00:00:00 2001 From: Cedric0852 <31562814+Cedric0852@users.noreply.github.com> Date: Thu, 30 Jan 2025 04:30:47 -0400 Subject: [PATCH 11/20] Added markdown translation --- app/views/v1/translate.py | 47 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/app/views/v1/translate.py b/app/views/v1/translate.py index 5addf77..1789b58 100644 --- a/app/views/v1/translate.py +++ b/app/views/v1/translate.py @@ -12,7 +12,11 @@ ) from app.utils.translate import translate_text from app.constants import MULTIMODALCODE +import re +from transformers import pipeline +# Initialize the translation pipeline +translator = pipeline("translation", model="facebook/nllb-200-3.3B", src_lang='eng_Latn', tgt_lang='kin_Latn') translate_v1 = APIRouter(prefix='/api/v1/translate') DEVDEBUG = True @@ -108,3 +112,46 @@ async def languages() -> LanguagesResponse: return LanguagesResponse( languages=config.language_codes, models=config.languages_list ) + +def remove_markdown(text): + """Remove markdown formatting from text.""" + text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text) # Bold + text = re.sub(r'(\*|_)(.*?)\1', r'\2', text) # Italics + text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text) # Links + text = re.sub(r'`(.*?)`', r'\1', text) # Inline code + text = re.sub(r'~~(.*?)~~', r'\1', text) # Strikethrough + return text + +def reapply_markdown(original, modified): + """Reapply markdown formatting after translation.""" + pattern = r'(\*\*.*?\*\*|\*.*?\*|__.*?__|_.*?_|\[.*?\]\(.*?\)|`.*?`|~~.*?~~)' + matches = re.finditer(pattern, original) + + result = "" + cursor = 0 + + for match in matches: + start, end = match.span() + token = match.group() + if cursor < start: + result += modified[cursor:start] + stripped = remove_markdown(token).strip() + translated_stripped = translator(stripped)[0]['translation_text'] + result += token.replace(stripped, translated_stripped) + cursor = end + + if cursor < len(original): + result += modified[cursor:] + + return result + +def process_text(text): + """Remove markdown, translate, and reapply markdown formatting.""" + plain_text = remove_markdown(text) + translated_text = translator(plain_text)[0]['translation_text'] + return reapply_markdown(text, translated_text) + +@translate_v1.post('/markdown', status_code=status.HTTP_200_OK) +async def translate_markdown(request: TranslationRequest) -> TranslationResponse: + translated_markdown = process_text(request.text) + return TranslationResponse(translation=translated_markdown) \ No newline at end of file From 2a6b1d24a19da868e107fda69482e5d3446d4b97 Mon Sep 17 00:00:00 2001 From: Samuel Rutunda Date: Fri, 31 Jan 2025 15:28:50 +0200 Subject: [PATCH 12/20] update --- Dockerfile | 21 +++++++++++++++++---- config.json | 9 ++++----- docker-compose.yml | 12 +++++++++--- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 40c1ec0..31c6b10 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,25 @@ -FROM python:3.8-slim - +#FROM python:3.8-slim +FROM nvidia/cuda:11.5.2-devel-ubuntu20.04 # Project setup ENV VIRTUAL_ENV=/opt/venv RUN apt-get update && apt-get clean +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + wget \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +#RUN apt install apt install nvidia-cuda-toolkit + +#RUN apt-get install cuda-cudart-11-8 +RUN apt-get update && apt-get install -y --no-install-recommends\ + python3.8-venv -RUN python -m venv "$VIRTUAL_ENV" +RUN python3 -m venv "$VIRTUAL_ENV" ENV PATH="$VIRTUAL_ENV/bin:$PATH" RUN pip install --quiet --upgrade pip && \ pip install --quiet pip-tools @@ -23,4 +36,4 @@ COPY . /app WORKDIR /app COPY ./app/nltk_pkg.py /app/nltk_pkg.py -RUN python /app/nltk_pkg.py +RUN python3 /app/nltk_pkg.py \ No newline at end of file diff --git a/config.json b/config.json index 0841709..51ec2f0 100644 --- a/config.json +++ b/config.json @@ -9,7 +9,7 @@ "model_type": "nllb", "checkpoint_id": "nllb-200-distilled-600M", "multilingual": true, - "load": true, + "load": false, "sentence_split": "nltk", "supported_pairs": ["en-rw", "en-fr", "fr-rw", "fr-en","rw-en","rw-fr"], "pipeline": { @@ -36,8 +36,7 @@ "src_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "tgt_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "multilingual": true, - "load": false, - "alt": "du_en_kin", + "load": true, "sentence_split": "nltk", "supported_pairs": ["en-rw", "rw-en"], "pipeline": { @@ -47,7 +46,7 @@ "translate": true, "recase": true }, - "lang_code_map": {"en": "eng_Latn", "tr": "tur_Latn", + "lang_code_map": {"en": "eng_Latn", "tr": "tur_Latn", "fr": "fra_Latn", "rw": "kin_Latn"} }, { @@ -57,7 +56,7 @@ "src_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "tgt_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "multilingual": true, - "load": true, + "load": false, "sentence_split": "nltk", "supported_pairs": ["en-rw", "rw-en"], "pipeline": { diff --git a/docker-compose.yml b/docker-compose.yml index 9669b03..9e13487 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: build: . command: uvicorn main:app --reload --host 0.0.0.0 --port 8000 --log-config logging.yml restart: unless-stopped - #runtime: nvidia # Comment out in local + runtime: nvidia # Comment out in local ports: - 8001:8000 volumes: @@ -17,9 +17,15 @@ services: - seccomp:unconfined environment: - MT_API_CONFIG=/app/config.json - - MT_API_DEVICE=cpu #or gpu, if so make runtime:nvidia - - MT_API_THREADS=4 + - MT_API_DEVICE=gpu #or gpu, if so make runtime:nvidia + - MT_API_THREADS=16 - MODELS_ROOT=/app/models - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=all + networks: + - infrastructure_default + +networks: + infrastructure_default: # external network + external: true From a5e5818e0c9512c9adde5ba0326d7ed3593557bf Mon Sep 17 00:00:00 2001 From: Samuel Rutunda Date: Fri, 31 Jan 2025 15:55:50 +0200 Subject: [PATCH 13/20] deploy --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 9e13487..accb0bb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,7 @@ services: - 8001:8000 volumes: - .:/app - - ../translation-models:/app/models + - $HOME/translation-models:/app/models - ./config.json:/app/config.json security_opt: - seccomp:unconfined From a90510031f4e22c89423e9ee34061cc24e2394e6 Mon Sep 17 00:00:00 2001 From: Samuel Rutunda Date: Thu, 20 Mar 2025 12:35:58 +0200 Subject: [PATCH 14/20] add changes to the code --- app/views/v1/translate.py | 46 ++------------------------- app/views/v1/translate_markdown.py | 50 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 44 deletions(-) create mode 100644 app/views/v1/translate_markdown.py diff --git a/app/views/v1/translate.py b/app/views/v1/translate.py index 1789b58..462d6e7 100644 --- a/app/views/v1/translate.py +++ b/app/views/v1/translate.py @@ -13,10 +13,10 @@ from app.utils.translate import translate_text from app.constants import MULTIMODALCODE import re +from app.views.v1.translate import * from transformers import pipeline -# Initialize the translation pipeline -translator = pipeline("translation", model="facebook/nllb-200-3.3B", src_lang='eng_Latn', tgt_lang='kin_Latn') + translate_v1 = APIRouter(prefix='/api/v1/translate') DEVDEBUG = True @@ -113,45 +113,3 @@ async def languages() -> LanguagesResponse: languages=config.language_codes, models=config.languages_list ) -def remove_markdown(text): - """Remove markdown formatting from text.""" - text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text) # Bold - text = re.sub(r'(\*|_)(.*?)\1', r'\2', text) # Italics - text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text) # Links - text = re.sub(r'`(.*?)`', r'\1', text) # Inline code - text = re.sub(r'~~(.*?)~~', r'\1', text) # Strikethrough - return text - -def reapply_markdown(original, modified): - """Reapply markdown formatting after translation.""" - pattern = r'(\*\*.*?\*\*|\*.*?\*|__.*?__|_.*?_|\[.*?\]\(.*?\)|`.*?`|~~.*?~~)' - matches = re.finditer(pattern, original) - - result = "" - cursor = 0 - - for match in matches: - start, end = match.span() - token = match.group() - if cursor < start: - result += modified[cursor:start] - stripped = remove_markdown(token).strip() - translated_stripped = translator(stripped)[0]['translation_text'] - result += token.replace(stripped, translated_stripped) - cursor = end - - if cursor < len(original): - result += modified[cursor:] - - return result - -def process_text(text): - """Remove markdown, translate, and reapply markdown formatting.""" - plain_text = remove_markdown(text) - translated_text = translator(plain_text)[0]['translation_text'] - return reapply_markdown(text, translated_text) - -@translate_v1.post('/markdown', status_code=status.HTTP_200_OK) -async def translate_markdown(request: TranslationRequest) -> TranslationResponse: - translated_markdown = process_text(request.text) - return TranslationResponse(translation=translated_markdown) \ No newline at end of file diff --git a/app/views/v1/translate_markdown.py b/app/views/v1/translate_markdown.py new file mode 100644 index 0000000..9960666 --- /dev/null +++ b/app/views/v1/translate_markdown.py @@ -0,0 +1,50 @@ +from app.views.v1.translate import * + +translate_markdown = APIRouter(prefix='/api/v1/translate_markdown') +def remove_markdown(text): + """Remove markdown formatting from text.""" + text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text) # Bold + text = re.sub(r'(\*|_)(.*?)\1', r'\2', text) # Italics + text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text) # Links + text = re.sub(r'`(.*?)`', r'\1', text) # Inline code + text = re.sub(r'~~(.*?)~~', r'\1', text) # Strikethrough + return text + +def reapply_markdown(original, modified): + """Reapply markdown formatting after translation.""" + pattern = r'(\*\*.*?\*\*|\*.*?\*|__.*?__|_.*?_|\[.*?\]\(.*?\)|`.*?`|~~.*?~~)' + matches = re.finditer(pattern, original) + + result = "" + cursor = 0 + + for match in matches: + start, end = match.span() + token = match.group() + if cursor < start: + result += modified[cursor:start] + stripped = remove_markdown(token).strip() + translated_stripped = translator(stripped)[0]['translation_text'] + result += token.replace(stripped, translated_stripped) + cursor = end + + if cursor < len(original): + result += modified[cursor:] + + return result + + + +@translate_markdown.post('/markdown', status_code=status.HTTP_200_OK) +async def translate_markdown(request: TranslationRequest) -> TranslationResponse: + model_id, src, tgt = fetch_model_data_from_request(request) + + def process_text(text,model_id,src,tgt): + """Remove markdown, translate, and reapply markdown formatting.""" + plain_text = remove_markdown(text) + translated_text = translate_text(model_id,plain_text,src,tgt) + + return reapply_markdown(text, translated_text) + + translated_markdown = process_text(request.text,model_id=model_id,src=src,tgt=tgt) + return TranslationResponse(translation=translated_markdown) \ No newline at end of file From 4b214ab8820506f4ae460fca845ef8693e17b466 Mon Sep 17 00:00:00 2001 From: Samuel Rutunda Date: Thu, 20 Mar 2025 12:38:55 +0200 Subject: [PATCH 15/20] add beautiful soup 4 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index af7827d..646ca12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ torch==2.1.0 transformers==4.34.1 tenacity==8.2.2 prometheus-fastapi-instrumentator==5.9.1 +beautifulsoup4 From 1fb23a29217a32741d5e9244e1ca0900e52315e0 Mon Sep 17 00:00:00 2001 From: Samuel Rutunda Date: Thu, 20 Mar 2025 12:49:54 +0200 Subject: [PATCH 16/20] disable_unk --- app/utils/translators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/utils/translators.py b/app/utils/translators.py index 4522d3d..3f0c157 100644 --- a/app/utils/translators.py +++ b/app/utils/translators.py @@ -62,7 +62,8 @@ def translator(src_texts, src=None, tgt=None): target_prefix = [[tgt]] * len(src_texts) src_texts = [sent + ["", src] for sent in src_texts] - translations = ctranslator.translate_batch([src_texts],disable_unk=True, target_prefix=target_prefix) + #translations = ctranslator.translate_batch([src_texts],disable_unk=True, target_prefix=target_prefix) + translations = ctranslator.translate_batch([src_texts], target_prefix=target_prefix) translations = [translation.hypotheses[0][1:] for translation in translations] else: translations = [s.hypotheses[0] for s in ctranslator.translate_batch(src_texts)] From a88fe9cfe7c2793ac206d72668d92143c3e85536 Mon Sep 17 00:00:00 2001 From: Irakoze Date: Tue, 27 May 2025 11:47:27 +0200 Subject: [PATCH 17/20] Add Markdown endpoints --- app/__init__.py | 2 ++ app/views/v1/translate_markdown.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/app/__init__.py b/app/__init__.py index cbac8f8..5ccdd05 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -16,10 +16,12 @@ def create_app() -> FastAPI: from app.views.v1.translate import translate_v1 from app.views.v1.translate_html import translate_html + from app.views.v1.translate_markdown import translate_markdown app.include_router(translate_v1) app.include_router(translate_html) + app.include_router(translate_markdown) @app.on_event('startup') diff --git a/app/views/v1/translate_markdown.py b/app/views/v1/translate_markdown.py index 9960666..7f805af 100644 --- a/app/views/v1/translate_markdown.py +++ b/app/views/v1/translate_markdown.py @@ -1,4 +1,6 @@ -from app.views.v1.translate import * +from fastapi import APIRouter, status +from app.views.v1.translate import translator, TranslationRequest, TranslationResponse, fetch_model_data_from_request, translate_text +import re translate_markdown = APIRouter(prefix='/api/v1/translate_markdown') def remove_markdown(text): From a2170125965d85191cef115e60a3f928318fb311 Mon Sep 17 00:00:00 2001 From: "Samuel R." Date: Tue, 3 Jun 2025 09:32:29 +0200 Subject: [PATCH 18/20] Update translators.py --- app/utils/translators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/utils/translators.py b/app/utils/translators.py index 3f0c157..425387b 100644 --- a/app/utils/translators.py +++ b/app/utils/translators.py @@ -63,7 +63,8 @@ def translator(src_texts, src=None, tgt=None): src_texts = [sent + ["", src] for sent in src_texts] #translations = ctranslator.translate_batch([src_texts],disable_unk=True, target_prefix=target_prefix) - translations = ctranslator.translate_batch([src_texts], target_prefix=target_prefix) + #translations = ctranslator.translate_batch([src_texts], target_prefix=target_prefix) + translations = ctranslator.translate_batch(src_texts,disable_unk=True, target_prefix=target_prefix) translations = [translation.hypotheses[0][1:] for translation in translations] else: translations = [s.hypotheses[0] for s in ctranslator.translate_batch(src_texts)] From 821d78943d159626905487e1278e7e5d1cb0e5f5 Mon Sep 17 00:00:00 2001 From: "Samuel R." Date: Tue, 3 Jun 2025 09:33:01 +0200 Subject: [PATCH 19/20] Update docker-compose.yml --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index accb0bb..7cd2b63 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,7 @@ services: - 8001:8000 volumes: - .:/app - - $HOME/translation-models:/app/models + - ../../translation-models:/app/models - ./config.json:/app/config.json security_opt: - seccomp:unconfined From d42fbcfef36f232b3367f306003269f492df8241 Mon Sep 17 00:00:00 2001 From: Samuel Rutunda Date: Tue, 3 Jun 2025 12:01:19 +0200 Subject: [PATCH 20/20] improve markdown algorithm --- app/views/v1/translate_markdown.py | 137 +++++++++++++++++++++-------- 1 file changed, 99 insertions(+), 38 deletions(-) diff --git a/app/views/v1/translate_markdown.py b/app/views/v1/translate_markdown.py index 7f805af..d2ff042 100644 --- a/app/views/v1/translate_markdown.py +++ b/app/views/v1/translate_markdown.py @@ -1,52 +1,113 @@ from fastapi import APIRouter, status -from app.views.v1.translate import translator, TranslationRequest, TranslationResponse, fetch_model_data_from_request, translate_text +from app.views.v1.translate import TranslationRequest, TranslationResponse, fetch_model_data_from_request, translate_text import re translate_markdown = APIRouter(prefix='/api/v1/translate_markdown') -def remove_markdown(text): - """Remove markdown formatting from text.""" - text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text) # Bold - text = re.sub(r'(\*|_)(.*?)\1', r'\2', text) # Italics - text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text) # Links - text = re.sub(r'`(.*?)`', r'\1', text) # Inline code - text = re.sub(r'~~(.*?)~~', r'\1', text) # Strikethrough - return text -def reapply_markdown(original, modified): - """Reapply markdown formatting after translation.""" - pattern = r'(\*\*.*?\*\*|\*.*?\*|__.*?__|_.*?_|\[.*?\]\(.*?\)|`.*?`|~~.*?~~)' - matches = re.finditer(pattern, original) +def remove_markdown(text): + """Remove markdown formatting from text while preserving positions.""" + segments = [] + current_pos = 0 + + # Define markdown patterns + patterns = [ + (r'(\*\*|__)(.*?)\1', 'bold'), # Bold + (r'(\*|_)(.*?)\1', 'italic'), # Italics + (r'\[(.*?)\]\((.*?)\)', 'link'), # Links + (r'`(.*?)`', 'code'), # Inline code + (r'~~(.*?)~~', 'strikethrough') # Strikethrough + ] + + # Find all markdown segments + all_matches = [] + for pattern, mark_type in patterns: + for match in re.finditer(pattern, text): + content = match.group(2) if mark_type != 'link' else match.group(1) + all_matches.append({ + 'start': match.start(), + 'end': match.end(), + 'content': content, + 'original': match.group(0), + 'type': mark_type, + 'url': match.group(2) if mark_type == 'link' else None + }) + + # Sort matches by position + all_matches.sort(key=lambda x: x['start']) + + # Split text into markdown and non-markdown segments + for match in all_matches: + if current_pos < match['start']: + # Add non-markdown text segment + segments.append({ + 'type': 'text', + 'content': text[current_pos:match['start']], + 'original': text[current_pos:match['start']] + }) + segments.append(match) + current_pos = match['end'] + + # Add remaining text if any + if current_pos < len(text): + segments.append({ + 'type': 'text', + 'content': text[current_pos:], + 'original': text[current_pos:] + }) + + # Extract plain text for translation + plain_text = ''.join([seg['content'] for seg in segments]) + + return plain_text, segments +def reapply_markdown(segments, translated_text): + """Reapply markdown formatting to translated text while handling different language lengths.""" result = "" - cursor = 0 - - for match in matches: - start, end = match.span() - token = match.group() - if cursor < start: - result += modified[cursor:start] - stripped = remove_markdown(token).strip() - translated_stripped = translator(stripped)[0]['translation_text'] - result += token.replace(stripped, translated_stripped) - cursor = end - - if cursor < len(original): - result += modified[cursor:] + translated_segments = [] + current_pos = 0 + + # Split translated text into segments based on original segment lengths and positions + for segment in segments: + # Get the next chunk of translated text + if segment['type'] == 'text': + # For non-markdown text, take the next portion of translated text + translated_content = translated_text[current_pos:current_pos + len(segment['content'])] + current_pos += len(segment['content']) + translated_segments.append(translated_content) + else: + # For markdown text, translate the content separately to maintain formatting + translated_content = translate_text(model_id, segment['content'], src, tgt) + translated_segments.append(translated_content) + + # Reconstruct the markdown text + for i, segment in enumerate(segments): + if segment['type'] == 'text': + result += translated_segments[i] + elif segment['type'] == 'link': + result += f"[{translated_segments[i]}]({segment['url']})" + elif segment['type'] == 'bold': + result += f"**{translated_segments[i]}**" + elif segment['type'] == 'italic': + result += f"*{translated_segments[i]}*" + elif segment['type'] == 'code': + result += f"`{translated_segments[i]}`" + elif segment['type'] == 'strikethrough': + result += f"~~{translated_segments[i]}~~" return result - - @translate_markdown.post('/markdown', status_code=status.HTTP_200_OK) -async def translate_markdown(request: TranslationRequest) -> TranslationResponse: +async def translate_markdown_text(request: TranslationRequest) -> TranslationResponse: + global model_id, src, tgt model_id, src, tgt = fetch_model_data_from_request(request) - def process_text(text,model_id,src,tgt): - """Remove markdown, translate, and reapply markdown formatting.""" - plain_text = remove_markdown(text) - translated_text = translate_text(model_id,plain_text,src,tgt) - - return reapply_markdown(text, translated_text) - - translated_markdown = process_text(request.text,model_id=model_id,src=src,tgt=tgt) + # Split text into segments and get plain text + plain_text, segments = remove_markdown(request.text) + + # Translate the main text + translated_text = translate_text(model_id, plain_text, src, tgt) + + # Reapply markdown with proper translations + translated_markdown = reapply_markdown(segments, translated_text) + return TranslationResponse(translation=translated_markdown) \ No newline at end of file