Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7f847d0
deploy customized to kinyarwanda
Apr 3, 2023
53c7b27
Merge pull request #1 from translatorswb/main
rutsam Apr 6, 2023
e51eb4a
Merge branch 'main' of https://github.com/translatorswb/TWB-MT-fastap…
Apr 25, 2023
2aad099
Merge branch 'translatorswb-main'
Apr 25, 2023
688d42f
Merge pull request #3 from translatorswb/main
rutsam Apr 26, 2023
444fabd
Update docker-compose.yml
rutsam Aug 10, 2023
fad8481
Update config.json
rutsam Aug 10, 2023
123d536
Merge branch 'main' of https://github.com/translatorswb/TWB-MT-fastap…
Aug 26, 2023
7548e30
Merge branch 'translatorswb-main'
Aug 26, 2023
609949c
Merge pull request #5 from translatorswb/main
rutsam Nov 7, 2023
cbbe4ce
Ignoring the unknown token
kurt0cougar Dec 11, 2023
e588d39
Merge pull request #6 from kurt0cougar/unknown_token_display
rutsam Feb 22, 2024
762c592
Update translators.py
kurt0cougar Feb 23, 2024
fbab844
adding the html edit
kurt0cougar Mar 25, 2024
bf61b8d
Update to deal with text from other tags in addition to paragraphs tr…
kurt0cougar Mar 29, 2024
63ad1af
Merge pull request #7 from Digital-Umuganda/TWB-MT-fastapi-html_edit
rutsam Apr 3, 2024
7563ea5
update translate html
Apr 3, 2024
1849eec
Update main.py
Cedric0852 Sep 18, 2024
f0a9bbc
Merge pull request #8 from Digital-Umuganda/Cedric0852-api-monitoring…
Cedric0852 Sep 18, 2024
36f9b04
Update requirements.txt
Cedric0852 Sep 18, 2024
8b07227
Merge pull request #9 from Digital-Umuganda/Cedric0852-add-monitoring…
Cedric0852 Sep 18, 2024
e9c63fc
Added markdown translation
Cedric0852 Jan 30, 2025
1683e86
Merge pull request #10 from Digital-Umuganda/feature/markdown-transla…
Cedric0852 Jan 30, 2025
2a6b1d2
update
Jan 31, 2025
a5e5818
deploy
Jan 31, 2025
a905100
add changes to the code
Mar 20, 2025
4b214ab
add beautiful soup 4
Mar 20, 2025
1fb23a2
disable_unk
Mar 20, 2025
a88fe9c
Add Markdown endpoints
Cedric0852 May 27, 2025
a217012
Update translators.py
rutsam Jun 3, 2025
821d789
Update docker-compose.yml
rutsam Jun 3, 2025
d42fbcf
improve markdown algorithm
Jun 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
FROM python:3.8-slim

#FROM python:3.8-slim
FROM nvidia/cuda:11.5.2-devel-ubuntu20.04
# Project setup

ENV VIRTUAL_ENV=/opt/venv

RUN apt-get update && apt-get clean
RUN apt-get update && \
apt-get install -y --no-install-recommends \
python3-dev \
python3-pip \
wget \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
#RUN apt install apt install nvidia-cuda-toolkit

#RUN apt-get install cuda-cudart-11-8
RUN apt-get update && apt-get install -y --no-install-recommends\
python3.8-venv

RUN python -m venv "$VIRTUAL_ENV"
RUN python3 -m venv "$VIRTUAL_ENV"
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN pip install --quiet --upgrade pip && \
pip install --quiet pip-tools
Expand All @@ -23,4 +36,4 @@ COPY . /app
WORKDIR /app

COPY ./app/nltk_pkg.py /app/nltk_pkg.py
RUN python /app/nltk_pkg.py
RUN python3 /app/nltk_pkg.py
32 changes: 32 additions & 0 deletions Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# If you intend to run your MT API on a GPU server it is advisable to using this Dockerfile
FROM nvidia/cuda:11.5.2-devel-ubuntu20.04
# Project setup

ENV VIRTUAL_ENV=/opt/venv

RUN apt-get update && apt-get clean
RUN apt-get update && \
apt-get install -y --no-install-recommends \
python3-dev \
python3-pip \
wget \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

RUN apt-get update && apt-get install -y --no-install-recommends\
python3.8-venv

RUN python3 -m venv "$VIRTUAL_ENV"
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN pip install --quiet --upgrade pip && \
pip install --quiet pip-tools
COPY ./requirements.txt /app/requirements.txt
RUN pip install -r /app/requirements.txt \
&& rm -rf /root/.cache/pip

COPY . /app
WORKDIR /app

COPY ./app/nltk_pkg.py /app/nltk_pkg.py
RUN python3 /app/nltk_pkg.py
6 changes: 6 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,14 @@ def create_app() -> FastAPI:
)

from app.views.v1.translate import translate_v1
from app.views.v1.translate_html import translate_html
from app.views.v1.translate_markdown import translate_markdown


app.include_router(translate_v1)
app.include_router(translate_html)
app.include_router(translate_markdown)


@app.on_event('startup')
async def startup_event() -> None:
Expand Down
6 changes: 4 additions & 2 deletions app/utils/translators.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def get_ctranslator(ctranslator_model_path: str) -> Callable:
# ]

def translator(text, src=None, tgt=None):
return ctranslator.translate_batch([text])[0][0]['tokens']
return ctranslator.translate_batch([text],disable_unk=True)[0][0]['tokens']

return translator

Expand All @@ -62,7 +62,9 @@ def translator(src_texts, src=None, tgt=None):
target_prefix = [[tgt]] * len(src_texts)
src_texts = [sent + ["</s>", src] for sent in src_texts]

translations = ctranslator.translate_batch(src_texts, target_prefix=target_prefix)
#translations = ctranslator.translate_batch([src_texts],disable_unk=True, target_prefix=target_prefix)
#translations = ctranslator.translate_batch([src_texts], target_prefix=target_prefix)
translations = ctranslator.translate_batch(src_texts,disable_unk=True, target_prefix=target_prefix)
translations = [translation.hypotheses[0][1:] for translation in translations]
else:
translations = [s.hypotheses[0] for s in ctranslator.translate_batch(src_texts)]
Expand Down
5 changes: 5 additions & 0 deletions app/views/v1/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
)
from app.utils.translate import translate_text
from app.constants import MULTIMODALCODE
import re
from app.views.v1.translate import *
from transformers import pipeline


translate_v1 = APIRouter(prefix='/api/v1/translate')

Expand Down Expand Up @@ -108,3 +112,4 @@ async def languages() -> LanguagesResponse:
return LanguagesResponse(
languages=config.language_codes, models=config.languages_list
)

34 changes: 34 additions & 0 deletions app/views/v1/translate_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from app.views.v1.translate import *
from bs4 import BeautifulSoup, NavigableString
import random
import os

translate_html = APIRouter(prefix='/api/v1/translate_html')

@translate_html.post('/translate_page', status_code=status.HTTP_200_OK)
async def modify_html_content(request: TranslationRequest):

model_id, src, tgt = fetch_model_data_from_request(request)
# Parse the HTML content
soup = BeautifulSoup(request.text, 'html.parser')


def edit_text(element,model_id,src,tgt):
"""
Recursively edits the text of the given BeautifulSoup element and its children.
"""
if isinstance(element, NavigableString):
return
if element.name in ['script', 'style']:
return

for child in element.children:
if isinstance(child, NavigableString):
edited_text = translate_text(model_id,child,src,tgt)
child.replace_with(edited_text)
else:
edit_text(element=child,model_id=model_id,src=src,tgt=tgt)

edit_text(element=soup,model_id=model_id,src=src,tgt=tgt)

return TranslationResponse(translation=soup)
113 changes: 113 additions & 0 deletions app/views/v1/translate_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from fastapi import APIRouter, status
from app.views.v1.translate import TranslationRequest, TranslationResponse, fetch_model_data_from_request, translate_text
import re

translate_markdown = APIRouter(prefix='/api/v1/translate_markdown')

def remove_markdown(text):
"""Remove markdown formatting from text while preserving positions."""
segments = []
current_pos = 0

# Define markdown patterns
patterns = [
(r'(\*\*|__)(.*?)\1', 'bold'), # Bold
(r'(\*|_)(.*?)\1', 'italic'), # Italics
(r'\[(.*?)\]\((.*?)\)', 'link'), # Links
(r'`(.*?)`', 'code'), # Inline code
(r'~~(.*?)~~', 'strikethrough') # Strikethrough
]

# Find all markdown segments
all_matches = []
for pattern, mark_type in patterns:
for match in re.finditer(pattern, text):
content = match.group(2) if mark_type != 'link' else match.group(1)
all_matches.append({
'start': match.start(),
'end': match.end(),
'content': content,
'original': match.group(0),
'type': mark_type,
'url': match.group(2) if mark_type == 'link' else None
})

# Sort matches by position
all_matches.sort(key=lambda x: x['start'])

# Split text into markdown and non-markdown segments
for match in all_matches:
if current_pos < match['start']:
# Add non-markdown text segment
segments.append({
'type': 'text',
'content': text[current_pos:match['start']],
'original': text[current_pos:match['start']]
})
segments.append(match)
current_pos = match['end']

# Add remaining text if any
if current_pos < len(text):
segments.append({
'type': 'text',
'content': text[current_pos:],
'original': text[current_pos:]
})

# Extract plain text for translation
plain_text = ''.join([seg['content'] for seg in segments])

return plain_text, segments

def reapply_markdown(segments, translated_text):
"""Reapply markdown formatting to translated text while handling different language lengths."""
result = ""
translated_segments = []
current_pos = 0

# Split translated text into segments based on original segment lengths and positions
for segment in segments:
# Get the next chunk of translated text
if segment['type'] == 'text':
# For non-markdown text, take the next portion of translated text
translated_content = translated_text[current_pos:current_pos + len(segment['content'])]
current_pos += len(segment['content'])
translated_segments.append(translated_content)
else:
# For markdown text, translate the content separately to maintain formatting
translated_content = translate_text(model_id, segment['content'], src, tgt)
translated_segments.append(translated_content)

# Reconstruct the markdown text
for i, segment in enumerate(segments):
if segment['type'] == 'text':
result += translated_segments[i]
elif segment['type'] == 'link':
result += f"[{translated_segments[i]}]({segment['url']})"
elif segment['type'] == 'bold':
result += f"**{translated_segments[i]}**"
elif segment['type'] == 'italic':
result += f"*{translated_segments[i]}*"
elif segment['type'] == 'code':
result += f"`{translated_segments[i]}`"
elif segment['type'] == 'strikethrough':
result += f"~~{translated_segments[i]}~~"

return result

@translate_markdown.post('/markdown', status_code=status.HTTP_200_OK)
async def translate_markdown_text(request: TranslationRequest) -> TranslationResponse:
global model_id, src, tgt
model_id, src, tgt = fetch_model_data_from_request(request)

# Split text into segments and get plain text
plain_text, segments = remove_markdown(request.text)

# Translate the main text
translated_text = translate_text(model_id, plain_text, src, tgt)

# Reapply markdown with proper translations
translated_markdown = reapply_markdown(segments, translated_text)

return TranslationResponse(translation=translated_markdown)
22 changes: 6 additions & 16 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,16 @@
"languages": {
"en": "English",
"fr": "French",
"de": "German",
"ha": "Hausa",
"apc":"Levantine Arabic",
"sw_cd":"Congolese Swahili",
"ti":"Tigrinya",
"uk":"Ukranian",
"kr": "Kanuri",
"ff": "Fulfulde",
"tr": "Turkish"
"rw": "Kinyarwanda"
},
"models": [
{
"model_type": "nllb",
"checkpoint_id": "nllb-200-distilled-600M",
"multilingual": true,
"alt": "nllb",
"load": false,
"sentence_split": "nltk",
"supported_pairs": ["en-kr", "en-fr", "en-ff", "en-ha"],
"supported_pairs": ["en-rw", "en-fr", "fr-rw", "fr-en","rw-en","rw-fr"],
"pipeline": {
"translate": true
},
Expand All @@ -31,7 +22,6 @@
"model_type": "m2m100",
"checkpoint_id": "m2m100_418M",
"multilingual": true,
"alt":"m2m",
"load": false,
"sentence_split": "nltk",
"supported_pairs": ["en-tr"],
Expand All @@ -41,8 +31,8 @@
},
{
"model_type": "ctranslator2",
"model_path": "nllb-200-distilled-600M-int8",
"alt": "education",
"model_path": "DigitalUmuganda/Quantized_Mbaza_MT_v1",
"alt": "",
"src_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model",
"tgt_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model",
"multilingual": true,
Expand All @@ -56,7 +46,7 @@
"translate": true,
"recase": true
},
"lang_code_map": {"en": "eng_Latn", "tr": "tur_Latn",
"lang_code_map": {"en": "eng_Latn", "tr": "tur_Latn",
"fr": "fra_Latn", "rw": "kin_Latn"}
},
{
Expand All @@ -66,7 +56,7 @@
"src_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model",
"tgt_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model",
"multilingual": true,
"load": true,
"load": false,
"sentence_split": "nltk",
"supported_pairs": ["en-rw", "rw-en"],
"pipeline": {
Expand Down
8 changes: 7 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ services:
- 8001:8000
volumes:
- .:/app
- ../translation-models:/app/models
- ../../translation-models:/app/models
- ./config.json:/app/config.json
security_opt:
- seccomp:unconfined
Expand All @@ -23,3 +23,9 @@ services:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=all

networks:
- infrastructure_default

networks:
infrastructure_default: # external network
external: true
4 changes: 3 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from app import create_app

from prometheus_fastapi_instrumentator import Instrumentator
app = create_app()
#Monitoring
Instrumentator().instrument(app).expose(app)
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ nltk==3.8.1
sentencepiece==0.1.99
torch==2.1.0
transformers==4.34.1
tenacity==8.2.2
prometheus-fastapi-instrumentator==5.9.1
beautifulsoup4