diff --git a/Dockerfile b/Dockerfile index 40c1ec0..31c6b10 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,25 @@ -FROM python:3.8-slim - +#FROM python:3.8-slim +FROM nvidia/cuda:11.5.2-devel-ubuntu20.04 # Project setup ENV VIRTUAL_ENV=/opt/venv RUN apt-get update && apt-get clean +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + wget \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +#RUN apt install apt install nvidia-cuda-toolkit + +#RUN apt-get install cuda-cudart-11-8 +RUN apt-get update && apt-get install -y --no-install-recommends\ + python3.8-venv -RUN python -m venv "$VIRTUAL_ENV" +RUN python3 -m venv "$VIRTUAL_ENV" ENV PATH="$VIRTUAL_ENV/bin:$PATH" RUN pip install --quiet --upgrade pip && \ pip install --quiet pip-tools @@ -23,4 +36,4 @@ COPY . /app WORKDIR /app COPY ./app/nltk_pkg.py /app/nltk_pkg.py -RUN python /app/nltk_pkg.py +RUN python3 /app/nltk_pkg.py \ No newline at end of file diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..78d5b7e --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,32 @@ +# If you intend to run your MT API on a GPU server it is advisable to using this Dockerfile +FROM nvidia/cuda:11.5.2-devel-ubuntu20.04 +# Project setup + +ENV VIRTUAL_ENV=/opt/venv + +RUN apt-get update && apt-get clean +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + wget \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y --no-install-recommends\ + python3.8-venv + +RUN python3 -m venv "$VIRTUAL_ENV" +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +RUN pip install --quiet --upgrade pip && \ + pip install --quiet pip-tools +COPY ./requirements.txt /app/requirements.txt +RUN pip install -r /app/requirements.txt \ + && rm -rf /root/.cache/pip + +COPY . /app +WORKDIR /app + +COPY ./app/nltk_pkg.py /app/nltk_pkg.py +RUN python3 /app/nltk_pkg.py \ No newline at end of file diff --git a/app/__init__.py b/app/__init__.py index a7daa86..5ccdd05 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -15,8 +15,14 @@ def create_app() -> FastAPI: ) from app.views.v1.translate import translate_v1 + from app.views.v1.translate_html import translate_html + from app.views.v1.translate_markdown import translate_markdown + app.include_router(translate_v1) + app.include_router(translate_html) + app.include_router(translate_markdown) + @app.on_event('startup') async def startup_event() -> None: diff --git a/app/utils/translators.py b/app/utils/translators.py index 3eedfa3..425387b 100644 --- a/app/utils/translators.py +++ b/app/utils/translators.py @@ -39,7 +39,7 @@ def get_ctranslator(ctranslator_model_path: str) -> Callable: # ] def translator(text, src=None, tgt=None): - return ctranslator.translate_batch([text])[0][0]['tokens'] + return ctranslator.translate_batch([text],disable_unk=True)[0][0]['tokens'] return translator @@ -62,7 +62,9 @@ def translator(src_texts, src=None, tgt=None): target_prefix = [[tgt]] * len(src_texts) src_texts = [sent + ["", src] for sent in src_texts] - translations = ctranslator.translate_batch(src_texts, target_prefix=target_prefix) + #translations = ctranslator.translate_batch([src_texts],disable_unk=True, target_prefix=target_prefix) + #translations = ctranslator.translate_batch([src_texts], target_prefix=target_prefix) + translations = ctranslator.translate_batch(src_texts,disable_unk=True, target_prefix=target_prefix) translations = [translation.hypotheses[0][1:] for translation in translations] else: translations = [s.hypotheses[0] for s in ctranslator.translate_batch(src_texts)] diff --git a/app/views/v1/translate.py b/app/views/v1/translate.py index 5addf77..462d6e7 100644 --- a/app/views/v1/translate.py +++ b/app/views/v1/translate.py @@ -12,6 +12,10 @@ ) from app.utils.translate import translate_text from app.constants import MULTIMODALCODE +import re +from app.views.v1.translate import * +from transformers import pipeline + translate_v1 = APIRouter(prefix='/api/v1/translate') @@ -108,3 +112,4 @@ async def languages() -> LanguagesResponse: return LanguagesResponse( languages=config.language_codes, models=config.languages_list ) + diff --git a/app/views/v1/translate_html.py b/app/views/v1/translate_html.py new file mode 100644 index 0000000..b1286fd --- /dev/null +++ b/app/views/v1/translate_html.py @@ -0,0 +1,34 @@ +from app.views.v1.translate import * +from bs4 import BeautifulSoup, NavigableString +import random +import os + +translate_html = APIRouter(prefix='/api/v1/translate_html') + +@translate_html.post('/translate_page', status_code=status.HTTP_200_OK) +async def modify_html_content(request: TranslationRequest): + + model_id, src, tgt = fetch_model_data_from_request(request) + # Parse the HTML content + soup = BeautifulSoup(request.text, 'html.parser') + + + def edit_text(element,model_id,src,tgt): + """ + Recursively edits the text of the given BeautifulSoup element and its children. + """ + if isinstance(element, NavigableString): + return + if element.name in ['script', 'style']: + return + + for child in element.children: + if isinstance(child, NavigableString): + edited_text = translate_text(model_id,child,src,tgt) + child.replace_with(edited_text) + else: + edit_text(element=child,model_id=model_id,src=src,tgt=tgt) + + edit_text(element=soup,model_id=model_id,src=src,tgt=tgt) + + return TranslationResponse(translation=soup) diff --git a/app/views/v1/translate_markdown.py b/app/views/v1/translate_markdown.py new file mode 100644 index 0000000..d2ff042 --- /dev/null +++ b/app/views/v1/translate_markdown.py @@ -0,0 +1,113 @@ +from fastapi import APIRouter, status +from app.views.v1.translate import TranslationRequest, TranslationResponse, fetch_model_data_from_request, translate_text +import re + +translate_markdown = APIRouter(prefix='/api/v1/translate_markdown') + +def remove_markdown(text): + """Remove markdown formatting from text while preserving positions.""" + segments = [] + current_pos = 0 + + # Define markdown patterns + patterns = [ + (r'(\*\*|__)(.*?)\1', 'bold'), # Bold + (r'(\*|_)(.*?)\1', 'italic'), # Italics + (r'\[(.*?)\]\((.*?)\)', 'link'), # Links + (r'`(.*?)`', 'code'), # Inline code + (r'~~(.*?)~~', 'strikethrough') # Strikethrough + ] + + # Find all markdown segments + all_matches = [] + for pattern, mark_type in patterns: + for match in re.finditer(pattern, text): + content = match.group(2) if mark_type != 'link' else match.group(1) + all_matches.append({ + 'start': match.start(), + 'end': match.end(), + 'content': content, + 'original': match.group(0), + 'type': mark_type, + 'url': match.group(2) if mark_type == 'link' else None + }) + + # Sort matches by position + all_matches.sort(key=lambda x: x['start']) + + # Split text into markdown and non-markdown segments + for match in all_matches: + if current_pos < match['start']: + # Add non-markdown text segment + segments.append({ + 'type': 'text', + 'content': text[current_pos:match['start']], + 'original': text[current_pos:match['start']] + }) + segments.append(match) + current_pos = match['end'] + + # Add remaining text if any + if current_pos < len(text): + segments.append({ + 'type': 'text', + 'content': text[current_pos:], + 'original': text[current_pos:] + }) + + # Extract plain text for translation + plain_text = ''.join([seg['content'] for seg in segments]) + + return plain_text, segments + +def reapply_markdown(segments, translated_text): + """Reapply markdown formatting to translated text while handling different language lengths.""" + result = "" + translated_segments = [] + current_pos = 0 + + # Split translated text into segments based on original segment lengths and positions + for segment in segments: + # Get the next chunk of translated text + if segment['type'] == 'text': + # For non-markdown text, take the next portion of translated text + translated_content = translated_text[current_pos:current_pos + len(segment['content'])] + current_pos += len(segment['content']) + translated_segments.append(translated_content) + else: + # For markdown text, translate the content separately to maintain formatting + translated_content = translate_text(model_id, segment['content'], src, tgt) + translated_segments.append(translated_content) + + # Reconstruct the markdown text + for i, segment in enumerate(segments): + if segment['type'] == 'text': + result += translated_segments[i] + elif segment['type'] == 'link': + result += f"[{translated_segments[i]}]({segment['url']})" + elif segment['type'] == 'bold': + result += f"**{translated_segments[i]}**" + elif segment['type'] == 'italic': + result += f"*{translated_segments[i]}*" + elif segment['type'] == 'code': + result += f"`{translated_segments[i]}`" + elif segment['type'] == 'strikethrough': + result += f"~~{translated_segments[i]}~~" + + return result + +@translate_markdown.post('/markdown', status_code=status.HTTP_200_OK) +async def translate_markdown_text(request: TranslationRequest) -> TranslationResponse: + global model_id, src, tgt + model_id, src, tgt = fetch_model_data_from_request(request) + + # Split text into segments and get plain text + plain_text, segments = remove_markdown(request.text) + + # Translate the main text + translated_text = translate_text(model_id, plain_text, src, tgt) + + # Reapply markdown with proper translations + translated_markdown = reapply_markdown(segments, translated_text) + + return TranslationResponse(translation=translated_markdown) \ No newline at end of file diff --git a/config.json b/config.json index 51e1262..51ec2f0 100644 --- a/config.json +++ b/config.json @@ -2,25 +2,16 @@ "languages": { "en": "English", "fr": "French", - "de": "German", - "ha": "Hausa", - "apc":"Levantine Arabic", - "sw_cd":"Congolese Swahili", - "ti":"Tigrinya", - "uk":"Ukranian", - "kr": "Kanuri", - "ff": "Fulfulde", - "tr": "Turkish" + "rw": "Kinyarwanda" }, "models": [ { "model_type": "nllb", "checkpoint_id": "nllb-200-distilled-600M", "multilingual": true, - "alt": "nllb", "load": false, "sentence_split": "nltk", - "supported_pairs": ["en-kr", "en-fr", "en-ff", "en-ha"], + "supported_pairs": ["en-rw", "en-fr", "fr-rw", "fr-en","rw-en","rw-fr"], "pipeline": { "translate": true }, @@ -31,7 +22,6 @@ "model_type": "m2m100", "checkpoint_id": "m2m100_418M", "multilingual": true, - "alt":"m2m", "load": false, "sentence_split": "nltk", "supported_pairs": ["en-tr"], @@ -41,8 +31,8 @@ }, { "model_type": "ctranslator2", - "model_path": "nllb-200-distilled-600M-int8", - "alt": "education", + "model_path": "DigitalUmuganda/Quantized_Mbaza_MT_v1", + "alt": "", "src_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "tgt_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "multilingual": true, @@ -56,7 +46,7 @@ "translate": true, "recase": true }, - "lang_code_map": {"en": "eng_Latn", "tr": "tur_Latn", + "lang_code_map": {"en": "eng_Latn", "tr": "tur_Latn", "fr": "fra_Latn", "rw": "kin_Latn"} }, { @@ -66,7 +56,7 @@ "src_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "tgt_sentencepiece_model": "flores200_sacrebleu_tokenizer_spm.model", "multilingual": true, - "load": true, + "load": false, "sentence_split": "nltk", "supported_pairs": ["en-rw", "rw-en"], "pipeline": { diff --git a/docker-compose.yml b/docker-compose.yml index 737d5f1..7cd2b63 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,7 @@ services: - 8001:8000 volumes: - .:/app - - ../translation-models:/app/models + - ../../translation-models:/app/models - ./config.json:/app/config.json security_opt: - seccomp:unconfined @@ -23,3 +23,9 @@ services: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=all + networks: + - infrastructure_default + +networks: + infrastructure_default: # external network + external: true diff --git a/main.py b/main.py index 0a23b5a..6d3ff21 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,5 @@ from app import create_app - +from prometheus_fastapi_instrumentator import Instrumentator app = create_app() +#Monitoring +Instrumentator().instrument(app).expose(app) diff --git a/requirements.txt b/requirements.txt index ec71fab..646ca12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,6 @@ nltk==3.8.1 sentencepiece==0.1.99 torch==2.1.0 transformers==4.34.1 +tenacity==8.2.2 +prometheus-fastapi-instrumentator==5.9.1 +beautifulsoup4