Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions libs/infinity_emb/Docker.template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@ cpu:
# RUN sed -i 's|torch = "2.4.1"|torch = "2.5.0"|' pyproject.toml
# RUN sed -i 's|"pypi"|"pytorch_cpu"|' pyproject.toml
# RUN poetry lock --no-update
poetry_extras: "all openvino"
main_install: |
# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all"
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu"
extra_env_variables: |
# Sets default to onnx
ENV INFINITY_ENGINE="optimum"

amd:
# 2 . command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
Expand All @@ -29,19 +33,20 @@ amd:
# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all"
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/rocm6.2"

poetry_extras: "all onnxruntime-gpu"
python_version: python3.10

trt:
base_image: nvidia/cuda:12.1.1-devel-ubuntu22.04
base_image: nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04
poetry_extras: "all onnxruntime-gpu"
extra_installs_main: |
# Install utils for tensorrt
RUN apt-get install -y --no-install-recommends openmpi-bin libopenmpi-dev git git-lfs python3-pip
RUN poetry run $PYTHON -m pip install --no-cache-dir flash-attn --no-build-isolation
RUN poetry run $PYTHON -m pip install --no-cache-dir "tensorrt==10.0.1" "tensorrt_lean==10.0.1" "tensorrt_dispatch==10.0.1"
ENV LD_LIBRARY_PATH /app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
ENV PATH /app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
RUN poetry run $PYTHON -m pip install --no-cache-dir "tensorrt==10.3.0" "tensorrt_lean==10.3.0" "tensorrt_dispatch==10.3.0"
extra_env_variables: |
# Set default to tensorrt
ENV LD_LIBRARY_PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
ENV PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
python_version: python3.10
main_install: "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all"
21 changes: 1 addition & 20 deletions libs/infinity_emb/Dockerfile.amd_auto
Original file line number Diff line number Diff line change
Expand Up @@ -91,26 +91,7 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]

#

# Use a multi-stage build -> production version, with download
# docker buildx build --target=production-with-download \
Expand Down
23 changes: 3 additions & 20 deletions libs/infinity_emb/Dockerfile.cpu_auto
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ENV PYTHONUNBUFFERED=1 \
POETRY_VIRTUALENVS_IN_PROJECT="true" \
# do not ask any interactive question
POETRY_NO_INTERACTION=1 \
EXTRAS="all" \
EXTRAS="all openvino" \
PYTHON="python3.11"
RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl
WORKDIR /app
Expand Down Expand Up @@ -91,25 +91,8 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]
# Sets default to onnx
ENV INFINITY_ENGINE="optimum"


# Use a multi-stage build -> production version, with download
Expand Down
21 changes: 1 addition & 20 deletions libs/infinity_emb/Dockerfile.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -82,26 +82,7 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]

{{extra_env_variables | default('#')}}

# Use a multi-stage build -> production version, with download
# docker buildx build --target=production-with-download \
Expand Down
21 changes: 1 addition & 20 deletions libs/infinity_emb/Dockerfile.nvidia_auto
Original file line number Diff line number Diff line change
Expand Up @@ -82,26 +82,7 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]

#

# Use a multi-stage build -> production version, with download
# docker buildx build --target=production-with-download \
Expand Down
28 changes: 5 additions & 23 deletions libs/infinity_emb/Dockerfile.trt_onnx_auto
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd

FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 AS base
FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04 AS base

ENV PYTHONUNBUFFERED=1 \
\
Expand Down Expand Up @@ -44,9 +44,7 @@ RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}" --without li
# Install utils for tensorrt
RUN apt-get install -y --no-install-recommends openmpi-bin libopenmpi-dev git git-lfs python3-pip
RUN poetry run $PYTHON -m pip install --no-cache-dir flash-attn --no-build-isolation
RUN poetry run $PYTHON -m pip install --no-cache-dir "tensorrt==10.0.1" "tensorrt_lean==10.0.1" "tensorrt_dispatch==10.0.1"
ENV LD_LIBRARY_PATH /app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
ENV PATH /app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
RUN poetry run $PYTHON -m pip install --no-cache-dir "tensorrt==10.3.0" "tensorrt_lean==10.3.0" "tensorrt_dispatch==10.3.0"



Expand Down Expand Up @@ -88,25 +86,9 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]
# Set default to tensorrt
ENV LD_LIBRARY_PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
ENV PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}


# Use a multi-stage build -> production version, with download
Expand Down
2 changes: 1 addition & 1 deletion libs/infinity_emb/infinity_emb/inference/select_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def select_model(

if engine_args.model_warmup:
# size one, warm up warm start timings.
loaded_engine.warmup(batch_size=engine_args.batch_size, n_tokens=1)
# loaded_engine.warmup(batch_size=engine_args.batch_size, n_tokens=1)
# size one token
min_inference_t = min(
min(loaded_engine.warmup(batch_size=1, n_tokens=1)[1] for _ in range(10)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self, *, engine_args: EngineArgs):
model_name_or_path=engine_args.model_name_or_path,
revision=engine_args.revision,
use_auth_token=True,
prefer_quantized="cpu" in provider.lower(),
prefer_quantized=("cpu" in provider.lower() or "openvino" in provider.lower()),
)

self.model = optimize_model(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, *, engine_args: EngineArgs):
model_name_or_path=engine_args.model_name_or_path,
revision=engine_args.revision,
use_auth_token=True,
prefer_quantized="cpu" in provider.lower(),
prefer_quantized=("cpu" in provider.lower() or "openvino" in provider.lower()),
)

self.pooling = (
Expand Down
27 changes: 21 additions & 6 deletions libs/infinity_emb/infinity_emb/transformer/utils_optimum.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
from huggingface_hub import HfApi, HfFolder # type: ignore
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE # type: ignore

from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TORCH
from infinity_emb._optional_imports import CHECK_ONNXRUNTIME
from infinity_emb.log_handler import logger
from infinity_emb.primitives import Device

if CHECK_ONNXRUNTIME.is_available:
try:
import onnxruntime as ort # type: ignore
from optimum.modeling_base import OptimizedModel # type: ignore
from optimum.onnxruntime import ( # type: ignore
ORTModel,
Expand All @@ -23,9 +24,6 @@
except (ImportError, RuntimeError, Exception) as ex:
CHECK_ONNXRUNTIME.mark_dirty(ex)

if CHECK_TORCH.is_available:
import torch


def mean_pooling(last_hidden_states: np.ndarray, attention_mask: np.ndarray):
input_mask_expanded = (np.expand_dims(attention_mask, axis=-1)).astype(float)
Expand All @@ -49,17 +47,32 @@ def normalize(input_array, p=2, dim=1, eps=1e-12):


def device_to_onnx(device: Device) -> str:
CHECK_ONNXRUNTIME.mark_required()
available = ort.get_available_providers()

if device == Device.cpu:
if "OpenVINOExecutionProvider" in available:
return "OpenVINOExecutionProvider"
return "CPUExecutionProvider"
elif device == Device.cuda:
if "ROCMExecutionProvider" in available:
return "ROCMExecutionProvider"
return "CUDAExecutionProvider"
elif device == Device.mps:
return "CoreMLExecutionProvider"
elif device == Device.tensorrt:
return "TensorrtExecutionProvider"
elif device is None or device == Device.auto:
if CHECK_TORCH.is_available and torch.cuda.is_available():
if "TensorrtExecutionProvider" in available:
return "TensorrtExecutionProvider"
elif "CUDAExecutionProvider" in available:
return "CUDAExecutionProvider"
elif "ROCMExecutionProvider" in available:
return "ROCMExecutionProvider"
elif "CoreMLExecutionProvider" in available:
return "CoreMLExecutionProvider"
elif "OpenVINOExecutionProvider" in available:
return "OpenVINOExecutionProvider"
else:
return "CPUExecutionProvider"
else:
Expand Down Expand Up @@ -135,7 +148,9 @@ def optimize_model(

optimizer = ORTOptimizer.from_pretrained(unoptimized_model)

is_gpu = "cpu" not in execution_provider.lower()
is_gpu = not (
"cpu" in execution_provider.lower() or "openvino" in execution_provider.lower()
)
optimization_config = OptimizationConfig(
optimization_level=99,
optimize_with_onnxruntime_only=False,
Expand Down
Loading