diff --git a/.github/workflows/app-release.yml b/.github/workflows/app-release.yml index aa9a43e..e89aeb0 100644 --- a/.github/workflows/app-release.yml +++ b/.github/workflows/app-release.yml @@ -31,19 +31,21 @@ permissions: jobs: Update-App-Config: - runs-on: ubuntu-latest + runs-on: gha-runner-supervisely outputs: RELEASE_TYPE: ${{ steps.update_app_config.outputs.RELEASE_TYPE }} COMMIT_HASH: ${{ steps.update_app_config.outputs.COMMIT_HASH }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: token: ${{ secrets.GH_ACCESS_TOKEN }} ref: master + - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: - python-version: 3.8 + python-version: "3.12" + - name: Update App Config env: GITHUB_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} @@ -54,7 +56,7 @@ jobs: VERSION="${VERSION:1}" fi echo "DOCKER_IMAGE=$VERSION" >> $GITHUB_ENV - + SEMVER_REGEX="^[0-9]+\.[0-9]+\.[0-9]+$" if [[ ! $VERSION =~ $SEMVER_REGEX ]]; then RELEASE_TYPE="release-branch" @@ -63,7 +65,7 @@ jobs: fi echo "RELEASE_TYPE=$RELEASE_TYPE" >> $GITHUB_OUTPUT echo "RELEASE_TYPE=$RELEASE_TYPE" - + python app/update_config.py $VERSION git add app/config.json diff --git a/.github/workflows/build-push-dev.yml b/.github/workflows/build-push-dev.yml index 8f7885d..5677baf 100644 --- a/.github/workflows/build-push-dev.yml +++ b/.github/workflows/build-push-dev.yml @@ -5,41 +5,29 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: gha-runner-supervisely steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Write Tag to ENV variable run: echo "BRANCH_NAME=${{ github.event.release.tag_name }}" >> $GITHUB_ENV - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME_COMMUNITY }} - password: ${{ secrets.DOCKER_TOKEN_COMMUNITY }} + - name: Get Docker Labels from python script run: python .github/workflows/docker_labels.py - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - version: v0.9.1 + - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./ - file: ./Dockerfile - provenance: false - builder: ${{ steps.buildx.outputs.name }} - push: true - tags: | - supervisely/agent:dev - build-args: | - LABEL_VERSION=agent:6.999.0 - LABEL_INFO=${{ env.LABEL_INFO }} - LABEL_MODES=${{ env.LABEL_MODES }} - LABEL_README=${{ env.LABEL_README }} - LABEL_BUILT_AT=${{ env.LABEL_BUILT_AT }} - cache-from: type=registry,ref=supervisely/agent:cache - cache-to: type=registry,ref=supervisely/agent:cache,mode=max - # cache-from: type=gha - # cache-to: type=gha,mode=max + run: | + buildctl build \ + --frontend dockerfile.v0 \ + --local context=. \ + --local dockerfile=. \ + --output type=image,name=supervisely/agent:dev,push=true \ + --opt provenance=false \ + --opt build-arg=LABEL_VERSION=6.999.0 \ + --opt build-arg=LABEL_INFO=${{ env.LABEL_INFO }} \ + --opt build-arg=LABEL_MODES=${{ env.LABEL_MODES }} \ + --opt build-arg=LABEL_README=${{ env.LABEL_README }} \ + --opt build-arg=LABEL_BUILT_AT=${{ env.LABEL_BUILT_AT }} \ + --import-cache type=registry,ref=supervisely/agent:dev-cache \ + --export-cache type=registry,ref=supervisely/agent:dev-cache,mode=max diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml index 0dfe4a6..c2b93aa 100644 --- a/.github/workflows/build-push-release.yml +++ b/.github/workflows/build-push-release.yml @@ -6,73 +6,46 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: gha-runner-supervisely steps: - - name: Free Disk Space (Ubuntu) - uses: jlumbroso/free-disk-space@main - with: - tool-cache: false - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Echo ${{ github.event.release.tag_name }} run: echo ${{ github.event.release.tag_name }} - name: Write version to ENV variable run: | - TAG_NAME=${{ github.event.release.tag_name }} - echo "LABEL_VERSION=${TAG_NAME:1}" >> $GITHUB_ENV - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME_COMMUNITY }} - password: ${{ secrets.DOCKER_TOKEN_COMMUNITY }} + TAG_NAME=${{ github.event.release.tag_name }} + echo "LABEL_VERSION=${TAG_NAME:1}" >> $GITHUB_ENV - name: Get Docker Labels from python script run: python .github/workflows/docker_labels.py - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - version: v0.9.1 - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./ - file: ./Dockerfile - provenance: false - builder: ${{ steps.buildx.outputs.name }} - push: true - tags: | - supervisely/agent:${{ env.LABEL_VERSION }} - build-args: | - LABEL_VERSION=agent:${{ env.LABEL_VERSION }} - LABEL_INFO=${{ env.LABEL_INFO }} - LABEL_MODES=${{ env.LABEL_MODES }} - LABEL_README=${{ env.LABEL_README }} - LABEL_BUILT_AT=${{ env.LABEL_BUILT_AT }} - cache-from: type=registry,ref=supervisely/agent:cache - cache-to: type=registry,ref=supervisely/agent:cache,mode=max - # cache-from: type=gha - # cache-to: type=gha,mode=max - + run: | + buildctl build \ + --frontend dockerfile.v0 \ + --local context=. \ + --local dockerfile=. \ + --output type=image,name=supervisely/agent:${{ env.LABEL_VERSION }},push=true \ + --opt provenance=false \ + --opt build-arg=LABEL_VERSION=agent:${{ env.LABEL_VERSION }} \ + --opt build-arg=LABEL_INFO=${{ env.LABEL_INFO }} \ + --opt build-arg=LABEL_MODES=${{ env.LABEL_MODES }} \ + --opt build-arg=LABEL_README=${{ env.LABEL_README }} \ + --opt build-arg=LABEL_BUILT_AT=${{ env.LABEL_BUILT_AT }} \ + --import-cache type=registry,ref=supervisely/agent:cache \ + --export-cache type=registry,ref=supervisely/agent:cache,mode=max + app-release: needs: build permissions: contents: write actions: write uses: supervisely/agent/.github/workflows/app-release.yml@app-release - secrets: + secrets: SUPERVISELY_DEV_API_TOKEN: "${{ secrets.SUPERVISELY_DEV_API_TOKEN }}" SUPERVISELY_PRIVATE_DEV_API_TOKEN: "${{ secrets.SUPERVISELY_PRIVATE_DEV_API_TOKEN }}" SUPERVISELY_PROD_API_TOKEN: "${{ secrets.SUPERVISELY_PROD_API_TOKEN }}" @@ -82,4 +55,3 @@ jobs: SUPERVISELY_PROD_SERVER_ADDRESS: "${{ vars.SUPERVISELY_PROD_SERVER_ADDRESS }}" RELEASE_VERSION: "${{ github.event.release.tag_name }}" RELEASE_DESCRIPTION: "${{ github.event.release.name }}" - diff --git a/.github/workflows/manual-build-push-release.yml b/.github/workflows/manual-build-push-release.yml index 311b5e1..d98dcb3 100644 --- a/.github/workflows/manual-build-push-release.yml +++ b/.github/workflows/manual-build-push-release.yml @@ -14,55 +14,36 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: gha-runner-supervisely steps: - - name: Free Disk Space (Ubuntu) - uses: jlumbroso/free-disk-space@main - with: - tool-cache: false - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 + - name: Tag Version run: echo ${{ github.event.inputs.tag_version }} + - name: Write Tag to ENV variable run: echo "LABEL_VERSION=${{ github.event.inputs.tag_version }}" >> $GITHUB_ENV - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME_COMMUNITY }} - password: ${{ secrets.DOCKER_TOKEN_COMMUNITY }} + - name: Get Docker Labels from python script run: python .github/workflows/docker_labels.py - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - version: v0.9.1 - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./ - file: ./Dockerfile - provenance: false - builder: ${{ steps.buildx.outputs.name }} - push: true - tags: | - supervisely/agent:${{ env.LABEL_VERSION }} - build-args: | - LABEL_VERSION=agent:${{ env.LABEL_VERSION }} - LABEL_INFO=${{ env.LABEL_INFO }} - LABEL_MODES=${{ env.LABEL_MODES }} - LABEL_README=${{ env.LABEL_README }} - LABEL_BUILT_AT=${{ env.LABEL_BUILT_AT }} - cache-from: type=registry,ref=supervisely/agent:cache - cache-to: type=registry,ref=supervisely/agent:cache,mode=max + - name: Build and push + run: | + buildctl build \ + --frontend dockerfile.v0 \ + --local context=. \ + --local dockerfile=. \ + --output type=image,name=supervisely/agent:${{ env.LABEL_VERSION }},push=true \ + --opt provenance=false \ + --opt build-arg=LABEL_VERSION=agent:${{ env.LABEL_VERSION }} \ + --opt build-arg=LABEL_INFO=${{ env.LABEL_INFO }} \ + --opt build-arg=LABEL_MODES=${{ env.LABEL_MODES }} \ + --opt build-arg=LABEL_README=${{ env.LABEL_README }} \ + --opt build-arg=LABEL_BUILT_AT=${{ env.LABEL_BUILT_AT }} \ + --import-cache type=registry,ref=supervisely/agent:cache \ + --export-cache type=registry,ref=supervisely/agent:cache,mode=max app-release: needs: build @@ -80,4 +61,3 @@ jobs: SUPERVISELY_PROD_SERVER_ADDRESS: "${{ vars.SUPERVISELY_PROD_SERVER_ADDRESS }}" RELEASE_VERSION: "${{ inputs.tag_version }}" RELEASE_DESCRIPTION: "${{ inputs.release_description }}" - diff --git a/.github/workflows/pr-check-with-pylint.yml b/.github/workflows/pr-check-with-pylint.yml index 57e6c08..dd03544 100644 --- a/.github/workflows/pr-check-with-pylint.yml +++ b/.github/workflows/pr-check-with-pylint.yml @@ -7,67 +7,80 @@ on: jobs: pylint: - runs-on: ubuntu-22.04 + runs-on: gha-runner-supervisely env: - ISSUES_URL: "https://api.github.com/repos/supervisely/issues/issues" - PROJECT_NUMBER: "2" - TODO_NAME: "🚀 Todo (now!)" - STATUS_FIELD_ID: "" - STATUS_ID: "" - ISSUE_NODE_ID: "" - ITEM_TO_MOVE: "" - ORG_PROJECT_ID: "" - ORG_LOGIN: "supervisely" + # ISSUES_URL: "https://api.github.com/repos/supervisely/issues/issues" + # PROJECT_NUMBER: "2" + # TODO_NAME: "🚀 Todo (now!)" + # STATUS_FIELD_ID: "" + # STATUS_ID: "" + # ISSUE_NODE_ID: "" + # ITEM_TO_MOVE: "" + # ORG_PROJECT_ID: "" + # ORG_LOGIN: "supervisely" ERRORS_DETECTED: false steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - repository: ${{ github.repository }} - token: ${{ secrets.PYLINT_TOKEN }} - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.8 - - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install python3-all-dev libboost-python-dev libexiv2-dev - sudo ln -s /usr/lib/x86_64-linux-gnu/libboost_python310.so /usr/lib/x86_64-linux-gnu/libboost_python38.so - pip install --upgrade pip - pip install pylint - pip install -r requirements.txt - - - - name: Run check with pylint - run: | - export PYTHONPATH=$PYTHONPATH:$PWD/agent/ - pylint_output=$(pylint --ignore-patterns=".*\.json$|.*\.gitignore$" "agent") || true - if [[ $pylint_output == *"E"* ]] || [[ $pylint_output == *"F"* ]]; then - # Save pylint output to a file - echo "$pylint_output" > pylint_errors.txt - echo "ERRORS_DETECTED=true" >> $GITHUB_ENV - else - echo "ERRORS_DETECTED=false" >> $GITHUB_ENV - fi - id: pylint - - - name: Fail if pylint errors detected - if: ${{ env.ERRORS_DETECTED == 'true' }} - run: | - issue_body=$(cat pylint_errors.txt) - echo "Pylint Errors: $issue_body" - echo "RESULT=failure" >> $GITHUB_ENV - exit 1 + - name: Checkout repository + uses: actions/checkout@v6 + with: + repository: ${{ github.repository }} + token: ${{ secrets.PYLINT_TOKEN }} + + - name: Get system python version + run: | + PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')") + echo "Detected Python version: $PYVER" + echo "PYVER=$PYVER" >> $GITHUB_ENV + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y python3-all-dev libboost-python-dev libexiv2-dev + + - name: Create Boost Python symlink + run: | + sudo ln -s \ + "/usr/lib/x86_64-linux-gnu/libboost_python${PYVER}.so" \ + "/usr/lib/x86_64-linux-gnu/libboost_python312.so" + + - name: Install pylint and requirements + run: | + pip install --upgrade pip + pip install pylint + pip install -r requirements.txt + + - name: Run check with pylint + run: | + export PYTHONPATH=$PYTHONPATH:$PWD/agent/ + pylint_output=$(pylint --ignore-patterns=".*\.json$|.*\.gitignore$" "agent") || true + if [[ $pylint_output == *"E"* ]] || [[ $pylint_output == *"F"* ]]; then + # Save pylint output to a file + echo "$pylint_output" > pylint_errors.txt + echo "ERRORS_DETECTED=true" >> $GITHUB_ENV + else + echo "ERRORS_DETECTED=false" >> $GITHUB_ENV + fi + id: pylint + + - name: Fail if pylint errors detected + if: ${{ env.ERRORS_DETECTED == 'true' }} + run: | + issue_body=$(cat pylint_errors.txt) + echo "Pylint Errors: $issue_body" + echo "RESULT=failure" >> $GITHUB_ENV + exit 1 # - name: Create GitHub issue # run: | # if [[ "${{ env.ERRORS_DETECTED }}" == "true" ]]; then # issue_body=$(cat pylint_errors.txt) # echo "Issue body: $issue_body" - + # json=$(jq -n \ # --arg title "Pylint Errors for ${{ github.event_name }} #${{ github.event.release.tag_name }} at $(date -u +'%Y-%m-%d %H:%M') UTC+0" \ # --arg body "$issue_body" \ @@ -87,13 +100,12 @@ jobs: # echo "issue_url=$issue_url" >> $GITHUB_ENV # issue_node_id=$(echo "$issue_response" | jq -r '.node_id') # echo "ISSUE_NODE_ID=$issue_node_id" >> $GITHUB_ENV - + # else # echo "No pylint errors detected." # fi - - - # - name: Get Project ID + + # - name: Get Project ID # if: ${{ env.ERRORS_DETECTED == 'true' }} # run: | # org_login=${{ env.ORG_LOGIN }} @@ -112,10 +124,10 @@ jobs: # https://api.github.com/graphql) # echo "Response from GitHub API: $response" # project_id=$(echo "$response" | jq -r '.data.organization.projectV2.id') - + # echo "Organization Project ID: $project_id" # echo "ORG_PROJECT_ID=$project_id" >> $GITHUB_ENV - + # - name: Get Status Field ID and Status ID # if: ${{ env.ERRORS_DETECTED == 'true' }} # run: | @@ -130,10 +142,10 @@ jobs: # }' \ # https://api.github.com/graphql) # echo "Response from GitHub API: $response" - # status_field_id=$(echo "$response" | jq -r '.data.node.fields.nodes[] | select(.name == "Status") | .id') + # status_field_id=$(echo "$response" | jq -r '.data.node.fields.nodes[] | select(.name == "Status") | .id') # status_id=$(echo "$response" | jq -r '.data.node.fields.nodes[] | select(.name == "Status") | .options[] | select(.name == "${{ env.TODO_NAME }}") | .id') # echo "STATUS_FIELD_ID=$status_field_id" >> $GITHUB_ENV - # echo "STATUS_ID=$status_id" >> $GITHUB_ENV + # echo "STATUS_ID=$status_id" >> $GITHUB_ENV # echo "Status Field ID: $status_id" # echo "Todo ID: $todo_id" @@ -142,7 +154,7 @@ jobs: # run: | # issue_id=${{ env.ISSUE_NODE_ID }} # project_id=${{ env.ORG_PROJECT_ID }} - + # item_id=$(curl -X POST -H "Authorization: Bearer ${{ secrets.PYLINT_TOKEN }}" \ # -H "Accept: application/vnd.github.v3+json" \ # https://api.github.com/graphql \ @@ -164,10 +176,10 @@ jobs: # } # EOF # ) - + # echo "Item ID: $item_id" # echo "ITEM_TO_MOVE=$item_id" >> $GITHUB_ENV - + # - name: Move issue to Todo column # if: ${{ env.ERRORS_DETECTED == 'true' }} # run: | diff --git a/Dockerfile b/Dockerfile index 0ef2e80..ed10378 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,3 @@ -# FROM ubuntu:24.04 # No GPU support FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04 ARG LABEL_VERSION @@ -21,8 +20,11 @@ ENV \ PIP_BREAK_SYSTEM_PACKAGES=1 \ PATH=/root/.local/bin:$PATH -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ +COPY requirements.txt /workdir/requirements.txt + +RUN set -eux; \ + apt-get update; \ + apt-get install -y --no-install-recommends \ build-essential \ python3.12 \ python3.12-venv \ @@ -31,7 +33,7 @@ RUN apt-get update \ python3-grpcio \ libexiv2-27 \ libexiv2-dev \ - libboost-all-dev \ + libboost-python-dev \ libgeos-dev \ libsm6 \ libxext6 \ @@ -59,35 +61,22 @@ RUN apt-get update \ html2text \ htop \ tree \ - && ln -sf /usr/bin/python3.12 /usr/bin/python \ - && ln -sf /usr/bin/pip3 /usr/bin/pip \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log - -RUN python -m pip install --ignore-installed --upgrade pip setuptools wheel - -RUN python -m pip install torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu128 - -# Install runtime dependencies that must stay -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ libmagic-dev \ + libmagic1 \ openssh-server \ ffmpeg \ fonts-noto \ - && mkdir -p /var/run/sshd \ - && apt-get -qq -y autoremove \ - && apt-get autoclean \ - && rm -rf /var/lib/apt/lists/* - -COPY requirements.txt /workdir/requirements.txt -RUN python -m pip install --no-cache-dir -r /workdir/requirements.txt - -RUN apt-get purge -y --auto-remove \ + ; \ + mkdir -p /var/run/sshd; \ + ln -sf /usr/bin/python3.12 /usr/bin/python; \ + ln -sf /usr/bin/pip3 /usr/bin/pip; \ + python -m pip install --ignore-installed --upgrade pip setuptools wheel; \ + python -m pip install --no-cache-dir -r /workdir/requirements.txt; \ + apt-get purge -y --auto-remove \ build-essential \ python3.12-dev \ libexiv2-dev \ - libboost-all-dev \ + libboost-python-dev \ libgeos-dev \ libxrender-dev \ libgl1-mesa-dev \ @@ -103,12 +92,24 @@ RUN apt-get purge -y --auto-remove \ libtiff-dev \ libatlas-base-dev \ gfortran \ + libmagic-dev \ pkg-config \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log + ; \ + apt-get -qq -y autoremove; \ + apt-get autoclean && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* \ + /var/cache/apt/* \ + /var/log/dpkg.log \ + /var/log/apt/* \ + /root/.cache/pip \ + /tmp/* \ + /var/tmp/* COPY agent /workdir/agent WORKDIR /workdir/agent ENTRYPOINT ["python", "-u", "/workdir/agent/main.py"] + + diff --git a/agent/worker/agent.py b/agent/worker/agent.py index dc6598f..6c0beb8 100644 --- a/agent/worker/agent.py +++ b/agent/worker/agent.py @@ -28,8 +28,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning) -import torch # pylint: disable=import-error - from worker import constants from worker import agent_utils from worker import docker_utils diff --git a/agent/worker/system_info.py b/agent/worker/system_info.py index 4f1e627..d2e7fb3 100644 --- a/agent/worker/system_info.py +++ b/agent/worker/system_info.py @@ -13,8 +13,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning) -import torch # pylint: disable=import-error - import supervisely_lib as sly from worker import constants @@ -228,78 +226,80 @@ def get_self_docker_image_digest(): return sly.catch_silently(_get_self_docker_image_digest) -def get_gpu_info_with_torch(logger): - torch.cuda.init() - gpu_info = None - try: - gpu_info = {} - gpu_info["is_available"] = torch.cuda.is_available() - if gpu_info["is_available"]: - gpu_info["device_count"] = torch.cuda.device_count() - gpu_info["device_names"] = [] - gpu_info["device_memory"] = [] - for idx in range(gpu_info["device_count"]): - gpu_info["device_names"].append(torch.cuda.get_device_name(idx)) - mem = {} - try: - device_props = torch.cuda.get_device_properties(idx) - t = device_props.total_memory - r = torch.cuda.memory_reserved(idx) - a = torch.cuda.memory_allocated(idx) - mem = { - "total": t, - "reserved": r, - "allocated": a, - "free": t - r, - } - except Exception as e: - logger.debug(repr(e)) - finally: - gpu_info["device_memory"].append(mem) - - except Exception as e: - logger.warning(repr(e)) - return gpu_info +def get_gpu_info(logger): + """ + Collect GPU information using NVML. + """ + + gpu_info = { + "is_available": False, + "device_count": 0, + "device_names": [], + "device_memory": [], + "device_capability": [], + } + try: + smi.nvmlInit() + except Exception as e: # pylint: disable=broad-except + logger.warning("Failed to initialize NVML: %s", repr(e)) + return gpu_info -def get_gpu_info(logger): - gpu_info = None try: - gpu_info = {} - gpu_info["is_available"] = torch.cuda.is_available() - if gpu_info["is_available"]: - smi.nvmlInit() - gpu_info["device_count"] = smi.nvmlDeviceGetCount() - gpu_info["device_names"] = [] - gpu_info["device_memory"] = [] - gpu_info["device_capability"] = [] - for idx in range(gpu_info["device_count"]): - handle = smi.nvmlDeviceGetHandleByIndex(idx) - capability = smi.nvmlDeviceGetCudaComputeCapability(handle) - capability = "{major}.{minor}".format(major=capability[0], minor=capability[1]) - gpu_info["device_names"].append(smi.nvmlDeviceGetName(handle)) - gpu_info["device_capability"].append( - { - "device": f"GPU {idx}", - "compute_capability": capability, - } - ) - mem = {} - try: - device_props = smi.nvmlDeviceGetMemoryInfo(handle) - mem = { - "total": device_props.total, - "reserved": device_props.used, - "available": device_props.free, - } - except Exception as e: - logger.debug(repr(e)) - finally: - gpu_info["device_memory"].append(mem) + try: + device_count = smi.nvmlDeviceGetCount() + except Exception as e: # pylint: disable=broad-except + logger.warning("Failed to get GPU count via NVML: %s", repr(e)) + return gpu_info + + gpu_info["device_count"] = device_count + gpu_info["is_available"] = device_count > 0 + + if not gpu_info["is_available"]: + return gpu_info + + for idx in range(device_count): + handle = smi.nvmlDeviceGetHandleByIndex(idx) + capability = smi.nvmlDeviceGetCudaComputeCapability(handle) + capability_str = "{major}.{minor}".format(major=capability[0], minor=capability[1]) + gpu_info["device_names"].append(smi.nvmlDeviceGetName(handle)) + gpu_info["device_capability"].append( + { + "device": f"GPU {idx}", + "compute_capability": capability_str, + } + ) + + mem = {} + try: + device_props = smi.nvmlDeviceGetMemoryInfo(handle) + mem = { + "total": device_props.total, + "reserved": device_props.used, + "available": device_props.free, + } + except Exception as e: # pylint: disable=broad-except + logger.debug("Failed to collect GPU memory info: %s", repr(e)) + finally: + gpu_info["device_memory"].append(mem) + + try: gpu_info["driver_version"] = smi.nvmlSystemGetDriverVersion() + except Exception as e: # pylint: disable=broad-except + logger.debug("Failed to get NVIDIA driver version: %s", repr(e)) + + try: gpu_info["cuda_version"] = smi.nvmlSystemGetCudaDriverVersion() + except Exception as e: # pylint: disable=broad-except + logger.debug("Failed to get CUDA driver version: %s", repr(e)) + + except Exception as e: # pylint: disable=broad-except + logger.warning("Failed to collect GPU info via NVML: %s", repr(e)) + finally: + try: smi.nvmlShutdown() + except Exception: # pylint: disable=broad-except + # Ignore shutdown errors + pass - except Exception as e: - logger.warning(repr(e)) return gpu_info diff --git a/build_dev.sh b/build_dev.sh new file mode 100755 index 0000000..f0043b8 --- /dev/null +++ b/build_dev.sh @@ -0,0 +1,4 @@ +docker build -t supervisely/agent:dev \ + --build-arg LABEL_VERSION=agent:6.999.0 \ + . && \ +docker push supervisely/agent:dev \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0d6a023..c7e0778 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,9 @@ docker==6.0.1 -psutil==5.9.0 urllib3==1.26.15 requests==2.28.1 requests-toolbelt>=1.0.0 hurry.filesize==0.9 scandir==1.10.0 -# grpcio installed from system packages (python3-grpcio) -# grpcio-tools removed due to protobuf version conflict with supervisely[agent] -py3exiv2==0.9.3 packaging==21.2 version-parser==1.0.1 python-slugify==6.1.2 @@ -15,9 +11,10 @@ nvidia-ml-py==12.535.77 httpx>=0.26.0 filelock==3.13.1 -# Installed inside Dockerfile -# torch==2.9.1+cu128 -# torchvision==0.24.1+cu128 +# grpcio installed from system packages (python3-grpcio) +# grpcio-tools removed due to protobuf version conflict with supervisely[agent] +pyexiv2==2.15.5 # Upgraded from py3exiv2==0.9.3 to support Python 3.12 +psutil==5.9.8 # Upgraded from psutil==5.9.0 to support Python 3.12 supervisely==6.73.474 supervisely[agent]==6.73.474