|
1 | 1 | # Copyright (c) ModelScope Contributors. All rights reserved. |
2 | 2 | import os |
3 | 3 | import platform |
| 4 | +import hashlib |
| 5 | +import re |
4 | 6 | import shutil |
| 7 | +import socket |
5 | 8 | import subprocess |
6 | 9 | from abc import ABC |
7 | 10 | from dataclasses import dataclass, field |
@@ -641,5 +644,72 @@ def is_last_rank(): |
641 | 644 | return True |
642 | 645 | return dist.get_rank() == dist.get_world_size() - 1 |
643 | 646 |
|
| 647 | + |
| 648 | +def _resolve_ascend_physical_device_id(device_id: int) -> int: |
| 649 | + """Map local NPU device index to physical device id via visible devices.""" |
| 650 | + visible = os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "").strip() |
| 651 | + if not visible: |
| 652 | + return device_id |
| 653 | + parts = [p.strip() for p in visible.split(",") if p.strip()] |
| 654 | + if device_id < 0 or device_id >= len(parts): |
| 655 | + return device_id |
| 656 | + return int(parts[device_id]) |
| 657 | + |
| 658 | + |
| 659 | +def _get_npu_bus_id_from_npu_smi(device_id: int) -> Optional[str]: |
| 660 | + """Get NPU Bus-Id from `npu-smi info` output.""" |
| 661 | + try: |
| 662 | + physical_id = _resolve_ascend_physical_device_id(device_id) |
| 663 | + except Exception: |
| 664 | + physical_id = device_id |
| 665 | + |
| 666 | + try: |
| 667 | + output = subprocess.check_output( |
| 668 | + ["npu-smi", "info"], |
| 669 | + text=True, |
| 670 | + stderr=subprocess.STDOUT, |
| 671 | + timeout=5, |
| 672 | + ) |
| 673 | + except Exception: |
| 674 | + return None |
| 675 | + |
| 676 | + # fix: vllm-ascend may not implement get_device_uuid, but we still need a reproducible cross-process device id. |
| 677 | + # fix: Prefer physical Bus-Id parsed from npu-smi instead of unstable/random identifiers. |
| 678 | + # Typical line: |
| 679 | + # | 0 0 | 0000:9D:00.0 | ... |
| 680 | + pattern = re.compile( |
| 681 | + r"^\|\s*\d+\s+(\d+)\s*\|\s*" |
| 682 | + r"([0-9A-Fa-f]{4}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}\.[0-9A-Fa-f])\s*\|", |
| 683 | + re.MULTILINE, |
| 684 | + ) |
| 685 | + for match in pattern.finditer(output): |
| 686 | + phy_id = int(match.group(1)) |
| 687 | + if phy_id == physical_id: |
| 688 | + return match.group(2).lower() |
| 689 | + return None |
| 690 | + |
| 691 | + |
| 692 | +def get_vllm_device_uuid(device_id: int = 0) -> str: |
| 693 | + """Get vLLM device uuid with NPU Bus-Id special handling.""" |
| 694 | + from vllm.platforms import current_platform |
| 695 | + |
| 696 | + try: |
| 697 | + return current_platform.get_device_uuid(device_id) |
| 698 | + except NotImplementedError: |
| 699 | + # fix: Root cause was NPU platform calling vLLM base placeholder and raising NotImplementedError. |
| 700 | + # fix: Use Bus-Id fallback first so sender/receiver compute the same IPC endpoint. |
| 701 | + # NPU special case: prefer stable PCIe Bus-Id from npu-smi. |
| 702 | + bus_id = _get_npu_bus_id_from_npu_smi(device_id) |
| 703 | + if bus_id: |
| 704 | + return bus_id |
| 705 | + # fix: If npu-smi is unavailable, fall back to deterministic hash instead of failing hard. |
| 706 | + # Generic deterministic fallback to keep sender/receiver socket names aligned. |
| 707 | + visible = os.environ.get("ASCEND_RT_VISIBLE_DEVICES") or os.environ.get( |
| 708 | + "CUDA_VISIBLE_DEVICES", "" |
| 709 | + ) |
| 710 | + raw = f"{socket.gethostname()}:{visible}:{device_id}" |
| 711 | + return hashlib.sha1(raw.encode("utf-8")).hexdigest()[:16] |
| 712 | + |
| 713 | + |
644 | 714 | def is_master(): |
645 | 715 | return Platform.is_master() |
0 commit comments