DeepWok · pgimenes · Jun 4, 2024 · Jun 6, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/.github/workflows/testTorchMLIR.yml b/.github/workflows/testTorchMLIR.yml
diff --git a/setup.py b/setup.py
@@ -80,7 +80,6 @@ def get_system():
     "sphinx-glpi-theme",
     "prettytable",
     "pyyaml",
-    "pynvml",
     "bitstring>=4.2",
     "myst_parser",
     "cvxpy",
@@ -98,7 +97,7 @@ def get_system():
     author="Aaron Zhao, Jianyi Cheng, Cheng Zhang, Pedro Gimenes",
     author_email="a.zhao@imperial.ac.uk, jianyi.cheng17@imperial.ac.uk, chengzhang98@outlook.com, pedro.gimenes19@imperial.ac.uk",
     license_files=("LICENSE",),
-    python_requires=">=3.11.9",
+    python_requires=">=3.11.4",
     package_dir={
         "": "src",
     },

diff --git a/src/chop/distributed/__init__.py b/src/chop/distributed/__init__.py
@@ -1 +0,0 @@
-from .launcher import MaseLauncher

diff --git a/src/chop/distributed/launcher.py b/src/chop/distributed/launcher.py
@@ -1,186 +1,55 @@
-import os
-from functools import partial
-from time import time
-
-import torch
-import torch.nn as nn
-import torch.distributed as dist
 import torch.multiprocessing as mp
 
-from torch.distributed._tensor import (
-    DeviceMesh,
-    Replicate,
-    Shard,
-)
-
-from chop.distributed.tensor import distribute_module, distribute_tensor
-
-from chop.distributed.utils import rlog
+from chop.distributed.utils import _get_mesh_from_world_size
 from ..tools import get_logger
 
 logger = get_logger(__name__)
 logger.setLevel("DEBUG")
 
 
-def distributed_timing(fn, *args, **kwargs):
-    dist.barrier(async_op=True)
-    start = time()
-    result = fn(*args, **kwargs)
-    dist.barrier(async_op=True)
-    end = time()
-
-    return result, (end - start)
-
-
-def distributed_average_timing(fn, repeat, args):
-    times = []
-    for itr in range(repeat):
-        rlog(
-            logger,
-            dist.get_rank(),
-            f"Running teration {itr}",
-            "debug",
-        )
-        dist.barrier(async_op=True)
-        start = time()
-        result = fn(*args)
-        dist.barrier(async_op=True)
-        end = time()
-        times.append(end - start)
-        rlog(
-            logger,
-            dist.get_rank(),
-            f"Time taken: {end - start}s",
-            "debug",
-        )
-
-    return result, sum(times[2:]) / len(times[2:])
-
-
-def dist_model_fn(
-    name: str,
-    module: nn.Module,
-    device_mesh: DeviceMesh,
-    rank: int,
-    tensor_sharding_map={},
-) -> None:
-    """
-    This function gets called by torch.distributed._tensor.distribute_module on each module in the model.
-    Each tensor in each module is distributed according to the sharding configuration in tensor_sharding_map.
-    """
-    if module in tensor_sharding_map:
-        node_name = tensor_sharding_map[module]["node"]
-        for parameter, sharding_config in tensor_sharding_map[module][
-            "sharding"
-        ].items():
-            if parameter in ["data_in_0", "output", "data_out_0"]:
-                continue
-            if not hasattr(module, parameter):
-                rlog(
-                    logger,
-                    rank,
-                    f"Module {module} does not have parameter {parameter}",
-                    level="warning",
-                )
-                continue
-
-            placement = sharding_config.placements
-
-            try:
-                rlog(
-                    logger,
-                    rank,
-                    f"Distributing parameter {parameter} of module {node_name} to {placement}",
-                    level="debug",
-                )
-                distributed_tensor = distribute_tensor(
-                    getattr(module, parameter), device_mesh, placement
-                )
-                setattr(module, parameter, torch.nn.Parameter(distributed_tensor))
-            except Exception as e:
-                rlog(
-                    logger,
-                    rank,
-                    f"Error distributing parameter {parameter} of module {node_name} to {placement}: {e}",
-                    level="error",
-                )
-
-
-def device_fn(
-    rank, world_size, model=None, device_mesh=None, tensor_sharding_map={}, inputs=[]
-):
-    """
-    This function gets called on each GPU device to set up the distributed environment and distribute the model,
-    following the SPMD model.
-    """
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    os.environ["RANK"] = str(rank)
-
-    # Initialize
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-    device = torch.device("cuda", rank)
-    torch.cuda.set_device(device)
-
-    # Distribute model parameters according to sharding configuration
-    mesh = DeviceMesh("cuda", mesh=device_mesh)
-    rlog(logger, rank, f"Distributing module parameters...", level="info")
-    model, dist_time = distributed_timing(
-        distribute_module,
-        model,
-        mesh,
-        partial(dist_model_fn, rank=rank, tensor_sharding_map=tensor_sharding_map),
-        input_fn=None,
-        output_fn=None,
-    )
-    rlog(logger, rank, f"Module distribution done. Time taken: {dist_time} seconds.")
-
-    # Run forward pass
-    rlog(logger, rank, f"Starting forward pass.", level="info")
-    inputs = [
-        distribute_tensor(in_tensor, mesh, [Replicate(), Replicate()])
-        for in_tensor in inputs
-    ]
-    _, time_taken = distributed_average_timing(
-        fn=model,
-        repeat=10,
-        args=inputs,
-    )
-    rlog(logger, rank, f"Forward pass finished. Time taken: {time_taken}", level="info")
-
-    dist.destroy_process_group()
-
-
 class MaseLauncher:
     """
     MaseLauncher launches an optimized model on multiple GPUs using torch.distributed.
     """
 
-    def __init__(self, mase_graph, world_size=None, device_mesh=None):
+    def __init__(
+        self,
+        mg=None,
+        world_size=None,
+        device_mesh=None,
+        device_fn=None,
+    ):
         """Initialize the MaseLauncher.
 
         Args:
             mase_graph (MaseGraph): The MaseGraph object containing the model.
             world_size (int, optional): Number of GPUs to use. Defaults to None.
             device_mesh (list, optional): List of GPUs to use. Defaults to None.
         """
-        self.mg = mase_graph
-        self.model = mase_graph.model
+        self.mg = mg
         self.world_size = world_size
-        self.device_mesh = device_mesh
+        self.device_fn = device_fn
+
+        if device_mesh is None:
+            self.device_mesh, _ = _get_mesh_from_world_size(world_size)
 
-    def run(self, tensor_sharding_map={}, inputs=[]):
+    def run(
+        self,
+        model_class=None,
+        model_config=None,
+        cli_args=None,
+    ):
         logger.info(f"Launching model with world size {self.world_size}.")
 
         mp.spawn(
-            partial(
-                device_fn,
-                model=self.model,
-                device_mesh=self.device_mesh,
-                tensor_sharding_map=tensor_sharding_map,
-                inputs=inputs,
+            self.device_fn,
+            args=(
+                self.world_size,
+                self.device_mesh,
+                model_class,
+                model_config,
+                cli_args,
             ),
-            args=(self.world_size,),
             nprocs=self.world_size,
             join=True,
         )
diff --git a/src/chop/distributed/tensor/__init__.py b/src/chop/distributed/tensor/__init__.py
@@ -14,10 +14,8 @@
 )
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
 
-import chop.distributed.tensor.ops
 from chop.distributed.tensor._utils import compute_local_shape
 from chop.distributed.tensor.api import distribute_module, distribute_tensor, DTensor
-from chop.distributed.tensor.ops.utils import normalize_to_torch_size
 
 
 # All public APIs from dtensor package
@@ -33,6 +31,25 @@
 ]
 
 
+def normalize_to_torch_size(size) -> torch.Size:
+    """
+    Unify variable types of size argument to torch.Size
+    Acceptable types include:
+        int, Sequence[int], Tuple[int], Tuple[Sequence[int]],
+        or torch.Size
+    """
+    if isinstance(size, torch.Size):
+        return size
+
+    if isinstance(size, int):
+        torch_size = [size]
+    elif len(size) == 1 and isinstance(size[0], Sequence):
+        torch_size = list(size[0])
+    else:
+        torch_size = list(size)
+    return torch.Size(torch_size)
+
+
 def _dtensor_init_helper(
     init_op,
     size: torch.Size,