I just try to let my two machines run a simple DNN with crypten. However, I always fail to establish communication at crypten.init(0, both on two different machines, or on single machine's two sessions using 127.0.0.1
I'm using conda:
crypten 0.4.1
python 3.12.7
pytorch 2.5.1
I then try to check my settings using a simple testing script. Here is the code:
import os
import argparse
import crypten
import crypten.communicator as comm
from crypten.config import cfg
import torch.distributed as dist
def run(rank, world_size, master_addr, master_port):
# Set environment variables for CrypTen
os.environ["RANK"] = str(rank)
os.environ["WORLD_SIZE"] = str(world_size)
os.environ["MASTER_ADDR"] = master_addr
os.environ["MASTER_PORT"] = master_port
print(f"Process {rank}: Initializing CrypTen with MASTER_ADDR={master_addr}, MASTER_PORT={master_port}")
# dist.init_process_group("gloo", init_method='env://')
# Initialize CrypTen
crypten.init()
print(f"Process {rank}: CrypTen initialized.")
# Check communicator status
print(f"Process {rank}: Communicator rank={comm.get().get_rank()}, world_size={comm.get().get_world_size()}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="CrypTen Initialization Debug Test")
parser.add_argument("--rank", type=int, required=True, help="Rank of the current process")
parser.add_argument("--world_size", type=int, default=2, help="Total number of processes")
parser.add_argument("--master_addr", type=str, default="127.0.0.1", help="Master address")
parser.add_argument("--master_port", type=str, default="29500", help="Master port")
parser.add_argument("--local_rank", type=int)
args = parser.parse_args()
run(args.rank, args.world_size, args.master_addr, args.master_port)
after run python script.py --rank=0 --world_size=2 --master_addr=127.0.0.1 --master_port=29500
and python script.py --rank=1 --world_size=2 --master_addr=127.0.0.1 --master_port=29500
both session hangs at crypten.init()
I'm using Mac, and already verified that torch.distributed is available.