From f528cee285bb7733c67c0f067767f30952abb68c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 21 Oct 2024 06:36:09 -0700 Subject: [PATCH 001/107] initial flwr integration commit Signed-off-by: kta-intel --- .../flower-app-pytorch/app-pytorch/.gitignore | 160 ++++++++++++++++++ .../flower-app-pytorch/app-pytorch/README.md | 20 +++ .../app-pytorch/app_pytorch/__init__.py | 1 + .../app-pytorch/app_pytorch/client_app.py | 55 ++++++ .../app-pytorch/app_pytorch/server_app.py | 31 ++++ .../app-pytorch/app_pytorch/task.py | 111 ++++++++++++ .../app-pytorch/pyproject.toml | 36 ++++ .../flower-app-pytorch/deserialize_message.py | 41 +++++ .../flower-app-pytorch/message_conversion.py | 88 ++++++++++ .../flower-app-pytorch/message_logger.py | 109 ++++++++++++ .../openfl_client_with_local_grpc_server.py | 53 ++++++ .../openfl_server_with_local_grpc_client.py | 63 +++++++ openfl/protocols/aggregator.proto | 7 + 13 files changed, 775 insertions(+) create mode 100644 openfl-workspace/flower-app-pytorch/app-pytorch/.gitignore create mode 100644 openfl-workspace/flower-app-pytorch/app-pytorch/README.md create mode 100644 openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/__init__.py create mode 100644 openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py create mode 100644 openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py create mode 100644 openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/task.py create mode 100644 openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml create mode 100644 openfl-workspace/flower-app-pytorch/deserialize_message.py create mode 100644 openfl-workspace/flower-app-pytorch/message_conversion.py create mode 100644 openfl-workspace/flower-app-pytorch/message_logger.py create mode 100644 openfl-workspace/flower-app-pytorch/openfl_client_with_local_grpc_server.py create mode 100644 openfl-workspace/flower-app-pytorch/openfl_server_with_local_grpc_client.py diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/.gitignore b/openfl-workspace/flower-app-pytorch/app-pytorch/.gitignore new file mode 100644 index 0000000000..68bc17f9ff --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/app-pytorch/README.md new file mode 100644 index 0000000000..9564565f97 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/README.md @@ -0,0 +1,20 @@ +# app-pytorch: A Flower / PyTorch app + +## Install dependencies and project + +```bash +pip install -e . +``` + +## Run with the Simulation Engine + +In the `app-pytorch` directory, use `flwr run` to run a local simulation: + +```bash +flwr run . +``` + +## Run with the Deployment Engine + +> \[!NOTE\] +> An update to this example will show how to run this Flower application with the Deployment Engine and TLS certificates, or with Docker. diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/__init__.py b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/__init__.py new file mode 100644 index 0000000000..bb8f979717 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/__init__.py @@ -0,0 +1 @@ +"""app-pytorch: A Flower / PyTorch app.""" diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py new file mode 100644 index 0000000000..d802d900c2 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py @@ -0,0 +1,55 @@ +"""app-pytorch: A Flower / PyTorch app.""" + +import torch + +from flwr.client import ClientApp, NumPyClient +from flwr.common import Context +from app_pytorch.task import Net, get_weights, load_data, set_weights, test, train + + +# Define Flower Client and client_fn +class FlowerClient(NumPyClient): + def __init__(self, net, trainloader, valloader, local_epochs): + self.net = net + self.trainloader = trainloader + self.valloader = valloader + self.local_epochs = local_epochs + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.net.to(self.device) + + def fit(self, parameters, config): + set_weights(self.net, parameters) + train_loss = train( + self.net, + self.trainloader, + self.local_epochs, + self.device, + ) + return ( + get_weights(self.net), + len(self.trainloader.dataset), + {"train_loss": train_loss}, + ) + + def evaluate(self, parameters, config): + set_weights(self.net, parameters) + loss, accuracy = test(self.net, self.valloader, self.device) + return loss, len(self.valloader.dataset), {"accuracy": accuracy} + + +def client_fn(context: Context): + # Load model and data + net = Net() + partition_id = context.node_config["partition-id"] + num_partitions = context.node_config["num-partitions"] + trainloader, valloader = load_data(partition_id, num_partitions) + local_epochs = 1 + + # Return Client instance + return FlowerClient(net, trainloader, valloader, local_epochs).to_client() + + +# Flower ClientApp +app = ClientApp( + client_fn, +) diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py new file mode 100644 index 0000000000..abc3ec6d78 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py @@ -0,0 +1,31 @@ +"""app-pytorch: A Flower / PyTorch app.""" + +from flwr.common import Context, ndarrays_to_parameters +from flwr.server import ServerApp, ServerAppComponents, ServerConfig +from flwr.server.strategy import FedAvg +from app_pytorch.task import Net, get_weights + + +def server_fn(context: Context): + # Read from config + num_rounds = 3 #context.run_config["num-server-rounds"] + fraction_fit = 0.5 #context.run_config["fraction-fit"] + + # Initialize model parameters + ndarrays = get_weights(Net()) + parameters = ndarrays_to_parameters(ndarrays) + + # Define strategy + strategy = FedAvg( + fraction_fit=fraction_fit, + fraction_evaluate=1.0, + min_available_clients=2, + initial_parameters=parameters, + ) + config = ServerConfig(num_rounds=num_rounds) + + return ServerAppComponents(strategy=strategy, config=config) + + +# Create ServerApp +app = ServerApp(server_fn=server_fn) diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/task.py b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/task.py new file mode 100644 index 0000000000..4a42b7009e --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/task.py @@ -0,0 +1,111 @@ +"""app-pytorch: A Flower / PyTorch app.""" + +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F +from flwr_datasets import FederatedDataset +from flwr_datasets.partitioner import IidPartitioner +from torch.utils.data import DataLoader +from torchvision.transforms import Compose, Normalize, ToTensor + + +class Net(nn.Module): + """Model (simple CNN adapted from 'PyTorch: A 60 Minute Blitz')""" + + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) + + +fds = None # Cache FederatedDataset + + +def load_data(partition_id: int, num_partitions: int): + """Load partition CIFAR10 data.""" + # Only initialize `FederatedDataset` once + global fds + if fds is None: + partitioner = IidPartitioner(num_partitions=num_partitions) + fds = FederatedDataset( + dataset="uoft-cs/cifar10", + partitioners={"train": partitioner}, + ) + partition = fds.load_partition(partition_id) + # Divide data on each node: 80% train, 20% test + partition_train_test = partition.train_test_split(test_size=0.2, seed=42) + pytorch_transforms = Compose( + [ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] + ) + + def apply_transforms(batch): + """Apply transforms to the partition from FederatedDataset.""" + batch["img"] = [pytorch_transforms(img) for img in batch["img"]] + return batch + + partition_train_test = partition_train_test.with_transform(apply_transforms) + trainloader = DataLoader(partition_train_test["train"], batch_size=32, shuffle=True) + testloader = DataLoader(partition_train_test["test"], batch_size=32) + return trainloader, testloader + + +def train(net, trainloader, epochs, device): + """Train the model on the training set.""" + net.to(device) # move model to GPU if available + criterion = torch.nn.CrossEntropyLoss().to(device) + optimizer = torch.optim.Adam(net.parameters(), lr=0.01) + net.train() + running_loss = 0.0 + for _ in range(epochs): + for batch in trainloader: + images = batch["img"] + labels = batch["label"] + optimizer.zero_grad() + loss = criterion(net(images.to(device)), labels.to(device)) + loss.backward() + optimizer.step() + running_loss += loss.item() + + avg_trainloss = running_loss / len(trainloader) + return avg_trainloss + + +def test(net, testloader, device): + """Validate the model on the test set.""" + net.to(device) + criterion = torch.nn.CrossEntropyLoss() + correct, loss = 0, 0.0 + with torch.no_grad(): + for batch in testloader: + images = batch["img"].to(device) + labels = batch["label"].to(device) + outputs = net(images) + loss += criterion(outputs, labels).item() + correct += (torch.max(outputs.data, 1)[1] == labels).sum().item() + accuracy = correct / len(testloader.dataset) + loss = loss / len(testloader) + return loss, accuracy + + +def get_weights(net): + return [val.cpu().numpy() for _, val in net.state_dict().items()] + + +def set_weights(net, parameters): + params_dict = zip(net.state_dict().keys(), parameters) + state_dict = OrderedDict({k: torch.tensor(v) for k, v in params_dict}) + net.load_state_dict(state_dict, strict=True) diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml b/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml new file mode 100644 index 0000000000..0ca8900b90 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "app-pytorch" +version = "1.0.0" +description = "" +license = "Apache-2.0" +dependencies = [ + "flwr-nightly==1.13.0.dev20241016", + "flwr-datasets[vision]>=0.3.0", + "torch==2.2.1", + "torchvision==0.17.1", +] + +[tool.hatch.build.targets.wheel] +packages = ["."] + +[tool.flwr.app] +publisher = "openfl-dev" + +[tool.flwr.app.components] +serverapp = "app_pytorch.server_app:app" +clientapp = "app_pytorch.client_app:app" + +[tool.flwr.app.config] +num-server-rounds = 3 +fraction-fit = 0.5 +local-epochs = 1 + +[tool.flwr.federations] +default = "local-simulation" + +[tool.flwr.federations.local-simulation] +options.num-supernodes = 2 diff --git a/openfl-workspace/flower-app-pytorch/deserialize_message.py b/openfl-workspace/flower-app-pytorch/deserialize_message.py new file mode 100644 index 0000000000..46f37e8acc --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/deserialize_message.py @@ -0,0 +1,41 @@ +import importlib +from google.protobuf.message import DecodeError + +def deserialize_flower_message(flower_message): + """ + Deserialize the grpc_message_content of a Flower message using the module and class name + specified in the metadata. + + Args: + flower_message: The Flower message containing the metadata and binary content. + + Returns: + The deserialized message object, or None if deserialization fails. + """ + # Access metadata directly + metadata = flower_message.metadata + module_name = metadata.get('grpc-message-module') + qualname = metadata.get('grpc-message-qualname') + + # Import the module + try: + module = importlib.import_module(module_name) + except ImportError as e: + print(f"Failed to import module: {module_name}. Error: {e}") + return None + + # Get the message class + try: + message_class = getattr(module, qualname) + except AttributeError as e: + print(f"Failed to get message class '{qualname}' from module '{module_name}'. Error: {e}") + return None + + # Deserialize the content + try: + message = message_class.FromString(flower_message.grpc_message_content) + except DecodeError as e: + print(f"Failed to deserialize message content. Error: {e}") + return None + + return message \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/message_conversion.py b/openfl-workspace/flower-app-pytorch/message_conversion.py new file mode 100644 index 0000000000..b91ac8c154 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/message_conversion.py @@ -0,0 +1,88 @@ +from flwr.proto import grpcadapter_pb2 +from openfl.protocols import aggregator_pb2 +# from deserialize_message import deserialize_flower_message + +# def flower_to_openfl_message(flower_message, sender, receiver): +# """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" +# if isinstance(flower_message, aggregator_pb2.DropPod()): +# # If the input is already an OpenFL message, return it as-is +# return flower_message +# else: +# """Convert a Flower MessageContainer to an OpenFL message.""" +# # Create the OpenFL message +# openfl_message = aggregator_pb2.DropPod() + +# # Set the MessageHeader fields based on the provided sender and receiver +# openfl_message.header.sender = sender +# openfl_message.header.receiver = receiver +# # openfl_message.message_type = flower_message.metadata['grpc-message-qualname'] +# # import pdb; pdb.set_trace() +# # serialized_flower_message = flower_message.SerializeToString() +# # openfl_message.message.npbytes = serialized_flower_message +# # openfl_message.message.size = len(serialized_flower_message) + +# # Set the DataStream message content to the serialized Flower message content +# openfl_message.message.npbytes = flower_message.grpc_message_content +# openfl_message.message.size = len(flower_message.grpc_message_content) + +# metadata = dict(flower_message.metadata) +# for key, value in metadata.items(): +# openfl_message.metadata[key] = value + +# return openfl_message + +# def openfl_to_flower_message(openfl_message): +# """Convert an OpenFL OpenFLMessage to a Flower MessageContainer.""" +# if isinstance(openfl_message, grpcadapter_pb2.MessageContainer): +# # If the input is already a Flower message, return it as-is +# return openfl_message +# else: +# # Deserialize the Flower message from the DataStream npbytes field +# flower_message = grpcadapter_pb2.MessageContainer() +# # import pdb; pdb.set_trace() +# # flower_message.ParseFromString(openfl_message.message.npbytes) +# # bytes_parsed = flower_message.grpc_message_content = openfl_message.message.npbytes + +# flower_message.grpc_message_content = openfl_message.message.npbytes + +# metadata = dict(openfl_message.metadata) +# for key, value in metadata.items(): +# flower_message.metadata[key] = value + +# # Will be depracated by Flower # +# flower_message.grpc_message_name = metadata.get('grpc-message-qualname') +# ################################ +# return flower_message + +def flower_to_openfl_message(flower_message, sender, receiver): + """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" + if isinstance(flower_message, aggregator_pb2.DropPod()): + # If the input is already an OpenFL message, return it as-is + return flower_message + else: + """Convert a Flower MessageContainer to an OpenFL message.""" + # Create the OpenFL message + openfl_message = aggregator_pb2.DropPod() + + # Set the MessageHeader fields based on the provided sender and receiver + openfl_message.header.sender = sender + openfl_message.header.receiver = receiver + # openfl_message.message_type = flower_message.metadata['grpc-message-qualname'] + serialized_flower_message = flower_message.SerializeToString() + openfl_message.message.npbytes = serialized_flower_message + openfl_message.message.size = len(serialized_flower_message) + + return openfl_message + +def openfl_to_flower_message(openfl_message): + """Convert an OpenFL OpenFLMessage to a Flower MessageContainer.""" + if isinstance(openfl_message, grpcadapter_pb2.MessageContainer): + # If the input is already a Flower message, return it as-is + return openfl_message + else: + # Deserialize the Flower message from the DataStream npbytes field + flower_message = grpcadapter_pb2.MessageContainer() + flower_message.ParseFromString(openfl_message.message.npbytes) + bytes_parsed = openfl_message.message.npbytes + # import pdb; pdb.set_trace() + return flower_message \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/message_logger.py b/openfl-workspace/flower-app-pytorch/message_logger.py new file mode 100644 index 0000000000..5f5b5ca152 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/message_logger.py @@ -0,0 +1,109 @@ +# import logging +# from deserialize_message import deserialize_flower_message + +# # Configure logging +# logging.basicConfig(filename='flower_messages.log', level=logging.INFO, format='%(asctime)s - %(message)s') + +# def log_flower_message(flower_message, message_type): +# """ +# Log a Flower message or response with deserialized content. + +# Args: +# flower_message: The Flower message or response to be logged. +# message_type: A string indicating the type of message ('sent' or 'received'). +# """ +# # Deserialize the grpc_message_content +# deserialized_content = deserialize_flower_message(flower_message) + +# # Prepare the log entry +# message_str = f"Flower message {message_type}:\n{flower_message}" +# if deserialized_content is not None: +# message_str += f"\nDeserialized content:\n{deserialized_content}" +# else: +# message_str += "\nDeserialization failed" + +# # Add separator +# message_str += f"\n{'=' * 40}\n" + +# # Log the message with deserialized content and separator +# logging.info(message_str) + +# # This function can be used to log messages from other parts of your application +# def log_message(flower_message, message_type): +# """ +# Public function to log a Flower message or response. + +# Args: +# flower_message: The Flower message or response to be logged. +# message_type: A string indicating the type of message ('sent' or 'received'). +# """ +# log_flower_message(flower_message, message_type) + +import logging +import os +from deserialize_message import deserialize_flower_message + +def get_logger(client_id): + """ + Get a logger for a specific client ID. + + Args: + client_id: A unique identifier for the client. + + Returns: + A logging.Logger instance for the client. + """ + # Create a directory for client logs if it doesn't exist + log_dir = 'client_logs' + if not os.path.exists(log_dir): + os.makedirs(log_dir) + + # Configure logging for the client + log_filename = os.path.join(log_dir, f'flower_messages_{client_id}.log') + logger = logging.getLogger(f'client_{client_id}') + logger.setLevel(logging.INFO) + if not logger.handlers: + # Add a file handler if it doesn't already exist + file_handler = logging.FileHandler(log_filename) + formatter = logging.Formatter('%(asctime)s - %(message)s') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + return logger + +def log_flower_message(flower_message, message_type, client_id): + """ + Log a Flower message or response with deserialized content for a specific client. + + Args: + flower_message: The Flower message or response to be logged. + message_type: A string indicating the type of message ('sent' or 'received'). + client_id: A unique identifier for the client. + """ + # Deserialize the grpc_message_content + deserialized_content = deserialize_flower_message(flower_message) + + # Prepare the log entry + message_str = f"Flower message {message_type}:\n{flower_message}" + if deserialized_content is not None: + message_str += f"\nDeserialized content:\n{deserialized_content}" + else: + message_str += "\nDeserialization failed" + + # Add separator + message_str += f"\n{'=' * 40}\n" + + # Get the logger for the client and log the message with deserialized content and separator + logger = get_logger(client_id) + logger.info(message_str) + +# This function can be used to log messages from other parts of your application +def log_message(flower_message, message_type, client_id): + """ + Public function to log a Flower message or response for a specific client. + + Args: + flower_message: The Flower message or response to be logged. + message_type: A string indicating the type of message ('sent' or 'received'). + client_id: A unique identifier for the client. + """ + log_flower_message(flower_message, message_type, client_id) \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/openfl_client_with_local_grpc_server.py b/openfl-workspace/flower-app-pytorch/openfl_client_with_local_grpc_server.py new file mode 100644 index 0000000000..bfb8931c04 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/openfl_client_with_local_grpc_server.py @@ -0,0 +1,53 @@ +import grpc +from concurrent import futures +from flwr.proto import grpcadapter_pb2_grpc +from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc +# from openfl.proto import openfl_pb2_grpc, openfl_pb2 +from message_conversion import flower_to_openfl_message, openfl_to_flower_message + +class OpenFLClient: + def __init__(self, openfl_server_address): + self.channel = grpc.insecure_channel(openfl_server_address) + self.stub = aggregator_pb2_grpc.AggregatorStub(self.channel) + + def send_message_to_server(self, flower_message): + # Convert Flower message to OpenFL message + openfl_message = flower_to_openfl_message(flower_message, + sender="Flower Client", + receiver="OpenFL Client") + # Send the OpenFL message to the OpenFL server and get a response + openfl_response = self.stub.PelicanDrop(openfl_message) + # Convert the OpenFL response back to a Flower message + flower_response = openfl_to_flower_message(openfl_response) + return flower_response + + +class OpenFLLocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): + def __init__(self, openfl_client): + self.openfl_client = openfl_client + + def SendReceive(self, request, context): + # Received a message from the Flower client + print(f"Received message from Flower client, sending to OpenFL server: {request.grpc_message_name}") + # Forward the incoming message to the OpenFL client + flower_response = self.openfl_client.send_message_to_server(request) + # Sending the response back to the Flower client + print(f"Received message from OpenFL server, sending response back to Flower client: {flower_response.grpc_message_name}") + return flower_response + +def serve(openfl_server_address, local_server_port): + # Start the OpenFL client + openfl_client = OpenFLClient(openfl_server_address) + + # Start the local gRPC server + server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(OpenFLLocalGRPCServer(openfl_client), server) + server.add_insecure_port(f'[::]:{local_server_port}') + server.start() + print(f"OpenFL local gRPC server started, listening on port {local_server_port}.") + server.wait_for_termination() + +if __name__ == '__main__': + openfl_server_address = '127.0.0.1:9095' # The OpenFL server's IP address and port + local_server_port = '9092' # The port the local gRPC server will listen on + serve(openfl_server_address, local_server_port) \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/openfl_server_with_local_grpc_client.py b/openfl-workspace/flower-app-pytorch/openfl_server_with_local_grpc_client.py new file mode 100644 index 0000000000..4ec3e95e7a --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/openfl_server_with_local_grpc_client.py @@ -0,0 +1,63 @@ +import grpc +from concurrent import futures +from flwr.proto import grpcadapter_pb2_grpc +from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc +from message_conversion import flower_to_openfl_message, openfl_to_flower_message + +# from message_logger import log_message + +class LocalGRPCClient: + def __init__(self, superlink_address): + self.superlink_channel = grpc.insecure_channel(superlink_address) + self.superlink_stub = grpcadapter_pb2_grpc.GrpcAdapterStub(self.superlink_channel) + + def send_receive(self, openfl_message, client_id): + # Convert OpenFL message to Flower message + flower_message = openfl_to_flower_message(openfl_message) + # log_message(flower_message, 'sent', client_id) + # Send the Flower message to the Flower server and get a response + flower_response = self.superlink_stub.SendReceive(flower_message) + # log_message(flower_message, 'received', client_id) + # Convert Flower response to OpenFL response + print(f"Received message from Flower server, sending response back to OpenFL client: {flower_response.grpc_message_name}") + openfl_response = flower_to_openfl_message(flower_response, sender='Flower Server', receiver='OpenFL Server') + return openfl_response + +class OpenFLServer(aggregator_pb2_grpc.AggregatorServicer): + def __init__(self, local_grpc_client): + self.local_grpc_client = local_grpc_client + + def PelicanDrop(self, request, context): + """ + Args: + request (aggregator_pb2.DropPod): The request + from the collaborator. + context (grpc.ServicerContext): The context of the request. + + Returns: + aggregator_pb2.DropPod: The response to the + request. + """ + client_id = context.peer() + # Forward the incoming OpenFL message to the local gRPC client + print(f"Received message from OpenFL client, sending message to Flower server")#: {request.message_type}") + openfl_response = self.local_grpc_client.send_receive(request, client_id) + # print(f"Received message from Flower server, sending response back to OpenFL client: {openfl_response.message_type}") + return openfl_response + +def serve(superlink_address, openfl_server_port): + # Start the local gRPC client + local_grpc_client = LocalGRPCClient(superlink_address) + + # Start the OpenFL server + server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + aggregator_pb2_grpc.add_AggregatorServicer_to_server(OpenFLServer(local_grpc_client), server) + server.add_insecure_port(f'[::]:{openfl_server_port}') + server.start() + print(f"OpenFL server started with local gRPC client. Listening on port {openfl_server_port}.") + server.wait_for_termination() + +if __name__ == '__main__': + superlink_address = '127.0.0.1:9093' # The Flower superlink address + openfl_server_port = '9095' # The port the OpenFL server will listen on + serve(superlink_address, openfl_server_port) \ No newline at end of file diff --git a/openfl/protocols/aggregator.proto b/openfl/protocols/aggregator.proto index 09ff2f271d..b249536abb 100644 --- a/openfl/protocols/aggregator.proto +++ b/openfl/protocols/aggregator.proto @@ -16,6 +16,7 @@ service Aggregator { rpc GetTrainedModel(GetTrainedModelRequest) returns (TrainedModelResponse) {} rpc GetExperimentDescription(GetExperimentDescriptionRequest) returns (GetExperimentDescriptionResponse) {} + rpc PelicanDrop(DropPod) returns (DropPod) {} } message MessageHeader { @@ -112,3 +113,9 @@ message GetExperimentDescriptionRequest { message GetExperimentDescriptionResponse { ExperimentDescription experiment = 1; } + +message DropPod { + MessageHeader header = 1; + DataStream message = 2; + map metadata = 3; +} From f0c41b8991072409fe03a6d0dec96066bd681691 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 21 Oct 2024 15:35:29 -0700 Subject: [PATCH 002/107] further enabling work Signed-off-by: kta-intel --- .../flower-app-pytorch/message_conversion.py | 52 ----- .../flower-app-pytorch/plan/cols.yaml | 5 + .../flower-app-pytorch/plan/plan.yaml | 44 ++++ .../workspace/plan/defaults/tasks_flower.yaml | 4 + openfl/component/aggregator/aggregator.py | 7 +- openfl/component/collaborator/collaborator.py | 14 ++ openfl/federated/plan/plan.py | 18 +- openfl/federated/task/runner.py | 5 +- openfl/federated/task/runner_flower.py | 47 ++++ openfl/interface/collaborator.py | 47 ++-- openfl/interface/plan.py | 207 +++++++++++------- openfl/transport/grpc/aggregator_client.py | 16 ++ openfl/transport/grpc/aggregator_server.py | 73 ++++++ openfl/transport/grpc/fim/__init__.py | 0 openfl/transport/grpc/fim/flower/__init__.py | 0 .../grpc/fim/flower/deserialize_message.py | 41 ++++ .../grpc/fim/flower/local_grpc_client.py | 18 ++ .../grpc/fim/flower/local_grpc_server.py | 20 ++ .../grpc/fim/flower/message_conversion.py | 35 +++ 19 files changed, 493 insertions(+), 160 deletions(-) create mode 100644 openfl-workspace/flower-app-pytorch/plan/cols.yaml create mode 100644 openfl-workspace/flower-app-pytorch/plan/plan.yaml create mode 100644 openfl-workspace/workspace/plan/defaults/tasks_flower.yaml create mode 100644 openfl/federated/task/runner_flower.py create mode 100644 openfl/transport/grpc/fim/__init__.py create mode 100644 openfl/transport/grpc/fim/flower/__init__.py create mode 100644 openfl/transport/grpc/fim/flower/deserialize_message.py create mode 100644 openfl/transport/grpc/fim/flower/local_grpc_client.py create mode 100644 openfl/transport/grpc/fim/flower/local_grpc_server.py create mode 100644 openfl/transport/grpc/fim/flower/message_conversion.py diff --git a/openfl-workspace/flower-app-pytorch/message_conversion.py b/openfl-workspace/flower-app-pytorch/message_conversion.py index b91ac8c154..ec15ac09b6 100644 --- a/openfl-workspace/flower-app-pytorch/message_conversion.py +++ b/openfl-workspace/flower-app-pytorch/message_conversion.py @@ -2,58 +2,6 @@ from openfl.protocols import aggregator_pb2 # from deserialize_message import deserialize_flower_message -# def flower_to_openfl_message(flower_message, sender, receiver): -# """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" -# if isinstance(flower_message, aggregator_pb2.DropPod()): -# # If the input is already an OpenFL message, return it as-is -# return flower_message -# else: -# """Convert a Flower MessageContainer to an OpenFL message.""" -# # Create the OpenFL message -# openfl_message = aggregator_pb2.DropPod() - -# # Set the MessageHeader fields based on the provided sender and receiver -# openfl_message.header.sender = sender -# openfl_message.header.receiver = receiver -# # openfl_message.message_type = flower_message.metadata['grpc-message-qualname'] -# # import pdb; pdb.set_trace() -# # serialized_flower_message = flower_message.SerializeToString() -# # openfl_message.message.npbytes = serialized_flower_message -# # openfl_message.message.size = len(serialized_flower_message) - -# # Set the DataStream message content to the serialized Flower message content -# openfl_message.message.npbytes = flower_message.grpc_message_content -# openfl_message.message.size = len(flower_message.grpc_message_content) - -# metadata = dict(flower_message.metadata) -# for key, value in metadata.items(): -# openfl_message.metadata[key] = value - -# return openfl_message - -# def openfl_to_flower_message(openfl_message): -# """Convert an OpenFL OpenFLMessage to a Flower MessageContainer.""" -# if isinstance(openfl_message, grpcadapter_pb2.MessageContainer): -# # If the input is already a Flower message, return it as-is -# return openfl_message -# else: -# # Deserialize the Flower message from the DataStream npbytes field -# flower_message = grpcadapter_pb2.MessageContainer() -# # import pdb; pdb.set_trace() -# # flower_message.ParseFromString(openfl_message.message.npbytes) -# # bytes_parsed = flower_message.grpc_message_content = openfl_message.message.npbytes - -# flower_message.grpc_message_content = openfl_message.message.npbytes - -# metadata = dict(openfl_message.metadata) -# for key, value in metadata.items(): -# flower_message.metadata[key] = value - -# # Will be depracated by Flower # -# flower_message.grpc_message_name = metadata.get('grpc-message-qualname') -# ################################ -# return flower_message - def flower_to_openfl_message(flower_message, sender, receiver): """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" if isinstance(flower_message, aggregator_pb2.DropPod()): diff --git a/openfl-workspace/flower-app-pytorch/plan/cols.yaml b/openfl-workspace/flower-app-pytorch/plan/cols.yaml new file mode 100644 index 0000000000..2d91469ca4 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/plan/cols.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2020-2024 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +collaborators: + \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml new file mode 100644 index 0000000000..f71a4e219b --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -0,0 +1,44 @@ +# Copyright (C) 2020-2024 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +aggregator : + defaults : plan/defaults/aggregator.yaml + template : openfl.component.Aggregator + settings : + init_state_path : null + best_state_path : null + last_state_path : null + rounds_to_train : 1 + +collaborator : + defaults : plan/defaults/collaborator.yaml + template : openfl.component.Collaborator + settings : + {} + +network : + defaults : plan/defaults/network.yaml + settings : + fim : True + +assigner : + defaults : plan/defaults/assigner.yaml + template : openfl.component.RandomGroupedAssigner + settings : + task_groups : + - name : flower_adapter + percentage : 1.0 + tasks : + - start_client_adapter + +task_runner : + defaults : plan/defaults/task_runner.yaml + template : openfl.federated.task.runner_flower.FlowerTaskRunner + settings : + {} + +tasks : + defaults : plan/defaults/tasks_flower.yaml + +compression_pipeline : + defaults : plan/defaults/compression_pipeline.yaml \ No newline at end of file diff --git a/openfl-workspace/workspace/plan/defaults/tasks_flower.yaml b/openfl-workspace/workspace/plan/defaults/tasks_flower.yaml new file mode 100644 index 0000000000..af64d0899c --- /dev/null +++ b/openfl-workspace/workspace/plan/defaults/tasks_flower.yaml @@ -0,0 +1,4 @@ +start_client_adapter: + function : start_client_adapter + kwargs : + local_server_port : 9092 diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 81d3e7411a..c787685504 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -169,8 +169,11 @@ def __init__( tensor_pipe=self.compression_pipeline, ) else: - self.model: base_pb2.ModelProto = utils.load_proto(self.init_state_path) - self._load_initial_tensors() # keys are TensorKeys + if self.init_state_path: + self.model: base_pb2.ModelProto = utils.load_proto(self.init_state_path) + self._load_initial_tensors() # keys are TensorKeys + else: + self.model = {} self.collaborator_tensor_results = {} # {TensorKey: nparray}} diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index ce44966a71..463a14c527 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -159,6 +159,9 @@ def set_available_devices(self, cuda: Tuple[str] = ()): def run(self): """Run the collaborator.""" while True: + # if self.fim: + # self.start_flower_client_connection() + # else: tasks, round_number, sleep_time, time_to_quit = self.get_tasks() if time_to_quit: break @@ -239,6 +242,17 @@ def do_task(self, task, round_number): func_name = self.task_config[task_name]["function"] kwargs = self.task_config[task_name]["kwargs"] + if func_name=="start_client_adapter": + if hasattr(self.task_runner, func_name): + method = getattr(self.task_runner, func_name) + if callable(method): + method(self.client, **kwargs) + return + else: + raise AttributeError(f"{func_name} is not callable on {self.task_runner}") + else: + raise AttributeError(f"{func_name} does not exist on {self.task_runner}") + # this would return a list of what tensors we require as TensorKeys required_tensorkeys_relative = self.task_runner.get_required_tensorkeys_for_function( func_name, **kwargs diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index 5f0575837d..3bb38d79c8 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -486,9 +486,11 @@ def get_task_runner(self, data_loader): self.runner_ = Plan.build(**defaults) # Define task dependencies after taskrunner has been initialized - self.runner_.initialize_tensorkeys_for_functions() - - return self.runner_ + if 'Flower' in defaults['template']: + return self.runner_ + else: + self.runner_.initialize_tensorkeys_for_functions() + return self.runner_ # Python interactive api def get_core_task_runner(self, data_loader=None, model_provider=None, task_keeper=None): @@ -595,8 +597,11 @@ def get_collaborator( ) else: # TaskRunner subclassing API - data_loader = self.get_data_loader(collaborator_name) - defaults[SETTINGS]["task_runner"] = self.get_task_runner(data_loader) + if 'Flower' in self.config["task_runner"]["template"]: + defaults[SETTINGS]["task_runner"] = self.get_task_runner(None) + else: + data_loader = self.get_data_loader(collaborator_name) + defaults[SETTINGS]["task_runner"] = self.get_task_runner(data_loader) defaults[SETTINGS]["compression_pipeline"] = self.get_tensor_pipe() defaults[SETTINGS]["task_config"] = self.config.get("tasks", {}) @@ -703,6 +708,9 @@ def get_server( server_args["aggregator"] = self.get_aggregator() + #TODO have this set in self.config["network"] + # server_args["fim"] = True + if self.server_ is None: self.server_ = AggregatorGRPCServer(**server_args) diff --git a/openfl/federated/task/runner.py b/openfl/federated/task/runner.py index 007f74163d..804f754768 100644 --- a/openfl/federated/task/runner.py +++ b/openfl/federated/task/runner.py @@ -37,7 +37,10 @@ def __init__(self, data_loader, tensor_dict_split_fn_kwargs: dict = None, **kwar **kwargs: Additional parameters to pass to the function. """ self.data_loader = data_loader - self.feature_shape = self.data_loader.get_feature_shape() + if self.data_loader: + self.feature_shape = self.data_loader.get_feature_shape() + else: + self.feature_shape = None # TODO: Should this comment a path of the doc string? # key word arguments for determining which parameters to hold out from # aggregation. diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py new file mode 100644 index 0000000000..5e1f70df38 --- /dev/null +++ b/openfl/federated/task/runner_flower.py @@ -0,0 +1,47 @@ +import grpc +from concurrent.futures import ThreadPoolExecutor +from flwr.proto import grpcadapter_pb2_grpc +from multiprocessing import cpu_count +from openfl.federated.task.runner import TaskRunner +from openfl.transport import AggregatorGRPCClient +from openfl.transport.grpc.fim.flower.local_grpc_server import LocalGRPCServer +import subprocess + + +class FlowerTaskRunner(TaskRunner): + def __init__(self, **kwargs): + """Initializes the FlowerTaskRunner object. + + Args: + **kwargs: Additional parameters to pass to the functions. + """ + super().__init__(**kwargs) + + def start_client_adapter(self, openfl_client, **kwargs): + local_server_port = kwargs['local_server_port'] + + # Start the local gRPC server + server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) + grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(LocalGRPCServer(openfl_client), server) + + # TODO: add restrictions + server.add_insecure_port(f'[::]:{local_server_port}') + server.start() + print(f"OpenFL local gRPC server started, listening on port {local_server_port}.") + + # Start the Flower supernode in a subprocess + # import pdb; pdb.set_trace() + supernode_process = subprocess.Popen([ + "flower-supernode", + kwargs.get('app_path', './app-pytorch'), + "--insecure", + "--grpc-adapter", + "--superlink", f"127.0.0.1:{local_server_port}", + "--node-config", f"num-partitions={kwargs.get('num_partitions', 1)}", + "--node-config", f"partition-id={kwargs.get('partition_id', 0)}" + ], shell=False) + + server.wait_for_termination() + + supernode_process.terminate() + supernode_process.wait() \ No newline at end of file diff --git a/openfl/interface/collaborator.py b/openfl/interface/collaborator.py index 9bad1e9716..5edb371008 100644 --- a/openfl/interface/collaborator.py +++ b/openfl/interface/collaborator.py @@ -55,7 +55,7 @@ def collaborator(context): required=False, help="The data set/shard configuration file [plan/data.yaml]", default="plan/data.yaml", - type=ClickPath(exists=True), + type=ClickPath(exists=False), ) @option( "-n", @@ -63,27 +63,42 @@ def collaborator(context): required=True, help="The certified common name of the collaborator", ) -def start_(plan, collaborator_name, data_config): +@option( + "-fim", + "--framework_interoperability_mode", + required=False, + help="For interoperability with other FL frameworks. True/False [Default: True]", + default=False, +) +def start_(plan, collaborator_name, data_config, framework_interoperability_mode): """Start a collaborator service.""" - if plan and is_directory_traversal(plan): - echo("Federated learning plan path is out of the openfl workspace scope.") - sys.exit(1) - if data_config and is_directory_traversal(data_config): - echo("The data set/shard configuration file path is out of the openfl workspace scope.") - sys.exit(1) + if framework_interoperability_mode: + plan = Plan.parse( + plan_config_path=Path(plan).absolute(), + ) + logger.info("🧿 Starting a Collaborator Service.") + plan.get_collaborator(collaborator_name).run() - plan = Plan.parse( - plan_config_path=Path(plan).absolute(), - data_config_path=Path(data_config).absolute(), - ) + else: + if plan and is_directory_traversal(plan): + echo("Federated learning plan path is out of the openfl workspace scope.") + sys.exit(1) + if data_config and is_directory_traversal(data_config): + echo("The data set/shard configuration file path is out of the openfl workspace scope.") + sys.exit(1) + + plan = Plan.parse( + plan_config_path=Path(plan).absolute(), + data_config_path=Path(data_config).absolute(), + ) - # TODO: Need to restructure data loader config file loader + # TODO: Need to restructure data loader config file loader - echo(f"Data = {plan.cols_data_paths}") - logger.info("🧿 Starting a Collaborator Service.") + echo(f"Data = {plan.cols_data_paths}") + logger.info("🧿 Starting a Collaborator Service.") - plan.get_collaborator(collaborator_name).run() + plan.get_collaborator(collaborator_name).run() @collaborator.command(name="create") diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index f4c91faed0..e39e9065e3 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -55,7 +55,7 @@ def plan(context): required=False, help="Authorized collaborator list [plan/cols.yaml]", default="plan/cols.yaml", - type=ClickPath(exists=True), + type=ClickPath(exists=False), ) @option( "-d", @@ -63,7 +63,7 @@ def plan(context): required=False, help="The data set/shard configuration file [plan/data.yaml]", default="plan/data.yaml", - type=ClickPath(exists=True), + type=ClickPath(exists=False), ) @option( "-a", @@ -94,6 +94,13 @@ def plan(context): help="Install packages listed under 'requirements.txt'. True/False [Default: True]", default=True, ) +@option( + "-fim", + "--framework_interoperability_mode", + required=False, + help="For interoperability with other FL frameworks. True/False [Default: True]", + default=False, +) def initialize( context, plan_config, @@ -103,6 +110,7 @@ def initialize( input_shape, gandlf_config, install_reqs, + framework_interoperability_mode ): """Initialize Data Science plan. @@ -119,107 +127,138 @@ def initialize( gandlf_config (str): GaNDLF Configuration File Path. install_reqs (bool): Whether to install packages listed under 'requirements.txt'. """ + if framework_interoperability_mode: + plan_config = Path(plan_config).absolute() + cols_config = Path(cols_config).absolute() - for p in [plan_config, cols_config, data_config]: - if is_directory_traversal(p): - echo(f"{p} is out of the openfl workspace scope.") - sys.exit(1) - - plan_config = Path(plan_config).absolute() - cols_config = Path(cols_config).absolute() - data_config = Path(data_config).absolute() - if gandlf_config is not None: - gandlf_config = Path(gandlf_config).absolute() - - if install_reqs: - requirements_filename = "requirements.txt" - requirements_path = Path(requirements_filename).absolute() - - if isfile(f"{str(requirements_path)}"): - check_call( - [ - sys.executable, - "-m", - "pip", - "install", - "-r", - f"{str(requirements_path)}", - ], - shell=False, + plan = Plan.parse( + plan_config_path=plan_config, + cols_config_path=cols_config, + ) + + plan_origin = Plan.parse( + plan_config_path=plan_config, + resolve=False, + ) + + if plan_origin.config["network"]["settings"]["agg_addr"] == "auto" or aggregator_address: + plan_origin.config["network"]["settings"]["agg_addr"] = aggregator_address or getfqdn_env() + + logger.warn( + f"Patching Aggregator Addr in Plan" + f" 🠆 {plan_origin.config['network']['settings']['agg_addr']}" ) - echo(f"Successfully installed packages from {requirements_path}.") - - # Required to restart the process for newly installed packages to be recognized - args_restart = [arg for arg in sys.argv if not arg.startswith("--install_reqs")] - args_restart.append("--install_reqs=False") - os.execv(args_restart[0], args_restart) - else: - echo("No additional requirements for workspace defined. Skipping...") - - plan = Plan.parse( - plan_config_path=plan_config, - cols_config_path=cols_config, - data_config_path=data_config, - gandlf_config_path=gandlf_config, - ) - init_state_path = plan.config["aggregator"]["settings"]["init_state_path"] + Plan.dump(plan_config, plan_origin.config) - # This is needed to bypass data being locally available - if input_shape is not None: - logger.info( - "Attempting to generate initial model weights with" f" custom input shape {input_shape}" + # Record that plan with this hash has been initialized + if "plans" not in context.obj: + context.obj["plans"] = [] + context.obj["plans"].append(f"{plan_config.stem}_{plan_origin.hash[:8]}") + logger.info(f"{context.obj['plans']}") + + else: + + for p in [plan_config, cols_config, data_config]: + if is_directory_traversal(p): + echo(f"{p} is out of the openfl workspace scope.") + sys.exit(1) + + plan_config = Path(plan_config).absolute() + cols_config = Path(cols_config).absolute() + data_config = Path(data_config).absolute() + if gandlf_config is not None: + gandlf_config = Path(gandlf_config).absolute() + + if install_reqs: + requirements_filename = "requirements.txt" + requirements_path = Path(requirements_filename).absolute() + + if isfile(f"{str(requirements_path)}"): + check_call( + [ + sys.executable, + "-m", + "pip", + "install", + "-r", + f"{str(requirements_path)}", + ], + shell=False, + ) + echo(f"Successfully installed packages from {requirements_path}.") + + # Required to restart the process for newly installed packages to be recognized + args_restart = [arg for arg in sys.argv if not arg.startswith("--install_reqs")] + args_restart.append("--install_reqs=False") + os.execv(args_restart[0], args_restart) + else: + echo("No additional requirements for workspace defined. Skipping...") + + plan = Plan.parse( + plan_config_path=plan_config, + cols_config_path=cols_config, + data_config_path=data_config, + gandlf_config_path=gandlf_config, ) - data_loader = get_dataloader(plan, prefer_minimal=True, input_shape=input_shape) + init_state_path = plan.config["aggregator"]["settings"]["init_state_path"] - task_runner = plan.get_task_runner(data_loader) - tensor_pipe = plan.get_tensor_pipe() + # This is needed to bypass data being locally available + if input_shape is not None: + logger.info( + "Attempting to generate initial model weights with" f" custom input shape {input_shape}" + ) - tensor_dict, holdout_params = split_tensor_dict_for_holdouts( - logger, - task_runner.get_tensor_dict(False), - **task_runner.tensor_dict_split_fn_kwargs, - ) + data_loader = get_dataloader(plan, prefer_minimal=True, input_shape=input_shape) - logger.warn( - f"Following parameters omitted from global initial model, " - f"local initialization will determine" - f" values: {list(holdout_params.keys())}" - ) + task_runner = plan.get_task_runner(data_loader) + tensor_pipe = plan.get_tensor_pipe() - model_snap = utils.construct_model_proto( - tensor_dict=tensor_dict, round_number=0, tensor_pipe=tensor_pipe - ) + tensor_dict, holdout_params = split_tensor_dict_for_holdouts( + logger, + task_runner.get_tensor_dict(False), + **task_runner.tensor_dict_split_fn_kwargs, + ) - logger.info("Creating Initial Weights File 🠆 %s", init_state_path) + logger.warn( + f"Following parameters omitted from global initial model, " + f"local initialization will determine" + f" values: {list(holdout_params.keys())}" + ) - utils.dump_proto(model_proto=model_snap, fpath=init_state_path) + model_snap = utils.construct_model_proto( + tensor_dict=tensor_dict, round_number=0, tensor_pipe=tensor_pipe + ) - plan_origin = Plan.parse( - plan_config_path=plan_config, - gandlf_config_path=gandlf_config, - resolve=False, - ) + logger.info("Creating Initial Weights File 🠆 %s", init_state_path) - if plan_origin.config["network"]["settings"]["agg_addr"] == "auto" or aggregator_address: - plan_origin.config["network"]["settings"]["agg_addr"] = aggregator_address or getfqdn_env() + utils.dump_proto(model_proto=model_snap, fpath=init_state_path) - logger.warn( - f"Patching Aggregator Addr in Plan" - f" 🠆 {plan_origin.config['network']['settings']['agg_addr']}" + plan_origin = Plan.parse( + plan_config_path=plan_config, + gandlf_config_path=gandlf_config, + resolve=False, ) - Plan.dump(plan_config, plan_origin.config) + if plan_origin.config["network"]["settings"]["agg_addr"] == "auto" or aggregator_address: + plan_origin.config["network"]["settings"]["agg_addr"] = aggregator_address or getfqdn_env() + + logger.warn( + f"Patching Aggregator Addr in Plan" + f" 🠆 {plan_origin.config['network']['settings']['agg_addr']}" + ) + + Plan.dump(plan_config, plan_origin.config) - if gandlf_config is not None: - Plan.dump(plan_config, plan_origin.config) + if gandlf_config is not None: + Plan.dump(plan_config, plan_origin.config) - # Record that plan with this hash has been initialized - if "plans" not in context.obj: - context.obj["plans"] = [] - context.obj["plans"].append(f"{plan_config.stem}_{plan_origin.hash[:8]}") - logger.info(f"{context.obj['plans']}") + # Record that plan with this hash has been initialized + if "plans" not in context.obj: + context.obj["plans"] = [] + context.obj["plans"].append(f"{plan_config.stem}_{plan_origin.hash[:8]}") + logger.info(f"{context.obj['plans']}") # TODO: looks like Plan.method diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index 14c324e8a3..ff2f6bc53a 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -13,9 +13,11 @@ from openfl.pipelines import NoCompressionPipeline from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc, utils from openfl.transport.grpc.grpc_channel_options import channel_options +from openfl.transport.grpc.fim.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message from openfl.utilities import check_equal + class ConstantBackoff: """Constant Backoff policy. @@ -485,6 +487,20 @@ def send_local_task_results( # also do other validation, like on the round_number self.validate_response(response, collaborator_name) + @_atomic_connection + @_resend_data_on_reconnection + def send_message_to_server(self, flower_message, collaborator_name): + # TODO, how to add header information to the message? + self._set_header(collaborator_name) + openfl_message = flower_to_openfl_message(flower_message, + header=self.header) + openfl_response = self.stub.PelicanDrop(openfl_message) + # Validate openFL response + self.validate_response(response, collaborator_name) + flower_response = openfl_to_flower_message(openfl_response) + # Validate flower response (deserialize message?) + return flower_response + def _get_trained_model(self, experiment_name, model_type): """Get trained model RPC. diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index b7c54813af..5c95d18053 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -14,8 +14,11 @@ from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc, utils from openfl.transport.grpc.grpc_channel_options import channel_options +from openfl.transport.grpc.fim.flower.local_grpc_client import LocalGRPCClient from openfl.utilities import check_equal, check_is_in +import subprocess + logger = logging.getLogger(__name__) @@ -50,6 +53,7 @@ def __init__( root_certificate=None, certificate=None, private_key=None, + fim=False, # Add a flag for Flower transport mode **kwargs, ): """ @@ -68,6 +72,7 @@ def __init__( TLS connection. private_key (str): The path to the server's private key for the TLS connection. + fim (bool): whether to use framework interopability mode **kwargs: Additional keyword arguments. """ self.aggregator = aggregator @@ -80,6 +85,13 @@ def __init__( self.server = None self.server_credentials = None + self.fim = fim + if self.fim: + superlink_address = '127.0.0.1:9093' #kwargs.get("superlink_address") + self.local_grpc_client = LocalGRPCClient(superlink_address) # Initialize the local gRPC client for Flower + else: + self.local_grpc_client = None + self.logger = logging.getLogger(__name__) def validate_collaborator(self, request, context): @@ -177,6 +189,9 @@ def GetTasks(self, request, context): # NOQA:N802 Returns: aggregator_pb2.GetTasksResponse: The response to the request. """ + # if self.fim: + # context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") + self.validate_collaborator(request, context) self.check_request(request) collaborator_name = request.header.sender @@ -228,6 +243,9 @@ def GetAggregatedTensor(self, request, context): # NOQA:N802 aggregator_pb2.GetAggregatedTensorResponse: The response to the request. """ + if self.fim: + context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") + self.validate_collaborator(request, context) self.check_request(request) collaborator_name = request.header.sender @@ -267,6 +285,9 @@ def SendLocalTaskResults(self, request, context): # NOQA:N802 aggregator_pb2.SendLocalTaskResultsResponse: The response to the request. """ + if self.fim: + context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") + try: proto = aggregator_pb2.TaskResults() proto = utils.datastream_to_proto(proto, request) @@ -292,6 +313,31 @@ def SendLocalTaskResults(self, request, context): # NOQA:N802 header=self.get_header(collaborator_name) ) + def PelicanDrop(self, request, context): + """ + Args: + request (aggregator_pb2.PelicanDrop): The request + from the collaborator. + context (grpc.ServicerContext): The context of the request. + + Returns: + aggregator_pb2.PelicanDrop: The response to the + request. + """ + if not self.fim: + context.abort(StatusCode.UNIMPLEMENTED, "PelicanDrop is only available in framework interopability mode.") + + #TODO: This needs to be done when receiving from OpenFL client (not from local gRPC client, local gRPC should have it's own verification) + #This seems like the right step for this verification since we don't send to local gRPC yet + self.validate_collaborator(request, context) + self.check_request(request) + collaborator_name = request.header.sender + + #TODO: Extract header information from client request (i.e. collaborator_name = request.header.sender) + # Forward the incoming OpenFL message to the local gRPC client + print(f"OpenFL Server: Received message from OpenFL client, sending message to Flower server") + return self.local_grpc_client.send_receive(request, header=self.get_header(collaborator_name)) + def get_server(self): """ Return gRPC server. @@ -302,6 +348,7 @@ def get_server(self): Returns: grpc.Server: The gRPC server. """ + # TODO: Need to launch superlink and flower server app somewhere self.server = server(ThreadPoolExecutor(max_workers=cpu_count()), options=channel_options) aggregator_pb2_grpc.add_AggregatorServicer_to_server(self, self.server) @@ -337,7 +384,26 @@ def serve(self): This method starts the gRPC server and handles requests until all quit jobs havebeen sent. + """ + if getattr(self, 'fim', False): + # Start the Flower superlink in a subprocess + superlink_process = subprocess.Popen([ + "flower-superlink", + "--insecure", + "--fleet-api-type", "grpc-adapter", + "--fleet-api-address", "127.0.0.1:9093", + "--driver-api-address", "127.0.0.1:9091" + ], shell=False) + + # Start the Flower server app in a subprocess + server_app_process = subprocess.Popen([ + "flower-server-app", + "./app-pytorch", + "--insecure", + "--superlink", "127.0.0.1:9091" + ], shell=False) + self.get_server() self.logger.info("Starting Aggregator gRPC Server") @@ -350,3 +416,10 @@ def serve(self): pass self.server.stop(0) + + if getattr(self, 'fim', False): + superlink_process.terminate() + server_app_process.terminate() + + superlink_process.wait() + server_app_process.wait() diff --git a/openfl/transport/grpc/fim/__init__.py b/openfl/transport/grpc/fim/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openfl/transport/grpc/fim/flower/__init__.py b/openfl/transport/grpc/fim/flower/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openfl/transport/grpc/fim/flower/deserialize_message.py b/openfl/transport/grpc/fim/flower/deserialize_message.py new file mode 100644 index 0000000000..46f37e8acc --- /dev/null +++ b/openfl/transport/grpc/fim/flower/deserialize_message.py @@ -0,0 +1,41 @@ +import importlib +from google.protobuf.message import DecodeError + +def deserialize_flower_message(flower_message): + """ + Deserialize the grpc_message_content of a Flower message using the module and class name + specified in the metadata. + + Args: + flower_message: The Flower message containing the metadata and binary content. + + Returns: + The deserialized message object, or None if deserialization fails. + """ + # Access metadata directly + metadata = flower_message.metadata + module_name = metadata.get('grpc-message-module') + qualname = metadata.get('grpc-message-qualname') + + # Import the module + try: + module = importlib.import_module(module_name) + except ImportError as e: + print(f"Failed to import module: {module_name}. Error: {e}") + return None + + # Get the message class + try: + message_class = getattr(module, qualname) + except AttributeError as e: + print(f"Failed to get message class '{qualname}' from module '{module_name}'. Error: {e}") + return None + + # Deserialize the content + try: + message = message_class.FromString(flower_message.grpc_message_content) + except DecodeError as e: + print(f"Failed to deserialize message content. Error: {e}") + return None + + return message \ No newline at end of file diff --git a/openfl/transport/grpc/fim/flower/local_grpc_client.py b/openfl/transport/grpc/fim/flower/local_grpc_client.py new file mode 100644 index 0000000000..530be37746 --- /dev/null +++ b/openfl/transport/grpc/fim/flower/local_grpc_client.py @@ -0,0 +1,18 @@ +import grpc +from flwr.proto import grpcadapter_pb2_grpc +from openfl.transport.grpc.fim.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message + +class LocalGRPCClient: + def __init__(self, superlink_address): + self.superlink_channel = grpc.insecure_channel(superlink_address) + self.superlink_stub = grpcadapter_pb2_grpc.GrpcAdapterStub(self.superlink_channel) + + def send_receive(self, openfl_message, header): + # TODO: verification step for messages coming from Flower server + collaborator_name = openfl_message.header.sender + + flower_message = openfl_to_flower_message(openfl_message) + flower_response = self.superlink_stub.SendReceive(flower_message) + # print(f"Received message from Flower server, sending response through OpenFL server back to OpenFL client: {flower_response.grpc_message_name}") + openfl_response = flower_to_openfl_message(flower_response, header=header) + return openfl_response diff --git a/openfl/transport/grpc/fim/flower/local_grpc_server.py b/openfl/transport/grpc/fim/flower/local_grpc_server.py new file mode 100644 index 0000000000..332467f203 --- /dev/null +++ b/openfl/transport/grpc/fim/flower/local_grpc_server.py @@ -0,0 +1,20 @@ +import grpc +from concurrent.futures import ThreadPoolExecutor +from flwr.proto import grpcadapter_pb2_grpc +from multiprocessing import cpu_count +from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc +from openfl.transport import AggregatorGRPCClient +from openfl.transport.grpc.fim.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message + + +class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): + def __init__(self, openfl_client): + self.openfl_client = openfl_client + + def SendReceive(self, request, context): + # print(f"Received message from Flower client, sending through OpenFL client to OpenFL server: {request.grpc_message_name}") + # Forward the incoming message to the OpenFL client + flower_response = self.openfl_client.send_message_to_server(request) + # Sending the response back to the Flower client + # print(f"Received message from OpenFL server, sending response through OpenFL client back to Flower client: {flower_response.grpc_message_name}") + return flower_response \ No newline at end of file diff --git a/openfl/transport/grpc/fim/flower/message_conversion.py b/openfl/transport/grpc/fim/flower/message_conversion.py new file mode 100644 index 0000000000..cc1344d18b --- /dev/null +++ b/openfl/transport/grpc/fim/flower/message_conversion.py @@ -0,0 +1,35 @@ +from flwr.proto import grpcadapter_pb2 +from openfl.protocols import aggregator_pb2 +# from deserialize_message import deserialize_flower_message + +def flower_to_openfl_message(flower_message, header): + """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" + if isinstance(flower_message, aggregator_pb2.DropPod()): + # If the input is already an OpenFL message, return it as-is + return flower_message + else: + """Convert a Flower MessageContainer to an OpenFL message.""" + # Create the OpenFL message + openfl_message = aggregator_pb2.DropPod() + + # Set the MessageHeader fields based on the provided sender and receiver + openfl_message.header = header + # openfl_message.message_type = flower_message.metadata['grpc-message-qualname'] + serialized_flower_message = flower_message.SerializeToString() + openfl_message.message.npbytes = serialized_flower_message + openfl_message.message.size = len(serialized_flower_message) + + return openfl_message + +def openfl_to_flower_message(openfl_message): + """Convert an OpenFL OpenFLMessage to a Flower MessageContainer.""" + if isinstance(openfl_message, grpcadapter_pb2.MessageContainer): + # If the input is already a Flower message, return it as-is + return openfl_message + else: + # Deserialize the Flower message from the DataStream npbytes field + flower_message = grpcadapter_pb2.MessageContainer() + flower_message.ParseFromString(openfl_message.message.npbytes) + bytes_parsed = openfl_message.message.npbytes + # import pdb; pdb.set_trace() + return flower_message \ No newline at end of file From aa08b5e61ad0c23bb6023556480078b761754417 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 22 Oct 2024 07:45:36 -0700 Subject: [PATCH 003/107] additional updates Signed-off-by: kta-intel --- openfl/component/collaborator/collaborator.py | 3 +-- openfl/federated/task/runner_flower.py | 13 +++++++------ openfl/transport/grpc/aggregator_client.py | 2 +- .../transport/grpc/fim/flower/local_grpc_server.py | 5 +++-- .../transport/grpc/fim/flower/message_conversion.py | 3 +-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index 463a14c527..d4dc11542e 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -241,12 +241,11 @@ def do_task(self, task, round_number): task_name = task.name func_name = self.task_config[task_name]["function"] kwargs = self.task_config[task_name]["kwargs"] - if func_name=="start_client_adapter": if hasattr(self.task_runner, func_name): method = getattr(self.task_runner, func_name) if callable(method): - method(self.client, **kwargs) + method(self.client, self.collaborator_name, **kwargs) return else: raise AttributeError(f"{func_name} is not callable on {self.task_runner}") diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 5e1f70df38..f0a91567b7 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -17,12 +17,12 @@ def __init__(self, **kwargs): """ super().__init__(**kwargs) - def start_client_adapter(self, openfl_client, **kwargs): + def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): local_server_port = kwargs['local_server_port'] # Start the local gRPC server server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) - grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(LocalGRPCServer(openfl_client), server) + grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(LocalGRPCServer(openfl_client, collaborator_name), server) # TODO: add restrictions server.add_insecure_port(f'[::]:{local_server_port}') @@ -31,15 +31,16 @@ def start_client_adapter(self, openfl_client, **kwargs): # Start the Flower supernode in a subprocess # import pdb; pdb.set_trace() - supernode_process = subprocess.Popen([ + command = [ "flower-supernode", kwargs.get('app_path', './app-pytorch'), "--insecure", "--grpc-adapter", "--superlink", f"127.0.0.1:{local_server_port}", - "--node-config", f"num-partitions={kwargs.get('num_partitions', 1)}", - "--node-config", f"partition-id={kwargs.get('partition_id', 0)}" - ], shell=False) + "--node-config", f"num-partitions={kwargs.get('num_partitions', 1)} partition-id={kwargs.get('partition_id', 0)}" + ] + # Start the subprocess + supernode_process = subprocess.Popen(command, shell=False) server.wait_for_termination() diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index ff2f6bc53a..7c189f06f7 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -496,7 +496,7 @@ def send_message_to_server(self, flower_message, collaborator_name): header=self.header) openfl_response = self.stub.PelicanDrop(openfl_message) # Validate openFL response - self.validate_response(response, collaborator_name) + self.validate_response(openfl_response, collaborator_name) flower_response = openfl_to_flower_message(openfl_response) # Validate flower response (deserialize message?) return flower_response diff --git a/openfl/transport/grpc/fim/flower/local_grpc_server.py b/openfl/transport/grpc/fim/flower/local_grpc_server.py index 332467f203..cbb883898f 100644 --- a/openfl/transport/grpc/fim/flower/local_grpc_server.py +++ b/openfl/transport/grpc/fim/flower/local_grpc_server.py @@ -8,13 +8,14 @@ class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): - def __init__(self, openfl_client): + def __init__(self, openfl_client, collaborator_name): self.openfl_client = openfl_client + self.collaborator_name = collaborator_name def SendReceive(self, request, context): # print(f"Received message from Flower client, sending through OpenFL client to OpenFL server: {request.grpc_message_name}") # Forward the incoming message to the OpenFL client - flower_response = self.openfl_client.send_message_to_server(request) + flower_response = self.openfl_client.send_message_to_server(request, self.collaborator_name) # Sending the response back to the Flower client # print(f"Received message from OpenFL server, sending response through OpenFL client back to Flower client: {flower_response.grpc_message_name}") return flower_response \ No newline at end of file diff --git a/openfl/transport/grpc/fim/flower/message_conversion.py b/openfl/transport/grpc/fim/flower/message_conversion.py index cc1344d18b..81f05f8094 100644 --- a/openfl/transport/grpc/fim/flower/message_conversion.py +++ b/openfl/transport/grpc/fim/flower/message_conversion.py @@ -11,9 +11,8 @@ def flower_to_openfl_message(flower_message, header): """Convert a Flower MessageContainer to an OpenFL message.""" # Create the OpenFL message openfl_message = aggregator_pb2.DropPod() - # Set the MessageHeader fields based on the provided sender and receiver - openfl_message.header = header + openfl_message.header.CopyFrom(header) # openfl_message.message_type = flower_message.metadata['grpc-message-qualname'] serialized_flower_message = flower_message.SerializeToString() openfl_message.message.npbytes = serialized_flower_message From e0b11030b0c360d886d3836650fd5d8a83d77613 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 23 Oct 2024 12:30:09 -0700 Subject: [PATCH 004/107] updates Signed-off-by: kta-intel --- .../flower-app-pytorch/app-pytorch/.gitignore | 160 ------------------ .../flower-app-pytorch/deserialize_message.py | 41 ----- .../flower-app-pytorch/message_conversion.py | 36 ---- .../flower-app-pytorch/message_logger.py | 109 ------------ .../openfl_client_with_local_grpc_server.py | 53 ------ .../openfl_server_with_local_grpc_client.py | 63 ------- openfl/transport/grpc/aggregator_client.py | 1 - openfl/transport/grpc/aggregator_server.py | 4 +- .../grpc/fim/flower/local_grpc_client.py | 2 - 9 files changed, 1 insertion(+), 468 deletions(-) delete mode 100644 openfl-workspace/flower-app-pytorch/app-pytorch/.gitignore delete mode 100644 openfl-workspace/flower-app-pytorch/deserialize_message.py delete mode 100644 openfl-workspace/flower-app-pytorch/message_conversion.py delete mode 100644 openfl-workspace/flower-app-pytorch/message_logger.py delete mode 100644 openfl-workspace/flower-app-pytorch/openfl_client_with_local_grpc_server.py delete mode 100644 openfl-workspace/flower-app-pytorch/openfl_server_with_local_grpc_client.py diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/.gitignore b/openfl-workspace/flower-app-pytorch/app-pytorch/.gitignore deleted file mode 100644 index 68bc17f9ff..0000000000 --- a/openfl-workspace/flower-app-pytorch/app-pytorch/.gitignore +++ /dev/null @@ -1,160 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/openfl-workspace/flower-app-pytorch/deserialize_message.py b/openfl-workspace/flower-app-pytorch/deserialize_message.py deleted file mode 100644 index 46f37e8acc..0000000000 --- a/openfl-workspace/flower-app-pytorch/deserialize_message.py +++ /dev/null @@ -1,41 +0,0 @@ -import importlib -from google.protobuf.message import DecodeError - -def deserialize_flower_message(flower_message): - """ - Deserialize the grpc_message_content of a Flower message using the module and class name - specified in the metadata. - - Args: - flower_message: The Flower message containing the metadata and binary content. - - Returns: - The deserialized message object, or None if deserialization fails. - """ - # Access metadata directly - metadata = flower_message.metadata - module_name = metadata.get('grpc-message-module') - qualname = metadata.get('grpc-message-qualname') - - # Import the module - try: - module = importlib.import_module(module_name) - except ImportError as e: - print(f"Failed to import module: {module_name}. Error: {e}") - return None - - # Get the message class - try: - message_class = getattr(module, qualname) - except AttributeError as e: - print(f"Failed to get message class '{qualname}' from module '{module_name}'. Error: {e}") - return None - - # Deserialize the content - try: - message = message_class.FromString(flower_message.grpc_message_content) - except DecodeError as e: - print(f"Failed to deserialize message content. Error: {e}") - return None - - return message \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/message_conversion.py b/openfl-workspace/flower-app-pytorch/message_conversion.py deleted file mode 100644 index ec15ac09b6..0000000000 --- a/openfl-workspace/flower-app-pytorch/message_conversion.py +++ /dev/null @@ -1,36 +0,0 @@ -from flwr.proto import grpcadapter_pb2 -from openfl.protocols import aggregator_pb2 -# from deserialize_message import deserialize_flower_message - -def flower_to_openfl_message(flower_message, sender, receiver): - """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" - if isinstance(flower_message, aggregator_pb2.DropPod()): - # If the input is already an OpenFL message, return it as-is - return flower_message - else: - """Convert a Flower MessageContainer to an OpenFL message.""" - # Create the OpenFL message - openfl_message = aggregator_pb2.DropPod() - - # Set the MessageHeader fields based on the provided sender and receiver - openfl_message.header.sender = sender - openfl_message.header.receiver = receiver - # openfl_message.message_type = flower_message.metadata['grpc-message-qualname'] - serialized_flower_message = flower_message.SerializeToString() - openfl_message.message.npbytes = serialized_flower_message - openfl_message.message.size = len(serialized_flower_message) - - return openfl_message - -def openfl_to_flower_message(openfl_message): - """Convert an OpenFL OpenFLMessage to a Flower MessageContainer.""" - if isinstance(openfl_message, grpcadapter_pb2.MessageContainer): - # If the input is already a Flower message, return it as-is - return openfl_message - else: - # Deserialize the Flower message from the DataStream npbytes field - flower_message = grpcadapter_pb2.MessageContainer() - flower_message.ParseFromString(openfl_message.message.npbytes) - bytes_parsed = openfl_message.message.npbytes - # import pdb; pdb.set_trace() - return flower_message \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/message_logger.py b/openfl-workspace/flower-app-pytorch/message_logger.py deleted file mode 100644 index 5f5b5ca152..0000000000 --- a/openfl-workspace/flower-app-pytorch/message_logger.py +++ /dev/null @@ -1,109 +0,0 @@ -# import logging -# from deserialize_message import deserialize_flower_message - -# # Configure logging -# logging.basicConfig(filename='flower_messages.log', level=logging.INFO, format='%(asctime)s - %(message)s') - -# def log_flower_message(flower_message, message_type): -# """ -# Log a Flower message or response with deserialized content. - -# Args: -# flower_message: The Flower message or response to be logged. -# message_type: A string indicating the type of message ('sent' or 'received'). -# """ -# # Deserialize the grpc_message_content -# deserialized_content = deserialize_flower_message(flower_message) - -# # Prepare the log entry -# message_str = f"Flower message {message_type}:\n{flower_message}" -# if deserialized_content is not None: -# message_str += f"\nDeserialized content:\n{deserialized_content}" -# else: -# message_str += "\nDeserialization failed" - -# # Add separator -# message_str += f"\n{'=' * 40}\n" - -# # Log the message with deserialized content and separator -# logging.info(message_str) - -# # This function can be used to log messages from other parts of your application -# def log_message(flower_message, message_type): -# """ -# Public function to log a Flower message or response. - -# Args: -# flower_message: The Flower message or response to be logged. -# message_type: A string indicating the type of message ('sent' or 'received'). -# """ -# log_flower_message(flower_message, message_type) - -import logging -import os -from deserialize_message import deserialize_flower_message - -def get_logger(client_id): - """ - Get a logger for a specific client ID. - - Args: - client_id: A unique identifier for the client. - - Returns: - A logging.Logger instance for the client. - """ - # Create a directory for client logs if it doesn't exist - log_dir = 'client_logs' - if not os.path.exists(log_dir): - os.makedirs(log_dir) - - # Configure logging for the client - log_filename = os.path.join(log_dir, f'flower_messages_{client_id}.log') - logger = logging.getLogger(f'client_{client_id}') - logger.setLevel(logging.INFO) - if not logger.handlers: - # Add a file handler if it doesn't already exist - file_handler = logging.FileHandler(log_filename) - formatter = logging.Formatter('%(asctime)s - %(message)s') - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - return logger - -def log_flower_message(flower_message, message_type, client_id): - """ - Log a Flower message or response with deserialized content for a specific client. - - Args: - flower_message: The Flower message or response to be logged. - message_type: A string indicating the type of message ('sent' or 'received'). - client_id: A unique identifier for the client. - """ - # Deserialize the grpc_message_content - deserialized_content = deserialize_flower_message(flower_message) - - # Prepare the log entry - message_str = f"Flower message {message_type}:\n{flower_message}" - if deserialized_content is not None: - message_str += f"\nDeserialized content:\n{deserialized_content}" - else: - message_str += "\nDeserialization failed" - - # Add separator - message_str += f"\n{'=' * 40}\n" - - # Get the logger for the client and log the message with deserialized content and separator - logger = get_logger(client_id) - logger.info(message_str) - -# This function can be used to log messages from other parts of your application -def log_message(flower_message, message_type, client_id): - """ - Public function to log a Flower message or response for a specific client. - - Args: - flower_message: The Flower message or response to be logged. - message_type: A string indicating the type of message ('sent' or 'received'). - client_id: A unique identifier for the client. - """ - log_flower_message(flower_message, message_type, client_id) \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/openfl_client_with_local_grpc_server.py b/openfl-workspace/flower-app-pytorch/openfl_client_with_local_grpc_server.py deleted file mode 100644 index bfb8931c04..0000000000 --- a/openfl-workspace/flower-app-pytorch/openfl_client_with_local_grpc_server.py +++ /dev/null @@ -1,53 +0,0 @@ -import grpc -from concurrent import futures -from flwr.proto import grpcadapter_pb2_grpc -from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc -# from openfl.proto import openfl_pb2_grpc, openfl_pb2 -from message_conversion import flower_to_openfl_message, openfl_to_flower_message - -class OpenFLClient: - def __init__(self, openfl_server_address): - self.channel = grpc.insecure_channel(openfl_server_address) - self.stub = aggregator_pb2_grpc.AggregatorStub(self.channel) - - def send_message_to_server(self, flower_message): - # Convert Flower message to OpenFL message - openfl_message = flower_to_openfl_message(flower_message, - sender="Flower Client", - receiver="OpenFL Client") - # Send the OpenFL message to the OpenFL server and get a response - openfl_response = self.stub.PelicanDrop(openfl_message) - # Convert the OpenFL response back to a Flower message - flower_response = openfl_to_flower_message(openfl_response) - return flower_response - - -class OpenFLLocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): - def __init__(self, openfl_client): - self.openfl_client = openfl_client - - def SendReceive(self, request, context): - # Received a message from the Flower client - print(f"Received message from Flower client, sending to OpenFL server: {request.grpc_message_name}") - # Forward the incoming message to the OpenFL client - flower_response = self.openfl_client.send_message_to_server(request) - # Sending the response back to the Flower client - print(f"Received message from OpenFL server, sending response back to Flower client: {flower_response.grpc_message_name}") - return flower_response - -def serve(openfl_server_address, local_server_port): - # Start the OpenFL client - openfl_client = OpenFLClient(openfl_server_address) - - # Start the local gRPC server - server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) - grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(OpenFLLocalGRPCServer(openfl_client), server) - server.add_insecure_port(f'[::]:{local_server_port}') - server.start() - print(f"OpenFL local gRPC server started, listening on port {local_server_port}.") - server.wait_for_termination() - -if __name__ == '__main__': - openfl_server_address = '127.0.0.1:9095' # The OpenFL server's IP address and port - local_server_port = '9092' # The port the local gRPC server will listen on - serve(openfl_server_address, local_server_port) \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/openfl_server_with_local_grpc_client.py b/openfl-workspace/flower-app-pytorch/openfl_server_with_local_grpc_client.py deleted file mode 100644 index 4ec3e95e7a..0000000000 --- a/openfl-workspace/flower-app-pytorch/openfl_server_with_local_grpc_client.py +++ /dev/null @@ -1,63 +0,0 @@ -import grpc -from concurrent import futures -from flwr.proto import grpcadapter_pb2_grpc -from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc -from message_conversion import flower_to_openfl_message, openfl_to_flower_message - -# from message_logger import log_message - -class LocalGRPCClient: - def __init__(self, superlink_address): - self.superlink_channel = grpc.insecure_channel(superlink_address) - self.superlink_stub = grpcadapter_pb2_grpc.GrpcAdapterStub(self.superlink_channel) - - def send_receive(self, openfl_message, client_id): - # Convert OpenFL message to Flower message - flower_message = openfl_to_flower_message(openfl_message) - # log_message(flower_message, 'sent', client_id) - # Send the Flower message to the Flower server and get a response - flower_response = self.superlink_stub.SendReceive(flower_message) - # log_message(flower_message, 'received', client_id) - # Convert Flower response to OpenFL response - print(f"Received message from Flower server, sending response back to OpenFL client: {flower_response.grpc_message_name}") - openfl_response = flower_to_openfl_message(flower_response, sender='Flower Server', receiver='OpenFL Server') - return openfl_response - -class OpenFLServer(aggregator_pb2_grpc.AggregatorServicer): - def __init__(self, local_grpc_client): - self.local_grpc_client = local_grpc_client - - def PelicanDrop(self, request, context): - """ - Args: - request (aggregator_pb2.DropPod): The request - from the collaborator. - context (grpc.ServicerContext): The context of the request. - - Returns: - aggregator_pb2.DropPod: The response to the - request. - """ - client_id = context.peer() - # Forward the incoming OpenFL message to the local gRPC client - print(f"Received message from OpenFL client, sending message to Flower server")#: {request.message_type}") - openfl_response = self.local_grpc_client.send_receive(request, client_id) - # print(f"Received message from Flower server, sending response back to OpenFL client: {openfl_response.message_type}") - return openfl_response - -def serve(superlink_address, openfl_server_port): - # Start the local gRPC client - local_grpc_client = LocalGRPCClient(superlink_address) - - # Start the OpenFL server - server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) - aggregator_pb2_grpc.add_AggregatorServicer_to_server(OpenFLServer(local_grpc_client), server) - server.add_insecure_port(f'[::]:{openfl_server_port}') - server.start() - print(f"OpenFL server started with local gRPC client. Listening on port {openfl_server_port}.") - server.wait_for_termination() - -if __name__ == '__main__': - superlink_address = '127.0.0.1:9093' # The Flower superlink address - openfl_server_port = '9095' # The port the OpenFL server will listen on - serve(superlink_address, openfl_server_port) \ No newline at end of file diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index 7c189f06f7..474647fd8b 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -490,7 +490,6 @@ def send_local_task_results( @_atomic_connection @_resend_data_on_reconnection def send_message_to_server(self, flower_message, collaborator_name): - # TODO, how to add header information to the message? self._set_header(collaborator_name) openfl_message = flower_to_openfl_message(flower_message, header=self.header) diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index 5c95d18053..72912b562a 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -327,13 +327,11 @@ def PelicanDrop(self, request, context): if not self.fim: context.abort(StatusCode.UNIMPLEMENTED, "PelicanDrop is only available in framework interopability mode.") - #TODO: This needs to be done when receiving from OpenFL client (not from local gRPC client, local gRPC should have it's own verification) - #This seems like the right step for this verification since we don't send to local gRPC yet + #TODO: local gRPC should have it's own verification when receiving and converting flower messages self.validate_collaborator(request, context) self.check_request(request) collaborator_name = request.header.sender - #TODO: Extract header information from client request (i.e. collaborator_name = request.header.sender) # Forward the incoming OpenFL message to the local gRPC client print(f"OpenFL Server: Received message from OpenFL client, sending message to Flower server") return self.local_grpc_client.send_receive(request, header=self.get_header(collaborator_name)) diff --git a/openfl/transport/grpc/fim/flower/local_grpc_client.py b/openfl/transport/grpc/fim/flower/local_grpc_client.py index 530be37746..4593158ddc 100644 --- a/openfl/transport/grpc/fim/flower/local_grpc_client.py +++ b/openfl/transport/grpc/fim/flower/local_grpc_client.py @@ -9,8 +9,6 @@ def __init__(self, superlink_address): def send_receive(self, openfl_message, header): # TODO: verification step for messages coming from Flower server - collaborator_name = openfl_message.header.sender - flower_message = openfl_to_flower_message(openfl_message) flower_response = self.superlink_stub.SendReceive(flower_message) # print(f"Received message from Flower server, sending response through OpenFL server back to OpenFL client: {flower_response.grpc_message_name}") From 6a815b7718ff8ca8df951a5330bd2813f79016c6 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 1 Nov 2024 11:33:24 -0700 Subject: [PATCH 005/107] enable supernode process configuration to pull number of partitions and partition id from data.yaml Signed-off-by: kta-intel --- .../flower-app-pytorch/plan/cols.yaml | 2 +- .../flower-app-pytorch/plan/data.yaml | 2 + .../flower-app-pytorch/plan/plan.yaml | 20 ++- openfl/federated/data/loader_flower.py | 64 +++++++ openfl/federated/plan/plan.py | 7 +- openfl/federated/task/runner_flower.py | 6 +- openfl/interface/collaborator.py | 45 ++--- openfl/interface/plan.py | 156 +++++++----------- .../grpc/fim/flower/local_grpc_server.py | 7 - 9 files changed, 164 insertions(+), 145 deletions(-) create mode 100644 openfl-workspace/flower-app-pytorch/plan/data.yaml create mode 100644 openfl/federated/data/loader_flower.py diff --git a/openfl-workspace/flower-app-pytorch/plan/cols.yaml b/openfl-workspace/flower-app-pytorch/plan/cols.yaml index 2d91469ca4..024e743dcd 100644 --- a/openfl-workspace/flower-app-pytorch/plan/cols.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/cols.yaml @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2024 Intel Corporation +# Copyright (C) 2024 Intel Corporation # Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. collaborators: diff --git a/openfl-workspace/flower-app-pytorch/plan/data.yaml b/openfl-workspace/flower-app-pytorch/plan/data.yaml new file mode 100644 index 0000000000..6bca42b213 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/plan/data.yaml @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index f71a4e219b..2ec011c5e6 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2024 Intel Corporation +# Copyright (C) 2024 Intel Corporation # Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. aggregator : @@ -16,6 +16,18 @@ collaborator : settings : {} +data_loader : + defaults : plan/defaults/data_loader.yaml + template : openfl.federated.data.loader_flower.FlowerDataLoader + settings : + collaborator_count : 2 + +task_runner : + defaults : plan/defaults/task_runner.yaml + template : openfl.federated.task.runner_flower.FlowerTaskRunner + settings : + {} + network : defaults : plan/defaults/network.yaml settings : @@ -31,12 +43,6 @@ assigner : tasks : - start_client_adapter -task_runner : - defaults : plan/defaults/task_runner.yaml - template : openfl.federated.task.runner_flower.FlowerTaskRunner - settings : - {} - tasks : defaults : plan/defaults/tasks_flower.yaml diff --git a/openfl/federated/data/loader_flower.py b/openfl/federated/data/loader_flower.py new file mode 100644 index 0000000000..6cfb7764d7 --- /dev/null +++ b/openfl/federated/data/loader_flower.py @@ -0,0 +1,64 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""FlowerDataLoader module.""" + +from openfl.federated.data.loader import DataLoader + + +class FlowerDataLoader(DataLoader): + """Flower Dataloader + + This class extends the OpenFL DataLoader to provide functionality for + loading and partitioning data for a Flower workload. + + Attributes: + data_shard (int): The shard number of the dataset. + num_partitions (int): The number of partitions to divide the dataset into. + """ + + def __init__(self, data_path, collaborator_count, **kwargs): + """ + Initialize the FlowerDataLoader. + + Args: + data_path (str or int): The shard number of the dataset. + collaborator_count (int): The number of partitions to divide the dataset into. + **kwargs: Additional keyword arguments to pass to the parent DataLoader class. + + Raises: + ValueError: If collaborator_count is not provided or if data_path is not a number. + """ + if collaborator_count is None: + raise ValueError("collaborator_count must be set and cannot be None.") + + try: + partition_id = int(data_path) + except ValueError: + raise ValueError("data_path must be a number corresponding to the shard.") + + if partition_id >= collaborator_count: + raise ValueError("data_path is used as the partition_id and therefore cannot be greater than or equal to the collaborator count.") + + super().__init__(**kwargs) + self.partition_id = partition_id + self.num_partitions = collaborator_count + + def get_node_configs(self): + """ + Get the configuration for each node. + + This method returns the number of partitions and the data shard, + which can be used by each node to access the dataset. + + Returns: + tuple: A tuple containing the number of partitions and the data shard. + """ + return self.num_partitions, self.partition_id + + def get_feature_shape(self): + """ + Override the parent method to return None. + Flower's own infrastructure will handle the feature shape. + """ + return None \ No newline at end of file diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index 3bb38d79c8..58d9f1d0e4 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -597,11 +597,8 @@ def get_collaborator( ) else: # TaskRunner subclassing API - if 'Flower' in self.config["task_runner"]["template"]: - defaults[SETTINGS]["task_runner"] = self.get_task_runner(None) - else: - data_loader = self.get_data_loader(collaborator_name) - defaults[SETTINGS]["task_runner"] = self.get_task_runner(data_loader) + data_loader = self.get_data_loader(collaborator_name) + defaults[SETTINGS]["task_runner"] = self.get_task_runner(data_loader) defaults[SETTINGS]["compression_pipeline"] = self.get_tensor_pipe() defaults[SETTINGS]["task_config"] = self.config.get("tasks", {}) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index f0a91567b7..d7fe79a566 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -16,6 +16,8 @@ def __init__(self, **kwargs): **kwargs: Additional parameters to pass to the functions. """ super().__init__(**kwargs) + self.num_partitions = self.data_loader.get_node_configs()[0] + self.partition_id = self.data_loader.get_node_configs()[1] def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): local_server_port = kwargs['local_server_port'] @@ -33,11 +35,11 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): # import pdb; pdb.set_trace() command = [ "flower-supernode", - kwargs.get('app_path', './app-pytorch'), + "./app-pytorch", "--insecure", "--grpc-adapter", "--superlink", f"127.0.0.1:{local_server_port}", - "--node-config", f"num-partitions={kwargs.get('num_partitions', 1)} partition-id={kwargs.get('partition_id', 0)}" + "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" ] # Start the subprocess supernode_process = subprocess.Popen(command, shell=False) diff --git a/openfl/interface/collaborator.py b/openfl/interface/collaborator.py index 5edb371008..d483f5b9fe 100644 --- a/openfl/interface/collaborator.py +++ b/openfl/interface/collaborator.py @@ -63,42 +63,27 @@ def collaborator(context): required=True, help="The certified common name of the collaborator", ) -@option( - "-fim", - "--framework_interoperability_mode", - required=False, - help="For interoperability with other FL frameworks. True/False [Default: True]", - default=False, -) -def start_(plan, collaborator_name, data_config, framework_interoperability_mode): +def start_(plan, collaborator_name, data_config): """Start a collaborator service.""" - if framework_interoperability_mode: - plan = Plan.parse( - plan_config_path=Path(plan).absolute(), - ) - logger.info("🧿 Starting a Collaborator Service.") - plan.get_collaborator(collaborator_name).run() + if plan and is_directory_traversal(plan): + echo("Federated learning plan path is out of the openfl workspace scope.") + sys.exit(1) + if data_config and is_directory_traversal(data_config): + echo("The data set/shard configuration file path is out of the openfl workspace scope.") + sys.exit(1) - else: - if plan and is_directory_traversal(plan): - echo("Federated learning plan path is out of the openfl workspace scope.") - sys.exit(1) - if data_config and is_directory_traversal(data_config): - echo("The data set/shard configuration file path is out of the openfl workspace scope.") - sys.exit(1) - - plan = Plan.parse( - plan_config_path=Path(plan).absolute(), - data_config_path=Path(data_config).absolute(), - ) + plan = Plan.parse( + plan_config_path=Path(plan).absolute(), + data_config_path=Path(data_config).absolute(), + ) - # TODO: Need to restructure data loader config file loader + # TODO: Need to restructure data loader config file loader - echo(f"Data = {plan.cols_data_paths}") - logger.info("🧿 Starting a Collaborator Service.") + echo(f"Data = {plan.cols_data_paths}") + logger.info("🧿 Starting a Collaborator Service.") - plan.get_collaborator(collaborator_name).run() + plan.get_collaborator(collaborator_name).run() @collaborator.command(name="create") diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index e39e9065e3..783f3f7933 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -127,81 +127,51 @@ def initialize( gandlf_config (str): GaNDLF Configuration File Path. install_reqs (bool): Whether to install packages listed under 'requirements.txt'. """ - if framework_interoperability_mode: - plan_config = Path(plan_config).absolute() - cols_config = Path(cols_config).absolute() - - plan = Plan.parse( - plan_config_path=plan_config, - cols_config_path=cols_config, - ) - - plan_origin = Plan.parse( - plan_config_path=plan_config, - resolve=False, - ) - - if plan_origin.config["network"]["settings"]["agg_addr"] == "auto" or aggregator_address: - plan_origin.config["network"]["settings"]["agg_addr"] = aggregator_address or getfqdn_env() - - logger.warn( - f"Patching Aggregator Addr in Plan" - f" 🠆 {plan_origin.config['network']['settings']['agg_addr']}" - ) - - Plan.dump(plan_config, plan_origin.config) - - # Record that plan with this hash has been initialized - if "plans" not in context.obj: - context.obj["plans"] = [] - context.obj["plans"].append(f"{plan_config.stem}_{plan_origin.hash[:8]}") - logger.info(f"{context.obj['plans']}") - - else: - for p in [plan_config, cols_config, data_config]: - if is_directory_traversal(p): - echo(f"{p} is out of the openfl workspace scope.") - sys.exit(1) - - plan_config = Path(plan_config).absolute() - cols_config = Path(cols_config).absolute() - data_config = Path(data_config).absolute() - if gandlf_config is not None: - gandlf_config = Path(gandlf_config).absolute() - - if install_reqs: - requirements_filename = "requirements.txt" - requirements_path = Path(requirements_filename).absolute() - - if isfile(f"{str(requirements_path)}"): - check_call( - [ - sys.executable, - "-m", - "pip", - "install", - "-r", - f"{str(requirements_path)}", - ], - shell=False, - ) - echo(f"Successfully installed packages from {requirements_path}.") - - # Required to restart the process for newly installed packages to be recognized - args_restart = [arg for arg in sys.argv if not arg.startswith("--install_reqs")] - args_restart.append("--install_reqs=False") - os.execv(args_restart[0], args_restart) - else: - echo("No additional requirements for workspace defined. Skipping...") - - plan = Plan.parse( - plan_config_path=plan_config, - cols_config_path=cols_config, - data_config_path=data_config, - gandlf_config_path=gandlf_config, - ) - + for p in [plan_config, cols_config, data_config]: + if is_directory_traversal(p): + echo(f"{p} is out of the openfl workspace scope.") + sys.exit(1) + + plan_config = Path(plan_config).absolute() + cols_config = Path(cols_config).absolute() + data_config = Path(data_config).absolute() + if gandlf_config is not None: + gandlf_config = Path(gandlf_config).absolute() + + if install_reqs: + requirements_filename = "requirements.txt" + requirements_path = Path(requirements_filename).absolute() + + if isfile(f"{str(requirements_path)}"): + check_call( + [ + sys.executable, + "-m", + "pip", + "install", + "-r", + f"{str(requirements_path)}", + ], + shell=False, + ) + echo(f"Successfully installed packages from {requirements_path}.") + + # Required to restart the process for newly installed packages to be recognized + args_restart = [arg for arg in sys.argv if not arg.startswith("--install_reqs")] + args_restart.append("--install_reqs=False") + os.execv(args_restart[0], args_restart) + else: + echo("No additional requirements for workspace defined. Skipping...") + + plan = Plan.parse( + plan_config_path=plan_config, + cols_config_path=cols_config, + data_config_path=data_config, + gandlf_config_path=gandlf_config, + ) + + if not framework_interoperability_mode: init_state_path = plan.config["aggregator"]["settings"]["init_state_path"] # This is needed to bypass data being locally available @@ -235,30 +205,30 @@ def initialize( utils.dump_proto(model_proto=model_snap, fpath=init_state_path) - plan_origin = Plan.parse( - plan_config_path=plan_config, - gandlf_config_path=gandlf_config, - resolve=False, - ) + plan_origin = Plan.parse( + plan_config_path=plan_config, + gandlf_config_path=gandlf_config, + resolve=False, + ) - if plan_origin.config["network"]["settings"]["agg_addr"] == "auto" or aggregator_address: - plan_origin.config["network"]["settings"]["agg_addr"] = aggregator_address or getfqdn_env() + if plan_origin.config["network"]["settings"]["agg_addr"] == "auto" or aggregator_address: + plan_origin.config["network"]["settings"]["agg_addr"] = aggregator_address or getfqdn_env() - logger.warn( - f"Patching Aggregator Addr in Plan" - f" 🠆 {plan_origin.config['network']['settings']['agg_addr']}" - ) + logger.warn( + f"Patching Aggregator Addr in Plan" + f" 🠆 {plan_origin.config['network']['settings']['agg_addr']}" + ) - Plan.dump(plan_config, plan_origin.config) + Plan.dump(plan_config, plan_origin.config) - if gandlf_config is not None: - Plan.dump(plan_config, plan_origin.config) + if gandlf_config is not None: + Plan.dump(plan_config, plan_origin.config) - # Record that plan with this hash has been initialized - if "plans" not in context.obj: - context.obj["plans"] = [] - context.obj["plans"].append(f"{plan_config.stem}_{plan_origin.hash[:8]}") - logger.info(f"{context.obj['plans']}") + # Record that plan with this hash has been initialized + if "plans" not in context.obj: + context.obj["plans"] = [] + context.obj["plans"].append(f"{plan_config.stem}_{plan_origin.hash[:8]}") + logger.info(f"{context.obj['plans']}") # TODO: looks like Plan.method diff --git a/openfl/transport/grpc/fim/flower/local_grpc_server.py b/openfl/transport/grpc/fim/flower/local_grpc_server.py index cbb883898f..3d091197bc 100644 --- a/openfl/transport/grpc/fim/flower/local_grpc_server.py +++ b/openfl/transport/grpc/fim/flower/local_grpc_server.py @@ -1,11 +1,4 @@ -import grpc -from concurrent.futures import ThreadPoolExecutor from flwr.proto import grpcadapter_pb2_grpc -from multiprocessing import cpu_count -from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc -from openfl.transport import AggregatorGRPCClient -from openfl.transport.grpc.fim.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message - class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): def __init__(self, openfl_client, collaborator_name): From a40dc48f4f6c66fd94e247acb4be31f6793837da Mon Sep 17 00:00:00 2001 From: kta-intel Date: Sat, 7 Dec 2024 07:09:24 -0800 Subject: [PATCH 006/107] update for flwr-nightly Signed-off-by: kta-intel --- .../flower-app-pytorch/app-pytorch/README.md | 11 +++++++++++ .../app-pytorch/app_pytorch/client_app.py | 2 +- .../app-pytorch/app_pytorch/server_app.py | 4 ++-- .../flower-app-pytorch/app-pytorch/pyproject.toml | 9 +++++---- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/app-pytorch/README.md index 9564565f97..998f39e69d 100644 --- a/openfl-workspace/flower-app-pytorch/app-pytorch/README.md +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/README.md @@ -14,7 +14,18 @@ In the `app-pytorch` directory, use `flwr run` to run a local simulation: flwr run . ``` +Refer to the [How to Run Simulations](https://flower.ai/docs/framework/how-to-run-simulations.html) guide in the documentation for advice on how to optimize your simulations. + ## Run with the Deployment Engine > \[!NOTE\] > An update to this example will show how to run this Flower application with the Deployment Engine and TLS certificates, or with Docker. + +## Resources + +- Flower website: [flower.ai](https://flower.ai/) +- Check the documentation: [flower.ai/docs](https://flower.ai/docs/) +- Give Flower a ⭐️ on GitHub: [GitHub](https://github.com/adap/flower) +- Join the Flower community! + - [Flower Slack](https://flower.ai/join-slack/) + - [Flower Discuss](https://discuss.flower.ai/) diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py index d802d900c2..ea0412c948 100644 --- a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py @@ -43,7 +43,7 @@ def client_fn(context: Context): partition_id = context.node_config["partition-id"] num_partitions = context.node_config["num-partitions"] trainloader, valloader = load_data(partition_id, num_partitions) - local_epochs = 1 + local_epochs = context.run_config["local-epochs"] # Return Client instance return FlowerClient(net, trainloader, valloader, local_epochs).to_client() diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py index abc3ec6d78..6e42b57beb 100644 --- a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py @@ -8,8 +8,8 @@ def server_fn(context: Context): # Read from config - num_rounds = 3 #context.run_config["num-server-rounds"] - fraction_fit = 0.5 #context.run_config["fraction-fit"] + num_rounds = context.run_config["num-server-rounds"] + fraction_fit = context.run_config["fraction-fit"] # Initialize model parameters ndarrays = get_weights(Net()) diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml b/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml index 0ca8900b90..0327a055fc 100644 --- a/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml @@ -8,7 +8,7 @@ version = "1.0.0" description = "" license = "Apache-2.0" dependencies = [ - "flwr-nightly==1.13.0.dev20241016", + "flwr-nightly==1.14.0.dev20241205", "flwr-datasets[vision]>=0.3.0", "torch==2.2.1", "torchvision==0.17.1", @@ -30,7 +30,8 @@ fraction-fit = 0.5 local-epochs = 1 [tool.flwr.federations] -default = "local-simulation" +default = "local-poc" -[tool.flwr.federations.local-simulation] -options.num-supernodes = 2 +[tool.flwr.federations.local-poc] +address = "127.0.0.1:9093" +insecure = true From 3a0e01eb67fff4e4bf4e95a05a89b81a5266d8fb Mon Sep 17 00:00:00 2001 From: kta-intel Date: Sun, 8 Dec 2024 07:11:32 -0800 Subject: [PATCH 007/107] update to flwr 1.14 Signed-off-by: kta-intel --- .../app-pytorch/pyproject.toml | 2 +- .../workspace/plan/defaults/tasks_flower.yaml | 2 +- openfl/federated/task/runner_flower.py | 11 ++++++--- openfl/interface/plan.py | 10 ++++---- openfl/transport/grpc/aggregator_server.py | 24 ++++++++++++------- 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml b/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml index 0327a055fc..f8e4c7c708 100644 --- a/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml @@ -33,5 +33,5 @@ local-epochs = 1 default = "local-poc" [tool.flwr.federations.local-poc] -address = "127.0.0.1:9093" +address = "127.0.0.1:9093" # this connects to flower --exec-api-address insecure = true diff --git a/openfl-workspace/workspace/plan/defaults/tasks_flower.yaml b/openfl-workspace/workspace/plan/defaults/tasks_flower.yaml index af64d0899c..3bc37b3060 100644 --- a/openfl-workspace/workspace/plan/defaults/tasks_flower.yaml +++ b/openfl-workspace/workspace/plan/defaults/tasks_flower.yaml @@ -1,4 +1,4 @@ start_client_adapter: function : start_client_adapter kwargs : - local_server_port : 9092 + local_server_port : 9090 # local grpc server diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index d7fe79a566..96bd5dc49d 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -32,13 +32,18 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): print(f"OpenFL local gRPC server started, listening on port {local_server_port}.") # Start the Flower supernode in a subprocess - # import pdb; pdb.set_trace() command = [ "flower-supernode", - "./app-pytorch", "--insecure", "--grpc-adapter", - "--superlink", f"127.0.0.1:{local_server_port}", + "--superlink", f"127.0.0.1:{local_server_port}", # This should connect to local gRPC server + # TODO: you must specify separate client ports when running multiple super nodes + # on a single machine (i.e. a local poc). We need to add ability to automatically + # set separate ports for each client if it is set as a local poc, otherwise it can be + # whatever is automatically set by the system. Or we can add option to set port manually + # or let it be automatically set + # TODO: temporarilty add client port to a collaborator unique yaml (i.e. data) + "--clientappio-api-address", f"127.0.0.1:{self.client_port}", "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" ] # Start the subprocess diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index e2a3503ea7..43ac31f86e 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -192,11 +192,11 @@ def initialize( **task_runner.tensor_dict_split_fn_kwargs, ) - logger.warning( - f"Following parameters omitted from global initial model, " - f"local initialization will determine" - f" values: {list(holdout_params.keys())}" - ) + logger.warning( + f"Following parameters omitted from global initial model, " + f"local initialization will determine" + f" values: {list(holdout_params.keys())}" + ) model_snap = utils.construct_model_proto( tensor_dict=tensor_dict, round_number=0, tensor_pipe=tensor_pipe diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index 21df684b07..7ac68158ac 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -88,7 +88,8 @@ def __init__( self.fim = fim if self.fim: - superlink_address = '127.0.0.1:9093' #kwargs.get("superlink_address") + # TODO Let user specify? + superlink_address = '127.0.0.1:9092' # NTS: This is the address that the Flower server will be listening on self.local_grpc_client = LocalGRPCClient(superlink_address) # Initialize the local gRPC client for Flower else: self.local_grpc_client = None @@ -395,16 +396,21 @@ def serve(self): "flower-superlink", "--insecure", "--fleet-api-type", "grpc-adapter", - "--fleet-api-address", "127.0.0.1:9093", - "--driver-api-address", "127.0.0.1:9091" + # "--fleet-api-address", "127.0.0.1:9093", + # "--driver-api-address", "127.0.0.1:9091", + # TODO, double check the addresses to make sure they are + # interacting properly + "--serverappio-api-address", "127.0.0.1:9091", # NTS: ? + "--fleet-api-address", "127.0.0.1:9092", # NTS: local gRPC client will connect here + "--exec-api-address", "127.0.0.1:9093", # NTS: port for server-app toml ], shell=False) # Start the Flower server app in a subprocess - server_app_process = subprocess.Popen([ - "flower-server-app", + flwr_run_process = subprocess.Popen([ + "flwr", + "run", "./app-pytorch", - "--insecure", - "--superlink", "127.0.0.1:9091" + "local-poc", #TODO: let model owner specify this ], shell=False) self.get_server() @@ -422,7 +428,7 @@ def serve(self): if getattr(self, 'fim', False): superlink_process.terminate() - server_app_process.terminate() + flwr_run_process.terminate() superlink_process.wait() - server_app_process.wait() + flwr_run_process.wait() From 49ef2eb3dc978f1af8f2c523325a881754b9753a Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 16 Dec 2024 07:26:06 -0800 Subject: [PATCH 008/107] enable runner to automatically set different client ports Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 96bd5dc49d..502b3a2db6 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -18,9 +18,16 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self.num_partitions = self.data_loader.get_node_configs()[0] self.partition_id = self.data_loader.get_node_configs()[1] + + # Define a base port number + base_port = 5000 + + # Calculate the client port by adding the partition ID to the base port + self.client_port = base_port + self.partition_id def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): local_server_port = kwargs['local_server_port'] + # local_server_port = 9092 # Start the local gRPC server server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) @@ -42,7 +49,7 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): # set separate ports for each client if it is set as a local poc, otherwise it can be # whatever is automatically set by the system. Or we can add option to set port manually # or let it be automatically set - # TODO: temporarilty add client port to a collaborator unique yaml (i.e. data) + # TODO: temporarily add client port to a collaborator unique yaml (i.e. data) "--clientappio-api-address", f"127.0.0.1:{self.client_port}", "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" ] From f529fe2e29bbf11842b8b7507d989b455ca72dbc Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 16 Dec 2024 14:29:55 -0800 Subject: [PATCH 009/107] add queue-based processing to avoid communication cancellation through atomic connection wrapper Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 25 +++++++-- openfl/transport/grpc/aggregator_client.py | 14 ++++- openfl/transport/grpc/aggregator_server.py | 21 ++----- .../grpc/fim/flower/local_grpc_client.py | 25 ++++++++- .../grpc/fim/flower/local_grpc_server.py | 56 +++++++++++++++++-- 5 files changed, 110 insertions(+), 31 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 502b3a2db6..5990844c15 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -9,8 +9,14 @@ class FlowerTaskRunner(TaskRunner): + """ + FlowerTaskRunner is a task runner that executes Flower SuperNode + to initialize the experiment from the client side + """ + def __init__(self, **kwargs): - """Initializes the FlowerTaskRunner object. + """ + Initializes. Args: **kwargs: Additional parameters to pass to the functions. @@ -24,10 +30,18 @@ def __init__(self, **kwargs): # Calculate the client port by adding the partition ID to the base port self.client_port = base_port + self.partition_id - + def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): + """ + Starts the local gRPC server and the Flower SuperNode. + + Args: + openfl_client: The OpenFL client instance used to communicate with the OpenFL server. + collaborator_name: The name of the collaborator. + **kwargs: Additional parameters, including 'local_server_port'. + """ local_server_port = kwargs['local_server_port'] - # local_server_port = 9092 + # local_server_port = 9092 # note [kta-intel]: a direct connection to flower superlink # Start the local gRPC server server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) @@ -38,18 +52,17 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): server.start() print(f"OpenFL local gRPC server started, listening on port {local_server_port}.") - # Start the Flower supernode in a subprocess + # Start the Flower SuperNode in a subprocess command = [ "flower-supernode", "--insecure", "--grpc-adapter", - "--superlink", f"127.0.0.1:{local_server_port}", # This should connect to local gRPC server + "--superlink", f"127.0.0.1:{local_server_port}", # note [kta-intel]: this connects to local gRPC server # TODO: you must specify separate client ports when running multiple super nodes # on a single machine (i.e. a local poc). We need to add ability to automatically # set separate ports for each client if it is set as a local poc, otherwise it can be # whatever is automatically set by the system. Or we can add option to set port manually # or let it be automatically set - # TODO: temporarily add client port to a collaborator unique yaml (i.e. data) "--clientappio-api-address", f"127.0.0.1:{self.client_port}", "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" ] diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index 8b30924ffe..7c753c971a 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -136,6 +136,8 @@ def intercept_stream_unary(self, continuation, client_call_details, request_iter def _atomic_connection(func): + # TODO: Need to investigate how to handle atomic connection when + # two requests are send in very quick succession def wrapper(self, *args, **kwargs): self.reconnect() response = func(self, *args, **kwargs) @@ -490,14 +492,22 @@ def send_local_task_results( @_atomic_connection @_resend_data_on_reconnection def send_message_to_server(self, flower_message, collaborator_name): + """ + Sends a message from the Flower SuperNode to the OpenFL server and returns the response. + + Args: + flower_message: The message from the Flower client to be sent to the OpenFL server. + collaborator_name: The name of the collaborator. + + Returns: + The response from the OpenFL server, converted back to a Flower message. + """ self._set_header(collaborator_name) openfl_message = flower_to_openfl_message(flower_message, header=self.header) openfl_response = self.stub.PelicanDrop(openfl_message) - # Validate openFL response self.validate_response(openfl_response, collaborator_name) flower_response = openfl_to_flower_message(openfl_response) - # Validate flower response (deserialize message?) return flower_response def _get_trained_model(self, experiment_name, model_type): diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index 7ac68158ac..fc8231c862 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -88,8 +88,9 @@ def __init__( self.fim = fim if self.fim: - # TODO Let user specify? - superlink_address = '127.0.0.1:9092' # NTS: This is the address that the Flower server will be listening on + # TODO: Users should have the option to specifc this address or have it default + # note [kta-intel]: This is the address that the Flower server will be listening on + superlink_address = '127.0.0.1:9092' self.local_grpc_client = LocalGRPCClient(superlink_address) # Initialize the local gRPC client for Flower else: self.local_grpc_client = None @@ -195,9 +196,6 @@ def GetTasks(self, request, context): # NOQA:N802 Returns: aggregator_pb2.GetTasksResponse: The response to the request. """ - # if self.fim: - # context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") - self.validate_collaborator(request, context) self.check_request(request) collaborator_name = request.header.sender @@ -333,13 +331,11 @@ def PelicanDrop(self, request, context): if not self.fim: context.abort(StatusCode.UNIMPLEMENTED, "PelicanDrop is only available in framework interopability mode.") - #TODO: local gRPC should have it's own verification when receiving and converting flower messages self.validate_collaborator(request, context) self.check_request(request) collaborator_name = request.header.sender # Forward the incoming OpenFL message to the local gRPC client - print(f"OpenFL Server: Received message from OpenFL client, sending message to Flower server") return self.local_grpc_client.send_receive(request, header=self.get_header(collaborator_name)) def get_server(self): @@ -352,7 +348,6 @@ def get_server(self): Returns: grpc.Server: The gRPC server. """ - # TODO: Need to launch superlink and flower server app somewhere self.server = server(ThreadPoolExecutor(max_workers=cpu_count()), options=channel_options) aggregator_pb2_grpc.add_AggregatorServicer_to_server(self, self.server) @@ -396,13 +391,9 @@ def serve(self): "flower-superlink", "--insecure", "--fleet-api-type", "grpc-adapter", - # "--fleet-api-address", "127.0.0.1:9093", - # "--driver-api-address", "127.0.0.1:9091", - # TODO, double check the addresses to make sure they are - # interacting properly - "--serverappio-api-address", "127.0.0.1:9091", # NTS: ? - "--fleet-api-address", "127.0.0.1:9092", # NTS: local gRPC client will connect here - "--exec-api-address", "127.0.0.1:9093", # NTS: port for server-app toml + "--serverappio-api-address", "127.0.0.1:9091", + "--fleet-api-address", "127.0.0.1:9092", # note [kta-intel]: local gRPC client will connect here + "--exec-api-address", "127.0.0.1:9093", # note [kta-intel]: port for server-app toml ], shell=False) # Start the Flower server app in a subprocess diff --git a/openfl/transport/grpc/fim/flower/local_grpc_client.py b/openfl/transport/grpc/fim/flower/local_grpc_client.py index 4593158ddc..60c71ff584 100644 --- a/openfl/transport/grpc/fim/flower/local_grpc_client.py +++ b/openfl/transport/grpc/fim/flower/local_grpc_client.py @@ -3,14 +3,35 @@ from openfl.transport.grpc.fim.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message class LocalGRPCClient: + """ + LocalGRPCClient facilitates communication between the Flower SuperLink + and the OpenFL Server. It converts messages between OpenFL and Flower formats + and handles the send-receive communication with the Flower SuperNode using gRPC. + """ def __init__(self, superlink_address): + """ + Initialize. + + Args: + superlink_address: The address the Flower SuperLink will listen on + """ self.superlink_channel = grpc.insecure_channel(superlink_address) self.superlink_stub = grpcadapter_pb2_grpc.GrpcAdapterStub(self.superlink_channel) def send_receive(self, openfl_message, header): - # TODO: verification step for messages coming from Flower server + """ + Sends a message to the Flower SuperLink and receives the response. + + Args: + openfl_message: converted Flower SuperNode request sent by OpenFL server + header: OpenFL header information to be included in the message. + + Returns: + The response from the Flower SuperLink, converted back to OpenFL format. + """ + # TODO: Add verification steps for messages coming from OpenFL transport flower_message = openfl_to_flower_message(openfl_message) flower_response = self.superlink_stub.SendReceive(flower_message) - # print(f"Received message from Flower server, sending response through OpenFL server back to OpenFL client: {flower_response.grpc_message_name}") openfl_response = flower_to_openfl_message(flower_response, header=header) + # TODO: Add verification steps for messages coming from Flower server return openfl_response diff --git a/openfl/transport/grpc/fim/flower/local_grpc_server.py b/openfl/transport/grpc/fim/flower/local_grpc_server.py index 3d091197bc..bb52ec17ad 100644 --- a/openfl/transport/grpc/fim/flower/local_grpc_server.py +++ b/openfl/transport/grpc/fim/flower/local_grpc_server.py @@ -1,14 +1,58 @@ +import threading +import queue from flwr.proto import grpcadapter_pb2_grpc class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): + """ + LocalGRPCServer is a gRPC server that handles requests from the Flower SuperNode + and forwards them to the OpenFL Client. It uses a queue-based system to + ensure that requests are processed sequentially, preventing concurrent + request handling issues. + """ + def __init__(self, openfl_client, collaborator_name): + """ + Initialize. + + Args: + openfl_client: An instance of the OpenFL Client. + collaborator_name: The name of the collaborator. + """ self.openfl_client = openfl_client self.collaborator_name = collaborator_name + self.request_queue = queue.Queue() + self.processing_thread = threading.Thread(target=self.process_queue) + self.processing_thread.daemon = True + self.processing_thread.start() def SendReceive(self, request, context): - # print(f"Received message from Flower client, sending through OpenFL client to OpenFL server: {request.grpc_message_name}") - # Forward the incoming message to the OpenFL client - flower_response = self.openfl_client.send_message_to_server(request, self.collaborator_name) - # Sending the response back to the Flower client - # print(f"Received message from OpenFL server, sending response through OpenFL client back to Flower client: {flower_response.grpc_message_name}") - return flower_response \ No newline at end of file + """ + Handles incoming gRPC requests by putting them into the request queue + and waiting for the response. + + Args: + request: The incoming gRPC request. + context: The gRPC context. + + Returns: + The response from the OpenFL server. + """ + # TODO: Add verification steps for messages coming from Flower clients + response_queue = queue.Queue() + self.request_queue.put((request, response_queue)) + # TODO: Add verification steps for messages coming from OpenFL transport + return response_queue.get() + + def process_queue(self): + """ + Continuously processes requests from the request queue. Each request is + sent to the OpenFL server, and the response is put into the corresponding + response queue. + """ + while True: + request, response_queue = self.request_queue.get() + # Send request to the OpenFL server + flower_response = self.openfl_client.send_message_to_server(request, self.collaborator_name) + # Send response to Flower client + response_queue.put(flower_response) + self.request_queue.task_done() \ No newline at end of file From ffe7b47d60c2c4cd9781b0db5067397e7fd5544c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 16 Dec 2024 14:31:57 -0800 Subject: [PATCH 010/107] add todo Signed-off-by: kta-intel --- openfl/component/collaborator/collaborator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index 16814e0237..ca64b7b349 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -163,9 +163,6 @@ def run(self): """Run the collaborator.""" memory_details = [] while True: - # if self.fim: - # self.start_flower_client_connection() - # else: tasks, round_number, sleep_time, time_to_quit = self.get_tasks() if time_to_quit: break @@ -254,6 +251,7 @@ def do_task(self, task, round_number): func_name = self.task_config[task_name]["function"] kwargs = self.task_config[task_name]["kwargs"] if func_name=="start_client_adapter": + # TODO Need a major elegant and robust way to implement this if hasattr(self.task_runner, func_name): method = getattr(self.task_runner, func_name) if callable(method): From a7ebd91d7c96eb1af6b635b3b9ba7b6941ff0f52 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 17 Dec 2024 13:48:35 -0800 Subject: [PATCH 011/107] add FLEX component Signed-off-by: kta-intel --- .../flower-app-pytorch/plan/plan.yaml | 11 ++++ .../workspace/plan/defaults/fl_exchange.yaml | 2 + openfl/component/__init__.py | 3 + openfl/component/aggregator/aggregator.py | 22 +++++++ openfl/component/interoperability/__init__.py | 5 ++ openfl/component/interoperability/flex.py | 42 ++++++++++++++ .../component/interoperability/flex_flower.py | 54 +++++++++++++++++ openfl/federated/plan/plan.py | 17 ++++++ openfl/transport/grpc/aggregator_server.py | 58 +++++++++---------- 9 files changed, 182 insertions(+), 32 deletions(-) create mode 100644 openfl-workspace/workspace/plan/defaults/fl_exchange.yaml create mode 100644 openfl/component/interoperability/__init__.py create mode 100644 openfl/component/interoperability/flex.py create mode 100644 openfl/component/interoperability/flex_flower.py diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 2ec011c5e6..520214dee9 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -10,6 +10,17 @@ aggregator : last_state_path : null rounds_to_train : 1 +flex : + defaults : plan/defaults/flex.yaml + template : openfl.component.FLEXFlower + enable : True + settings : + superlink_params: + insecure : True + serverappio-api-address : 127.0.0.1:9091 + fleet-api-address : 127.0.0.1:9092 # note [kta-intel]: local gRPC client will connect here + exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml + collaborator : defaults : plan/defaults/collaborator.yaml template : openfl.component.Collaborator diff --git a/openfl-workspace/workspace/plan/defaults/fl_exchange.yaml b/openfl-workspace/workspace/plan/defaults/fl_exchange.yaml new file mode 100644 index 0000000000..40f1385f43 --- /dev/null +++ b/openfl-workspace/workspace/plan/defaults/fl_exchange.yaml @@ -0,0 +1,2 @@ +template : openfl.component.FederatedLearningExchange +enable : True \ No newline at end of file diff --git a/openfl/component/__init__.py b/openfl/component/__init__.py index 3b787f87d0..233d795ca0 100644 --- a/openfl/component/__init__.py +++ b/openfl/component/__init__.py @@ -16,3 +16,6 @@ from openfl.component.straggler_handling_functions.straggler_handling_function import ( StragglerHandlingPolicy, ) +from openfl.component.interoperability.flex import FederatedLearningExchange +from openfl.component.interoperability.flex_flower import FLEXFlower + diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 8f4a2a774a..87995762d9 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -17,6 +17,7 @@ from openfl.utilities import TaskResultKey, TensorKey, change_tags from openfl.utilities.logs import get_memory_usage, write_metric +import subprocess class Aggregator: """An Aggregator is the central node in federated learning. @@ -70,6 +71,7 @@ def __init__( best_state_path, last_state_path, assigner, + flex, use_delta_updates=True, straggler_handling_policy=None, rounds_to_train=256, @@ -196,6 +198,8 @@ def __init__( self.use_delta_updates = use_delta_updates + self.flex = flex + def _load_initial_tensors(self): """Load all of the tensors required to begin federated learning. @@ -685,6 +689,24 @@ def send_local_task_results( self._end_of_round_with_stragglers_check() + def is_flex_available(self): + return self.flex is not None + + def start_flex(self): + if not self.is_flex_available(): + raise RuntimeError("Federated Learning exchange as not been enabled.") + return self.flex.start() + + def stop_flex(self): + if not self.is_flex_available(): + raise RuntimeError("Federated Learning exchange as not been enabled.") + return self.flex.stop() + + def get_flex_address(self): + if not self.is_flex_available(): + raise RuntimeError("Federated Learning exchange as not been enabled.") + return self.flex.address + def _end_of_round_with_stragglers_check(self): """ Checks if the minimum required collaborators have reported their results, diff --git a/openfl/component/interoperability/__init__.py b/openfl/component/interoperability/__init__.py new file mode 100644 index 0000000000..0ab76ed6cf --- /dev/null +++ b/openfl/component/interoperability/__init__.py @@ -0,0 +1,5 @@ +# Copyright 2020-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openfl.component.interoperability.flex import FederatedLearningExchange +from openfl.component.interoperability.flex_flower import FLEXFlower diff --git a/openfl/component/interoperability/flex.py b/openfl/component/interoperability/flex.py new file mode 100644 index 0000000000..7c7ca768d0 --- /dev/null +++ b/openfl/component/interoperability/flex.py @@ -0,0 +1,42 @@ +import subprocess +import logging + +logger = logging.getLogger(__name__) + +class FederatedLearningExchange: + """ + A skeletal base class for managing a subprocess. + """ + + def __init__(self, command: list[str], **kwargs): + """ + Initialize FLEX with a command to run as a subprocess. + Args: + command (list[str]): The command to run the server as a subprocess. + """ + self._command = command + self._process = None + + def start(self): + """ + Start the subprocess with the provided command. + """ + if self._process is None: + logger.info(f"[FLEX] Starting subprocess: {' '.join(self._command)}") + self._process = subprocess.Popen(self._command) + logger.info(f"[FLEX] Subprocess started with PID: {self._process.pid}") + else: + logger.info("[FLEX] Subprocess is already running.") + + def stop(self): + """ + Stop the subprocess if it is running. + """ + if self._process: + logger.info(f"[FLEX] Stopping subprocess with PID: {self._process.pid}...") + self._process.terminate() + self._process.wait() + self._process = None + logger.info("[FLEX] Subprocess stopped.") + else: + logger.info("[FLEX] No subprocess is currently running.") diff --git a/openfl/component/interoperability/flex_flower.py b/openfl/component/interoperability/flex_flower.py new file mode 100644 index 0000000000..def59b82a1 --- /dev/null +++ b/openfl/component/interoperability/flex_flower.py @@ -0,0 +1,54 @@ +from openfl.component.interoperability.flex import FederatedLearningExchange + +class FLEXFlower(FederatedLearningExchange): + """ + FLEX subclass for the Flower framework. + Responsible for generating the Flower server command. + """ + + def __init__(self, superlink_params: dict, **kwargs): + """ + Initialize FLEXFlower by building the server command from settings. + Args: + settings (dict): A dictionary of Flower server settings. + """ + self._settings = superlink_params + command = self._build_command(superlink_params) + super().__init__(command) + + def _build_command(self, superlink_params: dict) -> list[str]: + """ + Build the Flower server command based on settings. + Args: + settings (dict): Settings to configure the Flower server. + Returns: + list[str]: A list representing the Flower server start command. + """ + command = ["flower-superlink", "--fleet-api-type", "grpc-adapter"] + + if "insecure" in superlink_params: + if superlink_params["insecure"]: + command += ["--insecure"] + + if "serverappio-api-address" in superlink_params: + command += ["--serverappio-api-address", str(superlink_params["serverappio-api-address"])] + # flwr default: 0.0.0.0:9091 + + if "fleet-api-address" in superlink_params: + command += ["--fleet-api-address", str(superlink_params["fleet-api-address"])] + # flwr default: 0.0.0.0:9092 + + if "exec-api-address" in superlink_params: + command += ["--exec-api-address", str(superlink_params["exec-api-address"])] + # flwr default: 0.0.0.0:9093 + + return command + + @property + def address(self) -> str: + """ + Get the fleet API address from the settings. + Returns: + str: The fleet API address. + """ + return self._settings.get("fleet-api-address", "0.0.0.0:9092") \ No newline at end of file diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index 41d7afc12f..793f41b7cc 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -266,6 +266,7 @@ def __init__(self): self.collaborator_ = None # collaborator object self.aggregator_ = None # aggregator object self.assigner_ = None # assigner object + self.flex_ = None # federated learning exchange object self.loader_ = None # data loader object self.runner_ = None # task runner object @@ -339,6 +340,21 @@ def get_assigner(self): self.assigner_ = Plan.build(**defaults) return self.assigner_ + + def get_flex(self): + """Get federated learning exchange object.""" + defaults = self.config.get( + "flex", + {TEMPLATE: "openfl.components.FederatedLearningExchange", SETTINGS: {}}, + ) + + if self.flex_ is None and defaults.get('enable', False): + defaults = {k: v for k, v in defaults.items() if k != 'enable'} + self.flex_ = Plan.build(**defaults) + else: + self.flex_ = None + + return self.flex_ def get_tasks(self): """Get federation tasks.""" @@ -391,6 +407,7 @@ def get_aggregator(self, tensor_dict=None): defaults[SETTINGS]["assigner"] = self.get_assigner() defaults[SETTINGS]["compression_pipeline"] = self.get_tensor_pipe() defaults[SETTINGS]["straggler_handling_policy"] = self.get_straggler_handling_policy() + defaults[SETTINGS]["flex"] = self.get_flex() log_metric_callback = defaults[SETTINGS].get("log_metric_callback") if log_metric_callback: diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index fc8231c862..7d496b610c 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -53,7 +53,6 @@ def __init__( root_certificate=None, certificate=None, private_key=None, - fim=False, # Add a flag for Flower transport mode **kwargs, ): """ @@ -72,7 +71,7 @@ def __init__( TLS connection. private_key (str): The path to the server's private key for the TLS connection. - fim (bool): whether to use framework interopability mode + use_flex (bool): whether to use framework interopability mode **kwargs: Additional keyword arguments. """ print(f"{use_tls=}") @@ -86,11 +85,12 @@ def __init__( self.server = None self.server_credentials = None - self.fim = fim - if self.fim: + self.use_flex = self.aggregator.is_flex_available() + if self.use_flex: # TODO: Users should have the option to specifc this address or have it default # note [kta-intel]: This is the address that the Flower server will be listening on - superlink_address = '127.0.0.1:9092' + # superlink_address = '127.0.0.1:9092' + superlink_address = self.aggregator.get_flex_address() self.local_grpc_client = LocalGRPCClient(superlink_address) # Initialize the local gRPC client for Flower else: self.local_grpc_client = None @@ -247,7 +247,7 @@ def GetAggregatedTensor(self, request, context): # NOQA:N802 aggregator_pb2.GetAggregatedTensorResponse: The response to the request. """ - if self.fim: + if self.use_flex: context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") self.validate_collaborator(request, context) @@ -289,7 +289,7 @@ def SendLocalTaskResults(self, request, context): # NOQA:N802 aggregator_pb2.SendLocalTaskResultsResponse: The response to the request. """ - if self.fim: + if self.use_flex: context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") try: @@ -328,8 +328,8 @@ def PelicanDrop(self, request, context): aggregator_pb2.PelicanDrop: The response to the request. """ - if not self.fim: - context.abort(StatusCode.UNIMPLEMENTED, "PelicanDrop is only available in framework interopability mode.") + if not self.use_flex: + context.abort(StatusCode.UNIMPLEMENTED, "PelicanDrop is only available in federated interopability mode.") self.validate_collaborator(request, context) self.check_request(request) @@ -385,24 +385,18 @@ def serve(self): jobs havebeen sent. """ - if getattr(self, 'fim', False): - # Start the Flower superlink in a subprocess - superlink_process = subprocess.Popen([ - "flower-superlink", - "--insecure", - "--fleet-api-type", "grpc-adapter", - "--serverappio-api-address", "127.0.0.1:9091", - "--fleet-api-address", "127.0.0.1:9092", # note [kta-intel]: local gRPC client will connect here - "--exec-api-address", "127.0.0.1:9093", # note [kta-intel]: port for server-app toml - ], shell=False) - - # Start the Flower server app in a subprocess - flwr_run_process = subprocess.Popen([ - "flwr", - "run", - "./app-pytorch", - "local-poc", #TODO: let model owner specify this - ], shell=False) + # if getattr(self, 'use_flex', False): + + # # Start the Flower server app in a subprocess + # flwr_run_process = subprocess.Popen([ + # "flwr", + # "run", + # "./app-pytorch", + # "local-poc", #TODO: let model owner specify this + # ], shell=False) + + if self.use_flex: + self.aggregator.start_flex() self.get_server() @@ -417,9 +411,9 @@ def serve(self): self.server.stop(0) - if getattr(self, 'fim', False): - superlink_process.terminate() - flwr_run_process.terminate() + if self.use_flex: + self.aggregator.stop_flex() - superlink_process.wait() - flwr_run_process.wait() + # if getattr(self, 'use_flex', False): + # flwr_run_process.terminate() + # flwr_run_process.wait() From ebf9552f3d4d9bd144127cd94cb92f8fcb0c7c70 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 17 Dec 2024 14:03:04 -0800 Subject: [PATCH 012/107] move local grpc server to FLEX Signed-off-by: kta-intel --- openfl/component/aggregator/aggregator.py | 4 +- .../component/interoperability/flex_flower.py | 43 ++++++++----------- openfl/transport/grpc/aggregator_server.py | 9 +--- 3 files changed, 23 insertions(+), 33 deletions(-) diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 87995762d9..4876e55ac0 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -702,10 +702,10 @@ def stop_flex(self): raise RuntimeError("Federated Learning exchange as not been enabled.") return self.flex.stop() - def get_flex_address(self): + def get_local_grpc_client(self): if not self.is_flex_available(): raise RuntimeError("Federated Learning exchange as not been enabled.") - return self.flex.address + return self.flex.local_grpc_client def _end_of_round_with_stragglers_check(self): """ diff --git a/openfl/component/interoperability/flex_flower.py b/openfl/component/interoperability/flex_flower.py index def59b82a1..9ad1360ced 100644 --- a/openfl/component/interoperability/flex_flower.py +++ b/openfl/component/interoperability/flex_flower.py @@ -1,4 +1,5 @@ from openfl.component.interoperability.flex import FederatedLearningExchange +from openfl.transport.grpc.fim.flower.local_grpc_client import LocalGRPCClient class FLEXFlower(FederatedLearningExchange): """ @@ -8,47 +9,41 @@ class FLEXFlower(FederatedLearningExchange): def __init__(self, superlink_params: dict, **kwargs): """ - Initialize FLEXFlower by building the server command from settings. + Initialize FLEXFlower by building the server command from the superlink_params. Args: - settings (dict): A dictionary of Flower server settings. + superlink_params (dict): A dictionary of Flower server settings. """ - self._settings = superlink_params - command = self._build_command(superlink_params) + self.superlink_params = superlink_params + command = self._build_command() super().__init__(command) + + flex_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") + self.local_grpc_client = LocalGRPCClient(flex_address) - def _build_command(self, superlink_params: dict) -> list[str]: + def _build_command(self) -> list[str]: """ Build the Flower server command based on settings. Args: - settings (dict): Settings to configure the Flower server. + superlink_params (dict): Settings to configure the Flower server. Returns: list[str]: A list representing the Flower server start command. """ command = ["flower-superlink", "--fleet-api-type", "grpc-adapter"] - if "insecure" in superlink_params: - if superlink_params["insecure"]: + if "insecure" in self.superlink_params: + if self.superlink_params["insecure"]: command += ["--insecure"] - if "serverappio-api-address" in superlink_params: - command += ["--serverappio-api-address", str(superlink_params["serverappio-api-address"])] + if "serverappio-api-address" in self.superlink_params: + command += ["--serverappio-api-address", str(self.superlink_params["serverappio-api-address"])] # flwr default: 0.0.0.0:9091 - if "fleet-api-address" in superlink_params: - command += ["--fleet-api-address", str(superlink_params["fleet-api-address"])] + if "fleet-api-address" in self.superlink_params: + command += ["--fleet-api-address", str(self.superlink_params["fleet-api-address"])] # flwr default: 0.0.0.0:9092 - if "exec-api-address" in superlink_params: - command += ["--exec-api-address", str(superlink_params["exec-api-address"])] + if "exec-api-address" in self.superlink_params: + command += ["--exec-api-address", str(self.superlink_params["exec-api-address"])] # flwr default: 0.0.0.0:9093 - return command - - @property - def address(self) -> str: - """ - Get the fleet API address from the settings. - Returns: - str: The fleet API address. - """ - return self._settings.get("fleet-api-address", "0.0.0.0:9092") \ No newline at end of file + return command \ No newline at end of file diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index 7d496b610c..6130fb793b 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -14,7 +14,6 @@ from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc, utils from openfl.transport.grpc.grpc_channel_options import channel_options -from openfl.transport.grpc.fim.flower.local_grpc_client import LocalGRPCClient from openfl.utilities import check_equal, check_is_in import subprocess @@ -84,14 +83,10 @@ def __init__( self.private_key = private_key self.server = None self.server_credentials = None - self.use_flex = self.aggregator.is_flex_available() + if self.use_flex: - # TODO: Users should have the option to specifc this address or have it default - # note [kta-intel]: This is the address that the Flower server will be listening on - # superlink_address = '127.0.0.1:9092' - superlink_address = self.aggregator.get_flex_address() - self.local_grpc_client = LocalGRPCClient(superlink_address) # Initialize the local gRPC client for Flower + self.local_grpc_client = self.aggregator.get_local_grpc_client() # Initialize the local gRPC client else: self.local_grpc_client = None From 8020a2fa1afffe3230b991974572baa20fb9f22d Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 17 Dec 2024 14:08:44 -0800 Subject: [PATCH 013/107] change fim to flex Signed-off-by: kta-intel --- openfl/component/interoperability/flex_flower.py | 2 +- openfl/federated/task/runner_flower.py | 2 +- openfl/transport/grpc/aggregator_client.py | 2 +- openfl/transport/grpc/{fim => flex}/__init__.py | 0 openfl/transport/grpc/{fim => flex}/flower/__init__.py | 0 .../transport/grpc/{fim => flex}/flower/deserialize_message.py | 0 openfl/transport/grpc/{fim => flex}/flower/local_grpc_client.py | 2 +- openfl/transport/grpc/{fim => flex}/flower/local_grpc_server.py | 0 .../transport/grpc/{fim => flex}/flower/message_conversion.py | 0 9 files changed, 4 insertions(+), 4 deletions(-) rename openfl/transport/grpc/{fim => flex}/__init__.py (100%) rename openfl/transport/grpc/{fim => flex}/flower/__init__.py (100%) rename openfl/transport/grpc/{fim => flex}/flower/deserialize_message.py (100%) rename openfl/transport/grpc/{fim => flex}/flower/local_grpc_client.py (92%) rename openfl/transport/grpc/{fim => flex}/flower/local_grpc_server.py (100%) rename openfl/transport/grpc/{fim => flex}/flower/message_conversion.py (100%) diff --git a/openfl/component/interoperability/flex_flower.py b/openfl/component/interoperability/flex_flower.py index 9ad1360ced..f3745d1ad6 100644 --- a/openfl/component/interoperability/flex_flower.py +++ b/openfl/component/interoperability/flex_flower.py @@ -1,5 +1,5 @@ from openfl.component.interoperability.flex import FederatedLearningExchange -from openfl.transport.grpc.fim.flower.local_grpc_client import LocalGRPCClient +from openfl.transport.grpc.flex.flower.local_grpc_client import LocalGRPCClient class FLEXFlower(FederatedLearningExchange): """ diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 5990844c15..9826b09a4d 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -4,7 +4,7 @@ from multiprocessing import cpu_count from openfl.federated.task.runner import TaskRunner from openfl.transport import AggregatorGRPCClient -from openfl.transport.grpc.fim.flower.local_grpc_server import LocalGRPCServer +from openfl.transport.grpc.flex.flower.local_grpc_server import LocalGRPCServer import subprocess diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index 7c753c971a..40bd6915da 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -13,7 +13,7 @@ from openfl.pipelines import NoCompressionPipeline from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc, utils from openfl.transport.grpc.grpc_channel_options import channel_options -from openfl.transport.grpc.fim.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message +from openfl.transport.grpc.flex.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message from openfl.utilities import check_equal diff --git a/openfl/transport/grpc/fim/__init__.py b/openfl/transport/grpc/flex/__init__.py similarity index 100% rename from openfl/transport/grpc/fim/__init__.py rename to openfl/transport/grpc/flex/__init__.py diff --git a/openfl/transport/grpc/fim/flower/__init__.py b/openfl/transport/grpc/flex/flower/__init__.py similarity index 100% rename from openfl/transport/grpc/fim/flower/__init__.py rename to openfl/transport/grpc/flex/flower/__init__.py diff --git a/openfl/transport/grpc/fim/flower/deserialize_message.py b/openfl/transport/grpc/flex/flower/deserialize_message.py similarity index 100% rename from openfl/transport/grpc/fim/flower/deserialize_message.py rename to openfl/transport/grpc/flex/flower/deserialize_message.py diff --git a/openfl/transport/grpc/fim/flower/local_grpc_client.py b/openfl/transport/grpc/flex/flower/local_grpc_client.py similarity index 92% rename from openfl/transport/grpc/fim/flower/local_grpc_client.py rename to openfl/transport/grpc/flex/flower/local_grpc_client.py index 60c71ff584..13d082bf5c 100644 --- a/openfl/transport/grpc/fim/flower/local_grpc_client.py +++ b/openfl/transport/grpc/flex/flower/local_grpc_client.py @@ -1,6 +1,6 @@ import grpc from flwr.proto import grpcadapter_pb2_grpc -from openfl.transport.grpc.fim.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message +from openfl.transport.grpc.flex.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message class LocalGRPCClient: """ diff --git a/openfl/transport/grpc/fim/flower/local_grpc_server.py b/openfl/transport/grpc/flex/flower/local_grpc_server.py similarity index 100% rename from openfl/transport/grpc/fim/flower/local_grpc_server.py rename to openfl/transport/grpc/flex/flower/local_grpc_server.py diff --git a/openfl/transport/grpc/fim/flower/message_conversion.py b/openfl/transport/grpc/flex/flower/message_conversion.py similarity index 100% rename from openfl/transport/grpc/fim/flower/message_conversion.py rename to openfl/transport/grpc/flex/flower/message_conversion.py From 700b21b84652ecefd349867432d564e1935d8afe Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 17 Dec 2024 14:27:32 -0800 Subject: [PATCH 014/107] add method to flex base class for acquiring local grpc client Signed-off-by: kta-intel --- openfl/component/aggregator/aggregator.py | 2 +- openfl/component/interoperability/flex.py | 8 ++++++++ openfl/federated/task/runner_flower.py | 1 - openfl/transport/grpc/aggregator_client.py | 1 + openfl/transport/grpc/flex/flower/local_grpc_server.py | 1 + 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 4876e55ac0..2ebe10cd93 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -705,7 +705,7 @@ def stop_flex(self): def get_local_grpc_client(self): if not self.is_flex_available(): raise RuntimeError("Federated Learning exchange as not been enabled.") - return self.flex.local_grpc_client + return self.flex.get_local_grpc_client() def _end_of_round_with_stragglers_check(self): """ diff --git a/openfl/component/interoperability/flex.py b/openfl/component/interoperability/flex.py index 7c7ca768d0..d3471d884f 100644 --- a/openfl/component/interoperability/flex.py +++ b/openfl/component/interoperability/flex.py @@ -14,6 +14,7 @@ def __init__(self, command: list[str], **kwargs): Args: command (list[str]): The command to run the server as a subprocess. """ + self.local_grpc_client = None self._command = command self._process = None @@ -40,3 +41,10 @@ def stop(self): logger.info("[FLEX] Subprocess stopped.") else: logger.info("[FLEX] No subprocess is currently running.") + + + def get_local_grpc_client(self): + """ + Get the local gRPC client. + """ + return self.local_grpc_client \ No newline at end of file diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 9826b09a4d..923a170f01 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -3,7 +3,6 @@ from flwr.proto import grpcadapter_pb2_grpc from multiprocessing import cpu_count from openfl.federated.task.runner import TaskRunner -from openfl.transport import AggregatorGRPCClient from openfl.transport.grpc.flex.flower.local_grpc_server import LocalGRPCServer import subprocess diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index 40bd6915da..379c669484 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -503,6 +503,7 @@ def send_message_to_server(self, flower_message, collaborator_name): The response from the OpenFL server, converted back to a Flower message. """ self._set_header(collaborator_name) + #TODO: use a general to/from openfl_message function with "add_header" option, do the message conversion before calling send_message_to_server openfl_message = flower_to_openfl_message(flower_message, header=self.header) openfl_response = self.stub.PelicanDrop(openfl_message) diff --git a/openfl/transport/grpc/flex/flower/local_grpc_server.py b/openfl/transport/grpc/flex/flower/local_grpc_server.py index bb52ec17ad..9906e44544 100644 --- a/openfl/transport/grpc/flex/flower/local_grpc_server.py +++ b/openfl/transport/grpc/flex/flower/local_grpc_server.py @@ -52,6 +52,7 @@ def process_queue(self): while True: request, response_queue = self.request_queue.get() # Send request to the OpenFL server + # TODO: do message conversions here flower_response = self.openfl_client.send_message_to_server(request, self.collaborator_name) # Send response to Flower client response_queue.put(flower_response) From 5995fcf6d54507366bfa5f8c81f3f35d0ed75f0c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 18 Dec 2024 10:21:53 -0800 Subject: [PATCH 015/107] move message conversion out of openfl client methods Signed-off-by: kta-intel --- openfl/transport/grpc/aggregator_client.py | 16 ++++++---------- .../grpc/flex/flower/local_grpc_server.py | 8 ++++++-- .../grpc/flex/flower/message_conversion.py | 5 +++-- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index 379c669484..f148c953f3 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -13,7 +13,6 @@ from openfl.pipelines import NoCompressionPipeline from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc, utils from openfl.transport.grpc.grpc_channel_options import channel_options -from openfl.transport.grpc.flex.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message from openfl.utilities import check_equal @@ -491,25 +490,22 @@ def send_local_task_results( @_atomic_connection @_resend_data_on_reconnection - def send_message_to_server(self, flower_message, collaborator_name): + def send_message_to_server(self, openfl_message, collaborator_name): """ - Sends a message from the Flower SuperNode to the OpenFL server and returns the response. + Forwards a converted message from the local GRPC server (LGS) to the OpenFL server and returns the response. Args: - flower_message: The message from the Flower client to be sent to the OpenFL server. + openfl_message: The converted message from the LGS to be sent to the OpenFL server. collaborator_name: The name of the collaborator. Returns: - The response from the OpenFL server, converted back to a Flower message. + The response from the OpenFL server """ self._set_header(collaborator_name) - #TODO: use a general to/from openfl_message function with "add_header" option, do the message conversion before calling send_message_to_server - openfl_message = flower_to_openfl_message(flower_message, - header=self.header) + openfl_message.header.CopyFrom(self.header) openfl_response = self.stub.PelicanDrop(openfl_message) self.validate_response(openfl_response, collaborator_name) - flower_response = openfl_to_flower_message(openfl_response) - return flower_response + return openfl_response def _get_trained_model(self, experiment_name, model_type): """Get trained model RPC. diff --git a/openfl/transport/grpc/flex/flower/local_grpc_server.py b/openfl/transport/grpc/flex/flower/local_grpc_server.py index 9906e44544..2fa9177eb6 100644 --- a/openfl/transport/grpc/flex/flower/local_grpc_server.py +++ b/openfl/transport/grpc/flex/flower/local_grpc_server.py @@ -1,6 +1,7 @@ import threading import queue from flwr.proto import grpcadapter_pb2_grpc +from openfl.transport.grpc.flex.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): """ @@ -51,9 +52,12 @@ def process_queue(self): """ while True: request, response_queue = self.request_queue.get() + request = flower_to_openfl_message(request, header=None) + # Send request to the OpenFL server - # TODO: do message conversions here - flower_response = self.openfl_client.send_message_to_server(request, self.collaborator_name) + openfl_response = self.openfl_client.send_message_to_server(request, self.collaborator_name) + # Send response to Flower client + flower_response = openfl_to_flower_message(openfl_response) response_queue.put(flower_response) self.request_queue.task_done() \ No newline at end of file diff --git a/openfl/transport/grpc/flex/flower/message_conversion.py b/openfl/transport/grpc/flex/flower/message_conversion.py index 81f05f8094..c1494d93b0 100644 --- a/openfl/transport/grpc/flex/flower/message_conversion.py +++ b/openfl/transport/grpc/flex/flower/message_conversion.py @@ -2,7 +2,7 @@ from openfl.protocols import aggregator_pb2 # from deserialize_message import deserialize_flower_message -def flower_to_openfl_message(flower_message, header): +def flower_to_openfl_message(flower_message, header=None): """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" if isinstance(flower_message, aggregator_pb2.DropPod()): # If the input is already an OpenFL message, return it as-is @@ -12,7 +12,8 @@ def flower_to_openfl_message(flower_message, header): # Create the OpenFL message openfl_message = aggregator_pb2.DropPod() # Set the MessageHeader fields based on the provided sender and receiver - openfl_message.header.CopyFrom(header) + if header: + openfl_message.header.CopyFrom(header) # openfl_message.message_type = flower_message.metadata['grpc-message-qualname'] serialized_flower_message = flower_message.SerializeToString() openfl_message.message.npbytes = serialized_flower_message From 37893a7e19d4846bc1b771155a55c2a22bd65573 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 18 Dec 2024 15:19:27 -0800 Subject: [PATCH 016/107] add flwr run to FLEX, add FLEXAssigner Signed-off-by: kta-intel --- .../flower-app-pytorch/plan/plan.yaml | 26 ++--- .../workspace/plan/defaults/aggregator.yaml | 3 + .../workspace/plan/defaults/fl_exchange.yaml | 2 - .../workspace/plan/defaults/flex.yaml | 1 + .../{tasks_flower.yaml => tasks_flex.yaml} | 0 openfl/component/__init__.py | 1 + openfl/component/aggregator/aggregator.py | 9 +- openfl/component/assigner/__init__.py | 1 + openfl/component/assigner/flex_assigner.py | 97 +++++++++++++++++++ openfl/component/collaborator/collaborator.py | 3 +- openfl/component/interoperability/flex.py | 30 +++--- .../component/interoperability/flex_flower.py | 56 ++++++++++- openfl/federated/plan/plan.py | 11 +-- openfl/federated/task/runner_flower.py | 10 +- openfl/interface/plan.py | 16 +-- 15 files changed, 199 insertions(+), 67 deletions(-) delete mode 100644 openfl-workspace/workspace/plan/defaults/fl_exchange.yaml create mode 100644 openfl-workspace/workspace/plan/defaults/flex.yaml rename openfl-workspace/workspace/plan/defaults/{tasks_flower.yaml => tasks_flex.yaml} (100%) create mode 100644 openfl/component/assigner/flex_assigner.py diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 520214dee9..2fa75e0b34 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -5,27 +5,24 @@ aggregator : defaults : plan/defaults/aggregator.yaml template : openfl.component.Aggregator settings : - init_state_path : null - best_state_path : null - last_state_path : null rounds_to_train : 1 flex : defaults : plan/defaults/flex.yaml template : openfl.component.FLEXFlower - enable : True settings : - superlink_params: + superlink_params : insecure : True serverappio-api-address : 127.0.0.1:9091 fleet-api-address : 127.0.0.1:9092 # note [kta-intel]: local gRPC client will connect here exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml + flwr_run_params : # remove this to run flwr as a separate process + flwr_app_name : "app-pytorch" + federation_name : "local-poc" collaborator : defaults : plan/defaults/collaborator.yaml template : openfl.component.Collaborator - settings : - {} data_loader : defaults : plan/defaults/data_loader.yaml @@ -36,26 +33,21 @@ data_loader : task_runner : defaults : plan/defaults/task_runner.yaml template : openfl.federated.task.runner_flower.FlowerTaskRunner - settings : - {} network : defaults : plan/defaults/network.yaml - settings : - fim : True assigner : defaults : plan/defaults/assigner.yaml - template : openfl.component.RandomGroupedAssigner + template : openfl.component.FLEXAssigner settings : - task_groups : - - name : flower_adapter - percentage : 1.0 - tasks : + task_groups : + - name : FLEX_Flower + tasks : - start_client_adapter tasks : - defaults : plan/defaults/tasks_flower.yaml + defaults : plan/defaults/tasks_flex.yaml compression_pipeline : defaults : plan/defaults/compression_pipeline.yaml \ No newline at end of file diff --git a/openfl-workspace/workspace/plan/defaults/aggregator.yaml b/openfl-workspace/workspace/plan/defaults/aggregator.yaml index 0bb76e099d..b98a9b892e 100644 --- a/openfl-workspace/workspace/plan/defaults/aggregator.yaml +++ b/openfl-workspace/workspace/plan/defaults/aggregator.yaml @@ -2,3 +2,6 @@ template : openfl.component.Aggregator settings : db_store_rounds : 2 write_logs : true + init_state_path : init.pbuf + best_state_path : best.pbuf + last_state_path : last.pbuf diff --git a/openfl-workspace/workspace/plan/defaults/fl_exchange.yaml b/openfl-workspace/workspace/plan/defaults/fl_exchange.yaml deleted file mode 100644 index 40f1385f43..0000000000 --- a/openfl-workspace/workspace/plan/defaults/fl_exchange.yaml +++ /dev/null @@ -1,2 +0,0 @@ -template : openfl.component.FederatedLearningExchange -enable : True \ No newline at end of file diff --git a/openfl-workspace/workspace/plan/defaults/flex.yaml b/openfl-workspace/workspace/plan/defaults/flex.yaml new file mode 100644 index 0000000000..0841d205d0 --- /dev/null +++ b/openfl-workspace/workspace/plan/defaults/flex.yaml @@ -0,0 +1 @@ +template : openfl.component.FederatedLearningExchange \ No newline at end of file diff --git a/openfl-workspace/workspace/plan/defaults/tasks_flower.yaml b/openfl-workspace/workspace/plan/defaults/tasks_flex.yaml similarity index 100% rename from openfl-workspace/workspace/plan/defaults/tasks_flower.yaml rename to openfl-workspace/workspace/plan/defaults/tasks_flex.yaml diff --git a/openfl/component/__init__.py b/openfl/component/__init__.py index 233d795ca0..6630f601ad 100644 --- a/openfl/component/__init__.py +++ b/openfl/component/__init__.py @@ -6,6 +6,7 @@ from openfl.component.assigner.assigner import Assigner from openfl.component.assigner.random_grouped_assigner import RandomGroupedAssigner from openfl.component.assigner.static_grouped_assigner import StaticGroupedAssigner +from openfl.component.assigner.flex_assigner import FLEXAssigner from openfl.component.collaborator.collaborator import Collaborator from openfl.component.straggler_handling_functions.cutoff_time_based_straggler_handling import ( CutoffTimeBasedStragglerHandling, diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 2ebe10cd93..702d7810ff 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -137,6 +137,7 @@ def __init__( self.uuid = aggregator_uuid self.federation_uuid = federation_uuid self.assigner = assigner + self.flex = flex self.quit_job_sent_to = [] self.tensor_db = TensorDB() @@ -176,11 +177,12 @@ def __init__( tensor_pipe=self.compression_pipeline, ) else: - if self.init_state_path: + if self.flex: + # The model definition will be handled by the respective framework + self.model = {} + else: self.model: base_pb2.ModelProto = utils.load_proto(self.init_state_path) self._load_initial_tensors() # keys are TensorKeys - else: - self.model = {} self.collaborator_tensor_results = {} # {TensorKey: nparray}} @@ -198,7 +200,6 @@ def __init__( self.use_delta_updates = use_delta_updates - self.flex = flex def _load_initial_tensors(self): """Load all of the tensors required to begin federated learning. diff --git a/openfl/component/assigner/__init__.py b/openfl/component/assigner/__init__.py index 980a524b7f..81f03f6cb8 100644 --- a/openfl/component/assigner/__init__.py +++ b/openfl/component/assigner/__init__.py @@ -5,3 +5,4 @@ from openfl.component.assigner.assigner import Assigner from openfl.component.assigner.random_grouped_assigner import RandomGroupedAssigner from openfl.component.assigner.static_grouped_assigner import StaticGroupedAssigner +from openfl.component.assigner.flex_assigner import FLEXAssigner diff --git a/openfl/component/assigner/flex_assigner.py b/openfl/component/assigner/flex_assigner.py new file mode 100644 index 0000000000..8052dc5af0 --- /dev/null +++ b/openfl/component/assigner/flex_assigner.py @@ -0,0 +1,97 @@ +# Copyright 2020-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +"""Static grouped assigner module.""" + +from openfl.component.assigner.assigner import Assigner + + +class FLEXAssigner(Assigner): + """The task assigner maintains a list of tasks. + + This assigner is designed to facilitate interoperability between federated learning frameworks. + The expectation is that the OpenFL collaborator is tasked with running the external framework's API. + By default, all collaborators will run the same single task, which is `start_client_adapter` to + start the external framework's client and begin relaying gRPC messages. + + Attributes: + task_groups* (list of object): Task groups to assign. + """ + + def __init__(self, task_groups=None, **kwargs): + """Initializes the FLEXAssigner. + + Args: + task_groups (list of object): Task groups to assign. + **kwargs: Additional keyword arguments. + """ + self.task_groups = task_groups + super().__init__(**kwargs) + + def define_task_assignments(self): + """Define task assignments for each round and collaborator. + + This method uses the assigner function to assign tasks to + collaborators for each OpenFL round. + """ + if self.task_groups is None: + self.task_groups = [{"name": "default", "tasks": ['start_client_adapter'], "collaborators": self.authorized_cols}] + + for group in self.task_groups: + if "tasks" not in group or not group["tasks"]: + group["tasks"] = ['start_client_adapter'] + if "collaborators" not in group or not group["collaborators"]: + group["collaborators"] = self.authorized_cols + + # Check if any task other than 'start_client_adapter' is present + for task in group["tasks"]: + if task != 'start_client_adapter': + raise ValueError(f"Unsupported task '{task}' found. FLEXAssigner only supports 'start_client_adapter'.") + + # Start by finding all of the tasks in all specified groups + self.all_tasks_in_groups = list( + {task for group in self.task_groups for task in group["tasks"]} + ) + + # Initialize the map of collaborators for a given task on a given round + for task in self.all_tasks_in_groups: + self.collaborators_for_task[task] = {i: [] for i in range(self.rounds)} + + for group in self.task_groups: + group_col_list = group["collaborators"] + self.task_group_collaborators[group["name"]] = group_col_list + for col in group_col_list: + # For now, we assume that collaborators have the same tasks for + # every round + self.collaborator_tasks[col] = {i: group["tasks"] for i in range(self.rounds)} + # Now populate reverse lookup of tasks->group + for task in group["tasks"]: + for round_ in range(self.rounds): + # This should append the list of collaborators performing + # that task + self.collaborators_for_task[task][round_] += group_col_list + + def get_tasks_for_collaborator(self, collaborator_name, round_number): + """Get tasks for a specific collaborator in a specific round. + + Args: + collaborator_name (str): Name of the collaborator. + round_number (int): Round number. + + Returns: + list: List of tasks for the collaborator in the specified round. + """ + return self.collaborator_tasks[collaborator_name][round_number] + + def get_collaborators_for_task(self, task_name, round_number): + """Get collaborators for a specific task in a specific round. + + Args: + task_name (str): Name of the task. + round_number (int): Round number. + + Returns: + list: List of collaborators for the task in the specified round. + """ + return self.collaborators_for_task[task_name][round_number] \ No newline at end of file diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index ca64b7b349..6a0bba08a1 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -251,7 +251,8 @@ def do_task(self, task, round_number): func_name = self.task_config[task_name]["function"] kwargs = self.task_config[task_name]["kwargs"] if func_name=="start_client_adapter": - # TODO Need a major elegant and robust way to implement this + # TODO: Need to determine a more general way to handle this in order to enable + # additional tasks to be added to be added to FLEX if hasattr(self.task_runner, func_name): method = getattr(self.task_runner, func_name) if callable(method): diff --git a/openfl/component/interoperability/flex.py b/openfl/component/interoperability/flex.py index d3471d884f..a61a1f94da 100644 --- a/openfl/component/interoperability/flex.py +++ b/openfl/component/interoperability/flex.py @@ -1,50 +1,56 @@ import subprocess -import logging - -logger = logging.getLogger(__name__) +from logging import getLogger class FederatedLearningExchange: """ A skeletal base class for managing a subprocess. """ - def __init__(self, command: list[str], **kwargs): + def __init__(self, command: list[str], component_name: str = "Base", **kwargs): """ Initialize FLEX with a command to run as a subprocess. Args: command (list[str]): The command to run the server as a subprocess. + component_name (str): The name of the specific FLEX component being used. """ self.local_grpc_client = None self._command = command self._process = None + self.logger = getLogger(__name__) + self.component_name = component_name def start(self): """ Start the subprocess with the provided command. """ if self._process is None: - logger.info(f"[FLEX] Starting subprocess: {' '.join(self._command)}") + self.logger.info(f"[FLEX] Starting subprocess: {' '.join(self._command)}") self._process = subprocess.Popen(self._command) - logger.info(f"[FLEX] Subprocess started with PID: {self._process.pid}") + self.logger.info(f"[FLEX] Subprocess started with PID: {self._process.pid}") else: - logger.info("[FLEX] Subprocess is already running.") + self.logger.info("[FLEX] Subprocess is already running.") def stop(self): """ Stop the subprocess if it is running. """ if self._process: - logger.info(f"[FLEX] Stopping subprocess with PID: {self._process.pid}...") + self.logger.info(f"[FLEX] Stopping subprocess with PID: {self._process.pid}...") self._process.terminate() self._process.wait() self._process = None - logger.info("[FLEX] Subprocess stopped.") + self.logger.info("[FLEX] Subprocess stopped.") else: - logger.info("[FLEX] No subprocess is currently running.") - + self.logger.info("[FLEX] No subprocess is currently running.") def get_local_grpc_client(self): """ Get the local gRPC client. """ - return self.local_grpc_client \ No newline at end of file + return self.local_grpc_client + + def print_flex_info(self): + """ + Print information indicating which FLEX component is being used. + """ + self.logger.info(f"FLEX Enabled: {self.component_name}") \ No newline at end of file diff --git a/openfl/component/interoperability/flex_flower.py b/openfl/component/interoperability/flex_flower.py index f3745d1ad6..d71c3ad998 100644 --- a/openfl/component/interoperability/flex_flower.py +++ b/openfl/component/interoperability/flex_flower.py @@ -1,3 +1,4 @@ +import subprocess from openfl.component.interoperability.flex import FederatedLearningExchange from openfl.transport.grpc.flex.flower.local_grpc_client import LocalGRPCClient @@ -7,22 +8,27 @@ class FLEXFlower(FederatedLearningExchange): Responsible for generating the Flower server command. """ - def __init__(self, superlink_params: dict, **kwargs): + def __init__(self, superlink_params: dict, flwr_run_params: dict = None, **kwargs): """ Initialize FLEXFlower by building the server command from the superlink_params. Args: superlink_params (dict): A dictionary of Flower server settings. + flwr_run_params (dict, optional): A dictionary containing the Flower run parameters. Defaults to None. """ self.superlink_params = superlink_params + self.flwr_run_params = flwr_run_params command = self._build_command() - super().__init__(command) + super().__init__(command, component_name="Flower") flex_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") self.local_grpc_client = LocalGRPCClient(flex_address) + + self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None + self.flwr_run_process = None def _build_command(self) -> list[str]: """ - Build the Flower server command based on settings. + Start the Flower SuperLink based on settings. Args: superlink_params (dict): Settings to configure the Flower server. Returns: @@ -46,4 +52,46 @@ def _build_command(self) -> list[str]: command += ["--exec-api-address", str(self.superlink_params["exec-api-address"])] # flwr default: 0.0.0.0:9093 - return command \ No newline at end of file + return command + + def _build_flwr_run_command(self) -> list[str]: + """ + Build the `flwr run` command to run the Flower application. + Returns: + list[str]: A list representing the flwr_run command. + """ + flwr_app_name = self.flwr_run_params.get("flwr_app_name") + federation_name = self.flwr_run_params.get("federation_name") + + command = ["flwr", "run", f"./{flwr_app_name}"] + if federation_name: + command.append(federation_name) + return command + + def start(self): + """ + Start the `flower-superlink` and `flwr run` subprocesses with the provided commands. + """ + super().start() + + if self.flwr_run_command and self.flwr_run_process is None: + self.logger.info(f"[FLEX] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") + self.flwr_run_process = subprocess.Popen(self.flwr_run_command) + self.logger.info(f"[FLEX] `flwr run` subprocess started with PID: {self.flwr_run_process.pid}") + elif self.flwr_run_process: + self.logger.info("[FLEX] `flwr run` subprocess is already running.") + + def stop(self): + """ + Stop the `flower-superlink` and `flwr run` subprocesses if they are running. + """ + super().stop() + + if self.flwr_run_process: + self.logger.info(f"[FLEX] Stopping `flwr run` subprocess with PID: {self.flwr_run_process.pid}...") + self.flwr_run_process.terminate() + self.flwr_run_process.wait() + self.flwr_run_process = None + self.logger.info("[FLEX] `flwr run` subprocess stopped.") + else: + self.logger.info("[FLEX] No `flwr run` subprocess is currently running.") \ No newline at end of file diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index 793f41b7cc..a266b70476 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -343,13 +343,9 @@ def get_assigner(self): def get_flex(self): """Get federated learning exchange object.""" - defaults = self.config.get( - "flex", - {TEMPLATE: "openfl.components.FederatedLearningExchange", SETTINGS: {}}, - ) + defaults = self.config.get("flex") - if self.flex_ is None and defaults.get('enable', False): - defaults = {k: v for k, v in defaults.items() if k != 'enable'} + if self.flex_ is None and defaults: self.flex_ = Plan.build(**defaults) else: self.flex_ = None @@ -723,9 +719,6 @@ def get_server( server_args["aggregator"] = self.get_aggregator() - #TODO have this set in self.config["network"] - # server_args["fim"] = True - if self.server_ is None: self.server_ = AggregatorGRPCServer(**server_args) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 923a170f01..5e02c6dfa0 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -5,6 +5,7 @@ from openfl.federated.task.runner import TaskRunner from openfl.transport.grpc.flex.flower.local_grpc_server import LocalGRPCServer import subprocess +from logging import getLogger class FlowerTaskRunner(TaskRunner): @@ -21,6 +22,7 @@ def __init__(self, **kwargs): **kwargs: Additional parameters to pass to the functions. """ super().__init__(**kwargs) + self.logger = getLogger(__name__) self.num_partitions = self.data_loader.get_node_configs()[0] self.partition_id = self.data_loader.get_node_configs()[1] @@ -40,7 +42,6 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): **kwargs: Additional parameters, including 'local_server_port'. """ local_server_port = kwargs['local_server_port'] - # local_server_port = 9092 # note [kta-intel]: a direct connection to flower superlink # Start the local gRPC server server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) @@ -49,7 +50,7 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): # TODO: add restrictions server.add_insecure_port(f'[::]:{local_server_port}') server.start() - print(f"OpenFL local gRPC server started, listening on port {local_server_port}.") + self.logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") # Start the Flower SuperNode in a subprocess command = [ @@ -57,11 +58,6 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): "--insecure", "--grpc-adapter", "--superlink", f"127.0.0.1:{local_server_port}", # note [kta-intel]: this connects to local gRPC server - # TODO: you must specify separate client ports when running multiple super nodes - # on a single machine (i.e. a local poc). We need to add ability to automatically - # set separate ports for each client if it is set as a local poc, otherwise it can be - # whatever is automatically set by the system. Or we can add option to set port manually - # or let it be automatically set "--clientappio-api-address", f"127.0.0.1:{self.client_port}", "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" ] diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index 43ac31f86e..f0ec068de9 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -56,7 +56,7 @@ def plan(context): required=False, help="Authorized collaborator list [plan/cols.yaml]", default="plan/cols.yaml", - type=ClickPath(exists=False), + type=ClickPath(exists=True), ) @option( "-d", @@ -64,7 +64,7 @@ def plan(context): required=False, help="The data set/shard configuration file [plan/data.yaml]", default="plan/data.yaml", - type=ClickPath(exists=False), + type=ClickPath(exists=True), ) @option( "-a", @@ -95,13 +95,6 @@ def plan(context): help="Install packages listed under 'requirements.txt'. True/False [Default: True]", default=True, ) -@option( - "-fim", - "--framework_interoperability_mode", - required=False, - help="For interoperability with other FL frameworks. True/False [Default: True]", - default=False, -) def initialize( context, plan_config, @@ -111,7 +104,6 @@ def initialize( input_shape, gandlf_config, install_reqs, - framework_interoperability_mode ): """Initialize Data Science plan. @@ -172,7 +164,9 @@ def initialize( gandlf_config_path=gandlf_config, ) - if not framework_interoperability_mode: + if 'flex' in plan.config: + logger.info("FLEX enabled: %s", plan.config['flex']) + else: init_state_path = plan.config["aggregator"]["settings"]["init_state_path"] # This is needed to bypass data being locally available From 48475fdd19ee689b79aa9efd935f285bfc3b4350 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 18 Dec 2024 15:28:22 -0800 Subject: [PATCH 017/107] remove some TODOs and commented out code Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/plan/plan.yaml | 6 +++--- openfl/transport/grpc/aggregator_server.py | 13 ------------- .../transport/grpc/flex/flower/local_grpc_client.py | 2 -- .../transport/grpc/flex/flower/local_grpc_server.py | 4 +--- .../grpc/flex/flower/message_conversion.py | 3 ++- 5 files changed, 6 insertions(+), 22 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 2fa75e0b34..e91f7d5e9f 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -16,9 +16,9 @@ flex : serverappio-api-address : 127.0.0.1:9091 fleet-api-address : 127.0.0.1:9092 # note [kta-intel]: local gRPC client will connect here exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml - flwr_run_params : # remove this to run flwr as a separate process - flwr_app_name : "app-pytorch" - federation_name : "local-poc" + # flwr_run_params : + # flwr_app_name : "app-pytorch" + # federation_name : "local-poc" collaborator : defaults : plan/defaults/collaborator.yaml diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index 6130fb793b..e2bd3042fd 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -380,15 +380,6 @@ def serve(self): jobs havebeen sent. """ - # if getattr(self, 'use_flex', False): - - # # Start the Flower server app in a subprocess - # flwr_run_process = subprocess.Popen([ - # "flwr", - # "run", - # "./app-pytorch", - # "local-poc", #TODO: let model owner specify this - # ], shell=False) if self.use_flex: self.aggregator.start_flex() @@ -408,7 +399,3 @@ def serve(self): if self.use_flex: self.aggregator.stop_flex() - - # if getattr(self, 'use_flex', False): - # flwr_run_process.terminate() - # flwr_run_process.wait() diff --git a/openfl/transport/grpc/flex/flower/local_grpc_client.py b/openfl/transport/grpc/flex/flower/local_grpc_client.py index 13d082bf5c..ef7c4a8a6d 100644 --- a/openfl/transport/grpc/flex/flower/local_grpc_client.py +++ b/openfl/transport/grpc/flex/flower/local_grpc_client.py @@ -29,9 +29,7 @@ def send_receive(self, openfl_message, header): Returns: The response from the Flower SuperLink, converted back to OpenFL format. """ - # TODO: Add verification steps for messages coming from OpenFL transport flower_message = openfl_to_flower_message(openfl_message) flower_response = self.superlink_stub.SendReceive(flower_message) openfl_response = flower_to_openfl_message(flower_response, header=header) - # TODO: Add verification steps for messages coming from Flower server return openfl_response diff --git a/openfl/transport/grpc/flex/flower/local_grpc_server.py b/openfl/transport/grpc/flex/flower/local_grpc_server.py index 2fa9177eb6..0c3c03bdec 100644 --- a/openfl/transport/grpc/flex/flower/local_grpc_server.py +++ b/openfl/transport/grpc/flex/flower/local_grpc_server.py @@ -38,10 +38,8 @@ def SendReceive(self, request, context): Returns: The response from the OpenFL server. """ - # TODO: Add verification steps for messages coming from Flower clients response_queue = queue.Queue() self.request_queue.put((request, response_queue)) - # TODO: Add verification steps for messages coming from OpenFL transport return response_queue.get() def process_queue(self): @@ -53,7 +51,7 @@ def process_queue(self): while True: request, response_queue = self.request_queue.get() request = flower_to_openfl_message(request, header=None) - + # Send request to the OpenFL server openfl_response = self.openfl_client.send_message_to_server(request, self.collaborator_name) diff --git a/openfl/transport/grpc/flex/flower/message_conversion.py b/openfl/transport/grpc/flex/flower/message_conversion.py index c1494d93b0..81bc8e3df8 100644 --- a/openfl/transport/grpc/flex/flower/message_conversion.py +++ b/openfl/transport/grpc/flex/flower/message_conversion.py @@ -1,6 +1,6 @@ from flwr.proto import grpcadapter_pb2 from openfl.protocols import aggregator_pb2 -# from deserialize_message import deserialize_flower_message +# from openfl.transport.grpc.flex.flower.deserialize_message import deserialize_flower_message def flower_to_openfl_message(flower_message, header=None): """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" @@ -8,6 +8,7 @@ def flower_to_openfl_message(flower_message, header=None): # If the input is already an OpenFL message, return it as-is return flower_message else: + # TODO: Add verification steps for messages coming from Flower entities """Convert a Flower MessageContainer to an OpenFL message.""" # Create the OpenFL message openfl_message = aggregator_pb2.DropPod() From 2c72feb01e644da0098142f7a10afe10e9b37f25 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 18 Dec 2024 15:33:55 -0800 Subject: [PATCH 018/107] click path for data.yaml set to true Signed-off-by: kta-intel --- openfl/interface/collaborator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl/interface/collaborator.py b/openfl/interface/collaborator.py index 71437039e3..862ae4db84 100644 --- a/openfl/interface/collaborator.py +++ b/openfl/interface/collaborator.py @@ -56,7 +56,7 @@ def collaborator(context): required=False, help="The data set/shard configuration file [plan/data.yaml]", default="plan/data.yaml", - type=ClickPath(exists=False), + type=ClickPath(exists=True), ) @option( "-n", From 83e6c804105f7a5923af5712fcd07d899bbcc17f Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 19 Dec 2024 11:12:58 -0800 Subject: [PATCH 019/107] add requirements Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 openfl-workspace/flower-app-pytorch/requirements.txt diff --git a/openfl-workspace/flower-app-pytorch/requirements.txt b/openfl-workspace/flower-app-pytorch/requirements.txt new file mode 100644 index 0000000000..27257ff61b --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/requirements.txt @@ -0,0 +1 @@ +./app-pytorch \ No newline at end of file From 07f2db79b0e9e2f6fe4a8a7a8242415f7492c8e9 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 19 Dec 2024 13:49:33 -0800 Subject: [PATCH 020/107] make importing flwr components conditioned on existing lib Signed-off-by: kta-intel --- openfl/component/__init__.py | 5 ++++- openfl/component/interoperability/__init__.py | 5 ++++- openfl/interface/workspace.py | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/openfl/component/__init__.py b/openfl/component/__init__.py index 6630f601ad..315236c847 100644 --- a/openfl/component/__init__.py +++ b/openfl/component/__init__.py @@ -1,6 +1,7 @@ # Copyright 2020-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from importlib import util from openfl.component.aggregator.aggregator import Aggregator from openfl.component.assigner.assigner import Assigner @@ -18,5 +19,7 @@ StragglerHandlingPolicy, ) from openfl.component.interoperability.flex import FederatedLearningExchange -from openfl.component.interoperability.flex_flower import FLEXFlower + +if util.find_spec("flwr") is not None: + from openfl.component.interoperability.flex_flower import FLEXFlower diff --git a/openfl/component/interoperability/__init__.py b/openfl/component/interoperability/__init__.py index 0ab76ed6cf..29c4b14c9f 100644 --- a/openfl/component/interoperability/__init__.py +++ b/openfl/component/interoperability/__init__.py @@ -1,5 +1,8 @@ # Copyright 2020-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from importlib import util from openfl.component.interoperability.flex import FederatedLearningExchange -from openfl.component.interoperability.flex_flower import FLEXFlower + +if util.find_spec("flwr") is not None: + from openfl.component.interoperability.flex_flower import FLEXFlower diff --git a/openfl/interface/workspace.py b/openfl/interface/workspace.py index 52129a967e..d5cd323ebb 100644 --- a/openfl/interface/workspace.py +++ b/openfl/interface/workspace.py @@ -138,6 +138,7 @@ def create(prefix, template): requirements_filename = "requirements.txt" if os.path.isfile(f"{str(prefix)}/{requirements_filename}"): + os.chdir(prefix) check_call( [ executable, From da975e5a8dc2f6a59f925aa01c42ec1107191e95 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 08:02:12 -0800 Subject: [PATCH 021/107] enable send local task results in order to gracefully terminate Signed-off-by: kta-intel --- openfl/component/aggregator/aggregator.py | 23 ++- openfl/component/collaborator/collaborator.py | 3 + openfl/component/interoperability/flex.py | 1 + openfl/federated/task/runner_flower.py | 137 ++++++++++++++++-- openfl/transport/grpc/aggregator_client.py | 4 +- openfl/transport/grpc/aggregator_server.py | 8 +- .../grpc/flex/flower/message_conversion.py | 5 +- 7 files changed, 150 insertions(+), 31 deletions(-) diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 702d7810ff..f12dd9ae94 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -598,8 +598,8 @@ def send_local_task_results( collaborator_name, round_number, task_name, - data_size, - named_tensors, + data_size=None, + named_tensors=None, ): """ RPC called by collaborator. @@ -635,6 +635,13 @@ def send_local_task_results( f"for {task_name}, round {round_number}" ) + if self.is_flex_available(): + # Skip to end of round check + with self.lock: + self._is_collaborator_done(collaborator_name, round_number) + self._end_of_round_with_stragglers_check() + + task_key = TaskResultKey(task_name, collaborator_name, round_number) # we mustn't have results already @@ -1012,8 +1019,10 @@ def _end_of_round_check(self): # Compute all validation related metrics all_tasks = self.assigner.get_all_tasks_for_round(self.round_number) - for task_name in all_tasks: - self._compute_validation_related_task_metrics(task_name) + + if not self.is_flex_available(): + for task_name in all_tasks: + self._compute_validation_related_task_metrics(task_name) if self.log_memory_usage: # This is the place to check the memory usage of the aggregator @@ -1026,8 +1035,10 @@ def _end_of_round_check(self): self._end_of_round_check_done[self.round_number] = True # Save the latest model - self.logger.info("Saving round %s model...", self.round_number) - self._save_model(self.round_number, self.last_state_path) + if not self.is_flex_available(): + # external FL framework will handle the model saving if FLEX is enabled + self.logger.info("Saving round %s model...", self.round_number) + self._save_model(self.round_number, self.last_state_path) self.round_number += 1 # resetting stragglers for task for a new round diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index 6a0bba08a1..3d13149639 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -257,6 +257,9 @@ def do_task(self, task, round_number): method = getattr(self.task_runner, func_name) if callable(method): method(self.client, self.collaborator_name, **kwargs) + # TODO: better to use self.send_task_results(global_output_tensor_dict, round_number, task_name) + # maybe set global_output_tensor to empty + self.client.send_local_task_results(self.collaborator_name, round_number, task_name) return else: raise AttributeError(f"{func_name} is not callable on {self.task_runner}") diff --git a/openfl/component/interoperability/flex.py b/openfl/component/interoperability/flex.py index a61a1f94da..cc0ef889d7 100644 --- a/openfl/component/interoperability/flex.py +++ b/openfl/component/interoperability/flex.py @@ -36,6 +36,7 @@ def stop(self): """ if self._process: self.logger.info(f"[FLEX] Stopping subprocess with PID: {self._process.pid}...") + import pdb; pdb.set_trace() self._process.terminate() self._process.wait() self._process = None diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 5e02c6dfa0..c60d988f63 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -1,3 +1,105 @@ +# import grpc +# from concurrent.futures import ThreadPoolExecutor +# from flwr.proto import grpcadapter_pb2_grpc +# from multiprocessing import cpu_count +# from openfl.federated.task.runner import TaskRunner +# from openfl.transport.grpc.flex.flower.local_grpc_server import LocalGRPCServer +# import subprocess +# from logging import getLogger +# import threading + + +# class FlowerTaskRunner(TaskRunner): +# """ +# FlowerTaskRunner is a task runner that executes Flower SuperNode +# to initialize the experiment from the client side +# """ + +# def __init__(self, **kwargs): +# """ +# Initializes. + +# Args: +# **kwargs: Additional parameters to pass to the functions. +# """ +# super().__init__(**kwargs) +# self.logger = getLogger(__name__) +# self.num_partitions = self.data_loader.get_node_configs()[0] +# self.partition_id = self.data_loader.get_node_configs()[1] + +# # Define a base port number +# base_port = 5000 + +# # Calculate the client port by adding the partition ID to the base port +# self.client_port = base_port + self.partition_id + +# def start_client_adapter(self, openfl_client, collaborator_name, timeout=600, **kwargs): +# """ +# Starts the local gRPC server and the Flower SuperNode. + +# Args: +# openfl_client: The OpenFL client instance used to communicate with the OpenFL server. +# collaborator_name: The name of the collaborator. +# timeout: The timeout period in seconds after which the process will be terminated. +# **kwargs: Additional parameters, including 'local_server_port'. +# """ +# local_server_port = kwargs['local_server_port'] + +# # Start the local gRPC server +# server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) +# grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(LocalGRPCServer(openfl_client, collaborator_name), server) + +# # TODO: add restrictions +# server.add_insecure_port(f'[::]:{local_server_port}') +# server.start() +# self.logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") + +# # Start the Flower SuperNode in a subprocess +# command = [ +# "flower-supernode", +# "--insecure", +# "--grpc-adapter", +# "--superlink", f"127.0.0.1:{local_server_port}", # note [kta-intel]: this connects to local gRPC server +# "--clientappio-api-address", f"127.0.0.1:{self.client_port}", +# "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" +# ] + +# # Create an event to signal when the subprocess is complete +# subprocess_complete_event = threading.Event() + +# def run_supernode(): +# # Start the subprocess +# supernode_process = subprocess.Popen(command, shell=False) +# supernode_process.communicate() +# subprocess_complete_event.set() + +# # Start the subprocess in a separate thread +# supernode_thread = threading.Thread(target=run_supernode) +# supernode_thread.start() + +# # Define a function to terminate the process after the timeout +# def terminate_process(): +# if not subprocess_complete_event.is_set(): +# self.logger.warning("Timeout reached. Terminating the Flower SuperNode process.") +# subprocess_complete_event.set() +# supernode_process.terminate() + +# # Start the timer +# timer = threading.Timer(timeout, terminate_process) +# timer.start() + +# try: +# # Wait for the subprocess to complete +# while not subprocess_complete_event.is_set(): +# subprocess_complete_event.wait(timeout=10) +# finally: +# # Shut down the server gracefully +# server.stop(0) +# self.logger.info("OpenFL local gRPC server shut down.") + +# supernode_thread.join() +# timer.cancel() + import grpc from concurrent.futures import ThreadPoolExecutor from flwr.proto import grpcadapter_pb2_grpc @@ -52,19 +154,22 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): server.start() self.logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") - # Start the Flower SuperNode in a subprocess - command = [ - "flower-supernode", - "--insecure", - "--grpc-adapter", - "--superlink", f"127.0.0.1:{local_server_port}", # note [kta-intel]: this connects to local gRPC server - "--clientappio-api-address", f"127.0.0.1:{self.client_port}", - "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" - ] - # Start the subprocess - supernode_process = subprocess.Popen(command, shell=False) - - server.wait_for_termination() - - supernode_process.terminate() - supernode_process.wait() \ No newline at end of file + server.stop(0) + self.logger.info(f"OpenFL local gRPC server stopped.") + + # # Start the Flower SuperNode in a subprocess + # command = [ + # "flower-supernode", + # "--insecure", + # "--grpc-adapter", + # "--superlink", f"127.0.0.1:{local_server_port}", # note [kta-intel]: this connects to local gRPC server + # "--clientappio-api-address", f"127.0.0.1:{self.client_port}", + # "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" + # ] + # # Start the subprocess + # supernode_process = subprocess.Popen(command, shell=False) + + # server.wait_for_termination() + + # supernode_process.terminate() + # supernode_process.wait() \ No newline at end of file diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index f148c953f3..fbef80f99d 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -457,8 +457,8 @@ def send_local_task_results( collaborator_name, round_number, task_name, - data_size, - named_tensors, + data_size=None, + named_tensors=None, ): """ Send task results to the aggregator. diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index e2bd3042fd..26dbbf0e7b 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -284,8 +284,8 @@ def SendLocalTaskResults(self, request, context): # NOQA:N802 aggregator_pb2.SendLocalTaskResultsResponse: The response to the request. """ - if self.use_flex: - context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") + # if self.use_flex: + # context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") try: proto = aggregator_pb2.TaskResults() @@ -395,7 +395,7 @@ def serve(self): except KeyboardInterrupt: pass - self.server.stop(0) - if self.use_flex: self.aggregator.stop_flex() + + self.server.stop(0) diff --git a/openfl/transport/grpc/flex/flower/message_conversion.py b/openfl/transport/grpc/flex/flower/message_conversion.py index 81bc8e3df8..bb29781afa 100644 --- a/openfl/transport/grpc/flex/flower/message_conversion.py +++ b/openfl/transport/grpc/flex/flower/message_conversion.py @@ -8,7 +8,6 @@ def flower_to_openfl_message(flower_message, header=None): # If the input is already an OpenFL message, return it as-is return flower_message else: - # TODO: Add verification steps for messages coming from Flower entities """Convert a Flower MessageContainer to an OpenFL message.""" # Create the OpenFL message openfl_message = aggregator_pb2.DropPod() @@ -16,6 +15,8 @@ def flower_to_openfl_message(flower_message, header=None): if header: openfl_message.header.CopyFrom(header) # openfl_message.message_type = flower_message.metadata['grpc-message-qualname'] + + # TODO: Add verification steps for messages coming from Flower entities serialized_flower_message = flower_message.SerializeToString() openfl_message.message.npbytes = serialized_flower_message openfl_message.message.size = len(serialized_flower_message) @@ -31,6 +32,4 @@ def openfl_to_flower_message(openfl_message): # Deserialize the Flower message from the DataStream npbytes field flower_message = grpcadapter_pb2.MessageContainer() flower_message.ParseFromString(openfl_message.message.npbytes) - bytes_parsed = openfl_message.message.npbytes - # import pdb; pdb.set_trace() return flower_message \ No newline at end of file From d2d6907b2521bd7de3313f04c54d31d03d1b7461 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 08:20:26 -0800 Subject: [PATCH 022/107] fix superlink shutdown Signed-off-by: kta-intel --- openfl/component/interoperability/flex.py | 6 ++++-- .../component/interoperability/flex_flower.py | 17 +---------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/openfl/component/interoperability/flex.py b/openfl/component/interoperability/flex.py index cc0ef889d7..83719c7537 100644 --- a/openfl/component/interoperability/flex.py +++ b/openfl/component/interoperability/flex.py @@ -36,9 +36,11 @@ def stop(self): """ if self._process: self.logger.info(f"[FLEX] Stopping subprocess with PID: {self._process.pid}...") - import pdb; pdb.set_trace() self._process.terminate() - self._process.wait() + try: + self._process.wait(timeout=5) + except subprocess.TimeoutExpired: + self._process.kill() self._process = None self.logger.info("[FLEX] Subprocess stopped.") else: diff --git a/openfl/component/interoperability/flex_flower.py b/openfl/component/interoperability/flex_flower.py index d71c3ad998..541ab61d57 100644 --- a/openfl/component/interoperability/flex_flower.py +++ b/openfl/component/interoperability/flex_flower.py @@ -79,19 +79,4 @@ def start(self): self.flwr_run_process = subprocess.Popen(self.flwr_run_command) self.logger.info(f"[FLEX] `flwr run` subprocess started with PID: {self.flwr_run_process.pid}") elif self.flwr_run_process: - self.logger.info("[FLEX] `flwr run` subprocess is already running.") - - def stop(self): - """ - Stop the `flower-superlink` and `flwr run` subprocesses if they are running. - """ - super().stop() - - if self.flwr_run_process: - self.logger.info(f"[FLEX] Stopping `flwr run` subprocess with PID: {self.flwr_run_process.pid}...") - self.flwr_run_process.terminate() - self.flwr_run_process.wait() - self.flwr_run_process = None - self.logger.info("[FLEX] `flwr run` subprocess stopped.") - else: - self.logger.info("[FLEX] No `flwr run` subprocess is currently running.") \ No newline at end of file + self.logger.info("[FLEX] `flwr run` subprocess is already running.") \ No newline at end of file From 18aed1fd141d27c8694cff18499dda1e33c23e76 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 09:10:55 -0800 Subject: [PATCH 023/107] gracefully terminate agg processes Signed-off-by: kta-intel --- openfl/component/interoperability/flex.py | 11 ++ .../component/interoperability/flex_flower.py | 12 +- openfl/federated/task/runner_flower.py | 109 +----------------- 3 files changed, 25 insertions(+), 107 deletions(-) diff --git a/openfl/component/interoperability/flex.py b/openfl/component/interoperability/flex.py index 83719c7537..a9758c89fc 100644 --- a/openfl/component/interoperability/flex.py +++ b/openfl/component/interoperability/flex.py @@ -1,4 +1,5 @@ import subprocess +import psutil from logging import getLogger class FederatedLearningExchange: @@ -36,6 +37,16 @@ def stop(self): """ if self._process: self.logger.info(f"[FLEX] Stopping subprocess with PID: {self._process.pid}...") + # Use psutil to find and terminate child processes + parent = psutil.Process(self._process.pid) + children = parent.children(recursive=True) + for child in children: + self.logger.info(f"[FLEX] Stopping child process with PID: {child.pid}...") + child.terminate() + _, still_alive = psutil.wait_procs(children, timeout=1) + for p in still_alive: + p.kill() + # Terminate the main process self._process.terminate() try: self._process.wait(timeout=5) diff --git a/openfl/component/interoperability/flex_flower.py b/openfl/component/interoperability/flex_flower.py index 541ab61d57..8132fa114c 100644 --- a/openfl/component/interoperability/flex_flower.py +++ b/openfl/component/interoperability/flex_flower.py @@ -74,9 +74,13 @@ def start(self): """ super().start() - if self.flwr_run_command and self.flwr_run_process is None: + if self.flwr_run_command: self.logger.info(f"[FLEX] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") self.flwr_run_process = subprocess.Popen(self.flwr_run_command) - self.logger.info(f"[FLEX] `flwr run` subprocess started with PID: {self.flwr_run_process.pid}") - elif self.flwr_run_process: - self.logger.info("[FLEX] `flwr run` subprocess is already running.") \ No newline at end of file + + def stop(self): + """ + Stop the `flower-superlink` subprocess. + """ + # TODO : Add logic to maintain a long-lived federation -> might be better from the taskrunner + super().stop() \ No newline at end of file diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index c60d988f63..2fd1ed3c9e 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -1,105 +1,3 @@ -# import grpc -# from concurrent.futures import ThreadPoolExecutor -# from flwr.proto import grpcadapter_pb2_grpc -# from multiprocessing import cpu_count -# from openfl.federated.task.runner import TaskRunner -# from openfl.transport.grpc.flex.flower.local_grpc_server import LocalGRPCServer -# import subprocess -# from logging import getLogger -# import threading - - -# class FlowerTaskRunner(TaskRunner): -# """ -# FlowerTaskRunner is a task runner that executes Flower SuperNode -# to initialize the experiment from the client side -# """ - -# def __init__(self, **kwargs): -# """ -# Initializes. - -# Args: -# **kwargs: Additional parameters to pass to the functions. -# """ -# super().__init__(**kwargs) -# self.logger = getLogger(__name__) -# self.num_partitions = self.data_loader.get_node_configs()[0] -# self.partition_id = self.data_loader.get_node_configs()[1] - -# # Define a base port number -# base_port = 5000 - -# # Calculate the client port by adding the partition ID to the base port -# self.client_port = base_port + self.partition_id - -# def start_client_adapter(self, openfl_client, collaborator_name, timeout=600, **kwargs): -# """ -# Starts the local gRPC server and the Flower SuperNode. - -# Args: -# openfl_client: The OpenFL client instance used to communicate with the OpenFL server. -# collaborator_name: The name of the collaborator. -# timeout: The timeout period in seconds after which the process will be terminated. -# **kwargs: Additional parameters, including 'local_server_port'. -# """ -# local_server_port = kwargs['local_server_port'] - -# # Start the local gRPC server -# server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) -# grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(LocalGRPCServer(openfl_client, collaborator_name), server) - -# # TODO: add restrictions -# server.add_insecure_port(f'[::]:{local_server_port}') -# server.start() -# self.logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") - -# # Start the Flower SuperNode in a subprocess -# command = [ -# "flower-supernode", -# "--insecure", -# "--grpc-adapter", -# "--superlink", f"127.0.0.1:{local_server_port}", # note [kta-intel]: this connects to local gRPC server -# "--clientappio-api-address", f"127.0.0.1:{self.client_port}", -# "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" -# ] - -# # Create an event to signal when the subprocess is complete -# subprocess_complete_event = threading.Event() - -# def run_supernode(): -# # Start the subprocess -# supernode_process = subprocess.Popen(command, shell=False) -# supernode_process.communicate() -# subprocess_complete_event.set() - -# # Start the subprocess in a separate thread -# supernode_thread = threading.Thread(target=run_supernode) -# supernode_thread.start() - -# # Define a function to terminate the process after the timeout -# def terminate_process(): -# if not subprocess_complete_event.is_set(): -# self.logger.warning("Timeout reached. Terminating the Flower SuperNode process.") -# subprocess_complete_event.set() -# supernode_process.terminate() - -# # Start the timer -# timer = threading.Timer(timeout, terminate_process) -# timer.start() - -# try: -# # Wait for the subprocess to complete -# while not subprocess_complete_event.is_set(): -# subprocess_complete_event.wait(timeout=10) -# finally: -# # Shut down the server gracefully -# server.stop(0) -# self.logger.info("OpenFL local gRPC server shut down.") - -# supernode_thread.join() -# timer.cancel() - import grpc from concurrent.futures import ThreadPoolExecutor from flwr.proto import grpcadapter_pb2_grpc @@ -169,7 +67,12 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): # # Start the subprocess # supernode_process = subprocess.Popen(command, shell=False) + # import pdb; pdb.set_trace() # server.wait_for_termination() # supernode_process.terminate() - # supernode_process.wait() \ No newline at end of file + # supernode_process.wait() + # try: + # supernode_process.wait(timeout=5) + # except subprocess.TimeoutExpired: + # supernode_process.kill() \ No newline at end of file From e290e310423a303597518dbf5b809c695bbf9561 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 10:20:33 -0800 Subject: [PATCH 024/107] graceful shutdown at collaborators Signed-off-by: kta-intel --- openfl/component/interoperability/flex.py | 4 +- .../component/interoperability/flex_flower.py | 1 - openfl/federated/task/runner_flower.py | 72 ++++++++++++------- 3 files changed, 50 insertions(+), 27 deletions(-) diff --git a/openfl/component/interoperability/flex.py b/openfl/component/interoperability/flex.py index a9758c89fc..eca6e3f9de 100644 --- a/openfl/component/interoperability/flex.py +++ b/openfl/component/interoperability/flex.py @@ -37,7 +37,7 @@ def stop(self): """ if self._process: self.logger.info(f"[FLEX] Stopping subprocess with PID: {self._process.pid}...") - # Use psutil to find and terminate child processes + # find and terminate child processes parent = psutil.Process(self._process.pid) children = parent.children(recursive=True) for child in children: @@ -49,7 +49,7 @@ def stop(self): # Terminate the main process self._process.terminate() try: - self._process.wait(timeout=5) + self._process.wait(timeout=1) except subprocess.TimeoutExpired: self._process.kill() self._process = None diff --git a/openfl/component/interoperability/flex_flower.py b/openfl/component/interoperability/flex_flower.py index 8132fa114c..467a072736 100644 --- a/openfl/component/interoperability/flex_flower.py +++ b/openfl/component/interoperability/flex_flower.py @@ -82,5 +82,4 @@ def stop(self): """ Stop the `flower-superlink` subprocess. """ - # TODO : Add logic to maintain a long-lived federation -> might be better from the taskrunner super().stop() \ No newline at end of file diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 2fd1ed3c9e..d6b3bb33d8 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -6,6 +6,9 @@ from openfl.transport.grpc.flex.flower.local_grpc_server import LocalGRPCServer import subprocess from logging import getLogger +import signal +import threading +import psutil class FlowerTaskRunner(TaskRunner): @@ -52,27 +55,48 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): server.start() self.logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") - server.stop(0) - self.logger.info(f"OpenFL local gRPC server stopped.") - - # # Start the Flower SuperNode in a subprocess - # command = [ - # "flower-supernode", - # "--insecure", - # "--grpc-adapter", - # "--superlink", f"127.0.0.1:{local_server_port}", # note [kta-intel]: this connects to local gRPC server - # "--clientappio-api-address", f"127.0.0.1:{self.client_port}", - # "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" - # ] - # # Start the subprocess - # supernode_process = subprocess.Popen(command, shell=False) - - # import pdb; pdb.set_trace() - # server.wait_for_termination() - - # supernode_process.terminate() - # supernode_process.wait() - # try: - # supernode_process.wait(timeout=5) - # except subprocess.TimeoutExpired: - # supernode_process.kill() \ No newline at end of file + # Start the Flower SuperNode in a subprocess + command = [ + "flower-supernode", + "--insecure", + "--grpc-adapter", + "--superlink", f"127.0.0.1:{local_server_port}", # note [kta-intel]: this connects to local gRPC server + "--clientappio-api-address", f"127.0.0.1:{self.client_port}", + "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" + ] + # Start the subprocess + supernode_process = subprocess.Popen(command, shell=False) + + # Create an event to wait for the termination signal + termination_event = threading.Event() + + def signal_handler(_sig, _frame): + self.logger.info("Received shutdown signal. Terminating supernode process...") + + # find and terminate child processes + parent = psutil.Process(supernode_process.pid) + children = parent.children(recursive=True) + for child in children: + self.logger.info(f"[FLEX] Stopping child process with PID: {child.pid}...") + child.terminate() + _, still_alive = psutil.wait_procs(children, timeout=1) + for p in still_alive: + p.kill() + # Terminate the main process + + supernode_process.terminate() + try: + supernode_process.wait(timeout=1) + except subprocess.TimeoutExpired: + supernode_process.kill() + self.logger.info("Supernode process terminated. Shutting down gRPC server...") + server.stop(0) + self.logger.info("gRPC server stopped.") + termination_event.set() + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + self.logger.info("Press CTRL+C to stop the server and supernode process.") + + termination_event.wait() \ No newline at end of file From bb000f4116db86ef834677d16d24d0c51e6f4c44 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 10:56:46 -0800 Subject: [PATCH 025/107] add automatic shutdown to taskrunner Signed-off-by: kta-intel --- .../flower-app-pytorch/plan/plan.yaml | 2 + openfl/component/interoperability/flex.py | 36 ++++++------ openfl/federated/task/runner_flower.py | 55 ++++++++++++++++--- 3 files changed, 67 insertions(+), 26 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index e91f7d5e9f..5fabf4e630 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -33,6 +33,8 @@ data_loader : task_runner : defaults : plan/defaults/task_runner.yaml template : openfl.federated.task.runner_flower.FlowerTaskRunner + settings : + auto_shutdown : True network : defaults : plan/defaults/network.yaml diff --git a/openfl/component/interoperability/flex.py b/openfl/component/interoperability/flex.py index eca6e3f9de..5c888c1ea9 100644 --- a/openfl/component/interoperability/flex.py +++ b/openfl/component/interoperability/flex.py @@ -4,14 +4,14 @@ class FederatedLearningExchange: """ - A skeletal base class for managing a subprocess. + A skeletal base class for managing a server process. """ def __init__(self, command: list[str], component_name: str = "Base", **kwargs): """ - Initialize FLEX with a command to run as a subprocess. + Initialize FLEX to run a server process. Args: - command (list[str]): The command to run the server as a subprocess. + command (list[str]): The command to run the server process. component_name (str): The name of the specific FLEX component being used. """ self.local_grpc_client = None @@ -22,28 +22,28 @@ def __init__(self, command: list[str], component_name: str = "Base", **kwargs): def start(self): """ - Start the subprocess with the provided command. + Start the server process with the provided command. """ if self._process is None: - self.logger.info(f"[FLEX] Starting subprocess: {' '.join(self._command)}") + self.logger.info(f"[FLEX] Starting server process: {' '.join(self._command)}") self._process = subprocess.Popen(self._command) - self.logger.info(f"[FLEX] Subprocess started with PID: {self._process.pid}") + self.logger.info(f"[FLEX] server process started with PID: {self._process.pid}") else: - self.logger.info("[FLEX] Subprocess is already running.") + self.logger.info("[FLEX] server process is already running.") def stop(self): """ - Stop the subprocess if it is running. + Stop the server process if it is running. """ if self._process: - self.logger.info(f"[FLEX] Stopping subprocess with PID: {self._process.pid}...") - # find and terminate child processes - parent = psutil.Process(self._process.pid) - children = parent.children(recursive=True) - for child in children: - self.logger.info(f"[FLEX] Stopping child process with PID: {child.pid}...") - child.terminate() - _, still_alive = psutil.wait_procs(children, timeout=1) + self.logger.info(f"[FLEX] Stopping server process with PID: {self._process.pid}...") + # find and terminate sub_process processes + main_process = psutil.Process(self._process.pid) + sub_processes = main_process.children(recursive=True) + for sub_process in sub_processes: + self.logger.info(f"[FLEX] Stopping server subprocess with PID: {sub_process.pid}...") + sub_process.terminate() + _, still_alive = psutil.wait_procs(sub_processes, timeout=1) for p in still_alive: p.kill() # Terminate the main process @@ -53,9 +53,9 @@ def stop(self): except subprocess.TimeoutExpired: self._process.kill() self._process = None - self.logger.info("[FLEX] Subprocess stopped.") + self.logger.info("[FLEX] Server process stopped.") else: - self.logger.info("[FLEX] No subprocess is currently running.") + self.logger.info("[FLEX] No server process is currently running.") def get_local_grpc_client(self): """ diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index d6b3bb33d8..35f201a0b3 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -9,6 +9,7 @@ import signal import threading import psutil +import time class FlowerTaskRunner(TaskRunner): @@ -17,11 +18,13 @@ class FlowerTaskRunner(TaskRunner): to initialize the experiment from the client side """ - def __init__(self, **kwargs): + def __init__(self, auto_shutdown=True, **kwargs): """ Initializes. Args: + auto_shutdown (bool): Whether to enable automatic shutdown based on subprocess activity. + Default to True. Set to False for long-lived component **kwargs: Additional parameters to pass to the functions. """ super().__init__(**kwargs) @@ -34,6 +37,7 @@ def __init__(self, **kwargs): # Calculate the client port by adding the partition ID to the base port self.client_port = base_port + self.partition_id + self.auto_shutdown = auto_shutdown def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): """ @@ -42,6 +46,7 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): Args: openfl_client: The OpenFL client instance used to communicate with the OpenFL server. collaborator_name: The name of the collaborator. + auto_shutdown: Whether to enable automatic shutdown based on subprocess activity. **kwargs: Additional parameters, including 'local_server_port'. """ local_server_port = kwargs['local_server_port'] @@ -73,13 +78,12 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): def signal_handler(_sig, _frame): self.logger.info("Received shutdown signal. Terminating supernode process...") - # find and terminate child processes - parent = psutil.Process(supernode_process.pid) - children = parent.children(recursive=True) - for child in children: - self.logger.info(f"[FLEX] Stopping child process with PID: {child.pid}...") - child.terminate() - _, still_alive = psutil.wait_procs(children, timeout=1) + # find and terminate client_app_process processes + main_subprocess = psutil.Process(supernode_process.pid) + client_app_processes = main_subprocess.children(recursive=True) + for client_app_process in client_app_processes: + client_app_process.terminate() + _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) for p in still_alive: p.kill() # Terminate the main process @@ -97,6 +101,41 @@ def signal_handler(_sig, _frame): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) + if self.auto_shutdown: + self.logger.info("Automatic shutdown enabled. Monitoring subprocess activity...") + + def monitor_subprocesses(): + main_subprocess = psutil.Process(supernode_process.pid) + previous_end_time = None + intervals = [] + + while True: + client_app_processes = main_subprocess.children(recursive=True) + if client_app_processes: + for client_app_process in client_app_processes: + client_app_process.wait() + end_time = time.time() + if previous_end_time is not None: + interval = end_time - previous_end_time + intervals.append(interval) + # self.logger.info(f"Subprocess ended. Interval: {interval:.2f} seconds.") + previous_end_time = end_time + + if previous_end_time is not None: + running_timer = time.time() - previous_end_time + if intervals: + average_interval = sum(intervals) / len(intervals) + # self.logger.info(f"Running timer: {running_timer:.2f} seconds. Average interval: {average_interval:.2f} seconds.") + if running_timer > 2 * average_interval: + self.logger.info("No new subprocess started within the expected time. Initiating shutdown...") + signal_handler(signal.SIGTERM, None) + return + + time.sleep(1) + + monitor_thread = threading.Thread(target=monitor_subprocesses) + monitor_thread.start() + self.logger.info("Press CTRL+C to stop the server and supernode process.") termination_event.wait() \ No newline at end of file From 81dfdda49a1ecfd800bbef16a619689255db47df Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 12:23:10 -0800 Subject: [PATCH 026/107] remove flower readme, add --insecure as default Signed-off-by: kta-intel --- .../flower-app-pytorch/app-pytorch/README.md | 31 ------------------- .../component/interoperability/flex_flower.py | 2 ++ 2 files changed, 2 insertions(+), 31 deletions(-) delete mode 100644 openfl-workspace/flower-app-pytorch/app-pytorch/README.md diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/app-pytorch/README.md deleted file mode 100644 index 998f39e69d..0000000000 --- a/openfl-workspace/flower-app-pytorch/app-pytorch/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# app-pytorch: A Flower / PyTorch app - -## Install dependencies and project - -```bash -pip install -e . -``` - -## Run with the Simulation Engine - -In the `app-pytorch` directory, use `flwr run` to run a local simulation: - -```bash -flwr run . -``` - -Refer to the [How to Run Simulations](https://flower.ai/docs/framework/how-to-run-simulations.html) guide in the documentation for advice on how to optimize your simulations. - -## Run with the Deployment Engine - -> \[!NOTE\] -> An update to this example will show how to run this Flower application with the Deployment Engine and TLS certificates, or with Docker. - -## Resources - -- Flower website: [flower.ai](https://flower.ai/) -- Check the documentation: [flower.ai/docs](https://flower.ai/docs/) -- Give Flower a ⭐️ on GitHub: [GitHub](https://github.com/adap/flower) -- Join the Flower community! - - [Flower Slack](https://flower.ai/join-slack/) - - [Flower Discuss](https://discuss.flower.ai/) diff --git a/openfl/component/interoperability/flex_flower.py b/openfl/component/interoperability/flex_flower.py index 467a072736..d0a82104d2 100644 --- a/openfl/component/interoperability/flex_flower.py +++ b/openfl/component/interoperability/flex_flower.py @@ -39,6 +39,8 @@ def _build_command(self) -> list[str]: if "insecure" in self.superlink_params: if self.superlink_params["insecure"]: command += ["--insecure"] + else: + command += ["--insecure"] if "serverappio-api-address" in self.superlink_params: command += ["--serverappio-api-address", str(self.superlink_params["serverappio-api-address"])] From 961982e6bb23215a6a892e2d202a5a6728cf21df Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 13:15:49 -0800 Subject: [PATCH 027/107] add readme Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/README.md | 243 ++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 openfl-workspace/flower-app-pytorch/README.md diff --git a/openfl-workspace/flower-app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/README.md new file mode 100644 index 0000000000..4509c36117 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/README.md @@ -0,0 +1,243 @@ +# Open(FL)ower + +This workspace demonstrates a new functionality in OpenFL to interoperate with [Flower](https://flower.ai/). In particular, a user can now use the Flower API to run on an OpenFL infrastructure. OpenFL will act as an intermediary step between the Flower SuperLink and Flower SuperNode to relay messages across the network using OpenFL's transport mechanisms while Flower manages the experiment. + +## Overview + +In this repository, you'll notice a directory called `./app-pytorch`. This is effectively a Flower PyTorch app created using Flower's `flwr new` command that has been modified to run a local federation. The client and server apps dictate what will be run by the client and server respectively. `Task.py` defines the logic that will be executed by each app, such as the model definition, train/test tasks, etc. + +## Execution Methods + +There are two ways to execute this: + +1. Run `flwr run` as a sub-process of the aggregator alongside the superlink. (default) +2. Run `flwr run` as a [separate process](#invoke-flower-experiment-as-a-separate-command) after initializing the `SuperLink` and `SuperNode` at the aggregator and collaborators respectively. + +In addition, there are options to run the `SuperLink` and `SuperNode` as [long-lived components](#long-lived-superlink-and-supernode) that will indefinitely wait for new runs or, by default, as a short-lived component (similar to OpenFL's task runner) that terminates at the end of the experiment. + +## Getting Started + +### Create a Workspace + +Start by creating a workspace: + +```sh +fx workspace create --template flower-app-pytorch --prefix my_workspace +cd my_workspace +``` + +This will create a workspace in your current working directory called `./my_workspace` as well as install the Flower app defined in `./app-pytorch.` This will be where the experiment takes place. + +### Configure the Experiment +Notice under `./plan`, you will find the familiar OpenFL YAML files to configure the experiment. `col.yaml` and `data.yaml` will be populated by the collaborators that will run the Flower client app and the respective data shard or directory they will perform their training and testing on. +plan.yaml configures the experiment itself. The Open-Flower integration makes a few key changes to the `plan.yaml`: + +1. Introduction of a new top-level key (`flex`) to configure a newly introduced component called "FLEX (Federated Learning EXchange)". Specifically, the Flower integration uses a `FLEX` subclass called `FLEXFlower`. This component is run by the aggregator and is responsible for initializing the Flower SuperLink and connecting to the OpenFL server. The superlink parameters can be configured using `flex.settings.superlink_params`. If nothing is supplied, it will simply run `flower-superlink --insecure` with the command's default settings as dictated by Flower. It also includes the option to run the flwr run command via `flex.settings.flwr_run_params`. Without setting these commands, the aggregator will not invoke `flwr run` and it will be up to the user to run this process separately to start a Flower experiment. + +```yaml +flex: + defaults: plan/defaults/flex.yaml + template: openfl.component.FLEXFlower + settings: + superlink_params: + insecure: True + serverappio-api-address: 127.0.0.1:9091 + fleet-api-address: 127.0.0.1:9092 + exec-api-address: 127.0.0.1:9093 + flwr_run_params: + flwr_app_name: "app-pytorch" + federation_name: "local-poc" +``` + +2. `FLEXAssigner` and tasks designed to explicitly run `start_client_adapter` task for every authorized collaborator, which is defined by the Task Runner. + +```yaml +assigner: + defaults: plan/defaults/assigner.yaml + template: openfl.component.FLEXAssigner + settings: + task_groups: + - name: FLEX_Flower + tasks: + - start_client_adapter +``` + +3. `FlowerTaskRunner` which will execute the `start_client_adapter` task. This task starts the Flower SuperNode and makes a connection to the OpenFL client. Additionally, the `FlowerTaskRunner` has an additional setting `FlowerTaskRunner.settings.auto_shutdown` which is default set to `True`. When set to `True`, the task runner will shut the SuperNode at the completion of an experiment, otherwise, it will run continuously. + +```yaml +task_runner: + defaults: plan/defaults/task_runner.yaml + template: openfl.federated.task.runner_flower.FlowerTaskRunner + settings: + auto_shutdown: True +``` +3. `FlowerDataLoader` with similar high-level functionality to other dataloaders. + +**IMPORTANT NOTE**: `aggregator.settings.rounds_to_train` is set to 1. __Do not edit this__. The actual number of rounds for the experiment is controlled by Flower logic inside of `./app-pytorch/pyproject.toml`. The entirety of the Flower experiment will run in a single OpenFL round. The aggregator round is there to stop the OpenFL components at the completion of the experiment. + +## Running the Workspace +Run the workspace as normal (certify the workspace, initialize the plan, register the collaborators, etc.): + +```SH +# Generate a Certificate Signing Request (CSR) for the Aggregator +fx aggregator generate-cert-request + +# The CA signs the aggregator's request, which is now available in the workspace +fx aggregator certify --silent + +# Initialize FL Plan and Model Weights for the Federation +fx plan initialize + +################################ +# Setup Collaborator 1 +################################ + +# Create a collaborator named "collaborator1" that will use shard "0" +fx collaborator create -n collaborator1 -d 0 + +# Generate a CSR for collaborator1 +fx collaborator generate-cert-request -n collaborator1 + +# The CA signs collaborator1's certificate +fx collaborator certify -n collaborator1 --silent + +################################ +# Setup Collaborator 2 +################################ + +# Create a collaborator named "collaborator2" that will use shard "1" +fx collaborator create -n collaborator2 -d 1 + +# Generate a CSR for collaborator2 +fx collaborator generate-cert-request -n collaborator2 + +# The CA signs collaborator2's certificate +fx collaborator certify -n collaborator2 --silent + +############################## +# Start to Run the Federation +############################## + +# Run the Aggregator +fx aggregator start +``` + +This will prepare the workspace and start the OpenFL aggregator, Flower superlink, and Flower serverapp. You should see something like: + +```SH +INFO 🧿 Starting the Aggregator Service. aggregator.py:70 +INFO Building `openfl.component.FLEXAssigner` Module. plan.py:226 +INFO Building `openfl.pipelines.NoCompressionPipeline` Module. plan.py:226 +INFO Building `openfl.component.straggler_handling_functions.CutoffTimeBasedStragglerHandling` Module. plan.py:226 +WARNING CutoffTimeBasedStragglerHandling is disabled as straggler_cutoff_time is set to np.inf. cutoff_time_based_straggler_handling.py:46 +INFO Building `openfl.component.FLEXFlower` Module. plan.py:226 +INFO Building `openfl.component.Aggregator` Module. plan.py:226 +use_tls=True +INFO [FLEX] Starting server process: flower-superlink --fleet-api-type grpc-adapter --insecure --serverappio-api-address flex.py:28 + 127.0.0.1:9091 --fleet-api-address 127.0.0.1:9092 --exec-api-address 127.0.0.1:9093 +INFO [FLEX] server process started with PID: 1972825 flex.py:30 +INFO Starting Aggregator gRPC Server aggregator_server.py:389 +INFO : Starting Flower SuperLink +WARNING : Option `--insecure` was set. Starting insecure HTTP server. +INFO : Flower Deployment Engine: Starting Exec API on 127.0.0.1:9093 +INFO : Flower ECE: Starting ServerAppIo API (gRPC-rere) on 127.0.0.1:9091 +INFO : Flower ECE: Starting Fleet API (GrpcAdapter) on 127.0.0.1:9092 +``` + +### Start Collaborators +Open 2 additional terminals for collaborators. +For collaborator 1's terminal, run: +```SH +fx collaborator start -n collaborator1 +``` +For collaborator 2's terminal, run: +```SH +fx collaborator start -n collaborator2 +``` +This will start the collaborator nodes, the Flower `SuperNode`, and Flower `ClientApp`, and begin running the Flower experiment. You should see something like: + +```SH +INFO 🧿 Starting a Collaborator Service. collaborator.py:85 +INFO Building `openfl.federated.data.loader_flower.FlowerDataLoader` Module. plan.py:226 +INFO Building `openfl.federated.task.runner_flower.FlowerTaskRunner` Module. plan.py:226 +INFO Building `openfl.pipelines.NoCompressionPipeline` Module. plan.py:226 +INFO Building `openfl.component.Collaborator` Module. plan.py:226 +INFO Waiting for tasks... collaborator.py:222 +INFO Received the following tasks: [name: "start_client_adapter" collaborator.py:172 + ] +INFO OpenFL local gRPC server started, listening on port 9090. runner_flower.py:61 +INFO Automatic shutdown enabled. Monitoring subprocess activity... runner_flower.py:105 +INFO Press CTRL+C to stop the server and supernode process. runner_flower.py:139 +INFO : Starting Flower SuperNode +WARNING : Option `--insecure` was set. Starting insecure HTTP channel to 127.0.0.1:9090. +INFO : Starting Flower ClientAppIo gRPC server on 127.0.0.1:5000 +``` +### Completion of the Experiment +Upon the completion of the experiment, on the `aggregator` terminal, the Flower components should send an experiment summary as the `SuperLink `continues to receive requests from the supernode: +```SH +INFO : [SUMMARY] +INFO : Run finished 3 round(s) in 93.29s +INFO : History (loss, distributed): +INFO : round 1: 2.0937052175497555 +INFO : round 2: 1.8027011854633406 +INFO : round 3: 1.6812996898487116 +INFO : GrpcAdapter.PullTaskIns +INFO : GrpcAdapter.PullTaskIns +INFO : GrpcAdapter.PullTaskIns +``` +If `autoshutdown` is enabled, this will be shortly followed by the OpenFL `aggregator` receiving "results" from the `collaborator` and subsequently shutting down: + +```SH +INFO Collaborator collaborator1 is sending task results for start_client_adapter, round 0 aggregator.py:633 +INFO Round: 0, Collaborators that have completed all tasks: ['collaborator1'] aggregator.py:1095 +INFO : GrpcAdapter.DeleteNode +INFO Collaborator collaborator2 is sending task results for start_client_adapter, round 0 aggregator.py:633 +INFO Round: 0, Collaborators that have completed all tasks: ['collaborator1', 'collaborator2'] aggregator.py:1095 +INFO Experiment Completed. Cleaning up... aggregator.py:1053 +INFO Sending signal to collaborator collaborator2 to shutdown... aggregator.py:360 +INFO Sending signal to collaborator collaborator1 to shutdown... aggregator.py:360 +INFO [FLEX] Stopping server process with PID: 1963348... flex.py:39 +INFO [FLEX] Stopping server subprocess with PID: 1964099... flex.py:44 +INFO [FLEX] Server process stopped. +``` +Upon the completion of the experiment, on the `collaborator` terminals, the Flower components should be outputting the information about the run: + +```SH +INFO : [RUN ..., ROUND 3] +INFO : Received: evaluate message 53e1ad1c-ffeb-41cc-9857-3d1b83273bd9 +INFO : Starting Flower ClientApp +INFO : Pulling ClientAppInputs for token ... +INFO : Pushing ClientAppOutputs for token ... +``` + +If `autoshutdown` is enabled, this will be shortly followed by the OpenFL `collaborator` shutting down: + +```SH +INFO : Disconnect and shut down +INFO Supernode process terminated. Shutting down gRPC server... runner_flower.py:96 +INFO gRPC server stopped. runner_flower.py:98 +INFO Waiting for tasks... collaborator.py:222 +INFO End of Federation reached. Exiting... +``` +Congratulations, you have run a Flower experiment through OpenFL's task runner! + +## Advanced Usage +### Long-lived SuperLink and SuperNode +If `autoshutdown` is not enabled, Flower's `ServerApp` and `ClientApp` will shut down at the completion of the Flower experiment, but the `SuperLink` and `SuperNode` will continue to run. As a result, on the `aggregator` terminal, you will see a constant request coming from the `SuperNode`: +```SH +INFO : GrpcAdapter.PullTaskIns +INFO : GrpcAdapter.PullTaskIns +INFO : GrpcAdapter.PullTaskIns +``` +You can run another experiment by opening another terminal, navigating to this workspace, and running: +```SH +flwr run ./app-pytorch +``` +It will run another experiment. Once you are done, you can manually shut down OpenFL's `collaborator` and Flower's `SuperNode` with `CTRL+C`. This will trigger a task-completion by the task runner that'll subsequently begin the graceful shutdown process of the OpenFL and Flower components. + +### Invoke Flower experiment as a separate command +If you did not set `flwr_run_params` in the `plan.yaml`, the OpenFL `FLEX` will not automatically start a Flower experiment. Instead, you should open a terminal, navigate to this workspace, and run +```SH +flwr run ./app-pytorch +``` +separately to begin the experiment. \ No newline at end of file From ba02e9c56d3af9da6823c2bdf2e721961f626719 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 13:19:45 -0800 Subject: [PATCH 028/107] modify plan and update readme Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/README.md | 10 ++++++++++ openfl-workspace/flower-app-pytorch/plan/plan.yaml | 8 ++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/README.md index 4509c36117..3a348757c6 100644 --- a/openfl-workspace/flower-app-pytorch/README.md +++ b/openfl-workspace/flower-app-pytorch/README.md @@ -17,6 +17,16 @@ In addition, there are options to run the `SuperLink` and `SuperNode` as [long-l ## Getting Started +### Install OpenFL + +Ensure that OpenFL is installed +```sh +pip install virtualenv +virtualenv ./venv +source ./venv/bin/activate +pip install openfl +``` + ### Create a Workspace Start by creating a workspace: diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 5fabf4e630..29c8fbae39 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -5,7 +5,7 @@ aggregator : defaults : plan/defaults/aggregator.yaml template : openfl.component.Aggregator settings : - rounds_to_train : 1 + rounds_to_train : 1 #DO NOT EDIT flex : defaults : plan/defaults/flex.yaml @@ -16,9 +16,9 @@ flex : serverappio-api-address : 127.0.0.1:9091 fleet-api-address : 127.0.0.1:9092 # note [kta-intel]: local gRPC client will connect here exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml - # flwr_run_params : - # flwr_app_name : "app-pytorch" - # federation_name : "local-poc" + flwr_run_params : + flwr_app_name : "app-pytorch" + federation_name : "local-poc" collaborator : defaults : plan/defaults/collaborator.yaml From 7db176d33ac6ff82380f4dc812ec5142bd06297c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 13:23:28 -0800 Subject: [PATCH 029/107] install openfl instructions Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/README.md index 3a348757c6..fa302ed677 100644 --- a/openfl-workspace/flower-app-pytorch/README.md +++ b/openfl-workspace/flower-app-pytorch/README.md @@ -19,12 +19,18 @@ In addition, there are options to run the `SuperLink` and `SuperNode` as [long-l ### Install OpenFL -Ensure that OpenFL is installed +Create virtual env ```sh pip install virtualenv virtualenv ./venv source ./venv/bin/activate -pip install openfl +``` + +Install OpenFL from source +```sh +git clone https://github.com/securefederatedai/openfl.git +cd openfl +pip install -e . ``` ### Create a Workspace From bf9f955114c70e345248e4611fba57cb53193a30 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 20 Dec 2024 14:15:23 -0800 Subject: [PATCH 030/107] update torch and torchvision Signed-off-by: kta-intel --- .../flower-app-pytorch/app-pytorch/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml b/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml index f8e4c7c708..24cf96c916 100644 --- a/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml +++ b/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml @@ -10,8 +10,8 @@ license = "Apache-2.0" dependencies = [ "flwr-nightly==1.14.0.dev20241205", "flwr-datasets[vision]>=0.3.0", - "torch==2.2.1", - "torchvision==0.17.1", + "torch==2.3.1", + "torchvision==0.18.1", ] [tool.hatch.build.targets.wheel] From 4a3b37dd7819711b2f484d61a2c170e7a5435be6 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 2 Jan 2025 10:38:31 -0800 Subject: [PATCH 031/107] expand conditional Signed-off-by: kta-intel --- openfl/component/aggregator/aggregator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index f12dd9ae94..12bee168b5 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -1017,10 +1017,10 @@ def _end_of_round_check(self): if self._end_of_round_check_done[self.round_number]: return + if not self.is_flex_available(): # Compute all validation related metrics - all_tasks = self.assigner.get_all_tasks_for_round(self.round_number) + all_tasks = self.assigner.get_all_tasks_for_round(self.round_number) - if not self.is_flex_available(): for task_name in all_tasks: self._compute_validation_related_task_metrics(task_name) From 76b7936185c2c3240dfa39722699e0e204f4812e Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 2 Jan 2025 11:21:16 -0800 Subject: [PATCH 032/107] edit taskrunner docstrings Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 29 ++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 35f201a0b3..64f52e4e68 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -15,16 +15,25 @@ class FlowerTaskRunner(TaskRunner): """ FlowerTaskRunner is a task runner that executes Flower SuperNode - to initialize the experiment from the client side + to initialize the experiment from the client side. + + This class is responsible for starting a local gRPC server and a Flower SuperNode + in a subprocess. It also provides options for automatic shutdown based on subprocess + activity. + + Shutdown Options: + - Manual Shutdown: The server and supernode process can be manually stopped by pressing CTRL+C. + - Automatic Shutdown: If enabled, the system will monitor the activity of subprocesses and + automatically shut down if no new subprocess starts within a certain time frame. """ def __init__(self, auto_shutdown=True, **kwargs): """ - Initializes. + Initializes the FlowerTaskRunner. Args: auto_shutdown (bool): Whether to enable automatic shutdown based on subprocess activity. - Default to True. Set to False for long-lived component + Default is True. Set to False for long-lived components. **kwargs: Additional parameters to pass to the functions. """ super().__init__(**kwargs) @@ -46,8 +55,20 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): Args: openfl_client: The OpenFL client instance used to communicate with the OpenFL server. collaborator_name: The name of the collaborator. - auto_shutdown: Whether to enable automatic shutdown based on subprocess activity. **kwargs: Additional parameters, including 'local_server_port'. + + The method performs the following steps: + 1. Starts a local gRPC server to handle communication between the OpenFL client and the Flower SuperNode. + 2. Launches the Flower SuperNode in a subprocess. + 3. Sets up signal handlers for manual shutdown (via CTRL+C). + 4. If auto_shutdown is enabled, monitors subprocess activity and initiates shutdown if no new subprocess starts within the expected time frame. + + Shutdown Process: + - When a shutdown signal (SIGINT or SIGTERM) is received, the method will: + 1. Terminate all child processes of the supernode subprocess. + 2. Terminate the main supernode subprocess. + 3. Stop the gRPC server. + 4. Log the shutdown process and set the termination event to stop the server. """ local_server_port = kwargs['local_server_port'] From ab3093eaf0868febd1e938e852aa7c24609214e7 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 2 Jan 2025 14:00:21 -0800 Subject: [PATCH 033/107] update name FLEX to Connector Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/README.md | 28 ++++---- .../flower-app-pytorch/plan/plan.yaml | 12 ++-- .../defaults/{flex.yaml => connector.yaml} | 0 .../{tasks_flex.yaml => tasks_connector.yaml} | 0 openfl/component/__init__.py | 6 +- openfl/component/aggregator/aggregator.py | 64 +++++++++++++------ openfl/component/assigner/__init__.py | 2 +- ...flex_assigner.py => connector_assigner.py} | 6 +- openfl/component/collaborator/collaborator.py | 2 +- openfl/component/interoperability/__init__.py | 4 +- .../{flex.py => connector.py} | 29 +++++---- .../{flex_flower.py => connector_flower.py} | 16 ++--- openfl/federated/plan/plan.py | 16 ++--- openfl/federated/task/runner_flower.py | 2 +- openfl/interface/plan.py | 4 +- openfl/transport/grpc/aggregator_server.py | 22 +++---- .../grpc/{flex => connector}/__init__.py | 0 .../{flex => connector}/flower/__init__.py | 0 .../flower/deserialize_message.py | 0 .../flower/local_grpc_client.py | 2 +- .../flower/local_grpc_server.py | 2 +- .../flower/message_conversion.py | 2 +- 22 files changed, 121 insertions(+), 98 deletions(-) rename openfl-workspace/workspace/plan/defaults/{flex.yaml => connector.yaml} (100%) rename openfl-workspace/workspace/plan/defaults/{tasks_flex.yaml => tasks_connector.yaml} (100%) rename openfl/component/assigner/{flex_assigner.py => connector_assigner.py} (96%) rename openfl/component/interoperability/{flex.py => connector.py} (58%) rename openfl/component/interoperability/{flex_flower.py => connector_flower.py} (82%) rename openfl/transport/grpc/{flex => connector}/__init__.py (100%) rename openfl/transport/grpc/{flex => connector}/flower/__init__.py (100%) rename openfl/transport/grpc/{flex => connector}/flower/deserialize_message.py (100%) rename openfl/transport/grpc/{flex => connector}/flower/local_grpc_client.py (91%) rename openfl/transport/grpc/{flex => connector}/flower/local_grpc_server.py (94%) rename openfl/transport/grpc/{flex => connector}/flower/message_conversion.py (94%) diff --git a/openfl-workspace/flower-app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/README.md index fa302ed677..09c904a000 100644 --- a/openfl-workspace/flower-app-pytorch/README.md +++ b/openfl-workspace/flower-app-pytorch/README.md @@ -48,12 +48,12 @@ This will create a workspace in your current working directory called `./my_work Notice under `./plan`, you will find the familiar OpenFL YAML files to configure the experiment. `col.yaml` and `data.yaml` will be populated by the collaborators that will run the Flower client app and the respective data shard or directory they will perform their training and testing on. plan.yaml configures the experiment itself. The Open-Flower integration makes a few key changes to the `plan.yaml`: -1. Introduction of a new top-level key (`flex`) to configure a newly introduced component called "FLEX (Federated Learning EXchange)". Specifically, the Flower integration uses a `FLEX` subclass called `FLEXFlower`. This component is run by the aggregator and is responsible for initializing the Flower SuperLink and connecting to the OpenFL server. The superlink parameters can be configured using `flex.settings.superlink_params`. If nothing is supplied, it will simply run `flower-superlink --insecure` with the command's default settings as dictated by Flower. It also includes the option to run the flwr run command via `flex.settings.flwr_run_params`. Without setting these commands, the aggregator will not invoke `flwr run` and it will be up to the user to run this process separately to start a Flower experiment. +1. Introduction of a new top-level key (`connector`) to configure a newly introduced component called `Connector`. Specifically, the Flower integration uses a `Connector` subclass called `ConnectorFlower`. This component is run by the aggregator and is responsible for initializing the Flower SuperLink and connecting to the OpenFL server. The superlink parameters can be configured using `connector.settings.superlink_params`. If nothing is supplied, it will simply run `flower-superlink --insecure` with the command's default settings as dictated by Flower. It also includes the option to run the flwr run command via `connector.settings.flwr_run_params`. Without setting these commands, the aggregator will not invoke `flwr run` and it will be up to the user to run this process separately to start a Flower experiment. ```yaml -flex: - defaults: plan/defaults/flex.yaml - template: openfl.component.FLEXFlower +connector: + defaults: plan/defaults/connector.yaml + template: openfl.component.ConnectorFlower settings: superlink_params: insecure: True @@ -65,15 +65,15 @@ flex: federation_name: "local-poc" ``` -2. `FLEXAssigner` and tasks designed to explicitly run `start_client_adapter` task for every authorized collaborator, which is defined by the Task Runner. +2. `ConnectorAssigner` and tasks designed to explicitly run `start_client_adapter` task for every authorized collaborator, which is defined by the Task Runner. ```yaml assigner: defaults: plan/defaults/assigner.yaml - template: openfl.component.FLEXAssigner + template: openfl.component.ConnectorAssigner settings: task_groups: - - name: FLEX_Flower + - name: Connector_Flower tasks: - start_client_adapter ``` @@ -149,9 +149,9 @@ WARNING CutoffTimeBasedStragglerHandling is disabled as straggler_cutoff_time i INFO Building `openfl.component.FLEXFlower` Module. plan.py:226 INFO Building `openfl.component.Aggregator` Module. plan.py:226 use_tls=True -INFO [FLEX] Starting server process: flower-superlink --fleet-api-type grpc-adapter --insecure --serverappio-api-address flex.py:28 - 127.0.0.1:9091 --fleet-api-address 127.0.0.1:9092 --exec-api-address 127.0.0.1:9093 -INFO [FLEX] server process started with PID: 1972825 flex.py:30 +INFO [OpenFL Connector] Starting server process: flower-superlink --fleet-api-type grpc-adapter --insecure connector.py:28 + --serverappio-api-address 127.0.0.1:9091 --fleet-api-address 127.0.0.1:9092 --exec-api-address 127.0.1:9093 +INFO [OpenFL Connector] server process started with PID: 1972825 connector.py:30 INFO Starting Aggregator gRPC Server aggregator_server.py:389 INFO : Starting Flower SuperLink WARNING : Option `--insecure` was set. Starting insecure HTTP server. @@ -212,9 +212,9 @@ INFO Round: 0, Collaborators that have completed all tasks: ['collaborator1' INFO Experiment Completed. Cleaning up... aggregator.py:1053 INFO Sending signal to collaborator collaborator2 to shutdown... aggregator.py:360 INFO Sending signal to collaborator collaborator1 to shutdown... aggregator.py:360 -INFO [FLEX] Stopping server process with PID: 1963348... flex.py:39 -INFO [FLEX] Stopping server subprocess with PID: 1964099... flex.py:44 -INFO [FLEX] Server process stopped. +INFO [OpenFL Connector] Stopping server process with PID: 1963348... connector.py:39 +INFO [OpenFL Connector] Stopping server subprocess with PID: 1964099... connector.py:44 +INFO [OpenFL Connector] Server process stopped. ``` Upon the completion of the experiment, on the `collaborator` terminals, the Flower components should be outputting the information about the run: @@ -252,7 +252,7 @@ flwr run ./app-pytorch It will run another experiment. Once you are done, you can manually shut down OpenFL's `collaborator` and Flower's `SuperNode` with `CTRL+C`. This will trigger a task-completion by the task runner that'll subsequently begin the graceful shutdown process of the OpenFL and Flower components. ### Invoke Flower experiment as a separate command -If you did not set `flwr_run_params` in the `plan.yaml`, the OpenFL `FLEX` will not automatically start a Flower experiment. Instead, you should open a terminal, navigate to this workspace, and run +If you did not set `flwr_run_params` in the `plan.yaml`, the OpenFL `Connector` will not automatically start a Flower experiment. Instead, you should open a terminal, navigate to this workspace, and run ```SH flwr run ./app-pytorch ``` diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 29c8fbae39..3da6e96906 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -7,9 +7,9 @@ aggregator : settings : rounds_to_train : 1 #DO NOT EDIT -flex : - defaults : plan/defaults/flex.yaml - template : openfl.component.FLEXFlower +connector : + defaults : plan/defaults/connector.yaml + template : openfl.component.ConnectorFlower settings : superlink_params : insecure : True @@ -41,15 +41,15 @@ network : assigner : defaults : plan/defaults/assigner.yaml - template : openfl.component.FLEXAssigner + template : openfl.component.ConnectorAssigner settings : task_groups : - - name : FLEX_Flower + - name : Connector_Flower tasks : - start_client_adapter tasks : - defaults : plan/defaults/tasks_flex.yaml + defaults : plan/defaults/tasks_connector.yaml compression_pipeline : defaults : plan/defaults/compression_pipeline.yaml \ No newline at end of file diff --git a/openfl-workspace/workspace/plan/defaults/flex.yaml b/openfl-workspace/workspace/plan/defaults/connector.yaml similarity index 100% rename from openfl-workspace/workspace/plan/defaults/flex.yaml rename to openfl-workspace/workspace/plan/defaults/connector.yaml diff --git a/openfl-workspace/workspace/plan/defaults/tasks_flex.yaml b/openfl-workspace/workspace/plan/defaults/tasks_connector.yaml similarity index 100% rename from openfl-workspace/workspace/plan/defaults/tasks_flex.yaml rename to openfl-workspace/workspace/plan/defaults/tasks_connector.yaml diff --git a/openfl/component/__init__.py b/openfl/component/__init__.py index 315236c847..d90c4206ae 100644 --- a/openfl/component/__init__.py +++ b/openfl/component/__init__.py @@ -7,7 +7,7 @@ from openfl.component.assigner.assigner import Assigner from openfl.component.assigner.random_grouped_assigner import RandomGroupedAssigner from openfl.component.assigner.static_grouped_assigner import StaticGroupedAssigner -from openfl.component.assigner.flex_assigner import FLEXAssigner +from openfl.component.assigner.connector_assigner import ConnectorAssigner from openfl.component.collaborator.collaborator import Collaborator from openfl.component.straggler_handling_functions.cutoff_time_based_straggler_handling import ( CutoffTimeBasedStragglerHandling, @@ -18,8 +18,8 @@ from openfl.component.straggler_handling_functions.straggler_handling_function import ( StragglerHandlingPolicy, ) -from openfl.component.interoperability.flex import FederatedLearningExchange +from openfl.component.interoperability.connector import Connector if util.find_spec("flwr") is not None: - from openfl.component.interoperability.flex_flower import FLEXFlower + from openfl.component.interoperability.connector_flower import ConnectorFlower diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 12bee168b5..fe633f9351 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -71,7 +71,7 @@ def __init__( best_state_path, last_state_path, assigner, - flex, + connector, use_delta_updates=True, straggler_handling_policy=None, rounds_to_train=256, @@ -137,7 +137,7 @@ def __init__( self.uuid = aggregator_uuid self.federation_uuid = federation_uuid self.assigner = assigner - self.flex = flex + self.connector = connector self.quit_job_sent_to = [] self.tensor_db = TensorDB() @@ -177,7 +177,7 @@ def __init__( tensor_pipe=self.compression_pipeline, ) else: - if self.flex: + if self.connector: # The model definition will be handled by the respective framework self.model = {} else: @@ -635,7 +635,7 @@ def send_local_task_results( f"for {task_name}, round {round_number}" ) - if self.is_flex_available(): + if self.is_connector_available(): # Skip to end of round check with self.lock: self._is_collaborator_done(collaborator_name, round_number) @@ -697,23 +697,47 @@ def send_local_task_results( self._end_of_round_with_stragglers_check() - def is_flex_available(self): - return self.flex is not None + def is_connector_available(self): + """ + Check if the OpenFL Connector is available. + + Returns: + bool: True if connector is available, False otherwise. + """ + return self.connector is not None + + def start_connector(self): + """ + Start the OpenFL Connector. + + Raises: + RuntimeError: If OpenFL Connector has not been enabled. + """ + if not self.is_connector_available(): + raise RuntimeError("OpenFL Connector has not been enabled.") + return self.connector.start() - def start_flex(self): - if not self.is_flex_available(): - raise RuntimeError("Federated Learning exchange as not been enabled.") - return self.flex.start() + def stop_connector(self): + """ + Stop the OpenFL Connector. - def stop_flex(self): - if not self.is_flex_available(): - raise RuntimeError("Federated Learning exchange as not been enabled.") - return self.flex.stop() + Raises: + RuntimeError: If OpenFL Connector has not been enabled. + """ + if not self.is_connector_available(): + raise RuntimeError("OpenFL Connector has not been enabled.") + return self.connector.stop() def get_local_grpc_client(self): - if not self.is_flex_available(): - raise RuntimeError("Federated Learning exchange as not been enabled.") - return self.flex.get_local_grpc_client() + """ + Get the local gRPC client for the OpenFL Connector. + + Raises: + RuntimeError: If OpenFL Connector has not been enabled. + """ + if not self.is_connector_available(): + raise RuntimeError("OpenFL Connector has not been enabled.") + return self.connector.get_local_grpc_client() def _end_of_round_with_stragglers_check(self): """ @@ -1017,7 +1041,7 @@ def _end_of_round_check(self): if self._end_of_round_check_done[self.round_number]: return - if not self.is_flex_available(): + if not self.is_connector_available(): # Compute all validation related metrics all_tasks = self.assigner.get_all_tasks_for_round(self.round_number) @@ -1035,8 +1059,8 @@ def _end_of_round_check(self): self._end_of_round_check_done[self.round_number] = True # Save the latest model - if not self.is_flex_available(): - # external FL framework will handle the model saving if FLEX is enabled + if not self.is_connector_available(): + # external FL framework will handle the model saving if connector is enabled self.logger.info("Saving round %s model...", self.round_number) self._save_model(self.round_number, self.last_state_path) diff --git a/openfl/component/assigner/__init__.py b/openfl/component/assigner/__init__.py index 81f03f6cb8..73e3e1a477 100644 --- a/openfl/component/assigner/__init__.py +++ b/openfl/component/assigner/__init__.py @@ -5,4 +5,4 @@ from openfl.component.assigner.assigner import Assigner from openfl.component.assigner.random_grouped_assigner import RandomGroupedAssigner from openfl.component.assigner.static_grouped_assigner import StaticGroupedAssigner -from openfl.component.assigner.flex_assigner import FLEXAssigner +from openfl.component.assigner.connector_assigner import ConnectorAssigner diff --git a/openfl/component/assigner/flex_assigner.py b/openfl/component/assigner/connector_assigner.py similarity index 96% rename from openfl/component/assigner/flex_assigner.py rename to openfl/component/assigner/connector_assigner.py index 8052dc5af0..75d1d7d771 100644 --- a/openfl/component/assigner/flex_assigner.py +++ b/openfl/component/assigner/connector_assigner.py @@ -7,7 +7,7 @@ from openfl.component.assigner.assigner import Assigner -class FLEXAssigner(Assigner): +class ConnectorAssigner(Assigner): """The task assigner maintains a list of tasks. This assigner is designed to facilitate interoperability between federated learning frameworks. @@ -20,7 +20,7 @@ class FLEXAssigner(Assigner): """ def __init__(self, task_groups=None, **kwargs): - """Initializes the FLEXAssigner. + """Initializes the ConnectorAssigner. Args: task_groups (list of object): Task groups to assign. @@ -47,7 +47,7 @@ def define_task_assignments(self): # Check if any task other than 'start_client_adapter' is present for task in group["tasks"]: if task != 'start_client_adapter': - raise ValueError(f"Unsupported task '{task}' found. FLEXAssigner only supports 'start_client_adapter'.") + raise ValueError(f"Unsupported task '{task}' found. ConnectorAssigner only supports 'start_client_adapter'.") # Start by finding all of the tasks in all specified groups self.all_tasks_in_groups = list( diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index 3d13149639..39ba6ed0fb 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -252,7 +252,7 @@ def do_task(self, task, round_number): kwargs = self.task_config[task_name]["kwargs"] if func_name=="start_client_adapter": # TODO: Need to determine a more general way to handle this in order to enable - # additional tasks to be added to be added to FLEX + # additional tasks to be added to be added to Connector if hasattr(self.task_runner, func_name): method = getattr(self.task_runner, func_name) if callable(method): diff --git a/openfl/component/interoperability/__init__.py b/openfl/component/interoperability/__init__.py index 29c4b14c9f..0ea8b1f9df 100644 --- a/openfl/component/interoperability/__init__.py +++ b/openfl/component/interoperability/__init__.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from importlib import util -from openfl.component.interoperability.flex import FederatedLearningExchange +from openfl.component.interoperability.connector import Connector if util.find_spec("flwr") is not None: - from openfl.component.interoperability.flex_flower import FLEXFlower + from openfl.component.interoperability.connector_flower import ConnectorFlower diff --git a/openfl/component/interoperability/flex.py b/openfl/component/interoperability/connector.py similarity index 58% rename from openfl/component/interoperability/flex.py rename to openfl/component/interoperability/connector.py index 5c888c1ea9..e1b14923a2 100644 --- a/openfl/component/interoperability/flex.py +++ b/openfl/component/interoperability/connector.py @@ -2,17 +2,18 @@ import psutil from logging import getLogger -class FederatedLearningExchange: +class Connector: """ - A skeletal base class for managing a server process. + A skeletal base class for managing a server process of an external federated learning framework and + the connection with OpenFL's server """ def __init__(self, command: list[str], component_name: str = "Base", **kwargs): """ - Initialize FLEX to run a server process. + Initialize the OpenFL Connector. Args: command (list[str]): The command to run the server process. - component_name (str): The name of the specific FLEX component being used. + component_name (str): The name of the specific Connector component being used. """ self.local_grpc_client = None self._command = command @@ -25,23 +26,23 @@ def start(self): Start the server process with the provided command. """ if self._process is None: - self.logger.info(f"[FLEX] Starting server process: {' '.join(self._command)}") + self.logger.info(f"[OpenFL Connector] Starting server process: {' '.join(self._command)}") self._process = subprocess.Popen(self._command) - self.logger.info(f"[FLEX] server process started with PID: {self._process.pid}") + self.logger.info(f"[OpenFL Connector] server process started with PID: {self._process.pid}") else: - self.logger.info("[FLEX] server process is already running.") + self.logger.info("[OpenFL Connector] server process is already running.") def stop(self): """ Stop the server process if it is running. """ if self._process: - self.logger.info(f"[FLEX] Stopping server process with PID: {self._process.pid}...") + self.logger.info(f"[OpenFL Connector] Stopping server process with PID: {self._process.pid}...") # find and terminate sub_process processes main_process = psutil.Process(self._process.pid) sub_processes = main_process.children(recursive=True) for sub_process in sub_processes: - self.logger.info(f"[FLEX] Stopping server subprocess with PID: {sub_process.pid}...") + self.logger.info(f"[OpenFL Connector] Stopping server subprocess with PID: {sub_process.pid}...") sub_process.terminate() _, still_alive = psutil.wait_procs(sub_processes, timeout=1) for p in still_alive: @@ -53,9 +54,9 @@ def stop(self): except subprocess.TimeoutExpired: self._process.kill() self._process = None - self.logger.info("[FLEX] Server process stopped.") + self.logger.info("[OpenFL Connector] Server process stopped.") else: - self.logger.info("[FLEX] No server process is currently running.") + self.logger.info("[OpenFL Connector] No server process is currently running.") def get_local_grpc_client(self): """ @@ -63,8 +64,8 @@ def get_local_grpc_client(self): """ return self.local_grpc_client - def print_flex_info(self): + def print_Connector_info(self): """ - Print information indicating which FLEX component is being used. + Print information indicating which Connector component is being used. """ - self.logger.info(f"FLEX Enabled: {self.component_name}") \ No newline at end of file + self.logger.info(f"OpenFL Connector Enabled: {self.component_name}") \ No newline at end of file diff --git a/openfl/component/interoperability/flex_flower.py b/openfl/component/interoperability/connector_flower.py similarity index 82% rename from openfl/component/interoperability/flex_flower.py rename to openfl/component/interoperability/connector_flower.py index d0a82104d2..48b3e7446d 100644 --- a/openfl/component/interoperability/flex_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -1,16 +1,16 @@ import subprocess -from openfl.component.interoperability.flex import FederatedLearningExchange -from openfl.transport.grpc.flex.flower.local_grpc_client import LocalGRPCClient +from openfl.component.interoperability.connector import Connector +from openfl.transport.grpc.connector.flower.local_grpc_client import LocalGRPCClient -class FLEXFlower(FederatedLearningExchange): +class ConnectorFlower(Connector): """ - FLEX subclass for the Flower framework. + Connector subclass for the Flower framework. Responsible for generating the Flower server command. """ def __init__(self, superlink_params: dict, flwr_run_params: dict = None, **kwargs): """ - Initialize FLEXFlower by building the server command from the superlink_params. + Initialize ConnectorFlower by building the server command from the superlink_params. Args: superlink_params (dict): A dictionary of Flower server settings. flwr_run_params (dict, optional): A dictionary containing the Flower run parameters. Defaults to None. @@ -20,8 +20,8 @@ def __init__(self, superlink_params: dict, flwr_run_params: dict = None, **kwarg command = self._build_command() super().__init__(command, component_name="Flower") - flex_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") - self.local_grpc_client = LocalGRPCClient(flex_address) + connector_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") + self.local_grpc_client = LocalGRPCClient(connector_address) self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None self.flwr_run_process = None @@ -77,7 +77,7 @@ def start(self): super().start() if self.flwr_run_command: - self.logger.info(f"[FLEX] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") + self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") self.flwr_run_process = subprocess.Popen(self.flwr_run_command) def stop(self): diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index a266b70476..a5f08b0d03 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -266,7 +266,7 @@ def __init__(self): self.collaborator_ = None # collaborator object self.aggregator_ = None # aggregator object self.assigner_ = None # assigner object - self.flex_ = None # federated learning exchange object + self.connector_ = None # OpenFL Connector object self.loader_ = None # data loader object self.runner_ = None # task runner object @@ -341,16 +341,16 @@ def get_assigner(self): return self.assigner_ - def get_flex(self): + def get_connector(self): """Get federated learning exchange object.""" - defaults = self.config.get("flex") + defaults = self.config.get("connector") - if self.flex_ is None and defaults: - self.flex_ = Plan.build(**defaults) + if self.connector_ is None and defaults: + self.connector_ = Plan.build(**defaults) else: - self.flex_ = None + self.connector_ = None - return self.flex_ + return self.connector_ def get_tasks(self): """Get federation tasks.""" @@ -403,7 +403,7 @@ def get_aggregator(self, tensor_dict=None): defaults[SETTINGS]["assigner"] = self.get_assigner() defaults[SETTINGS]["compression_pipeline"] = self.get_tensor_pipe() defaults[SETTINGS]["straggler_handling_policy"] = self.get_straggler_handling_policy() - defaults[SETTINGS]["flex"] = self.get_flex() + defaults[SETTINGS]["connector"] = self.get_connector() log_metric_callback = defaults[SETTINGS].get("log_metric_callback") if log_metric_callback: diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 64f52e4e68..42a261c485 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -3,7 +3,7 @@ from flwr.proto import grpcadapter_pb2_grpc from multiprocessing import cpu_count from openfl.federated.task.runner import TaskRunner -from openfl.transport.grpc.flex.flower.local_grpc_server import LocalGRPCServer +from openfl.transport.grpc.connector.flower.local_grpc_server import LocalGRPCServer import subprocess from logging import getLogger import signal diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index f0ec068de9..97007e5e2d 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -164,8 +164,8 @@ def initialize( gandlf_config_path=gandlf_config, ) - if 'flex' in plan.config: - logger.info("FLEX enabled: %s", plan.config['flex']) + if 'connector' in plan.config: + logger.info("OpenFL Connector enabled: %s", plan.config['connector']) else: init_state_path = plan.config["aggregator"]["settings"]["init_state_path"] diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index 26dbbf0e7b..f1a004401d 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -16,8 +16,6 @@ from openfl.transport.grpc.grpc_channel_options import channel_options from openfl.utilities import check_equal, check_is_in -import subprocess - logger = logging.getLogger(__name__) @@ -70,7 +68,7 @@ def __init__( TLS connection. private_key (str): The path to the server's private key for the TLS connection. - use_flex (bool): whether to use framework interopability mode + use_connector (bool): whether to use framework interopability mode **kwargs: Additional keyword arguments. """ print(f"{use_tls=}") @@ -83,9 +81,9 @@ def __init__( self.private_key = private_key self.server = None self.server_credentials = None - self.use_flex = self.aggregator.is_flex_available() + self.use_connector = self.aggregator.is_connector_available() - if self.use_flex: + if self.use_connector: self.local_grpc_client = self.aggregator.get_local_grpc_client() # Initialize the local gRPC client else: self.local_grpc_client = None @@ -242,7 +240,7 @@ def GetAggregatedTensor(self, request, context): # NOQA:N802 aggregator_pb2.GetAggregatedTensorResponse: The response to the request. """ - if self.use_flex: + if self.use_connector: context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") self.validate_collaborator(request, context) @@ -284,7 +282,7 @@ def SendLocalTaskResults(self, request, context): # NOQA:N802 aggregator_pb2.SendLocalTaskResultsResponse: The response to the request. """ - # if self.use_flex: + # if self.use_connector: # context.abort(StatusCode.UNIMPLEMENTED, "This method is not available in framework interopability mode.") try: @@ -323,7 +321,7 @@ def PelicanDrop(self, request, context): aggregator_pb2.PelicanDrop: The response to the request. """ - if not self.use_flex: + if not self.use_connector: context.abort(StatusCode.UNIMPLEMENTED, "PelicanDrop is only available in federated interopability mode.") self.validate_collaborator(request, context) @@ -381,8 +379,8 @@ def serve(self): """ - if self.use_flex: - self.aggregator.start_flex() + if self.use_connector: + self.aggregator.start_connector() self.get_server() @@ -395,7 +393,7 @@ def serve(self): except KeyboardInterrupt: pass - if self.use_flex: - self.aggregator.stop_flex() + if self.use_connector: + self.aggregator.stop_connector() self.server.stop(0) diff --git a/openfl/transport/grpc/flex/__init__.py b/openfl/transport/grpc/connector/__init__.py similarity index 100% rename from openfl/transport/grpc/flex/__init__.py rename to openfl/transport/grpc/connector/__init__.py diff --git a/openfl/transport/grpc/flex/flower/__init__.py b/openfl/transport/grpc/connector/flower/__init__.py similarity index 100% rename from openfl/transport/grpc/flex/flower/__init__.py rename to openfl/transport/grpc/connector/flower/__init__.py diff --git a/openfl/transport/grpc/flex/flower/deserialize_message.py b/openfl/transport/grpc/connector/flower/deserialize_message.py similarity index 100% rename from openfl/transport/grpc/flex/flower/deserialize_message.py rename to openfl/transport/grpc/connector/flower/deserialize_message.py diff --git a/openfl/transport/grpc/flex/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py similarity index 91% rename from openfl/transport/grpc/flex/flower/local_grpc_client.py rename to openfl/transport/grpc/connector/flower/local_grpc_client.py index ef7c4a8a6d..b56355b63f 100644 --- a/openfl/transport/grpc/flex/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -1,6 +1,6 @@ import grpc from flwr.proto import grpcadapter_pb2_grpc -from openfl.transport.grpc.flex.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message +from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message class LocalGRPCClient: """ diff --git a/openfl/transport/grpc/flex/flower/local_grpc_server.py b/openfl/transport/grpc/connector/flower/local_grpc_server.py similarity index 94% rename from openfl/transport/grpc/flex/flower/local_grpc_server.py rename to openfl/transport/grpc/connector/flower/local_grpc_server.py index 0c3c03bdec..ad93883627 100644 --- a/openfl/transport/grpc/flex/flower/local_grpc_server.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_server.py @@ -1,7 +1,7 @@ import threading import queue from flwr.proto import grpcadapter_pb2_grpc -from openfl.transport.grpc.flex.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message +from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): """ diff --git a/openfl/transport/grpc/flex/flower/message_conversion.py b/openfl/transport/grpc/connector/flower/message_conversion.py similarity index 94% rename from openfl/transport/grpc/flex/flower/message_conversion.py rename to openfl/transport/grpc/connector/flower/message_conversion.py index bb29781afa..122e386a28 100644 --- a/openfl/transport/grpc/flex/flower/message_conversion.py +++ b/openfl/transport/grpc/connector/flower/message_conversion.py @@ -1,6 +1,6 @@ from flwr.proto import grpcadapter_pb2 from openfl.protocols import aggregator_pb2 -# from openfl.transport.grpc.flex.flower.deserialize_message import deserialize_flower_message +# from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message def flower_to_openfl_message(flower_message, header=None): """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" From 21c13b5ca10ab480af617480e1fbde3ee58d9047 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 3 Jan 2025 14:03:00 -0800 Subject: [PATCH 034/107] more docstring Signed-off-by: kta-intel --- .../connector/flower/message_conversion.py | 45 +++++++++++++++---- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/openfl/transport/grpc/connector/flower/message_conversion.py b/openfl/transport/grpc/connector/flower/message_conversion.py index 122e386a28..448c991af9 100644 --- a/openfl/transport/grpc/connector/flower/message_conversion.py +++ b/openfl/transport/grpc/connector/flower/message_conversion.py @@ -1,22 +1,36 @@ from flwr.proto import grpcadapter_pb2 -from openfl.protocols import aggregator_pb2 +from openfl.protocols import aggregator_pb2 # from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message def flower_to_openfl_message(flower_message, header=None): - """Convert a Flower MessageContainer to an OpenFL OpenFLMessage.""" - if isinstance(flower_message, aggregator_pb2.DropPod()): + """ + Convert a Flower MessageContainer to an OpenFL DropPod. + + This function takes a Flower MessageContainer and converts it into an OpenFL DropPod. + If the input is already an OpenFL DropPod, it returns the input as-is. + + Args: + flower_message (grpcadapter_pb2.MessageContainer or aggregator_pb2.DropPod): + The Flower message to be converted. It can either be a Flower MessageContainer + or an OpenFL DropPod. + header (aggregator_pb2.MessageHeader, optional): + An optional header to be included in the OpenFL DropPod. If provided, + it will be copied to the DropPod's header field. + + Returns: + aggregator_pb2.DropPod: The converted OpenFL DropPod message. + """ + if isinstance(flower_message, aggregator_pb2.DropPod): # If the input is already an OpenFL message, return it as-is return flower_message else: - """Convert a Flower MessageContainer to an OpenFL message.""" # Create the OpenFL message openfl_message = aggregator_pb2.DropPod() # Set the MessageHeader fields based on the provided sender and receiver if header: openfl_message.header.CopyFrom(header) - # openfl_message.message_type = flower_message.metadata['grpc-message-qualname'] - - # TODO: Add verification steps for messages coming from Flower entities + + # Serialize the Flower message and set it in the OpenFL message serialized_flower_message = flower_message.SerializeToString() openfl_message.message.npbytes = serialized_flower_message openfl_message.message.size = len(serialized_flower_message) @@ -24,12 +38,25 @@ def flower_to_openfl_message(flower_message, header=None): return openfl_message def openfl_to_flower_message(openfl_message): - """Convert an OpenFL OpenFLMessage to a Flower MessageContainer.""" + """ + Convert an OpenFL DropPod to a Flower MessageContainer. + + This function takes an OpenFL DropPod and converts it into a Flower MessageContainer. + If the input is already a Flower MessageContainer, it returns the input as-is. + + Args: + openfl_message (aggregator_pb2.DropPod or grpcadapter_pb2.MessageContainer): + The OpenFL message to be converted. It can either be an OpenFL DropPod + or a Flower MessageContainer. + + Returns: + grpcadapter_pb2.MessageContainer: The converted Flower MessageContainer. + """ if isinstance(openfl_message, grpcadapter_pb2.MessageContainer): # If the input is already a Flower message, return it as-is return openfl_message else: - # Deserialize the Flower message from the DataStream npbytes field + # Deserialize the Flower message from the DataStream npbytes field flower_message = grpcadapter_pb2.MessageContainer() flower_message.ParseFromString(openfl_message.message.npbytes) return flower_message \ No newline at end of file From 12ee6df405b3c3784136865c04c36d9696c7e970 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 7 Jan 2025 14:14:52 -0800 Subject: [PATCH 035/107] move app-pytorch to src so that workspace can be properly exported and imported Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/README.md | 6 +++--- openfl-workspace/flower-app-pytorch/requirements.txt | 2 +- .../{ => src}/app-pytorch/app_pytorch/__init__.py | 0 .../{ => src}/app-pytorch/app_pytorch/client_app.py | 0 .../{ => src}/app-pytorch/app_pytorch/server_app.py | 0 .../{ => src}/app-pytorch/app_pytorch/task.py | 0 .../flower-app-pytorch/{ => src}/app-pytorch/pyproject.toml | 0 openfl/component/interoperability/connector_flower.py | 2 +- 8 files changed, 5 insertions(+), 5 deletions(-) rename openfl-workspace/flower-app-pytorch/{ => src}/app-pytorch/app_pytorch/__init__.py (100%) rename openfl-workspace/flower-app-pytorch/{ => src}/app-pytorch/app_pytorch/client_app.py (100%) rename openfl-workspace/flower-app-pytorch/{ => src}/app-pytorch/app_pytorch/server_app.py (100%) rename openfl-workspace/flower-app-pytorch/{ => src}/app-pytorch/app_pytorch/task.py (100%) rename openfl-workspace/flower-app-pytorch/{ => src}/app-pytorch/pyproject.toml (100%) diff --git a/openfl-workspace/flower-app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/README.md index 09c904a000..b0fa089cee 100644 --- a/openfl-workspace/flower-app-pytorch/README.md +++ b/openfl-workspace/flower-app-pytorch/README.md @@ -4,7 +4,7 @@ This workspace demonstrates a new functionality in OpenFL to interoperate with [ ## Overview -In this repository, you'll notice a directory called `./app-pytorch`. This is effectively a Flower PyTorch app created using Flower's `flwr new` command that has been modified to run a local federation. The client and server apps dictate what will be run by the client and server respectively. `Task.py` defines the logic that will be executed by each app, such as the model definition, train/test tasks, etc. +In this repository, you'll notice a directory under `src` called `app-pytorch`. This is effectively a Flower PyTorch app created using Flower's `flwr new` command that has been modified to run a local federation. The client and server apps dictate what will be run by the client and server respectively. `Task.py` defines the logic that will be executed by each app, such as the model definition, train/test tasks, etc. ## Execution Methods @@ -247,13 +247,13 @@ INFO : GrpcAdapter.PullTaskIns ``` You can run another experiment by opening another terminal, navigating to this workspace, and running: ```SH -flwr run ./app-pytorch +flwr run ./src/app-pytorch ``` It will run another experiment. Once you are done, you can manually shut down OpenFL's `collaborator` and Flower's `SuperNode` with `CTRL+C`. This will trigger a task-completion by the task runner that'll subsequently begin the graceful shutdown process of the OpenFL and Flower components. ### Invoke Flower experiment as a separate command If you did not set `flwr_run_params` in the `plan.yaml`, the OpenFL `Connector` will not automatically start a Flower experiment. Instead, you should open a terminal, navigate to this workspace, and run ```SH -flwr run ./app-pytorch +flwr run ./src/app-pytorch ``` separately to begin the experiment. \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/requirements.txt b/openfl-workspace/flower-app-pytorch/requirements.txt index 27257ff61b..aa2724e793 100644 --- a/openfl-workspace/flower-app-pytorch/requirements.txt +++ b/openfl-workspace/flower-app-pytorch/requirements.txt @@ -1 +1 @@ -./app-pytorch \ No newline at end of file +./src/app-pytorch diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/__init__.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/__init__.py similarity index 100% rename from openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/__init__.py rename to openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/__init__.py diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/client_app.py similarity index 100% rename from openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/client_app.py rename to openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/client_app.py diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py similarity index 100% rename from openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/server_app.py rename to openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/task.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py similarity index 100% rename from openfl-workspace/flower-app-pytorch/app-pytorch/app_pytorch/task.py rename to openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py diff --git a/openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml b/openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml similarity index 100% rename from openfl-workspace/flower-app-pytorch/app-pytorch/pyproject.toml rename to openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 48b3e7446d..eed2731f4b 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -65,7 +65,7 @@ def _build_flwr_run_command(self) -> list[str]: flwr_app_name = self.flwr_run_params.get("flwr_app_name") federation_name = self.flwr_run_params.get("federation_name") - command = ["flwr", "run", f"./{flwr_app_name}"] + command = ["flwr", "run", f"./src/{flwr_app_name}"] if federation_name: command.append(federation_name) return command From 27ea8d14bdcc01fba2584c8a2fa31b6b6934aede Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 9 Jan 2025 10:17:34 -0800 Subject: [PATCH 036/107] testing gramine Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/plan/plan.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 3da6e96906..3808dfdaaa 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -16,9 +16,9 @@ connector : serverappio-api-address : 127.0.0.1:9091 fleet-api-address : 127.0.0.1:9092 # note [kta-intel]: local gRPC client will connect here exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml - flwr_run_params : - flwr_app_name : "app-pytorch" - federation_name : "local-poc" + # flwr_run_params : + # flwr_app_name : "app-pytorch" + # federation_name : "local-poc" collaborator : defaults : plan/defaults/collaborator.yaml From caa0ff0f64714391cabab1697f378fea698fdbb4 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 13 Jan 2025 14:14:08 -0800 Subject: [PATCH 037/107] fix monitor subprocess Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 45 +++++++++++++++----------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 42a261c485..3c86d752b6 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -99,22 +99,26 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): def signal_handler(_sig, _frame): self.logger.info("Received shutdown signal. Terminating supernode process...") - # find and terminate client_app_process processes - main_subprocess = psutil.Process(supernode_process.pid) - client_app_processes = main_subprocess.children(recursive=True) - for client_app_process in client_app_processes: - client_app_process.terminate() - _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) - for p in still_alive: - p.kill() - # Terminate the main process - - supernode_process.terminate() - try: - supernode_process.wait(timeout=1) - except subprocess.TimeoutExpired: - supernode_process.kill() - self.logger.info("Supernode process terminated. Shutting down gRPC server...") + if supernode_process.poll() is None: + # find and terminate client_app_process processes + main_subprocess = psutil.Process(supernode_process.pid) + client_app_processes = main_subprocess.children(recursive=True) + for client_app_process in client_app_processes: + client_app_process.terminate() + _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) + for p in still_alive: + p.kill() + # Terminate the main process + supernode_process.terminate() + try: + supernode_process.wait(timeout=1) + except subprocess.TimeoutExpired: + supernode_process.kill() + self.logger.info("Supernode process terminated.") + else: + self.logger.info("Supernode process already terminated.") + + self.logger.info("Shutting down gRPC server...") server.stop(0) self.logger.info("gRPC server stopped.") termination_event.set() @@ -122,6 +126,8 @@ def signal_handler(_sig, _frame): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) + monitor_thread = None + if self.auto_shutdown: self.logger.info("Automatic shutdown enabled. Monitoring subprocess activity...") @@ -130,7 +136,7 @@ def monitor_subprocesses(): previous_end_time = None intervals = [] - while True: + while not termination_event.is_set(): client_app_processes = main_subprocess.children(recursive=True) if client_app_processes: for client_app_process in client_app_processes: @@ -159,4 +165,7 @@ def monitor_subprocesses(): self.logger.info("Press CTRL+C to stop the server and supernode process.") - termination_event.wait() \ No newline at end of file + termination_event.wait() + + if monitor_thread is not None: # Ensure the monitor thread is properly terminated + monitor_thread.join() \ No newline at end of file From b91e8871aabd47d731396a35b1902183afd6d13c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 14 Jan 2025 08:53:50 -0800 Subject: [PATCH 038/107] adding try excepts for subprocess shutdown Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 93 ++++++-------------------- 1 file changed, 22 insertions(+), 71 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 3c86d752b6..0f6e7283f3 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -13,108 +13,57 @@ class FlowerTaskRunner(TaskRunner): - """ - FlowerTaskRunner is a task runner that executes Flower SuperNode - to initialize the experiment from the client side. - - This class is responsible for starting a local gRPC server and a Flower SuperNode - in a subprocess. It also provides options for automatic shutdown based on subprocess - activity. - - Shutdown Options: - - Manual Shutdown: The server and supernode process can be manually stopped by pressing CTRL+C. - - Automatic Shutdown: If enabled, the system will monitor the activity of subprocesses and - automatically shut down if no new subprocess starts within a certain time frame. - """ - def __init__(self, auto_shutdown=True, **kwargs): - """ - Initializes the FlowerTaskRunner. - - Args: - auto_shutdown (bool): Whether to enable automatic shutdown based on subprocess activity. - Default is True. Set to False for long-lived components. - **kwargs: Additional parameters to pass to the functions. - """ super().__init__(**kwargs) self.logger = getLogger(__name__) self.num_partitions = self.data_loader.get_node_configs()[0] self.partition_id = self.data_loader.get_node_configs()[1] - # Define a base port number base_port = 5000 - - # Calculate the client port by adding the partition ID to the base port self.client_port = base_port + self.partition_id self.auto_shutdown = auto_shutdown def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): - """ - Starts the local gRPC server and the Flower SuperNode. - - Args: - openfl_client: The OpenFL client instance used to communicate with the OpenFL server. - collaborator_name: The name of the collaborator. - **kwargs: Additional parameters, including 'local_server_port'. - - The method performs the following steps: - 1. Starts a local gRPC server to handle communication between the OpenFL client and the Flower SuperNode. - 2. Launches the Flower SuperNode in a subprocess. - 3. Sets up signal handlers for manual shutdown (via CTRL+C). - 4. If auto_shutdown is enabled, monitors subprocess activity and initiates shutdown if no new subprocess starts within the expected time frame. - - Shutdown Process: - - When a shutdown signal (SIGINT or SIGTERM) is received, the method will: - 1. Terminate all child processes of the supernode subprocess. - 2. Terminate the main supernode subprocess. - 3. Stop the gRPC server. - 4. Log the shutdown process and set the termination event to stop the server. - """ local_server_port = kwargs['local_server_port'] - # Start the local gRPC server server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(LocalGRPCServer(openfl_client, collaborator_name), server) - - # TODO: add restrictions server.add_insecure_port(f'[::]:{local_server_port}') server.start() self.logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") - # Start the Flower SuperNode in a subprocess command = [ "flower-supernode", "--insecure", "--grpc-adapter", - "--superlink", f"127.0.0.1:{local_server_port}", # note [kta-intel]: this connects to local gRPC server + "--superlink", f"127.0.0.1:{local_server_port}", "--clientappio-api-address", f"127.0.0.1:{self.client_port}", "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" ] - # Start the subprocess supernode_process = subprocess.Popen(command, shell=False) - # Create an event to wait for the termination signal termination_event = threading.Event() def signal_handler(_sig, _frame): self.logger.info("Received shutdown signal. Terminating supernode process...") if supernode_process.poll() is None: - # find and terminate client_app_process processes - main_subprocess = psutil.Process(supernode_process.pid) - client_app_processes = main_subprocess.children(recursive=True) - for client_app_process in client_app_processes: - client_app_process.terminate() - _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) - for p in still_alive: - p.kill() - # Terminate the main process - supernode_process.terminate() try: - supernode_process.wait(timeout=1) - except subprocess.TimeoutExpired: - supernode_process.kill() - self.logger.info("Supernode process terminated.") + main_subprocess = psutil.Process(supernode_process.pid) + client_app_processes = main_subprocess.children(recursive=True) + for client_app_process in client_app_processes: + client_app_process.terminate() + _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) + for p in still_alive: + p.kill() + supernode_process.terminate() + try: + supernode_process.wait(timeout=1) + except subprocess.TimeoutExpired: + supernode_process.kill() + self.logger.info("Supernode process terminated.") + except psutil.NoSuchProcess: + self.logger.info("Supernode process already terminated.") else: self.logger.info("Supernode process already terminated.") @@ -132,7 +81,11 @@ def signal_handler(_sig, _frame): self.logger.info("Automatic shutdown enabled. Monitoring subprocess activity...") def monitor_subprocesses(): - main_subprocess = psutil.Process(supernode_process.pid) + try: + main_subprocess = psutil.Process(supernode_process.pid) + except psutil.NoSuchProcess: + return + previous_end_time = None intervals = [] @@ -145,14 +98,12 @@ def monitor_subprocesses(): if previous_end_time is not None: interval = end_time - previous_end_time intervals.append(interval) - # self.logger.info(f"Subprocess ended. Interval: {interval:.2f} seconds.") previous_end_time = end_time if previous_end_time is not None: running_timer = time.time() - previous_end_time if intervals: average_interval = sum(intervals) / len(intervals) - # self.logger.info(f"Running timer: {running_timer:.2f} seconds. Average interval: {average_interval:.2f} seconds.") if running_timer > 2 * average_interval: self.logger.info("No new subprocess started within the expected time. Initiating shutdown...") signal_handler(signal.SIGTERM, None) @@ -167,5 +118,5 @@ def monitor_subprocesses(): termination_event.wait() - if monitor_thread is not None: # Ensure the monitor thread is properly terminated + if monitor_thread is not None: monitor_thread.join() \ No newline at end of file From 1a0149c3bbe2596a4455302e1063246009e5733a Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 14 Jan 2025 10:51:02 -0800 Subject: [PATCH 039/107] termination event fix Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 0f6e7283f3..83a4df50c3 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -116,7 +116,11 @@ def monitor_subprocesses(): self.logger.info("Press CTRL+C to stop the server and supernode process.") - termination_event.wait() + try: + while not termination_event.is_set(): + time.sleep(0.1) + except KeyboardInterrupt: + signal_handler(signal.SIGINT, None) if monitor_thread is not None: monitor_thread.join() \ No newline at end of file From d1e85d34e2cff137986e35c39741c3c8c48b5b0c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 14 Jan 2025 11:10:18 -0800 Subject: [PATCH 040/107] more signal handling Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 83a4df50c3..dddcc41b89 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -22,6 +22,7 @@ def __init__(self, auto_shutdown=True, **kwargs): base_port = 5000 self.client_port = base_port + self.partition_id self.auto_shutdown = auto_shutdown + self.shutdown_initiated = False # Flag to ensure signal handler runs only once def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): local_server_port = kwargs['local_server_port'] @@ -45,6 +46,10 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): termination_event = threading.Event() def signal_handler(_sig, _frame): + if self.shutdown_initiated: + return + self.shutdown_initiated = True + self.logger.info("Received shutdown signal. Terminating supernode process...") if supernode_process.poll() is None: @@ -123,4 +128,6 @@ def monitor_subprocesses(): signal_handler(signal.SIGINT, None) if monitor_thread is not None: - monitor_thread.join() \ No newline at end of file + monitor_thread.join() + + self.logger.info("Exiting Task Runner") \ No newline at end of file From 0f487420486d8d8b7a53fc6cf5a400bc31be927d Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 14 Jan 2025 12:02:43 -0800 Subject: [PATCH 041/107] docstrings Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index dddcc41b89..741b3a31bc 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -13,7 +13,28 @@ class FlowerTaskRunner(TaskRunner): + """ + FlowerTaskRunner is a task runner that executes Flower SuperNode + to initialize the experiment from the client side. + + This class is responsible for starting a local gRPC server and a Flower SuperNode + in a subprocess. It also provides options for automatic shutdown based on subprocess + activity. + + Shutdown Options: + - Manual Shutdown: The server and supernode process can be manually stopped by pressing CTRL+C. + - Automatic Shutdown: If enabled, the system will monitor the activity of subprocesses and + automatically shut down if no new subprocess starts within a certain time frame. + """ def __init__(self, auto_shutdown=True, **kwargs): + """ + Initializes the FlowerTaskRunner. + + Args: + auto_shutdown (bool): Whether to enable automatic shutdown based on subprocess activity. + Default is True. Set to False for long-lived components. + **kwargs: Additional parameters to pass to the functions. + """ super().__init__(**kwargs) self.logger = getLogger(__name__) self.num_partitions = self.data_loader.get_node_configs()[0] @@ -25,6 +46,27 @@ def __init__(self, auto_shutdown=True, **kwargs): self.shutdown_initiated = False # Flag to ensure signal handler runs only once def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): + """ + Starts the local gRPC server and the Flower SuperNode. + + Args: + openfl_client: The OpenFL client instance used to communicate with the OpenFL server. + collaborator_name: The name of the collaborator. + **kwargs: Additional parameters, including 'local_server_port'. + + The method performs the following steps: + 1. Starts a local gRPC server to handle communication between the OpenFL client and the Flower SuperNode. + 2. Launches the Flower SuperNode in a subprocess. + 3. Sets up signal handlers for manual shutdown (via CTRL+C). + 4. If auto_shutdown is enabled, monitors subprocess activity and initiates shutdown if no new subprocess starts within the expected time frame. + + Shutdown Process: + - When a shutdown signal (SIGINT or SIGTERM) is received, the method will: + 1. Terminate all child processes of the supernode subprocess. + 2. Terminate the main supernode subprocess. + 3. Stop the gRPC server. + 4. Log the shutdown process and set the termination event to stop the server. + """ local_server_port = kwargs['local_server_port'] server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) @@ -46,6 +88,13 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): termination_event = threading.Event() def signal_handler(_sig, _frame): + """ + Handles shutdown signals (SIGINT or SIGTERM) to terminate the supernode process and stop the gRPC server. + + Args: + _sig: The signal number. + _frame: The current stack frame (not used). + """ if self.shutdown_initiated: return self.shutdown_initiated = True @@ -86,6 +135,9 @@ def signal_handler(_sig, _frame): self.logger.info("Automatic shutdown enabled. Monitoring subprocess activity...") def monitor_subprocesses(): + """ + Monitors the activity of subprocesses and initiates shutdown if no new subprocess starts within the expected time frame. + """ try: main_subprocess = psutil.Process(supernode_process.pid) except psutil.NoSuchProcess: From 7b82004fbfcb0e84c689a5286c0d15b3435f3008 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 14 Jan 2025 12:04:52 -0800 Subject: [PATCH 042/107] improve docstrings Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 741b3a31bc..e0e1eaff81 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -89,7 +89,7 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): def signal_handler(_sig, _frame): """ - Handles shutdown signals (SIGINT or SIGTERM) to terminate the supernode process and stop the gRPC server. + Handles shutdown signals (SIGINT or SIGTERM) to terminate the supernode process and stop the local gRPC server. Args: _sig: The signal number. @@ -121,9 +121,9 @@ def signal_handler(_sig, _frame): else: self.logger.info("Supernode process already terminated.") - self.logger.info("Shutting down gRPC server...") + self.logger.info("Shutting down local gRPC server...") server.stop(0) - self.logger.info("gRPC server stopped.") + self.logger.info("local gRPC server stopped.") termination_event.set() signal.signal(signal.SIGINT, signal_handler) From 78bd7fde74237cab38cc175805eac549447fdb3a Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 16 Jan 2025 15:01:49 -0800 Subject: [PATCH 043/107] patch flower Signed-off-by: kta-intel --- .../flower-app-pytorch/plan/plan.yaml | 13 +- .../src/app-pytorch/app_pytorch/task.py | 84 +++++++--- .../flower-app-pytorch/src/patch/__init__.py | 3 + .../src/patch/flower_superlink_patch.py | 11 ++ .../src/patch/flower_supernode_patch.py | 11 ++ .../src/patch/flwr_run_patch.py | 11 ++ .../src/patch/patch_flwr_build.py | 143 ++++++++++++++++++ .../src/patch/patch_flwr_telemetry.py | 50 ++++++ .../component/interoperability/connector.py | 2 +- .../interoperability/connector_flower.py | 17 ++- openfl/federated/task/runner_flower.py | 30 ++-- 11 files changed, 332 insertions(+), 43 deletions(-) create mode 100644 openfl-workspace/flower-app-pytorch/src/patch/__init__.py create mode 100644 openfl-workspace/flower-app-pytorch/src/patch/flower_superlink_patch.py create mode 100644 openfl-workspace/flower-app-pytorch/src/patch/flower_supernode_patch.py create mode 100644 openfl-workspace/flower-app-pytorch/src/patch/flwr_run_patch.py create mode 100644 openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py create mode 100644 openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_telemetry.py diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 3808dfdaaa..bb37a05e15 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -13,12 +13,14 @@ connector : settings : superlink_params : insecure : True - serverappio-api-address : 127.0.0.1:9091 + serverappio-api-address : 127.0.0.1:9091 # note [kta-intel]: ServerApp will connect here fleet-api-address : 127.0.0.1:9092 # note [kta-intel]: local gRPC client will connect here - exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml - # flwr_run_params : - # flwr_app_name : "app-pytorch" - # federation_name : "local-poc" + exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml (for flwr run) + patch : True + flwr_run_params : + flwr_app_name : "app-pytorch" + federation_name : "local-poc" + patch : True collaborator : defaults : plan/defaults/collaborator.yaml @@ -35,6 +37,7 @@ task_runner : template : openfl.federated.task.runner_flower.FlowerTaskRunner settings : auto_shutdown : True + patch : True network : defaults : plan/defaults/network.yaml diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py index 4a42b7009e..1a0604b623 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py @@ -5,9 +5,9 @@ import torch import torch.nn as nn import torch.nn.functional as F -from flwr_datasets import FederatedDataset -from flwr_datasets.partitioner import IidPartitioner -from torch.utils.data import DataLoader +# from flwr_datasets import FederatedDataset # NOTE: flwr_dataset will create ~/.flwr/source +# from flwr_datasets.partitioner import IidPartitioner +from torch.utils.data import DataLoader, Dataset from torchvision.transforms import Compose, Normalize, ToTensor @@ -32,34 +32,68 @@ def forward(self, x): return self.fc3(x) -fds = None # Cache FederatedDataset - +# fds = None # Cache FederatedDataset + + +# def load_data(partition_id: int, num_partitions: int): +# """Load partition CIFAR10 data.""" +# # Only initialize `FederatedDataset` once +# global fds +# if fds is None: +# partitioner = IidPartitioner(num_partitions=num_partitions) +# fds = FederatedDataset( +# dataset="uoft-cs/cifar10", +# partitioners={"train": partitioner}, +# ) +# partition = fds.load_partition(partition_id) +# # Divide data on each node: 80% train, 20% test +# partition_train_test = partition.train_test_split(test_size=0.2, seed=42) +# pytorch_transforms = Compose( +# [ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] +# ) + +# def apply_transforms(batch): +# """Apply transforms to the partition from FederatedDataset.""" +# batch["img"] = [pytorch_transforms(img) for img in batch["img"]] +# return batch + +# partition_train_test = partition_train_test.with_transform(apply_transforms) +# trainloader = DataLoader(partition_train_test["train"], batch_size=32, shuffle=True) +# testloader = DataLoader(partition_train_test["test"], batch_size=32) +# return trainloader, testloader + +class DummyDataset(Dataset): + def __init__(self, num_samples, transform=None): + self.num_samples = num_samples + self.transform = transform + self.data = torch.randn(num_samples, 3, 32, 32) # Random images + self.targets = torch.randint(0, 10, (num_samples,)) # Random labels + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + sample = {'img': self.data[idx], 'label': self.targets[idx]} + if self.transform: + sample['img'] = self.transform(sample['img']) + return sample def load_data(partition_id: int, num_partitions: int): - """Load partition CIFAR10 data.""" - # Only initialize `FederatedDataset` once - global fds - if fds is None: - partitioner = IidPartitioner(num_partitions=num_partitions) - fds = FederatedDataset( - dataset="uoft-cs/cifar10", - partitioners={"train": partitioner}, - ) - partition = fds.load_partition(partition_id) - # Divide data on each node: 80% train, 20% test - partition_train_test = partition.train_test_split(test_size=0.2, seed=42) + """Load partition dummy CIFAR10 data.""" + num_samples = 50000 // num_partitions # Assuming 50,000 samples in total + num_train_samples = int(num_samples * 0.8) + num_test_samples = num_samples - num_train_samples + pytorch_transforms = Compose( - [ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] + [Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] ) - def apply_transforms(batch): - """Apply transforms to the partition from FederatedDataset.""" - batch["img"] = [pytorch_transforms(img) for img in batch["img"]] - return batch + train_dataset = DummyDataset(num_train_samples, transform=pytorch_transforms) + test_dataset = DummyDataset(num_test_samples, transform=pytorch_transforms) + + trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True) + testloader = DataLoader(test_dataset, batch_size=32) - partition_train_test = partition_train_test.with_transform(apply_transforms) - trainloader = DataLoader(partition_train_test["train"], batch_size=32, shuffle=True) - testloader = DataLoader(partition_train_test["test"], batch_size=32) return trainloader, testloader diff --git a/openfl-workspace/flower-app-pytorch/src/patch/__init__.py b/openfl-workspace/flower-app-pytorch/src/patch/__init__.py new file mode 100644 index 0000000000..d5df5b8668 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/src/patch/__init__.py @@ -0,0 +1,3 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""You may copy this file as the starting point of your own model.""" diff --git a/openfl-workspace/flower-app-pytorch/src/patch/flower_superlink_patch.py b/openfl-workspace/flower-app-pytorch/src/patch/flower_superlink_patch.py new file mode 100644 index 0000000000..dd6fd0e01e --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/src/patch/flower_superlink_patch.py @@ -0,0 +1,11 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) + +import src.patch.patch_flwr_telemetry + +import re +from flwr.server.app import run_superlink +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(run_superlink()) diff --git a/openfl-workspace/flower-app-pytorch/src/patch/flower_supernode_patch.py b/openfl-workspace/flower-app-pytorch/src/patch/flower_supernode_patch.py new file mode 100644 index 0000000000..9960d48cc0 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/src/patch/flower_supernode_patch.py @@ -0,0 +1,11 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) + +import src.patch.patch_flwr_telemetry + +import re +from flwr.client.supernode.app import run_supernode +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(run_supernode()) \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/src/patch/flwr_run_patch.py b/openfl-workspace/flower-app-pytorch/src/patch/flwr_run_patch.py new file mode 100644 index 0000000000..694169c410 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/src/patch/flwr_run_patch.py @@ -0,0 +1,11 @@ +import os +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) + +import src.patch.patch_flwr_build + +import re +from flwr.cli.app import app +if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) + sys.exit(app()) \ No newline at end of file diff --git a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py new file mode 100644 index 0000000000..7f987a2583 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py @@ -0,0 +1,143 @@ +import flwr.cli.build +from flwr.cli.build import write_to_zip, get_fab_filename +from typing import Annotated, Optional +import typer +from pathlib import Path +from flwr.cli.utils import is_valid_project_name +from flwr.cli.config_utils import load_and_validate +# import pathspec +import tempfile +import zipfile +from flwr.common.constant import FAB_ALLOWED_EXTENSIONS +import shutil +import tomli_w +import hashlib +import os + +def patched_build( + app: Annotated[ + Optional[Path], + typer.Option(help="Path of the Flower App to bundle into a FAB"), + ] = None, +) -> tuple[str, str]: + """Build a Flower App into a Flower App Bundle (FAB). + + You can run ``flwr build`` without any arguments to bundle the app located in the + current directory. Alternatively, you can you can specify a path using the ``--app`` + option to bundle an app located at the provided path. For example: + + ``flwr build --app ./apps/flower-hello-world``. + """ + if app is None: + app = Path.cwd() + + app = app.resolve() + if not app.is_dir(): + typer.secho( + f"❌ The path {app} is not a valid path to a Flower app.", + fg=typer.colors.RED, + bold=True, + ) + raise typer.Exit(code=1) + + if not is_valid_project_name(app.name): + typer.secho( + f"❌ The project name {app.name} is invalid, " + "a valid project name must start with a letter, " + "and can only contain letters, digits, and hyphens.", + fg=typer.colors.RED, + bold=True, + ) + raise typer.Exit(code=1) + + conf, errors, warnings = load_and_validate(app / "pyproject.toml") + if conf is None: + typer.secho( + "Project configuration could not be loaded.\npyproject.toml is invalid:\n" + + "\n".join([f"- {line}" for line in errors]), + fg=typer.colors.RED, + bold=True, + ) + raise typer.Exit(code=1) + + if warnings: + typer.secho( + "Project configuration is missing the following " + "recommended properties:\n" + "\n".join([f"- {line}" for line in warnings]), + fg=typer.colors.RED, + bold=True, + ) + + # Load .gitignore rules if present + ignore_spec = flwr.cli.build._load_gitignore(app) + + list_file_content = "" + + # Remove the 'federations' field from 'tool.flwr' if it exists + if ( + "tool" in conf + and "flwr" in conf["tool"] + and "federations" in conf["tool"]["flwr"] + ): + del conf["tool"]["flwr"]["federations"] + + toml_contents = tomli_w.dumps(conf) + + with tempfile.NamedTemporaryFile(suffix=".zip", dir='./', delete=False) as temp_file: + temp_filename = temp_file.name + + with zipfile.ZipFile(temp_filename, "w", zipfile.ZIP_DEFLATED) as fab_file: + write_to_zip(fab_file, "pyproject.toml", toml_contents) + + # Continue with adding other files + all_files = [ + f + for f in app.rglob("*") + if not ignore_spec.match_file(f) + and f.name != temp_filename + and f.suffix in FAB_ALLOWED_EXTENSIONS + and f.name != "pyproject.toml" # Exclude the original pyproject.toml + ] + + for file_path in all_files: + # Read the file content manually + with open(file_path, "rb") as f: + file_contents = f.read() + + archive_path = file_path.relative_to(app) + write_to_zip(fab_file, str(archive_path), file_contents) + + # Calculate file info + sha256_hash = hashlib.sha256(file_contents).hexdigest() + file_size_bits = os.path.getsize(file_path) * 8 # size in bits + list_file_content += f"{archive_path},{sha256_hash},{file_size_bits}\n" + + # Add CONTENT and CONTENT.jwt to the zip file + write_to_zip(fab_file, ".info/CONTENT", list_file_content) + + # Get hash of FAB file + content = Path(temp_filename).read_bytes() + fab_hash = hashlib.sha256(content).hexdigest() + + # Set the name of the zip file + fab_filename = get_fab_filename(conf, fab_hash) + + # Once the temporary zip file is created, rename it to the final filename + shutil.move(temp_filename, fab_filename) + + typer.secho( + f"🎊 Successfully built {fab_filename}", fg=typer.colors.GREEN, bold=True + ) + + return fab_filename, fab_hash + +# def _load_gitignore(app: Path) -> pathspec.PathSpec: +# """Load and parse .gitignore file, returning a pathspec.""" +# gitignore_path = app / ".gitignore" +# patterns = ["__pycache__/"] # Default pattern +# if gitignore_path.exists(): +# with open(gitignore_path, encoding="UTF-8") as file: +# patterns.extend(file.readlines()) +# return pathspec.PathSpec.from_lines("gitwildmatch", patterns) + +flwr.cli.build.build = patched_build diff --git a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_telemetry.py b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_telemetry.py new file mode 100644 index 0000000000..b6210f6d15 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_telemetry.py @@ -0,0 +1,50 @@ +import flwr.common.telemetry +# import flwr_datasets.common.telemetry +from pathlib import Path +import os +import uuid + +def patched_get_source_id() -> str: + """Get existing or new source ID.""" + source_id = "unavailable" + # Check if .flwr in home exists + + #### + if os.getenv("FLWR_HOME"): + flwr_dir = Path(os.getenv("FLWR_HOME")) + else: + try: + home = flwr.common.telemetry._get_home() + except RuntimeError: + # If the home directory can’t be resolved, RuntimeError is raised. + return source_id + + flwr_dir = home.joinpath(".flwr") + + # Create .flwr directory if it does not exist yet. + try: + flwr_dir.mkdir(parents=True, exist_ok=True) + except PermissionError: + return source_id + + source_file = flwr_dir.joinpath("source") + + # If no source_file exists create one and write it + if not source_file.exists(): + try: + source_file.touch(exist_ok=True) + source_file.write_text(str(uuid.uuid4()), encoding="utf-8") + except PermissionError: + return source_id + + source_id = source_file.read_text(encoding="utf-8").strip() + + try: + uuid.UUID(source_id) + except ValueError: + source_id = "invalid" + + return source_id + +flwr.common.telemetry._get_source_id = patched_get_source_id +# flwr_datasets.common.telemetry._get_source_id = patched_get_source_id diff --git a/openfl/component/interoperability/connector.py b/openfl/component/interoperability/connector.py index e1b14923a2..18822af8f3 100644 --- a/openfl/component/interoperability/connector.py +++ b/openfl/component/interoperability/connector.py @@ -42,7 +42,7 @@ def stop(self): main_process = psutil.Process(self._process.pid) sub_processes = main_process.children(recursive=True) for sub_process in sub_processes: - self.logger.info(f"[OpenFL Connector] Stopping server subprocess with PID: {sub_process.pid}...") + self.logger.info(f"[OpenFL Connector] Stopping server subprocess with PID: {sub_process.pid}...") sub_process.terminate() _, still_alive = psutil.wait_procs(sub_processes, timeout=1) for p in still_alive: diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index eed2731f4b..307f1f08d6 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -2,6 +2,10 @@ from openfl.component.interoperability.connector import Connector from openfl.transport.grpc.connector.flower.local_grpc_client import LocalGRPCClient +import os +# import pdb; pdb.set_trace() +# os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") + class ConnectorFlower(Connector): """ Connector subclass for the Flower framework. @@ -34,7 +38,10 @@ def _build_command(self) -> list[str]: Returns: list[str]: A list representing the Flower server start command. """ - command = ["flower-superlink", "--fleet-api-type", "grpc-adapter"] + if self.superlink_params.get("patch"): + command = ["python", "src/patch/flower_superlink_patch.py", "--fleet-api-type", "grpc-adapter"] + else: + command = ["flower-superlink", "--fleet-api-type", "grpc-adapter"] if "insecure" in self.superlink_params: if self.superlink_params["insecure"]: @@ -64,8 +71,12 @@ def _build_flwr_run_command(self) -> list[str]: """ flwr_app_name = self.flwr_run_params.get("flwr_app_name") federation_name = self.flwr_run_params.get("federation_name") - - command = ["flwr", "run", f"./src/{flwr_app_name}"] + + if self.flwr_run_params.get("patch"): + command = ["python", "src/patch/flwr_run_patch.py", "run", f"./src/{flwr_app_name}"] + else: + command = ["flwr", "run", f"./src/{flwr_app_name}"] + if federation_name: command.append(federation_name) return command diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index e0e1eaff81..9a3aa3fb7a 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -43,6 +43,7 @@ def __init__(self, auto_shutdown=True, **kwargs): base_port = 5000 self.client_port = base_port + self.partition_id self.auto_shutdown = auto_shutdown + self.patch = kwargs.get('patch') self.shutdown_initiated = False # Flag to ensure signal handler runs only once def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): @@ -67,22 +68,33 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): 3. Stop the gRPC server. 4. Log the shutdown process and set the termination event to stop the server. """ - local_server_port = kwargs['local_server_port'] + local_server_port = kwargs.get('local_server_port') server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(LocalGRPCServer(openfl_client, collaborator_name), server) server.add_insecure_port(f'[::]:{local_server_port}') server.start() self.logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") + if self.patch: + command = [ + "python", + "src/patch/flower_supernode_patch.py", + "--insecure", + "--grpc-adapter", + "--superlink", f"127.0.0.1:{local_server_port}", + "--clientappio-api-address", f"127.0.0.1:{self.client_port}", + "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" + ] + else: + command = [ + "flower-supernode", + "--insecure", + "--grpc-adapter", + "--superlink", f"127.0.0.1:{local_server_port}", + "--clientappio-api-address", f"127.0.0.1:{self.client_port}", + "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" + ] - command = [ - "flower-supernode", - "--insecure", - "--grpc-adapter", - "--superlink", f"127.0.0.1:{local_server_port}", - "--clientappio-api-address", f"127.0.0.1:{self.client_port}", - "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" - ] supernode_process = subprocess.Popen(command, shell=False) termination_event = threading.Event() From f738e49cfe364d6847820effd9eb5f9daf491611 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 16 Jan 2025 15:03:54 -0800 Subject: [PATCH 044/107] method to ctrl+c shutdown Signed-off-by: kta-intel --- openfl/component/interoperability/connector.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/openfl/component/interoperability/connector.py b/openfl/component/interoperability/connector.py index 18822af8f3..44782922e7 100644 --- a/openfl/component/interoperability/connector.py +++ b/openfl/component/interoperability/connector.py @@ -1,5 +1,7 @@ import subprocess import psutil +import signal +import sys from logging import getLogger class Connector: @@ -21,6 +23,9 @@ def __init__(self, command: list[str], component_name: str = "Base", **kwargs): self.logger = getLogger(__name__) self.component_name = component_name + # Register signal handler for clean termination + signal.signal(signal.SIGINT, self._handle_sigint) + def start(self): """ Start the server process with the provided command. @@ -68,4 +73,12 @@ def print_Connector_info(self): """ Print information indicating which Connector component is being used. """ - self.logger.info(f"OpenFL Connector Enabled: {self.component_name}") \ No newline at end of file + self.logger.info(f"OpenFL Connector Enabled: {self.component_name}") + + def _handle_sigint(self, signum, frame): + """ + Handle the SIGINT signal (Ctrl+C) to cleanly stop the server process and its children. + """ + self.logger.info("[OpenFL Connector] SIGINT received. Terminating server process...") + self.stop() + sys.exit(0) \ No newline at end of file From eb959b20373e84d61aa491574b55600cf476e7db Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 17 Jan 2025 06:59:23 -0800 Subject: [PATCH 045/107] add os env variable to install flower in workspace Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 3 +-- openfl/federated/task/runner_flower.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 307f1f08d6..4bf15d0c9d 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -3,8 +3,7 @@ from openfl.transport.grpc.connector.flower.local_grpc_client import LocalGRPCClient import os -# import pdb; pdb.set_trace() -# os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") +os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") class ConnectorFlower(Connector): """ diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 9a3aa3fb7a..30a2624f0a 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -11,6 +11,8 @@ import psutil import time +import os +os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") class FlowerTaskRunner(TaskRunner): """ From 753ab7ce2b93f81c4dd90c0dc6cf3d121529369a Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 17 Jan 2025 10:09:17 -0800 Subject: [PATCH 046/107] save tmp and fab in flwr home Signed-off-by: kta-intel --- .../flower-app-pytorch/src/patch/patch_flwr_build.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py index 7f987a2583..7b62515c36 100644 --- a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py +++ b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py @@ -83,7 +83,9 @@ def patched_build( toml_contents = tomli_w.dumps(conf) - with tempfile.NamedTemporaryFile(suffix=".zip", dir='./', delete=False) as temp_file: + flwr_home = os.getenv("FLWR_HOME") + + with tempfile.NamedTemporaryFile(suffix=".zip", dir=flwr_home, delete=False) as temp_file: temp_filename = temp_file.name with zipfile.ZipFile(temp_filename, "w", zipfile.ZIP_DEFLATED) as fab_file: @@ -123,13 +125,14 @@ def patched_build( fab_filename = get_fab_filename(conf, fab_hash) # Once the temporary zip file is created, rename it to the final filename - shutil.move(temp_filename, fab_filename) + final_path = Path(flwr_home) / fab_filename + shutil.move(temp_filename, final_path) typer.secho( f"🎊 Successfully built {fab_filename}", fg=typer.colors.GREEN, bold=True ) - return fab_filename, fab_hash + return final_path, fab_hash # def _load_gitignore(app: Path) -> pathspec.PathSpec: # """Load and parse .gitignore file, returning a pathspec.""" From 806975d6eeb287cd688f67535739fa30bd20d6fb Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 17 Jan 2025 10:22:18 -0800 Subject: [PATCH 047/107] make flwr_home if it doesn't exist Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 1 + openfl/federated/task/runner_flower.py | 1 + 2 files changed, 2 insertions(+) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 4bf15d0c9d..4325e8c869 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -4,6 +4,7 @@ import os os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") +os.makedirs(os.environ["FLWR_HOME"], exist_ok=True) class ConnectorFlower(Connector): """ diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 30a2624f0a..29a4a304f7 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -13,6 +13,7 @@ import os os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") +os.makedirs(os.environ["FLWR_HOME"], exist_ok=True) class FlowerTaskRunner(TaskRunner): """ From 8bb0740d66341966844ca848c87431d72ac33eae Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 17 Jan 2025 14:36:41 -0800 Subject: [PATCH 048/107] fixes to patch Signed-off-by: kta-intel --- .../src/patch/patch_flwr_build.py | 31 ++++++++++--------- .../src/patch/patch_flwr_telemetry.py | 10 +++--- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py index 7b62515c36..315151680b 100644 --- a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py +++ b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py @@ -5,7 +5,6 @@ from pathlib import Path from flwr.cli.utils import is_valid_project_name from flwr.cli.config_utils import load_and_validate -# import pathspec import tempfile import zipfile from flwr.common.constant import FAB_ALLOWED_EXTENSIONS @@ -14,7 +13,7 @@ import hashlib import os -def patched_build( +def build( app: Annotated[ Optional[Path], typer.Option(help="Path of the Flower App to bundle into a FAB"), @@ -83,9 +82,13 @@ def patched_build( toml_contents = tomli_w.dumps(conf) + ### PATCH ### + # REASONING: original code writes to /tmp/ by default. Writing to flwr_home allows us to consolidate written files + # This is useful for running in an SGX enclave with Gramine since we need to strictly control allowed/trusted files flwr_home = os.getenv("FLWR_HOME") - with tempfile.NamedTemporaryFile(suffix=".zip", dir=flwr_home, delete=False) as temp_file: + ############# + temp_filename = temp_file.name with zipfile.ZipFile(temp_filename, "w", zipfile.ZIP_DEFLATED) as fab_file: @@ -100,7 +103,11 @@ def patched_build( and f.suffix in FAB_ALLOWED_EXTENSIONS and f.name != "pyproject.toml" # Exclude the original pyproject.toml ] - + ### PATCH ### + # REASONING: order matters for creating a hash. This will force consistent ordering of files + # For SGX, to distribute the FAB pre-experiment, the hash must be consistent on all systems + all_files.sort() + ############# for file_path in all_files: # Read the file content manually with open(file_path, "rb") as f: @@ -124,8 +131,12 @@ def patched_build( # Set the name of the zip file fab_filename = get_fab_filename(conf, fab_hash) - # Once the temporary zip file is created, rename it to the final filename + ### PATCH ### + # REASONING: original code writes to /tmp/ by default. Writing to flwr_home allows us to consolidate written files + # Also, return final_path final_path = Path(flwr_home) / fab_filename + ############# + shutil.move(temp_filename, final_path) typer.secho( @@ -134,13 +145,5 @@ def patched_build( return final_path, fab_hash -# def _load_gitignore(app: Path) -> pathspec.PathSpec: -# """Load and parse .gitignore file, returning a pathspec.""" -# gitignore_path = app / ".gitignore" -# patterns = ["__pycache__/"] # Default pattern -# if gitignore_path.exists(): -# with open(gitignore_path, encoding="UTF-8") as file: -# patterns.extend(file.readlines()) -# return pathspec.PathSpec.from_lines("gitwildmatch", patterns) -flwr.cli.build.build = patched_build +flwr.cli.build.build = build diff --git a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_telemetry.py b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_telemetry.py index b6210f6d15..687dae43bd 100644 --- a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_telemetry.py +++ b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_telemetry.py @@ -1,17 +1,18 @@ import flwr.common.telemetry -# import flwr_datasets.common.telemetry from pathlib import Path import os import uuid -def patched_get_source_id() -> str: +def _get_source_id() -> str: """Get existing or new source ID.""" source_id = "unavailable" # Check if .flwr in home exists - #### + ### PATCH ### + # REASONING: consolidate written file locations if os.getenv("FLWR_HOME"): flwr_dir = Path(os.getenv("FLWR_HOME")) + ############# else: try: home = flwr.common.telemetry._get_home() @@ -46,5 +47,4 @@ def patched_get_source_id() -> str: return source_id -flwr.common.telemetry._get_source_id = patched_get_source_id -# flwr_datasets.common.telemetry._get_source_id = patched_get_source_id +flwr.common.telemetry._get_source_id = _get_source_id From cea7bd9fdce2a66a12c5e40ce1ed15b540b86a3c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 22 Jan 2025 08:28:04 -0800 Subject: [PATCH 049/107] monitor output instead of subprocess calls Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 85 ++++++++++++++++---------- 1 file changed, 53 insertions(+), 32 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 29a4a304f7..193cd6b39a 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -62,7 +62,7 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): 1. Starts a local gRPC server to handle communication between the OpenFL client and the Flower SuperNode. 2. Launches the Flower SuperNode in a subprocess. 3. Sets up signal handlers for manual shutdown (via CTRL+C). - 4. If auto_shutdown is enabled, monitors subprocess activity and initiates shutdown if no new subprocess starts within the expected time frame. + 4. If auto_shutdown is enabled, monitors run activity and initiates shutdown if no new runs start within the expected time frame. Shutdown Process: - When a shutdown signal (SIGINT or SIGTERM) is received, the method will: @@ -98,7 +98,10 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" ] - supernode_process = subprocess.Popen(command, shell=False) + if self.auto_shutdown: + supernode_process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) + else: + supernode_process = subprocess.Popen(command, shell=False) termination_event = threading.Event() @@ -147,43 +150,61 @@ def signal_handler(_sig, _frame): monitor_thread = None if self.auto_shutdown: - self.logger.info("Automatic shutdown enabled. Monitoring subprocess activity...") + self.logger.info("Automatic shutdown enabled. Monitoring runs...") - def monitor_subprocesses(): + def monitor_runs(): """ - Monitors the activity of subprocesses and initiates shutdown if no new subprocess starts within the expected time frame. + Monitors the activity of the runs and initiates shutdown if no new run starts within the expected time frame. """ - try: - main_subprocess = psutil.Process(supernode_process.pid) - except psutil.NoSuchProcess: - return - - previous_end_time = None - intervals = [] - - while not termination_event.is_set(): - client_app_processes = main_subprocess.children(recursive=True) - if client_app_processes: - for client_app_process in client_app_processes: - client_app_process.wait() - end_time = time.time() - if previous_end_time is not None: - interval = end_time - previous_end_time - intervals.append(interval) - previous_end_time = end_time - - if previous_end_time is not None: - running_timer = time.time() - previous_end_time - if intervals: - average_interval = sum(intervals) / len(intervals) - if running_timer > 2 * average_interval: - self.logger.info("No new subprocess started within the expected time. Initiating shutdown...") + start_time = None + total_time = 0 + run_count = 0 + average_time = 0 + checking_time = False + + def check_time(): + nonlocal start_time, average_time, checking_time + while True: + if checking_time and start_time is not None: + elapsed_time = time.time() - start_time + if elapsed_time > 2 * average_time and average_time > 0: + self.logger.info("No new run started started within the expected time. Initiating shutdown...") signal_handler(signal.SIGTERM, None) return + time.sleep(1) # Check every second - time.sleep(1) + # Start a thread to continuously check the elapsed time + time_check_thread = threading.Thread(target=check_time) + time_check_thread.daemon = True + time_check_thread.start() - monitor_thread = threading.Thread(target=monitor_subprocesses) + while not termination_event.is_set(): + output = supernode_process.stdout.readline() + if output == b'' and supernode_process.poll() is not None: + break + if output: + decoded_output = output.decode('utf-8').strip() + print(decoded_output) # Print the output to the terminal + + # Check for RUN message + if "RUN" in decoded_output: + if start_time is not None: + end_time = time.time() + run_time = end_time - start_time + total_time += run_time + run_count += 1 + average_time = total_time / run_count + + # Reset the start time for the new RUN + start_time = end_time + checking_time = False # Stop checking as a new RUN has started + + # Start the timer after "Sent reply" is detected + if "Sent reply" in decoded_output: + start_time = time.time() + checking_time = True # Start checking the elapsed time + + monitor_thread = threading.Thread(target=monitor_runs) monitor_thread.start() self.logger.info("Press CTRL+C to stop the server and supernode process.") From 3d750ef538ce22e8bf57dd143861ee0b8d4edb0e Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 23 Jan 2025 11:05:53 -0800 Subject: [PATCH 050/107] add exceptions for graceful shutdown to forcefully terminate Signed-off-by: kta-intel --- .../component/interoperability/connector.py | 38 +++--- openfl/federated/task/runner_flower.py | 125 ++++++++---------- 2 files changed, 77 insertions(+), 86 deletions(-) diff --git a/openfl/component/interoperability/connector.py b/openfl/component/interoperability/connector.py index 44782922e7..7ee463d0fd 100644 --- a/openfl/component/interoperability/connector.py +++ b/openfl/component/interoperability/connector.py @@ -42,24 +42,30 @@ def stop(self): Stop the server process if it is running. """ if self._process: - self.logger.info(f"[OpenFL Connector] Stopping server process with PID: {self._process.pid}...") - # find and terminate sub_process processes - main_process = psutil.Process(self._process.pid) - sub_processes = main_process.children(recursive=True) - for sub_process in sub_processes: - self.logger.info(f"[OpenFL Connector] Stopping server subprocess with PID: {sub_process.pid}...") - sub_process.terminate() - _, still_alive = psutil.wait_procs(sub_processes, timeout=1) - for p in still_alive: - p.kill() - # Terminate the main process - self._process.terminate() try: - self._process.wait(timeout=1) - except subprocess.TimeoutExpired: + self.logger.info(f"[OpenFL Connector] Stopping server process with PID: {self._process.pid}...") + # find and terminate sub_process processes + main_process = psutil.Process(self._process.pid) + sub_processes = main_process.children(recursive=True) + for sub_process in sub_processes: + self.logger.info(f"[OpenFL Connector] Stopping server subprocess with PID: {sub_process.pid}...") + sub_process.terminate() + _, still_alive = psutil.wait_procs(sub_processes, timeout=1) + for p in still_alive: + p.kill() + # Terminate the main process + self._process.terminate() + try: + self._process.wait(timeout=1) + except subprocess.TimeoutExpired: + self._process.kill() + self._process = None + self.logger.info("[OpenFL Connector] Server process stopped.") + except Exception as e: + self.logger.error(f"Error during graceful shutdown: {e}") + self.logger.info("Attempting forceful termination of superlink process...") self._process.kill() - self._process = None - self.logger.info("[OpenFL Connector] Server process stopped.") + self.logger.info("Superlink process forcefully terminated.") else: self.logger.info("[OpenFL Connector] No server process is currently running.") diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 193cd6b39a..f098dec419 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -62,7 +62,7 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): 1. Starts a local gRPC server to handle communication between the OpenFL client and the Flower SuperNode. 2. Launches the Flower SuperNode in a subprocess. 3. Sets up signal handlers for manual shutdown (via CTRL+C). - 4. If auto_shutdown is enabled, monitors run activity and initiates shutdown if no new runs start within the expected time frame. + 4. If auto_shutdown is enabled, monitors run activity and initiates shutdown if no new subprocesses start within the expected time frame. Shutdown Process: - When a shutdown signal (SIGINT or SIGTERM) is received, the method will: @@ -98,10 +98,7 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" ] - if self.auto_shutdown: - supernode_process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) - else: - supernode_process = subprocess.Popen(command, shell=False) + supernode_process = subprocess.Popen(command, shell=False) termination_event = threading.Event() @@ -119,25 +116,31 @@ def signal_handler(_sig, _frame): self.logger.info("Received shutdown signal. Terminating supernode process...") - if supernode_process.poll() is None: - try: - main_subprocess = psutil.Process(supernode_process.pid) - client_app_processes = main_subprocess.children(recursive=True) - for client_app_process in client_app_processes: - client_app_process.terminate() - _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) - for p in still_alive: - p.kill() - supernode_process.terminate() + try: + if supernode_process.poll() is None: try: - supernode_process.wait(timeout=1) - except subprocess.TimeoutExpired: - supernode_process.kill() - self.logger.info("Supernode process terminated.") - except psutil.NoSuchProcess: + main_subprocess = psutil.Process(supernode_process.pid) + client_app_processes = main_subprocess.children(recursive=True) + for client_app_process in client_app_processes: + client_app_process.terminate() + _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) + for p in still_alive: + p.kill() + supernode_process.terminate() + try: + supernode_process.wait(timeout=1) + except subprocess.TimeoutExpired: + supernode_process.kill() + self.logger.info("Supernode process terminated.") + except psutil.NoSuchProcess: + self.logger.info("Supernode process already terminated.") + else: self.logger.info("Supernode process already terminated.") - else: - self.logger.info("Supernode process already terminated.") + except Exception as e: + self.logger.error(f"Error during graceful shutdown: {e}") + self.logger.info("Attempting forceful termination of supernode process...") + supernode_process.kill() + self.logger.info("Supernode process forcefully terminated.") self.logger.info("Shutting down local gRPC server...") server.stop(0) @@ -150,61 +153,43 @@ def signal_handler(_sig, _frame): monitor_thread = None if self.auto_shutdown: - self.logger.info("Automatic shutdown enabled. Monitoring runs...") + self.logger.info("Automatic shutdown enabled. Monitoring subprocess activity...") - def monitor_runs(): + def monitor_subprocesses(): """ - Monitors the activity of the runs and initiates shutdown if no new run starts within the expected time frame. + Monitors the activity of subprocesses and initiates shutdown if no new subprocess starts within the expected time frame. """ - start_time = None - total_time = 0 - run_count = 0 - average_time = 0 - checking_time = False - - def check_time(): - nonlocal start_time, average_time, checking_time - while True: - if checking_time and start_time is not None: - elapsed_time = time.time() - start_time - if elapsed_time > 2 * average_time and average_time > 0: - self.logger.info("No new run started started within the expected time. Initiating shutdown...") + try: + main_subprocess = psutil.Process(supernode_process.pid) + except psutil.NoSuchProcess: + return + + previous_end_time = None + intervals = [] + + while not termination_event.is_set(): + client_app_processes = main_subprocess.children(recursive=True) + if client_app_processes: + for client_app_process in client_app_processes: + client_app_process.wait() + end_time = time.time() + if previous_end_time is not None: + interval = end_time - previous_end_time + intervals.append(interval) + previous_end_time = end_time + + if previous_end_time is not None: + running_timer = time.time() - previous_end_time + if intervals: + average_interval = sum(intervals) / len(intervals) + if running_timer > 2 * average_interval: + self.logger.info("No new subprocess started within the expected time. Initiating shutdown...") signal_handler(signal.SIGTERM, None) return - time.sleep(1) # Check every second - # Start a thread to continuously check the elapsed time - time_check_thread = threading.Thread(target=check_time) - time_check_thread.daemon = True - time_check_thread.start() + time.sleep(1) - while not termination_event.is_set(): - output = supernode_process.stdout.readline() - if output == b'' and supernode_process.poll() is not None: - break - if output: - decoded_output = output.decode('utf-8').strip() - print(decoded_output) # Print the output to the terminal - - # Check for RUN message - if "RUN" in decoded_output: - if start_time is not None: - end_time = time.time() - run_time = end_time - start_time - total_time += run_time - run_count += 1 - average_time = total_time / run_count - - # Reset the start time for the new RUN - start_time = end_time - checking_time = False # Stop checking as a new RUN has started - - # Start the timer after "Sent reply" is detected - if "Sent reply" in decoded_output: - start_time = time.time() - checking_time = True # Start checking the elapsed time - - monitor_thread = threading.Thread(target=monitor_runs) + monitor_thread = threading.Thread(target=monitor_subprocesses) monitor_thread.start() self.logger.info("Press CTRL+C to stop the server and supernode process.") From 5fe3c37493c0db7627de247556c1e2327b0383ef Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 28 Jan 2025 09:02:02 -0800 Subject: [PATCH 051/107] updating auto shutdown mechanism to shut off supernode and local grpc server after final validation round completes --- .../component/interoperability/connector.py | 7 +- .../interoperability/connector_flower.py | 3 + openfl/federated/task/runner_flower.py | 82 ++++++------------- .../connector/flower/deserialize_message.py | 37 +++++++++ .../connector/flower/local_grpc_client.py | 1 + .../connector/flower/local_grpc_server.py | 27 +++--- 6 files changed, 84 insertions(+), 73 deletions(-) diff --git a/openfl/component/interoperability/connector.py b/openfl/component/interoperability/connector.py index 7ee463d0fd..db2e1db135 100644 --- a/openfl/component/interoperability/connector.py +++ b/openfl/component/interoperability/connector.py @@ -45,6 +45,7 @@ def stop(self): try: self.logger.info(f"[OpenFL Connector] Stopping server process with PID: {self._process.pid}...") # find and terminate sub_process processes + # main_process = psutil.Process('error') main_process = psutil.Process(self._process.pid) sub_processes = main_process.children(recursive=True) for sub_process in sub_processes: @@ -61,11 +62,9 @@ def stop(self): self._process.kill() self._process = None self.logger.info("[OpenFL Connector] Server process stopped.") - except Exception as e: - self.logger.error(f"Error during graceful shutdown: {e}") - self.logger.info("Attempting forceful termination of superlink process...") + except Exception: self._process.kill() - self.logger.info("Superlink process forcefully terminated.") + self.logger.info("[OpenFL Connector] Server process forcefully terminated.") else: self.logger.info("[OpenFL Connector] No server process is currently running.") diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 4325e8c869..4900ea1314 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -29,6 +29,7 @@ def __init__(self, superlink_params: dict, flwr_run_params: dict = None, **kwarg self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None self.flwr_run_process = None + # import pdb; pdb.set_trace() def _build_command(self) -> list[str]: """ @@ -79,6 +80,8 @@ def _build_flwr_run_command(self) -> list[str]: if federation_name: command.append(federation_name) + + # import pdb; pdb.set_trace() return command def start(self): diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index f098dec419..93e94636c9 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -1,3 +1,4 @@ +import threading import grpc from concurrent.futures import ThreadPoolExecutor from flwr.proto import grpcadapter_pb2_grpc @@ -7,11 +8,10 @@ import subprocess from logging import getLogger import signal -import threading import psutil import time - import os + os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") os.makedirs(os.environ["FLWR_HOME"], exist_ok=True) @@ -48,6 +48,7 @@ def __init__(self, auto_shutdown=True, **kwargs): self.auto_shutdown = auto_shutdown self.patch = kwargs.get('patch') self.shutdown_initiated = False # Flag to ensure signal handler runs only once + self.shutdown_requested = False def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): """ @@ -73,11 +74,23 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): """ local_server_port = kwargs.get('local_server_port') + def message_callback(): + """ + Callback function to handle messaging events. + If auto_shutdown is enabled, logs a message indicating that the final reply + has been sent and triggers the SIGTERM signal handler to initiate shutdown. + """ + # self.logger.info("Final reply sent") + self.shutdown_requested = True + server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) - grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(LocalGRPCServer(openfl_client, collaborator_name), server) + grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server( + LocalGRPCServer(openfl_client, collaborator_name, message_callback), server + ) server.add_insecure_port(f'[::]:{local_server_port}') server.start() self.logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") + if self.patch: command = [ "python", @@ -105,17 +118,15 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): def signal_handler(_sig, _frame): """ Handles shutdown signals (SIGINT or SIGTERM) to terminate the supernode process and stop the local gRPC server. - Args: _sig: The signal number. _frame: The current stack frame (not used). """ if self.shutdown_initiated: + # Avoid running the shutdown process multiple times return self.shutdown_initiated = True - self.logger.info("Received shutdown signal. Terminating supernode process...") - try: if supernode_process.poll() is None: try: @@ -123,12 +134,12 @@ def signal_handler(_sig, _frame): client_app_processes = main_subprocess.children(recursive=True) for client_app_process in client_app_processes: client_app_process.terminate() - _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) + _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) for p in still_alive: p.kill() supernode_process.terminate() try: - supernode_process.wait(timeout=1) + supernode_process.wait(timeout=1) except subprocess.TimeoutExpired: supernode_process.kill() self.logger.info("Supernode process terminated.") @@ -136,9 +147,7 @@ def signal_handler(_sig, _frame): self.logger.info("Supernode process already terminated.") else: self.logger.info("Supernode process already terminated.") - except Exception as e: - self.logger.error(f"Error during graceful shutdown: {e}") - self.logger.info("Attempting forceful termination of supernode process...") + except Exception: supernode_process.kill() self.logger.info("Supernode process forcefully terminated.") @@ -150,57 +159,12 @@ def signal_handler(_sig, _frame): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) - monitor_thread = None - - if self.auto_shutdown: - self.logger.info("Automatic shutdown enabled. Monitoring subprocess activity...") - - def monitor_subprocesses(): - """ - Monitors the activity of subprocesses and initiates shutdown if no new subprocess starts within the expected time frame. - """ - try: - main_subprocess = psutil.Process(supernode_process.pid) - except psutil.NoSuchProcess: - return - - previous_end_time = None - intervals = [] - - while not termination_event.is_set(): - client_app_processes = main_subprocess.children(recursive=True) - if client_app_processes: - for client_app_process in client_app_processes: - client_app_process.wait() - end_time = time.time() - if previous_end_time is not None: - interval = end_time - previous_end_time - intervals.append(interval) - previous_end_time = end_time - - if previous_end_time is not None: - running_timer = time.time() - previous_end_time - if intervals: - average_interval = sum(intervals) / len(intervals) - if running_timer > 2 * average_interval: - self.logger.info("No new subprocess started within the expected time. Initiating shutdown...") - signal_handler(signal.SIGTERM, None) - return - - time.sleep(1) - - monitor_thread = threading.Thread(target=monitor_subprocesses) - monitor_thread.start() - self.logger.info("Press CTRL+C to stop the server and supernode process.") try: while not termination_event.is_set(): + if self.shutdown_requested: + signal_handler(signal.SIGTERM, None) time.sleep(0.1) except KeyboardInterrupt: - signal_handler(signal.SIGINT, None) - - if monitor_thread is not None: - monitor_thread.join() - - self.logger.info("Exiting Task Runner") \ No newline at end of file + signal_handler(signal.SIGINT, None) \ No newline at end of file diff --git a/openfl/transport/grpc/connector/flower/deserialize_message.py b/openfl/transport/grpc/connector/flower/deserialize_message.py index 46f37e8acc..886b77404b 100644 --- a/openfl/transport/grpc/connector/flower/deserialize_message.py +++ b/openfl/transport/grpc/connector/flower/deserialize_message.py @@ -1,6 +1,39 @@ import importlib +# import os from google.protobuf.message import DecodeError +# def get_next_log_filename(log_dir='./logs'): +# """ +# Get the next log filename based on the existing log files in the directory. + +# Args: +# log_dir: The directory where log files are stored. + +# Returns: +# The next log filename. +# """ +# if not os.path.exists(log_dir): +# os.makedirs(log_dir) + +# existing_logs = [f for f in os.listdir(log_dir) if f.endswith('.log')] +# if not existing_logs: +# return os.path.join(log_dir, '1.log') + +# existing_numbers = [int(f.split('.')[0]) for f in existing_logs] +# next_number = max(existing_numbers) + 1 +# return os.path.join(log_dir, f'{next_number}.log') + +# def save_message_to_log(message, log_filename): +# """ +# Save the message to a log file. + +# Args: +# message: The message object to save. +# log_filename: The filename of the log file. +# """ +# with open(log_filename, 'w') as log_file: +# log_file.write(str(message)) + def deserialize_flower_message(flower_message): """ Deserialize the grpc_message_content of a Flower message using the module and class name @@ -38,4 +71,8 @@ def deserialize_flower_message(flower_message): print(f"Failed to deserialize message content. Error: {e}") return None + # # Save the message to a log file + # log_filename = get_next_log_filename() + # save_message_to_log(message, log_filename) + # print(f"Message saved to {log_filename}") return message \ No newline at end of file diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index b56355b63f..0746a7d6d8 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -1,6 +1,7 @@ import grpc from flwr.proto import grpcadapter_pb2_grpc from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message +from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message class LocalGRPCClient: """ diff --git a/openfl/transport/grpc/connector/flower/local_grpc_server.py b/openfl/transport/grpc/connector/flower/local_grpc_server.py index ad93883627..1d34456a00 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_server.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_server.py @@ -1,40 +1,41 @@ import threading import queue +import grpc from flwr.proto import grpcadapter_pb2_grpc from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message +from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): - """ + """ LocalGRPCServer is a gRPC server that handles requests from the Flower SuperNode and forwards them to the OpenFL Client. It uses a queue-based system to ensure that requests are processed sequentially, preventing concurrent request handling issues. """ - def __init__(self, openfl_client, collaborator_name): + def __init__(self, openfl_client, collaborator_name, message_callback): """ Initialize. Args: openfl_client: An instance of the OpenFL Client. collaborator_name: The name of the collaborator. + message_callback: A callback function to be called when a specific message is received. """ self.openfl_client = openfl_client self.collaborator_name = collaborator_name + self.message_callback = message_callback self.request_queue = queue.Queue() self.processing_thread = threading.Thread(target=self.process_queue) self.processing_thread.daemon = True self.processing_thread.start() + self.shutting_down = False # Flag to indicate if the server is shutting down def SendReceive(self, request, context): - """ - Handles incoming gRPC requests by putting them into the request queue - and waiting for the response. - + """ Handles incoming gRPC requests by putting them into the request queue and waiting for the response. Args: request: The incoming gRPC request. context: The gRPC context. - Returns: The response from the OpenFL server. """ @@ -50,12 +51,18 @@ def process_queue(self): """ while True: request, response_queue = self.request_queue.get() - request = flower_to_openfl_message(request, header=None) + deserialized_message = deserialize_flower_message(request) + openfl_request = flower_to_openfl_message(request, header=None) # Send request to the OpenFL server - openfl_response = self.openfl_client.send_message_to_server(request, self.collaborator_name) - + openfl_response = self.openfl_client.send_message_to_server(openfl_request, self.collaborator_name) # Send response to Flower client flower_response = openfl_to_flower_message(openfl_response) + # Check for the specific conditions + if hasattr(deserialized_message, 'task_res_list'): + for task_res in deserialized_message.task_res_list: + # TODO: this needs to be able to be set by the plan or the toml, not hard coded in the local grpc server + if task_res.group_id == "3" and task_res.task.task_type == "evaluate": + self.message_callback() response_queue.put(flower_response) self.request_queue.task_done() \ No newline at end of file From eddb02835d16a1b10a652e0c87da0299aa06f0f0 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 28 Jan 2025 11:15:11 -0800 Subject: [PATCH 052/107] update to run shut down command from server Signed-off-by: kta-intel --- .../flower-app-pytorch/plan/plan.yaml | 2 +- .../component/interoperability/connector.py | 4 +- .../interoperability/connector_flower.py | 41 ++++++++++++++----- openfl/federated/task/runner_flower.py | 11 ++--- .../connector/flower/local_grpc_client.py | 14 ++++++- .../connector/flower/local_grpc_server.py | 14 +++---- .../connector/flower/message_conversion.py | 4 +- 7 files changed, 61 insertions(+), 29 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index bb37a05e15..0310312085 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -11,6 +11,7 @@ connector : defaults : plan/defaults/connector.yaml template : openfl.component.ConnectorFlower settings : + flwr_app_name : "app-pytorch" superlink_params : insecure : True serverappio-api-address : 127.0.0.1:9091 # note [kta-intel]: ServerApp will connect here @@ -18,7 +19,6 @@ connector : exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml (for flwr run) patch : True flwr_run_params : - flwr_app_name : "app-pytorch" federation_name : "local-poc" patch : True diff --git a/openfl/component/interoperability/connector.py b/openfl/component/interoperability/connector.py index db2e1db135..4b6c05e727 100644 --- a/openfl/component/interoperability/connector.py +++ b/openfl/component/interoperability/connector.py @@ -45,7 +45,6 @@ def stop(self): try: self.logger.info(f"[OpenFL Connector] Stopping server process with PID: {self._process.pid}...") # find and terminate sub_process processes - # main_process = psutil.Process('error') main_process = psutil.Process(self._process.pid) sub_processes = main_process.children(recursive=True) for sub_process in sub_processes: @@ -62,7 +61,8 @@ def stop(self): self._process.kill() self._process = None self.logger.info("[OpenFL Connector] Server process stopped.") - except Exception: + except Exception as e: + self.logger.debug(f"[OpenFL Connector] Error during graceful shutdown: {e}") self._process.kill() self.logger.info("[OpenFL Connector] Server process forcefully terminated.") else: diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 4900ea1314..0b8eaffbdd 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -1,4 +1,5 @@ import subprocess +import toml from openfl.component.interoperability.connector import Connector from openfl.transport.grpc.connector.flower.local_grpc_client import LocalGRPCClient @@ -12,30 +13,48 @@ class ConnectorFlower(Connector): Responsible for generating the Flower server command. """ - def __init__(self, superlink_params: dict, flwr_run_params: dict = None, **kwargs): + def __init__(self, flwr_app_name: dict, superlink_params: dict, flwr_run_params: dict = None, **kwargs): """ Initialize ConnectorFlower by building the server command from the superlink_params. Args: superlink_params (dict): A dictionary of Flower server settings. flwr_run_params (dict, optional): A dictionary containing the Flower run parameters. Defaults to None. """ + self.flwr_app_name = flwr_app_name self.superlink_params = superlink_params self.flwr_run_params = flwr_run_params command = self._build_command() + super().__init__(command, component_name="Flower") - connector_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") - self.local_grpc_client = LocalGRPCClient(connector_address) - + self.local_grpc_client = self._get_local_grpc_client() + self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None self.flwr_run_process = None - # import pdb; pdb.set_trace() + + def _get_local_grpc_client(self): + """ + Create and return a LocalGRPCClient instance based on superlink_params + and the number of server rounds from the pyproject.toml file. + + Returns: + LocalGRPCClient: An instance of LocalGRPCClient initialized with the + connector address and number of server rounds. + """ + connector_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") + + # Load in the number of server rounds from the pyproject.toml file + toml_file_path = os.path.join('src', self.flwr_app_name, 'pyproject.toml') + toml_data = toml.load(toml_file_path) + + num_server_rounds = toml_data['tool']['flwr']['app']['config']['num-server-rounds'] + + return LocalGRPCClient(connector_address, num_server_rounds) def _build_command(self) -> list[str]: """ - Start the Flower SuperLink based on settings. - Args: - superlink_params (dict): Settings to configure the Flower server. + Start the Flower SuperLink based on superlink_params. + Returns: list[str]: A list representing the Flower server start command. """ @@ -67,16 +86,16 @@ def _build_command(self) -> list[str]: def _build_flwr_run_command(self) -> list[str]: """ Build the `flwr run` command to run the Flower application. + Returns: list[str]: A list representing the flwr_run command. """ - flwr_app_name = self.flwr_run_params.get("flwr_app_name") federation_name = self.flwr_run_params.get("federation_name") if self.flwr_run_params.get("patch"): - command = ["python", "src/patch/flwr_run_patch.py", "run", f"./src/{flwr_app_name}"] + command = ["python", "src/patch/flwr_run_patch.py", "run", f"./src/{self.flwr_app_name}"] else: - command = ["flwr", "run", f"./src/{flwr_app_name}"] + command = ["flwr", "run", f"./src/{self.flwr_app_name}"] if federation_name: command.append(federation_name) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 93e94636c9..381eff3ef9 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -47,8 +47,8 @@ def __init__(self, auto_shutdown=True, **kwargs): self.client_port = base_port + self.partition_id self.auto_shutdown = auto_shutdown self.patch = kwargs.get('patch') - self.shutdown_initiated = False # Flag to ensure signal handler runs only once - self.shutdown_requested = False + self.shutdown_initiated = False # Flag to ensure signal handler runs only once + self.shutdown_requested = False # Flag signal shutdown def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): """ @@ -80,8 +80,8 @@ def message_callback(): If auto_shutdown is enabled, logs a message indicating that the final reply has been sent and triggers the SIGTERM signal handler to initiate shutdown. """ - # self.logger.info("Final reply sent") - self.shutdown_requested = True + if self.auto_shutdown: + self.shutdown_requested = True server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server( @@ -147,7 +147,8 @@ def signal_handler(_sig, _frame): self.logger.info("Supernode process already terminated.") else: self.logger.info("Supernode process already terminated.") - except Exception: + except Exception as e: + self.logger.debug(f"Error during graceful shutdown: {e}") supernode_process.kill() self.logger.info("Supernode process forcefully terminated.") diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index 0746a7d6d8..5c0e120854 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -9,7 +9,7 @@ class LocalGRPCClient: and the OpenFL Server. It converts messages between OpenFL and Flower formats and handles the send-receive communication with the Flower SuperNode using gRPC. """ - def __init__(self, superlink_address): + def __init__(self, superlink_address, num_server_rounds): """ Initialize. @@ -18,6 +18,8 @@ def __init__(self, superlink_address): """ self.superlink_channel = grpc.insecure_channel(superlink_address) self.superlink_stub = grpcadapter_pb2_grpc.GrpcAdapterStub(self.superlink_channel) + self.num_server_rounds = num_server_rounds + self.end_experiment = False def send_receive(self, openfl_message, header): """ @@ -31,6 +33,14 @@ def send_receive(self, openfl_message, header): The response from the Flower SuperLink, converted back to OpenFL format. """ flower_message = openfl_to_flower_message(openfl_message) + deserialized_message = deserialize_flower_message(flower_message) + + # Check if clients completes last task for final round + if hasattr(deserialized_message, 'task_res_list'): + for task_res in deserialized_message.task_res_list: + if task_res.group_id == str(self.num_server_rounds) and task_res.task.task_type == "evaluate": + self.end_experiment = True + flower_response = self.superlink_stub.SendReceive(flower_message) - openfl_response = flower_to_openfl_message(flower_response, header=header) + openfl_response = flower_to_openfl_message(flower_response, header=header, end_experiment=self.end_experiment) return openfl_response diff --git a/openfl/transport/grpc/connector/flower/local_grpc_server.py b/openfl/transport/grpc/connector/flower/local_grpc_server.py index 1d34456a00..0bac3aade0 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_server.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_server.py @@ -51,18 +51,18 @@ def process_queue(self): """ while True: request, response_queue = self.request_queue.get() - deserialized_message = deserialize_flower_message(request) + # deserialized_message = deserialize_flower_message(request) openfl_request = flower_to_openfl_message(request, header=None) # Send request to the OpenFL server openfl_response = self.openfl_client.send_message_to_server(openfl_request, self.collaborator_name) + + # Check to end experiment + if hasattr(openfl_response, 'metadata'): + if openfl_response.metadata['end_experiment'] == 'True': + self.message_callback() + # Send response to Flower client flower_response = openfl_to_flower_message(openfl_response) - # Check for the specific conditions - if hasattr(deserialized_message, 'task_res_list'): - for task_res in deserialized_message.task_res_list: - # TODO: this needs to be able to be set by the plan or the toml, not hard coded in the local grpc server - if task_res.group_id == "3" and task_res.task.task_type == "evaluate": - self.message_callback() response_queue.put(flower_response) self.request_queue.task_done() \ No newline at end of file diff --git a/openfl/transport/grpc/connector/flower/message_conversion.py b/openfl/transport/grpc/connector/flower/message_conversion.py index 448c991af9..f5279c64c1 100644 --- a/openfl/transport/grpc/connector/flower/message_conversion.py +++ b/openfl/transport/grpc/connector/flower/message_conversion.py @@ -2,7 +2,7 @@ from openfl.protocols import aggregator_pb2 # from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message -def flower_to_openfl_message(flower_message, header=None): +def flower_to_openfl_message(flower_message, header=None, end_experiment=False): """ Convert a Flower MessageContainer to an OpenFL DropPod. @@ -35,6 +35,8 @@ def flower_to_openfl_message(flower_message, header=None): openfl_message.message.npbytes = serialized_flower_message openfl_message.message.size = len(serialized_flower_message) + # Add flag to check if experiment has ended + openfl_message.metadata.update({"end_experiment": str(end_experiment)}) return openfl_message def openfl_to_flower_message(openfl_message): From 121570cdfd9ec85130ea7e9cc4ccb1d1a7cf1376 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 30 Jan 2025 14:03:56 -0800 Subject: [PATCH 053/107] download data beforehand Signed-off-by: kta-intel --- .../src/app-pytorch/app_pytorch/task.py | 48 +++++++++---------- .../flower-app-pytorch/src/setup_data.py | 46 ++++++++++++++++++ 2 files changed, 70 insertions(+), 24 deletions(-) create mode 100644 openfl-workspace/flower-app-pytorch/src/setup_data.py diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py index 1a0604b623..b8112ec92a 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py @@ -9,6 +9,7 @@ # from flwr_datasets.partitioner import IidPartitioner from torch.utils.data import DataLoader, Dataset from torchvision.transforms import Compose, Normalize, ToTensor +import os class Net(nn.Module): @@ -34,7 +35,6 @@ def forward(self, x): # fds = None # Cache FederatedDataset - # def load_data(partition_id: int, num_partitions: int): # """Load partition CIFAR10 data.""" # # Only initialize `FederatedDataset` once @@ -60,40 +60,40 @@ def forward(self, x): # partition_train_test = partition_train_test.with_transform(apply_transforms) # trainloader = DataLoader(partition_train_test["train"], batch_size=32, shuffle=True) # testloader = DataLoader(partition_train_test["test"], batch_size=32) +# import pdb; pdb.set_trace() # return trainloader, testloader -class DummyDataset(Dataset): - def __init__(self, num_samples, transform=None): - self.num_samples = num_samples - self.transform = transform - self.data = torch.randn(num_samples, 3, 32, 32) # Random images - self.targets = torch.randint(0, 10, (num_samples,)) # Random labels - def __len__(self): - return self.num_samples +def load_partition_data(partition_id): + partition_dir = os.path.join('./data', f"{partition_id}") + + train_data_path = os.path.join(partition_dir, "train.pt") + test_data_path = os.path.join(partition_dir, "test.pt") + + train_data = torch.load(train_data_path) + test_data = torch.load(test_data_path) + + return train_data, test_data - def __getitem__(self, idx): - sample = {'img': self.data[idx], 'label': self.targets[idx]} - if self.transform: - sample['img'] = self.transform(sample['img']) - return sample def load_data(partition_id: int, num_partitions: int): - """Load partition dummy CIFAR10 data.""" - num_samples = 50000 // num_partitions # Assuming 50,000 samples in total - num_train_samples = int(num_samples * 0.8) - num_test_samples = num_samples - num_train_samples - + """Load partition CIFAR10 data.""" + train_data, test_data = load_partition_data(partition_id) pytorch_transforms = Compose( - [Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] + [ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] ) - train_dataset = DummyDataset(num_train_samples, transform=pytorch_transforms) - test_dataset = DummyDataset(num_test_samples, transform=pytorch_transforms) + def apply_transforms(batch): + """Apply transforms to the partition from FederatedDataset.""" + batch["img"] = [pytorch_transforms(img) for img in batch["img"]] + return batch - trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True) - testloader = DataLoader(test_dataset, batch_size=32) + train_data = train_data.with_transform(apply_transforms) + test_data = test_data.with_transform(apply_transforms) + trainloader = DataLoader(train_data, batch_size=32, shuffle=True) + testloader = DataLoader(test_data, batch_size=32) + import pdb; pdb.set_trace() return trainloader, testloader diff --git a/openfl-workspace/flower-app-pytorch/src/setup_data.py b/openfl-workspace/flower-app-pytorch/src/setup_data.py new file mode 100644 index 0000000000..7633f0c6f4 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/src/setup_data.py @@ -0,0 +1,46 @@ +import os +import sys +import torch +from flwr_datasets import FederatedDataset +from flwr_datasets.partitioner import IidPartitioner + +def main(num_partitions): + # Directory to save the partitions + save_dir = "data" + + # Ensure the save directory exists + os.makedirs(save_dir, exist_ok=True) + + # Initialize FederatedDataset + partitioner = IidPartitioner(num_partitions=num_partitions) + fds = FederatedDataset( + dataset="uoft-cs/cifar10", + partitioners={"train": partitioner}, + ) + + # Function to save partition data + def save_partition_data(partition_id, partition_train_test): + partition_dir = os.path.join(save_dir, f"{partition_id}") + os.makedirs(partition_dir, exist_ok=True) + + train_data_path = os.path.join(partition_dir, "train.pt") + test_data_path = os.path.join(partition_dir, "test.pt") + + torch.save(partition_train_test["train"], train_data_path) + torch.save(partition_train_test["test"], test_data_path) + + # Download, split, and save the dataset + for partition_id in range(num_partitions): + partition = fds.load_partition(partition_id) + partition_train_test = partition.train_test_split(test_size=0.2, seed=42) + save_partition_data(partition_id, partition_train_test) + + print("Dataset downloaded, split, and saved successfully.") + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python setup_data.py ") + sys.exit(1) + + num_partitions = int(sys.argv[1]) + main(num_partitions) \ No newline at end of file From 0e74f1c74ff4475430dd576d348e1ca96e43ff99 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 30 Jan 2025 14:14:01 -0800 Subject: [PATCH 054/107] remove debug break Signed-off-by: kta-intel --- .../flower-app-pytorch/src/app-pytorch/app_pytorch/task.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py index b8112ec92a..ed96044de1 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py @@ -93,7 +93,6 @@ def apply_transforms(batch): trainloader = DataLoader(train_data, batch_size=32, shuffle=True) testloader = DataLoader(test_data, batch_size=32) - import pdb; pdb.set_trace() return trainloader, testloader From 7664cc67f8a4c0f6b507606a6c886e72ded468f1 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 31 Jan 2025 10:16:29 -0800 Subject: [PATCH 055/107] fixes around log and persistent db Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/plan/plan.yaml | 3 ++- openfl/component/aggregator/aggregator.py | 14 +++++++------- openfl/component/collaborator/collaborator.py | 3 ++- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 0310312085..d606d41b43 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -5,7 +5,8 @@ aggregator : defaults : plan/defaults/aggregator.yaml template : openfl.component.Aggregator settings : - rounds_to_train : 1 #DO NOT EDIT + rounds_to_train : 1 #DO NOT EDIT. This is to indicate OpenFL communication rounds + set persist_checkpoint: false connector : defaults : plan/defaults/connector.yaml diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index c2589d271b..b72174de03 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -1170,20 +1170,20 @@ def _end_of_round_check(self): if not self.is_connector_available(): # Compute all validation related metrics - logs = {} - for task_name in self.assigner.get_all_tasks_for_round(self.round_number): - logs.update(self._compute_validation_related_task_metrics(task_name)) + logs = {} + for task_name in self.assigner.get_all_tasks_for_round(self.round_number): + logs.update(self._compute_validation_related_task_metrics(task_name)) - # End of round callbacks. - self.callbacks.on_round_end(self.round_number, logs) + # End of round callbacks. + self.callbacks.on_round_end(self.round_number, logs) # Once all of the task results have been processed self._end_of_round_check_done[self.round_number] = True # Save the latest model if not self.is_connector_available(): - logger.info("Saving round %s model...", self.round_number) - self._save_model(self.round_number, self.last_state_path) + logger.info("Saving round %s model...", self.round_number) + self._save_model(self.round_number, self.last_state_path) self.round_number += 1 # resetting stragglers for task for a new round diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index d2c93fad01..a8e92b1f70 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -274,7 +274,8 @@ def do_task(self, task, round_number) -> dict: # TODO: better to use self.send_task_results(global_output_tensor_dict, round_number, task_name) # maybe set global_output_tensor to empty self.client.send_local_task_results(self.collaborator_name, round_number, task_name) - return + metrics = {'collaborator1/start_client_adapter': 'Completed'} + return metrics else: raise AttributeError(f"{func_name} is not callable on {self.task_runner}") else: From f5903e9bb80084a89b89d2f5797918e2d1c4ce8a Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 31 Jan 2025 10:35:05 -0800 Subject: [PATCH 056/107] fixes to workspace Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/plan/plan.yaml | 4 ++-- openfl-workspace/flower-app-pytorch/requirements.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index d606d41b43..a604ce90c6 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -5,8 +5,8 @@ aggregator : defaults : plan/defaults/aggregator.yaml template : openfl.component.Aggregator settings : - rounds_to_train : 1 #DO NOT EDIT. This is to indicate OpenFL communication rounds - set persist_checkpoint: false + rounds_to_train : 1 # DO NOT EDIT. This is to indicate OpenFL communication rounds + persist_checkpoint: false connector : defaults : plan/defaults/connector.yaml diff --git a/openfl-workspace/flower-app-pytorch/requirements.txt b/openfl-workspace/flower-app-pytorch/requirements.txt index aa2724e793..8141047b9d 100644 --- a/openfl-workspace/flower-app-pytorch/requirements.txt +++ b/openfl-workspace/flower-app-pytorch/requirements.txt @@ -1 +1,2 @@ ./src/app-pytorch +toml From ce9ceb5c12ac6b906870ded23f6a5e908ac3587e Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 31 Jan 2025 12:01:26 -0800 Subject: [PATCH 057/107] update auto shutdown and dataset Signed-off-by: kta-intel --- .../src/app-pytorch/app_pytorch/task.py | 15 ++++++++------- .../src/app-pytorch/pyproject.toml | 8 ++++---- .../flower-app-pytorch/src/setup_data.py | 9 ++++----- .../grpc/connector/flower/deserialize_message.py | 2 +- .../grpc/connector/flower/local_grpc_client.py | 11 ++++++----- 5 files changed, 23 insertions(+), 22 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py index ed96044de1..ac5244d1ac 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py @@ -1,13 +1,14 @@ """app-pytorch: A Flower / PyTorch app.""" from collections import OrderedDict +from datasets import load_from_disk import torch import torch.nn as nn import torch.nn.functional as F # from flwr_datasets import FederatedDataset # NOTE: flwr_dataset will create ~/.flwr/source # from flwr_datasets.partitioner import IidPartitioner -from torch.utils.data import DataLoader, Dataset +from torch.utils.data import DataLoader # Dataset from torchvision.transforms import Compose, Normalize, ToTensor import os @@ -64,14 +65,14 @@ def forward(self, x): # return trainloader, testloader -def load_partition_data(partition_id): - partition_dir = os.path.join('./data', f"{partition_id}") +def load_partition_data(partition_id, data_dir="data"): + partition_dir = os.path.join(data_dir, f"{partition_id}") - train_data_path = os.path.join(partition_dir, "train.pt") - test_data_path = os.path.join(partition_dir, "test.pt") + train_data_path = os.path.join(partition_dir, "train") + test_data_path = os.path.join(partition_dir, "test") - train_data = torch.load(train_data_path) - test_data = torch.load(test_data_path) + train_data = load_from_disk(train_data_path) + test_data = load_from_disk(test_data_path) return train_data, test_data diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml b/openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml index 24cf96c916..39e88d12a1 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml @@ -8,10 +8,10 @@ version = "1.0.0" description = "" license = "Apache-2.0" dependencies = [ - "flwr-nightly==1.14.0.dev20241205", - "flwr-datasets[vision]>=0.3.0", - "torch==2.3.1", - "torchvision==0.18.1", + "flwr>=1.15.0", + "flwr-datasets[vision]>=0.5.0", + "torch==2.5.1", + "torchvision==0.20.1", ] [tool.hatch.build.targets.wheel] diff --git a/openfl-workspace/flower-app-pytorch/src/setup_data.py b/openfl-workspace/flower-app-pytorch/src/setup_data.py index 7633f0c6f4..e8fb78d52a 100644 --- a/openfl-workspace/flower-app-pytorch/src/setup_data.py +++ b/openfl-workspace/flower-app-pytorch/src/setup_data.py @@ -1,6 +1,5 @@ import os import sys -import torch from flwr_datasets import FederatedDataset from flwr_datasets.partitioner import IidPartitioner @@ -23,11 +22,11 @@ def save_partition_data(partition_id, partition_train_test): partition_dir = os.path.join(save_dir, f"{partition_id}") os.makedirs(partition_dir, exist_ok=True) - train_data_path = os.path.join(partition_dir, "train.pt") - test_data_path = os.path.join(partition_dir, "test.pt") + train_data_path = os.path.join(partition_dir, "train") + test_data_path = os.path.join(partition_dir, "test") - torch.save(partition_train_test["train"], train_data_path) - torch.save(partition_train_test["test"], test_data_path) + partition_train_test["train"].save_to_disk(train_data_path) + partition_train_test["test"].save_to_disk(test_data_path) # Download, split, and save the dataset for partition_id in range(num_partitions): diff --git a/openfl/transport/grpc/connector/flower/deserialize_message.py b/openfl/transport/grpc/connector/flower/deserialize_message.py index 886b77404b..9de362b77b 100644 --- a/openfl/transport/grpc/connector/flower/deserialize_message.py +++ b/openfl/transport/grpc/connector/flower/deserialize_message.py @@ -71,7 +71,7 @@ def deserialize_flower_message(flower_message): print(f"Failed to deserialize message content. Error: {e}") return None - # # Save the message to a log file + # Save the message to a log file # log_filename = get_next_log_filename() # save_message_to_log(message, log_filename) # print(f"Message saved to {log_filename}") diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index 5c0e120854..6ff24dccd5 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -35,11 +35,12 @@ def send_receive(self, openfl_message, header): flower_message = openfl_to_flower_message(openfl_message) deserialized_message = deserialize_flower_message(flower_message) - # Check if clients completes last task for final round - if hasattr(deserialized_message, 'task_res_list'): - for task_res in deserialized_message.task_res_list: - if task_res.group_id == str(self.num_server_rounds) and task_res.task.task_type == "evaluate": - self.end_experiment = True + # Check if clients completes the evaluation task for the final server round + if hasattr(deserialized_message, 'messages_list'): + self.end_experiment = any( + message.metadata.group_id == str(self.num_server_rounds) and message.metadata.message_type == "evaluate" + for message in deserialized_message.messages_list + ) flower_response = self.superlink_stub.SendReceive(flower_message) openfl_response = flower_to_openfl_message(flower_response, header=header, end_experiment=self.end_experiment) From f9961994e09eecd60192309f74a91d0309d2a7d7 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 31 Jan 2025 12:25:26 -0800 Subject: [PATCH 058/107] give time for child processes to stop Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 42 ++++++++++++++++++-------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 381eff3ef9..1d0ae1e4b2 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -122,31 +122,49 @@ def signal_handler(_sig, _frame): _sig: The signal number. _frame: The current stack frame (not used). """ + # Avoid running the shutdown process multiple times if self.shutdown_initiated: - # Avoid running the shutdown process multiple times return self.shutdown_initiated = True + def terminate_process(process, timeout=5): + """ + Helper function to terminate a process gracefully. + Args: + process: The process to terminate. + timeout: The timeout for waiting for the process to terminate. + """ + try: + process.terminate() + process.wait(timeout=timeout) + except (psutil.NoSuchProcess, subprocess.TimeoutExpired): + process.kill() + try: if supernode_process.poll() is None: try: main_subprocess = psutil.Process(supernode_process.pid) client_app_processes = main_subprocess.children(recursive=True) + + # Wait for client app processes to complete + for client_app_process in client_app_processes: + try: + client_app_process.wait(timeout=5) + except psutil.NoSuchProcess: + pass + + # Terminate client app processes if they are still running for client_app_process in client_app_processes: - client_app_process.terminate() - _, still_alive = psutil.wait_procs(client_app_processes, timeout=1) - for p in still_alive: - p.kill() - supernode_process.terminate() - try: - supernode_process.wait(timeout=1) - except subprocess.TimeoutExpired: - supernode_process.kill() + if client_app_process.is_running(): + terminate_process(client_app_process) + + # Terminate the supernode process + terminate_process(supernode_process) self.logger.info("Supernode process terminated.") except psutil.NoSuchProcess: - self.logger.info("Supernode process already terminated.") + self.logger.info("Supernode process already terminated 2.") else: - self.logger.info("Supernode process already terminated.") + self.logger.info("Supernode process already terminated 1.") except Exception as e: self.logger.debug(f"Error during graceful shutdown: {e}") supernode_process.kill() From 3809ff91064d8a953406a3385aebb7024893efac Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 31 Jan 2025 15:29:43 -0800 Subject: [PATCH 059/107] write logs set to false to avoid issues with gramine Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/plan/plan.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index a604ce90c6..48dcce106d 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -6,7 +6,8 @@ aggregator : template : openfl.component.Aggregator settings : rounds_to_train : 1 # DO NOT EDIT. This is to indicate OpenFL communication rounds - persist_checkpoint: false + persist_checkpoint : false + write_logs : false connector : defaults : plan/defaults/connector.yaml From 46fa4f999a1cbd865990d41ead5d6f7778a744c9 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 5 Feb 2025 13:01:49 -0800 Subject: [PATCH 060/107] fix condition to forcefully shutdown process Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 59 +++++++++---------- .../connector/flower/local_grpc_server.py | 2 - 2 files changed, 29 insertions(+), 32 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 1d0ae1e4b2..ce3b5eac78 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -137,38 +137,37 @@ def terminate_process(process, timeout=5): try: process.terminate() process.wait(timeout=timeout) - except (psutil.NoSuchProcess, subprocess.TimeoutExpired): + except: process.kill() - try: - if supernode_process.poll() is None: - try: - main_subprocess = psutil.Process(supernode_process.pid) - client_app_processes = main_subprocess.children(recursive=True) - - # Wait for client app processes to complete - for client_app_process in client_app_processes: - try: - client_app_process.wait(timeout=5) - except psutil.NoSuchProcess: - pass - - # Terminate client app processes if they are still running - for client_app_process in client_app_processes: - if client_app_process.is_running(): - terminate_process(client_app_process) - - # Terminate the supernode process - terminate_process(supernode_process) - self.logger.info("Supernode process terminated.") - except psutil.NoSuchProcess: - self.logger.info("Supernode process already terminated 2.") - else: - self.logger.info("Supernode process already terminated 1.") - except Exception as e: - self.logger.debug(f"Error during graceful shutdown: {e}") - supernode_process.kill() - self.logger.info("Supernode process forcefully terminated.") + if supernode_process.poll() is None: + try: + main_subprocess = psutil.Process(supernode_process.pid) + client_app_processes = main_subprocess.children(recursive=True) + + # Wait for client app processes to complete + for client_app_process in client_app_processes: + try: + client_app_process.wait(timeout=5) + except psutil.NoSuchProcess: + pass + + # Terminate client app processes if they are still running + for client_app_process in client_app_processes: + if client_app_process.is_running(): + terminate_process(client_app_process) + + # Terminate the supernode process + terminate_process(main_subprocess) + self.logger.info("Supernode process terminated.") + except Exception as e: + self.logger.info(f"Error during graceful shutdown: {e}") + # Directly shutdown the supernode_process + # Gramine does not detect psutil.Process + terminate_process(supernode_process) + self.logger.info("Supernode process forcefully terminated.") + else: + self.logger.info("Supernode process already terminated 1.") self.logger.info("Shutting down local gRPC server...") server.stop(0) diff --git a/openfl/transport/grpc/connector/flower/local_grpc_server.py b/openfl/transport/grpc/connector/flower/local_grpc_server.py index 0bac3aade0..e6c85921b8 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_server.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_server.py @@ -3,7 +3,6 @@ import grpc from flwr.proto import grpcadapter_pb2_grpc from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message -from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): """ @@ -51,7 +50,6 @@ def process_queue(self): """ while True: request, response_queue = self.request_queue.get() - # deserialized_message = deserialize_flower_message(request) openfl_request = flower_to_openfl_message(request, header=None) # Send request to the OpenFL server From 5f486915612833de6b7af4e8a1f842401de232b9 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 6 Feb 2025 08:00:54 -0800 Subject: [PATCH 061/107] add sleep to let client app close Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index ce3b5eac78..67cee4aa5e 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -164,6 +164,7 @@ def terminate_process(process, timeout=5): self.logger.info(f"Error during graceful shutdown: {e}") # Directly shutdown the supernode_process # Gramine does not detect psutil.Process + time.sleep(10) terminate_process(supernode_process) self.logger.info("Supernode process forcefully terminated.") else: From 07870e89d1e54a4bdc5185edecf8b90f0f71247b Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 6 Feb 2025 12:38:59 -0800 Subject: [PATCH 062/107] fixing termination Signed-off-by: kta-intel --- .../component/interoperability/connector.py | 4 +- openfl/federated/task/runner_flower.py | 43 ++++++++----------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/openfl/component/interoperability/connector.py b/openfl/component/interoperability/connector.py index 4b6c05e727..1040ccc480 100644 --- a/openfl/component/interoperability/connector.py +++ b/openfl/component/interoperability/connector.py @@ -54,9 +54,9 @@ def stop(self): for p in still_alive: p.kill() # Terminate the main process - self._process.terminate() try: - self._process.wait(timeout=1) + self._process.terminate() + self._process.wait(timeout=5) except subprocess.TimeoutExpired: self._process.kill() self._process = None diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 67cee4aa5e..be0c60f6e3 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -67,8 +67,8 @@ def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): Shutdown Process: - When a shutdown signal (SIGINT or SIGTERM) is received, the method will: - 1. Terminate all child processes of the supernode subprocess. - 2. Terminate the main supernode subprocess. + 1. Terminate all child processes of the SuperNode subprocess. + 2. Terminate the main SuperNode subprocess. 3. Stop the gRPC server. 4. Log the shutdown process and set the termination event to stop the server. """ @@ -117,15 +117,11 @@ def message_callback(): def signal_handler(_sig, _frame): """ - Handles shutdown signals (SIGINT or SIGTERM) to terminate the supernode process and stop the local gRPC server. + Handles shutdown signals (SIGINT or SIGTERM) to terminate the SuperNode process and stop the local gRPC server. Args: _sig: The signal number. _frame: The current stack frame (not used). """ - # Avoid running the shutdown process multiple times - if self.shutdown_initiated: - return - self.shutdown_initiated = True def terminate_process(process, timeout=5): """ @@ -137,38 +133,33 @@ def terminate_process(process, timeout=5): try: process.terminate() process.wait(timeout=timeout) - except: + except psutil.TimeoutExpired: + self.logger.debug(f"Timeout expired while waiting for process {process.pid} to terminate. Killing the process.") process.kill() + except psutil.NoSuchProcess: + self.logger.debug(f"Process {process.pid} does not exist. Skipping.") + pass if supernode_process.poll() is None: try: main_subprocess = psutil.Process(supernode_process.pid) client_app_processes = main_subprocess.children(recursive=True) - # Wait for client app processes to complete for client_app_process in client_app_processes: - try: - client_app_process.wait(timeout=5) - except psutil.NoSuchProcess: - pass - - # Terminate client app processes if they are still running - for client_app_process in client_app_processes: - if client_app_process.is_running(): - terminate_process(client_app_process) - - # Terminate the supernode process + terminate_process(client_app_process) + terminate_process(main_subprocess) - self.logger.info("Supernode process terminated.") + self.logger.info("SuperNode process terminated.") + except Exception as e: - self.logger.info(f"Error during graceful shutdown: {e}") - # Directly shutdown the supernode_process + self.logger.debug(f"Error during graceful shutdown: {e}") # Gramine does not detect psutil.Process + # Give time for clientapp to stop then directly shutdown the supernode_process time.sleep(10) terminate_process(supernode_process) - self.logger.info("Supernode process forcefully terminated.") + self.logger.info("SuperNode process terminated.") else: - self.logger.info("Supernode process already terminated 1.") + self.logger.info("SuperNode process already terminated.") self.logger.info("Shutting down local gRPC server...") server.stop(0) @@ -178,7 +169,7 @@ def terminate_process(process, timeout=5): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) - self.logger.info("Press CTRL+C to stop the server and supernode process.") + self.logger.info("Press CTRL+C to stop the server and SuperNode process.") try: while not termination_event.is_set(): From c26c9d21891b17b9ca4425f8b025aaa1973fe712 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 6 Feb 2025 14:06:25 -0800 Subject: [PATCH 063/107] create a separate try-except block for subprocess Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index be0c60f6e3..61b422a70f 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -155,8 +155,13 @@ def terminate_process(process, timeout=5): self.logger.debug(f"Error during graceful shutdown: {e}") # Gramine does not detect psutil.Process # Give time for clientapp to stop then directly shutdown the supernode_process - time.sleep(10) - terminate_process(supernode_process) + try: + supernode_process.terminate() + supernode_process.wait(timeout=10) + except: + self.logger.debug(f"Timeout expired while waiting for process {supernode_process.pid} to terminate. Killing the process.") + supernode_process.kill() + self.logger.info("SuperNode process terminated.") else: self.logger.info("SuperNode process already terminated.") From 96d8570cdcdbad5264c154883435b2787a59f1e9 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 7 Feb 2025 07:45:10 -0800 Subject: [PATCH 064/107] adjust exception in signal handler Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 61b422a70f..66c58e9681 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -155,12 +155,14 @@ def terminate_process(process, timeout=5): self.logger.debug(f"Error during graceful shutdown: {e}") # Gramine does not detect psutil.Process # Give time for clientapp to stop then directly shutdown the supernode_process - try: - supernode_process.terminate() - supernode_process.wait(timeout=10) - except: - self.logger.debug(f"Timeout expired while waiting for process {supernode_process.pid} to terminate. Killing the process.") - supernode_process.kill() + time.sleep(10) + supernode_process.kill() + # try: + # supernode_process.terminate() + # supernode_process.wait(timeout=5) + # except: + # self.logger.debug(f"Timeout expired while waiting for process {supernode_process.pid} to terminate. Killing the process.") + # supernode_process.kill() self.logger.info("SuperNode process terminated.") else: From 80f515a6cd06ed0ddb87a7dce65221fc8a2aa6d9 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 10 Feb 2025 15:05:19 -0800 Subject: [PATCH 065/107] update connector.yaml Signed-off-by: kta-intel --- openfl-workspace/workspace/plan/defaults/connector.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl-workspace/workspace/plan/defaults/connector.yaml b/openfl-workspace/workspace/plan/defaults/connector.yaml index 0841d205d0..2b6645d22b 100644 --- a/openfl-workspace/workspace/plan/defaults/connector.yaml +++ b/openfl-workspace/workspace/plan/defaults/connector.yaml @@ -1 +1 @@ -template : openfl.component.FederatedLearningExchange \ No newline at end of file +template : openfl.component.Connector \ No newline at end of file From d4fa28c14e4b01e196748547ab3a9b80991358c7 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 11 Feb 2025 10:07:19 -0800 Subject: [PATCH 066/107] update flwr run command Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 0b8eaffbdd..37da8799fb 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -1,5 +1,6 @@ import subprocess import toml +import json from openfl.component.interoperability.connector import Connector from openfl.transport.grpc.connector.flower.local_grpc_client import LocalGRPCClient @@ -30,7 +31,7 @@ def __init__(self, flwr_app_name: dict, superlink_params: dict, flwr_run_params: self.local_grpc_client = self._get_local_grpc_client() self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None - self.flwr_run_process = None + self.run_id = None def _get_local_grpc_client(self): """ @@ -93,9 +94,9 @@ def _build_flwr_run_command(self) -> list[str]: federation_name = self.flwr_run_params.get("federation_name") if self.flwr_run_params.get("patch"): - command = ["python", "src/patch/flwr_run_patch.py", "run", f"./src/{self.flwr_app_name}"] + command = ["python", "src/patch/flwr_run_patch.py", "run", f"./src/{self.flwr_app_name}", "--format", "json"] else: - command = ["flwr", "run", f"./src/{self.flwr_app_name}"] + command = ["flwr", "run", f"./src/{self.flwr_app_name}", "--format", "json"] if federation_name: command.append(federation_name) @@ -111,7 +112,9 @@ def start(self): if self.flwr_run_command: self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") - self.flwr_run_process = subprocess.Popen(self.flwr_run_command) + flwr_run_process = subprocess.run(self.flwr_run_command, capture_output=True, text=True) + stdout_output = json.loads(flwr_run_process.stdout) + self.run_id = stdout_output['run_id'] def stop(self): """ From e859d36aab8ae885f363d7a02083c575252af5ab Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 11 Feb 2025 10:12:55 -0800 Subject: [PATCH 067/107] fix run-id flag Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 37da8799fb..05f6e5519f 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -114,7 +114,7 @@ def start(self): self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") flwr_run_process = subprocess.run(self.flwr_run_command, capture_output=True, text=True) stdout_output = json.loads(flwr_run_process.stdout) - self.run_id = stdout_output['run_id'] + self.run_id = stdout_output['run-id'] def stop(self): """ From e9f1e360bb18647ba0b7ac79b5f432e8c1c40cc2 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 11 Feb 2025 10:37:37 -0800 Subject: [PATCH 068/107] run_id test Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 1 + .../transport/grpc/connector/flower/local_grpc_client.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 05f6e5519f..697f5fec42 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -115,6 +115,7 @@ def start(self): flwr_run_process = subprocess.run(self.flwr_run_command, capture_output=True, text=True) stdout_output = json.loads(flwr_run_process.stdout) self.run_id = stdout_output['run-id'] + self.local_grpc_client.set_run_id(self.run_id) def stop(self): """ diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index 6ff24dccd5..5011f7d447 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -9,7 +9,7 @@ class LocalGRPCClient: and the OpenFL Server. It converts messages between OpenFL and Flower formats and handles the send-receive communication with the Flower SuperNode using gRPC. """ - def __init__(self, superlink_address, num_server_rounds): + def __init__(self, superlink_address, num_server_rounds, run_id=None): """ Initialize. @@ -20,6 +20,10 @@ def __init__(self, superlink_address, num_server_rounds): self.superlink_stub = grpcadapter_pb2_grpc.GrpcAdapterStub(self.superlink_channel) self.num_server_rounds = num_server_rounds self.end_experiment = False + self.run_id = run_id + + def set_run_id(self, run_id): + self.run_id = run_id def send_receive(self, openfl_message, header): """ @@ -34,6 +38,7 @@ def send_receive(self, openfl_message, header): """ flower_message = openfl_to_flower_message(openfl_message) deserialized_message = deserialize_flower_message(flower_message) + print(self.run_id) # Check if clients completes the evaluation task for the final server round if hasattr(deserialized_message, 'messages_list'): From efb6752371cdb6d6f402e6290bdb14f7a991be45 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 11 Feb 2025 11:10:25 -0800 Subject: [PATCH 069/107] add debug steps for sgx Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 697f5fec42..2f555c7c6d 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -31,7 +31,6 @@ def __init__(self, flwr_app_name: dict, superlink_params: dict, flwr_run_params: self.local_grpc_client = self._get_local_grpc_client() self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None - self.run_id = None def _get_local_grpc_client(self): """ @@ -101,7 +100,6 @@ def _build_flwr_run_command(self) -> list[str]: if federation_name: command.append(federation_name) - # import pdb; pdb.set_trace() return command def start(self): @@ -113,9 +111,11 @@ def start(self): if self.flwr_run_command: self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") flwr_run_process = subprocess.run(self.flwr_run_command, capture_output=True, text=True) + print(flwr_run_process) #DEBUG + print(flwr_run_process.stdout) #DEBUG stdout_output = json.loads(flwr_run_process.stdout) - self.run_id = stdout_output['run-id'] - self.local_grpc_client.set_run_id(self.run_id) + print(stdout_output) #DEBUG + self.local_grpc_client.set_run_id(stdout_output['run-id']) def stop(self): """ From 343de5e3cdeea057c1f543b67b52f8127f09f001 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 11 Feb 2025 12:11:40 -0800 Subject: [PATCH 070/107] fix flwr run patch Signed-off-by: kta-intel --- .../flower-app-pytorch/src/patch/patch_flwr_build.py | 2 +- openfl/component/interoperability/connector_flower.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py index 315151680b..ff069cc44e 100644 --- a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py +++ b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py @@ -134,7 +134,7 @@ def build( ### PATCH ### # REASONING: original code writes to /tmp/ by default. Writing to flwr_home allows us to consolidate written files # Also, return final_path - final_path = Path(flwr_home) / fab_filename + final_path = os.path.join(flwr_home, fab_filename) ############# shutil.move(temp_filename, final_path) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 2f555c7c6d..38c2ee1c59 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -110,11 +110,9 @@ def start(self): if self.flwr_run_command: self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") - flwr_run_process = subprocess.run(self.flwr_run_command, capture_output=True, text=True) - print(flwr_run_process) #DEBUG - print(flwr_run_process.stdout) #DEBUG + flwr_run_process = subprocess.run(self.flwr_run_command, stdout=subprocess.PIPE, text=True) + print(flwr_run_process.stdout) stdout_output = json.loads(flwr_run_process.stdout) - print(stdout_output) #DEBUG self.local_grpc_client.set_run_id(stdout_output['run-id']) def stop(self): From 38b4f8829b9810c58ce4b8d85bed4d57ee33a602 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 11 Feb 2025 13:43:46 -0800 Subject: [PATCH 071/107] new automatic shutdown mech Signed-off-by: kta-intel --- .../flower-app-pytorch/plan/plan.yaml | 7 ++-- .../interoperability/connector_flower.py | 33 +++++++++---------- openfl/federated/task/runner_flower.py | 3 +- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 48dcce106d..ac330b1e91 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -13,7 +13,7 @@ connector : defaults : plan/defaults/connector.yaml template : openfl.component.ConnectorFlower settings : - flwr_app_name : "app-pytorch" + auto_shutdown : True superlink_params : insecure : True serverappio-api-address : 127.0.0.1:9091 # note [kta-intel]: ServerApp will connect here @@ -21,6 +21,7 @@ connector : exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml (for flwr run) patch : True flwr_run_params : + flwr_app_name : "app-pytorch" federation_name : "local-poc" patch : True @@ -38,7 +39,6 @@ task_runner : defaults : plan/defaults/task_runner.yaml template : openfl.federated.task.runner_flower.FlowerTaskRunner settings : - auto_shutdown : True patch : True network : @@ -46,10 +46,11 @@ network : assigner : defaults : plan/defaults/assigner.yaml - template : openfl.component.ConnectorAssigner + template : openfl.component.RandomGroupedAssigner settings : task_groups : - name : Connector_Flower + percentage : 1.0 tasks : - start_client_adapter diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 38c2ee1c59..0edb09b659 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -14,22 +14,21 @@ class ConnectorFlower(Connector): Responsible for generating the Flower server command. """ - def __init__(self, flwr_app_name: dict, superlink_params: dict, flwr_run_params: dict = None, **kwargs): + def __init__(self, superlink_params: dict, flwr_run_params: dict = None, + automatic_shutdown: bool = False, **kwargs): """ Initialize ConnectorFlower by building the server command from the superlink_params. Args: superlink_params (dict): A dictionary of Flower server settings. flwr_run_params (dict, optional): A dictionary containing the Flower run parameters. Defaults to None. """ - self.flwr_app_name = flwr_app_name self.superlink_params = superlink_params - self.flwr_run_params = flwr_run_params command = self._build_command() - super().__init__(command, component_name="Flower") - - self.local_grpc_client = self._get_local_grpc_client() + self.flwr_run_params = flwr_run_params + self.automatic_shutdown = automatic_shutdown + self.local_grpc_client = self._get_local_grpc_client() self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None def _get_local_grpc_client(self): @@ -43,13 +42,7 @@ def _get_local_grpc_client(self): """ connector_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") - # Load in the number of server rounds from the pyproject.toml file - toml_file_path = os.path.join('src', self.flwr_app_name, 'pyproject.toml') - toml_data = toml.load(toml_file_path) - - num_server_rounds = toml_data['tool']['flwr']['app']['config']['num-server-rounds'] - - return LocalGRPCClient(connector_address, num_server_rounds) + return LocalGRPCClient(connector_address, self.automatic_shutdown) def _build_command(self) -> list[str]: """ @@ -91,11 +84,12 @@ def _build_flwr_run_command(self) -> list[str]: list[str]: A list representing the flwr_run command. """ federation_name = self.flwr_run_params.get("federation_name") + flwr_app_name = self.flwr_run_params.get("flwr_app_name") if self.flwr_run_params.get("patch"): - command = ["python", "src/patch/flwr_run_patch.py", "run", f"./src/{self.flwr_app_name}", "--format", "json"] + command = ["python", "src/patch/flwr_run_patch.py", "run", f"./src/{flwr_app_name}", "--format", "json"] else: - command = ["flwr", "run", f"./src/{self.flwr_app_name}", "--format", "json"] + command = ["flwr", "run", f"./src/{flwr_app_name}", "--format", "json"] if federation_name: command.append(federation_name) @@ -111,9 +105,12 @@ def start(self): if self.flwr_run_command: self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") flwr_run_process = subprocess.run(self.flwr_run_command, stdout=subprocess.PIPE, text=True) - print(flwr_run_process.stdout) - stdout_output = json.loads(flwr_run_process.stdout) - self.local_grpc_client.set_run_id(stdout_output['run-id']) + + if self.automatic_shutdown: + flwr_run_stdout_output = json.loads(flwr_run_process.stdout) + flwr_run_id = flwr_run_stdout_output['run-id'] + flwr_app_name = self.flwr_run_params.get("flwr_app_name") + self.local_grpc_client.set_run_id(flwr_run_id, flwr_app_name) def stop(self): """ diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 66c58e9681..d5570b1af0 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -80,8 +80,7 @@ def message_callback(): If auto_shutdown is enabled, logs a message indicating that the final reply has been sent and triggers the SIGTERM signal handler to initiate shutdown. """ - if self.auto_shutdown: - self.shutdown_requested = True + self.shutdown_requested = True server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server( From 28fa32dabf1841adef1a2949076daca0cb591f85 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 12 Feb 2025 08:27:22 -0800 Subject: [PATCH 072/107] changes to local_grpc_client to enable new shutdown mechanism Signed-off-by: kta-intel --- .../connector/flower/local_grpc_client.py | 58 ++++++++++++++----- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index 5011f7d447..97baea202e 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -1,4 +1,6 @@ import grpc +import subprocess +import json from flwr.proto import grpcadapter_pb2_grpc from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message @@ -9,7 +11,7 @@ class LocalGRPCClient: and the OpenFL Server. It converts messages between OpenFL and Flower formats and handles the send-receive communication with the Flower SuperNode using gRPC. """ - def __init__(self, superlink_address, num_server_rounds, run_id=None): + def __init__(self, superlink_address, automatic_shutdown=False): """ Initialize. @@ -18,12 +20,12 @@ def __init__(self, superlink_address, num_server_rounds, run_id=None): """ self.superlink_channel = grpc.insecure_channel(superlink_address) self.superlink_stub = grpcadapter_pb2_grpc.GrpcAdapterStub(self.superlink_channel) - self.num_server_rounds = num_server_rounds + + self.automatic_shutdown = automatic_shutdown self.end_experiment = False - self.run_id = run_id - def set_run_id(self, run_id): - self.run_id = run_id + self.run_id = None + self.flwr_ls_command = None def send_receive(self, openfl_message, header): """ @@ -37,16 +39,44 @@ def send_receive(self, openfl_message, header): The response from the Flower SuperLink, converted back to OpenFL format. """ flower_message = openfl_to_flower_message(openfl_message) - deserialized_message = deserialize_flower_message(flower_message) - print(self.run_id) - - # Check if clients completes the evaluation task for the final server round - if hasattr(deserialized_message, 'messages_list'): - self.end_experiment = any( - message.metadata.group_id == str(self.num_server_rounds) and message.metadata.message_type == "evaluate" - for message in deserialized_message.messages_list - ) + # deserialized_message = deserialize_flower_message(flower_message) + # # Check if clients completes the evaluation task for the final server round + # if hasattr(deserialized_message, 'messages_list'): + # self.end_experiment = any( + # message.metadata.group_id == str(self.num_server_rounds) and message.metadata.message_type == "evaluate" + # for message in deserialized_message.messages_list + # ) flower_response = self.superlink_stub.SendReceive(flower_message) + + if self.automatic_shutdown: + self.end_experiment = self.monitor_server_app() + openfl_response = flower_to_openfl_message(flower_response, header=header, end_experiment=self.end_experiment) return openfl_response + + def set_run_id(self, run_id, flwr_app_name): + """ + Set the run ID for the Flower application and build the flwr_ls_command. + + Args: + run_id: The run ID of the Flower application + flwr_app_name: The name of the Flower application + """ + self.run_id = run_id + self.flwr_ls_command = ["flwr", "ls", f"./src/{flwr_app_name}", "--format", "json", "--run-id", str(self.run_id)] + + def monitor_server_app(self) -> bool: + """ + Run the `flwr ls` command to monitor the Flower application. + + Returns: + bool: True if the experiment has ended, False otherwise. + """ + flwr_ls_process = subprocess.run(self.flwr_ls_command, stdout=subprocess.PIPE, text=True) + flwr_ls_output = json.loads(flwr_ls_process.stdout) + + for run in flwr_ls_output["runs"]: + if "finished" in run["status"]: + return True + return False \ No newline at end of file From c219c66af52b11ecbdc98438c8991dd7fbbf45db Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 12 Feb 2025 08:56:24 -0800 Subject: [PATCH 073/107] fix automatic_shutdown flag Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/plan/plan.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index ac330b1e91..a45f7f5562 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -13,7 +13,7 @@ connector : defaults : plan/defaults/connector.yaml template : openfl.component.ConnectorFlower settings : - auto_shutdown : True + automatic_shutdown : True superlink_params : insecure : True serverappio-api-address : 127.0.0.1:9091 # note [kta-intel]: ServerApp will connect here From 499dedc53baaae4233d3102b9ee60fde3174b0b4 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 12 Feb 2025 12:59:09 -0800 Subject: [PATCH 074/107] debugging Signed-off-by: kta-intel --- .../transport/grpc/connector/flower/local_grpc_client.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index 97baea202e..ad0df15238 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -39,7 +39,10 @@ def send_receive(self, openfl_message, header): The response from the Flower SuperLink, converted back to OpenFL format. """ flower_message = openfl_to_flower_message(openfl_message) - # deserialized_message = deserialize_flower_message(flower_message) + deserialized_message = deserialize_flower_message(flower_message) + if hasattr(deserialized_message, 'messages_list'): + for message in deserialized_message.messages_list: + self.round = message.metadata.group_id # # Check if clients completes the evaluation task for the final server round # if hasattr(deserialized_message, 'messages_list'): @@ -51,6 +54,7 @@ def send_receive(self, openfl_message, header): if self.automatic_shutdown: self.end_experiment = self.monitor_server_app() + print(self.end_experiment) openfl_response = flower_to_openfl_message(flower_response, header=header, end_experiment=self.end_experiment) return openfl_response @@ -74,6 +78,8 @@ def monitor_server_app(self) -> bool: bool: True if the experiment has ended, False otherwise. """ flwr_ls_process = subprocess.run(self.flwr_ls_command, stdout=subprocess.PIPE, text=True) + print(flwr_ls_process) + print(flwr_ls_process.stdout) flwr_ls_output = json.loads(flwr_ls_process.stdout) for run in flwr_ls_output["runs"]: From 9607bb0982dfc2c83c86cbf825bad4656976b1c8 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 12 Feb 2025 14:00:21 -0800 Subject: [PATCH 075/107] more debug lines Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 1 + openfl/transport/grpc/connector/flower/local_grpc_client.py | 1 + 2 files changed, 2 insertions(+) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 0edb09b659..94ffc04407 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -110,6 +110,7 @@ def start(self): flwr_run_stdout_output = json.loads(flwr_run_process.stdout) flwr_run_id = flwr_run_stdout_output['run-id'] flwr_app_name = self.flwr_run_params.get("flwr_app_name") + print(flwr_run_id) self.local_grpc_client.set_run_id(flwr_run_id, flwr_app_name) def stop(self): diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index ad0df15238..dd6ecd9e96 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -77,6 +77,7 @@ def monitor_server_app(self) -> bool: Returns: bool: True if the experiment has ended, False otherwise. """ + print(self.flwr_ls_command) flwr_ls_process = subprocess.run(self.flwr_ls_command, stdout=subprocess.PIPE, text=True) print(flwr_ls_process) print(flwr_ls_process.stdout) From 156ba1f5de8ec795c2a6c3b218c66e7b13e758c4 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 12 Feb 2025 14:32:56 -0800 Subject: [PATCH 076/107] debug 3 Signed-off-by: kta-intel --- openfl/transport/grpc/connector/flower/local_grpc_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index dd6ecd9e96..8f6fb40785 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -50,7 +50,9 @@ def send_receive(self, openfl_message, header): # message.metadata.group_id == str(self.num_server_rounds) and message.metadata.message_type == "evaluate" # for message in deserialized_message.messages_list # ) + print("1") flower_response = self.superlink_stub.SendReceive(flower_message) + print("2") if self.automatic_shutdown: self.end_experiment = self.monitor_server_app() From 951db9dd098262db464fa6e5003f3cd651283950 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 13 Feb 2025 08:06:20 -0800 Subject: [PATCH 077/107] additional debug statements Signed-off-by: kta-intel --- .../component/interoperability/connector_flower.py | 5 +++-- .../grpc/connector/flower/local_grpc_client.py | 14 +++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 94ffc04407..b0cceb9035 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -1,5 +1,4 @@ import subprocess -import toml import json from openfl.component.interoperability.connector import Connector from openfl.transport.grpc.connector.flower.local_grpc_client import LocalGRPCClient @@ -106,11 +105,13 @@ def start(self): self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") flwr_run_process = subprocess.run(self.flwr_run_command, stdout=subprocess.PIPE, text=True) + self.logger.debug(f"{flwr_run_process.stdout}") + if self.automatic_shutdown: flwr_run_stdout_output = json.loads(flwr_run_process.stdout) flwr_run_id = flwr_run_stdout_output['run-id'] flwr_app_name = self.flwr_run_params.get("flwr_app_name") - print(flwr_run_id) + self.logger.debug(f"{flwr_run_id}") self.local_grpc_client.set_run_id(flwr_run_id, flwr_app_name) def stop(self): diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index 8f6fb40785..a005984304 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -4,6 +4,7 @@ from flwr.proto import grpcadapter_pb2_grpc from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message +from logging import getLogger class LocalGRPCClient: """ @@ -27,6 +28,8 @@ def __init__(self, superlink_address, automatic_shutdown=False): self.run_id = None self.flwr_ls_command = None + self.logger = getLogger(__name__) + def send_receive(self, openfl_message, header): """ Sends a message to the Flower SuperLink and receives the response. @@ -39,6 +42,7 @@ def send_receive(self, openfl_message, header): The response from the Flower SuperLink, converted back to OpenFL format. """ flower_message = openfl_to_flower_message(openfl_message) + self.logger.info(f"1") deserialized_message = deserialize_flower_message(flower_message) if hasattr(deserialized_message, 'messages_list'): for message in deserialized_message.messages_list: @@ -50,11 +54,11 @@ def send_receive(self, openfl_message, header): # message.metadata.group_id == str(self.num_server_rounds) and message.metadata.message_type == "evaluate" # for message in deserialized_message.messages_list # ) - print("1") flower_response = self.superlink_stub.SendReceive(flower_message) - print("2") + self.logger.debug(f"2") if self.automatic_shutdown: + self.logger.debug(f"3") self.end_experiment = self.monitor_server_app() print(self.end_experiment) @@ -79,11 +83,11 @@ def monitor_server_app(self) -> bool: Returns: bool: True if the experiment has ended, False otherwise. """ - print(self.flwr_ls_command) + self.logger.debug(f"{self.flwr_ls_command}") flwr_ls_process = subprocess.run(self.flwr_ls_command, stdout=subprocess.PIPE, text=True) - print(flwr_ls_process) - print(flwr_ls_process.stdout) + self.logger.debug(f"{flwr_ls_process}") flwr_ls_output = json.loads(flwr_ls_process.stdout) + self.logger.debug(f"{flwr_ls_output}") for run in flwr_ls_output["runs"]: if "finished" in run["status"]: From 3af36fe691bf9fe903c5e49f8d1c92da67a2489f Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 13 Feb 2025 10:34:03 -0800 Subject: [PATCH 078/107] use process mode and directly track serverapp Signed-off-by: kta-intel --- .../interoperability/connector_flower.py | 51 ++++++++++++++----- .../connector/flower/local_grpc_client.py | 49 ++---------------- 2 files changed, 43 insertions(+), 57 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index b0cceb9035..1d5f4d79bf 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -21,12 +21,13 @@ def __init__(self, superlink_params: dict, flwr_run_params: dict = None, superlink_params (dict): A dictionary of Flower server settings. flwr_run_params (dict, optional): A dictionary containing the Flower run parameters. Defaults to None. """ + self.automatic_shutdown = automatic_shutdown self.superlink_params = superlink_params + self.flwr_serverapp_command = None command = self._build_command() super().__init__(command, component_name="Flower") self.flwr_run_params = flwr_run_params - self.automatic_shutdown = automatic_shutdown self.local_grpc_client = self._get_local_grpc_client() self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None @@ -41,7 +42,7 @@ def _get_local_grpc_client(self): """ connector_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") - return LocalGRPCClient(connector_address, self.automatic_shutdown) + return LocalGRPCClient(connector_address, self.automatic_shutdown, self.is_flwr_serverapp_running) def _build_command(self) -> list[str]: """ @@ -58,8 +59,6 @@ def _build_command(self) -> list[str]: if "insecure" in self.superlink_params: if self.superlink_params["insecure"]: command += ["--insecure"] - else: - command += ["--insecure"] if "serverappio-api-address" in self.superlink_params: command += ["--serverappio-api-address", str(self.superlink_params["serverappio-api-address"])] @@ -73,8 +72,40 @@ def _build_command(self) -> list[str]: command += ["--exec-api-address", str(self.superlink_params["exec-api-address"])] # flwr default: 0.0.0.0:9093 + if self.automatic_shutdown: + command += ["--isolation", "process"] + self.flwr_serverapp_command = self._build_flwr_serverapp_command() + # flwr will default to "--isolation subprocess" + return command + def _build_flwr_serverapp_command(self) -> list[str]: + """ + Start the Flower SuperLink based on superlink_params. + + Returns: + list[str]: A list representing the Flower server start command. + """ + command = ["flwr-serverapp", "--run-once"] + + if "insecure" in self.superlink_params: + if self.superlink_params["insecure"]: + command += ["--insecure"] + + if "serverappio-api-address" in self.superlink_params: + command += ["--serverappio-api-address", str(self.superlink_params["serverappio-api-address"])] + # flwr default: 0.0.0.0:9091 + + return command + + def is_flwr_serverapp_running(self): + """ + Check if the flwr_serverapp subprocess is still running. + """ + if hasattr(self, 'flwr_serverapp_subprocess'): + return self.flwr_serverapp_subprocess.poll() is None + return False + def _build_flwr_run_command(self) -> list[str]: """ Build the `flwr run` command to run the Flower application. @@ -103,16 +134,10 @@ def start(self): if self.flwr_run_command: self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") - flwr_run_process = subprocess.run(self.flwr_run_command, stdout=subprocess.PIPE, text=True) - - self.logger.debug(f"{flwr_run_process.stdout}") + subprocess.run(self.flwr_run_command) - if self.automatic_shutdown: - flwr_run_stdout_output = json.loads(flwr_run_process.stdout) - flwr_run_id = flwr_run_stdout_output['run-id'] - flwr_app_name = self.flwr_run_params.get("flwr_app_name") - self.logger.debug(f"{flwr_run_id}") - self.local_grpc_client.set_run_id(flwr_run_id, flwr_app_name) + if self.flwr_serverapp_command: + self.flwr_serverapp_subprocess = subprocess.Popen(self.flwr_serverapp_command) def stop(self): """ diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index a005984304..d8a4fa71fe 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -12,7 +12,7 @@ class LocalGRPCClient: and the OpenFL Server. It converts messages between OpenFL and Flower formats and handles the send-receive communication with the Flower SuperNode using gRPC. """ - def __init__(self, superlink_address, automatic_shutdown=False): + def __init__(self, superlink_address, automatic_shutdown=False, is_flwr_serverapp_running_callback=None): """ Initialize. @@ -24,9 +24,7 @@ def __init__(self, superlink_address, automatic_shutdown=False): self.automatic_shutdown = automatic_shutdown self.end_experiment = False - - self.run_id = None - self.flwr_ls_command = None + self.is_flwr_serverapp_running_callback = is_flwr_serverapp_running_callback self.logger = getLogger(__name__) @@ -42,54 +40,17 @@ def send_receive(self, openfl_message, header): The response from the Flower SuperLink, converted back to OpenFL format. """ flower_message = openfl_to_flower_message(openfl_message) - self.logger.info(f"1") + deserialized_message = deserialize_flower_message(flower_message) if hasattr(deserialized_message, 'messages_list'): for message in deserialized_message.messages_list: self.round = message.metadata.group_id - # # Check if clients completes the evaluation task for the final server round - # if hasattr(deserialized_message, 'messages_list'): - # self.end_experiment = any( - # message.metadata.group_id == str(self.num_server_rounds) and message.metadata.message_type == "evaluate" - # for message in deserialized_message.messages_list - # ) flower_response = self.superlink_stub.SendReceive(flower_message) - self.logger.debug(f"2") if self.automatic_shutdown: - self.logger.debug(f"3") - self.end_experiment = self.monitor_server_app() - print(self.end_experiment) + self.end_experiment = not self.is_flwr_serverapp_running_callback() + print(self.is_flwr_serverapp_running_callback()) openfl_response = flower_to_openfl_message(flower_response, header=header, end_experiment=self.end_experiment) return openfl_response - - def set_run_id(self, run_id, flwr_app_name): - """ - Set the run ID for the Flower application and build the flwr_ls_command. - - Args: - run_id: The run ID of the Flower application - flwr_app_name: The name of the Flower application - """ - self.run_id = run_id - self.flwr_ls_command = ["flwr", "ls", f"./src/{flwr_app_name}", "--format", "json", "--run-id", str(self.run_id)] - - def monitor_server_app(self) -> bool: - """ - Run the `flwr ls` command to monitor the Flower application. - - Returns: - bool: True if the experiment has ended, False otherwise. - """ - self.logger.debug(f"{self.flwr_ls_command}") - flwr_ls_process = subprocess.run(self.flwr_ls_command, stdout=subprocess.PIPE, text=True) - self.logger.debug(f"{flwr_ls_process}") - flwr_ls_output = json.loads(flwr_ls_process.stdout) - self.logger.debug(f"{flwr_ls_output}") - - for run in flwr_ls_output["runs"]: - if "finished" in run["status"]: - return True - return False \ No newline at end of file From 02c45db6022a678d441e08c95ed5d41052a1c071 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 13 Feb 2025 11:14:40 -0800 Subject: [PATCH 079/107] remove some debug stuff Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 7 ------- .../transport/grpc/connector/flower/local_grpc_client.py | 4 ++-- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index d5570b1af0..557d7edefc 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -47,7 +47,6 @@ def __init__(self, auto_shutdown=True, **kwargs): self.client_port = base_port + self.partition_id self.auto_shutdown = auto_shutdown self.patch = kwargs.get('patch') - self.shutdown_initiated = False # Flag to ensure signal handler runs only once self.shutdown_requested = False # Flag signal shutdown def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): @@ -156,12 +155,6 @@ def terminate_process(process, timeout=5): # Give time for clientapp to stop then directly shutdown the supernode_process time.sleep(10) supernode_process.kill() - # try: - # supernode_process.terminate() - # supernode_process.wait(timeout=5) - # except: - # self.logger.debug(f"Timeout expired while waiting for process {supernode_process.pid} to terminate. Killing the process.") - # supernode_process.kill() self.logger.info("SuperNode process terminated.") else: diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index d8a4fa71fe..8d6cacfc0d 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -25,6 +25,7 @@ def __init__(self, superlink_address, automatic_shutdown=False, is_flwr_serverap self.automatic_shutdown = automatic_shutdown self.end_experiment = False self.is_flwr_serverapp_running_callback = is_flwr_serverapp_running_callback + self.round_number = 0 self.logger = getLogger(__name__) @@ -44,13 +45,12 @@ def send_receive(self, openfl_message, header): deserialized_message = deserialize_flower_message(flower_message) if hasattr(deserialized_message, 'messages_list'): for message in deserialized_message.messages_list: - self.round = message.metadata.group_id + self.round_number = message.metadata.group_id flower_response = self.superlink_stub.SendReceive(flower_message) if self.automatic_shutdown: self.end_experiment = not self.is_flwr_serverapp_running_callback() - print(self.is_flwr_serverapp_running_callback()) openfl_response = flower_to_openfl_message(flower_response, header=header, end_experiment=self.end_experiment) return openfl_response From d9c9913cae9da15750fb3c5b5635ad010f968655 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 13 Feb 2025 11:29:03 -0800 Subject: [PATCH 080/107] remove auto shutdown Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 557d7edefc..20b5b1ccbe 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -29,7 +29,7 @@ class FlowerTaskRunner(TaskRunner): - Automatic Shutdown: If enabled, the system will monitor the activity of subprocesses and automatically shut down if no new subprocess starts within a certain time frame. """ - def __init__(self, auto_shutdown=True, **kwargs): + def __init__(self, **kwargs): """ Initializes the FlowerTaskRunner. @@ -45,7 +45,6 @@ def __init__(self, auto_shutdown=True, **kwargs): base_port = 5000 self.client_port = base_port + self.partition_id - self.auto_shutdown = auto_shutdown self.patch = kwargs.get('patch') self.shutdown_requested = False # Flag signal shutdown From 160ab8a0b12badc6d46a4f64be208efec62792c6 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 13 Feb 2025 12:01:51 -0800 Subject: [PATCH 081/107] adding additional failsafes Signed-off-by: kta-intel --- .../interoperability/connector_flower.py | 34 ++++++++++++++++--- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 1d5f4d79bf..5ad044f950 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -30,6 +30,7 @@ def __init__(self, superlink_params: dict, flwr_run_params: dict = None, self.flwr_run_params = flwr_run_params self.local_grpc_client = self._get_local_grpc_client() self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None + self.signal_shutdown_sent = False def _get_local_grpc_client(self): """ @@ -102,9 +103,31 @@ def is_flwr_serverapp_running(self): """ Check if the flwr_serverapp subprocess is still running. """ - if hasattr(self, 'flwr_serverapp_subprocess'): - return self.flwr_serverapp_subprocess.poll() is None + if not hasattr(self, 'flwr_serverapp_subprocess'): + self.logger.debug("[OpenFL Connector] ServerApp was never started.") + return False + + if self.flwr_serverapp_subprocess.poll() is None: + self.logger.debug("[OpenFL Connector] ServerApp is still running.") + return True + + if not self.signal_shutdown_sent: + self.signal_shutdown_sent = True + self.logger.info("[OpenFL Connector] Experiment has ended. Sending signal to shut down Flower components.") + return False + + def _stop_flwr_serverapp(self): + """ + Stop the `flwr_serverapp` subprocess if it is still running. + """ + if hasattr(self, 'flwr_serverapp_subprocess') and self.flwr_serverapp_subprocess.poll() is None: + self.logger.debug("[OpenFL Connector] ServerApp still running. Stopping...") + self.flwr_serverapp_subprocess.terminate() + try: + self.flwr_serverapp_subprocess.wait(timeout=5) + except subprocess.TimeoutExpired: + self.flwr_serverapp_subprocess.kill() def _build_flwr_run_command(self) -> list[str]: """ @@ -117,9 +140,9 @@ def _build_flwr_run_command(self) -> list[str]: flwr_app_name = self.flwr_run_params.get("flwr_app_name") if self.flwr_run_params.get("patch"): - command = ["python", "src/patch/flwr_run_patch.py", "run", f"./src/{flwr_app_name}", "--format", "json"] + command = ["python", "src/patch/flwr_run_patch.py", "run", f"./src/{flwr_app_name}"] else: - command = ["flwr", "run", f"./src/{flwr_app_name}", "--format", "json"] + command = ["flwr", "run", f"./src/{flwr_app_name}"] if federation_name: command.append(federation_name) @@ -143,4 +166,5 @@ def stop(self): """ Stop the `flower-superlink` subprocess. """ - super().stop() \ No newline at end of file + super().stop() + self._stop_flwr_serverapp() \ No newline at end of file From 04d173cbf33473d9d741667b831537557d12c5b2 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 14 Feb 2025 12:01:39 -0800 Subject: [PATCH 082/107] logic to save out model Signed-off-by: kta-intel --- .../src/app-pytorch/app_pytorch/server_app.py | 75 ++++++++++++++++++- openfl/component/collaborator/collaborator.py | 2 +- openfl/federated/task/runner_flower.py | 46 +++++++++++- 3 files changed, 120 insertions(+), 3 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py index 6e42b57beb..ec9361b46b 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py @@ -6,6 +6,78 @@ from app_pytorch.task import Net, get_weights +#################################################################################### +# TODO: Consider moving this to a separate file and importing SaveModelStrategy + +from openfl.protocols import utils +from openfl.pipelines import NoCompressionPipeline +def save_model(tensor_dict, round_number, file_path): + model = utils.construct_model_proto( + tensor_dict, round_number, NoCompressionPipeline() + ) + utils.dump_proto(model, file_path) + + +# from flwr.server.strategy import FedAvg +from flwr.server.client_proxy import ClientProxy +from flwr.common import FitRes, Scalar, Parameters, parameters_to_ndarrays, Metrics +from typing import Optional, Union, OrderedDict, List, Tuple +import numpy as np + +net = Net() + +class SaveModelStrategy(FedAvg): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.largest_loss = 1e9 + + def aggregate_fit( + self, + server_round: int, + results: list[tuple[ClientProxy, FitRes]], + failures: list[Union[tuple[ClientProxy, FitRes], BaseException]], + ) -> tuple[Optional[Parameters], dict[str, Scalar]]: + """Aggregate model weights using weighted average and store checkpoint""" + + # Call aggregate_fit from base class (FedAvg) to aggregate parameters and metrics + aggregated_parameters, aggregated_metrics = super().aggregate_fit( + server_round, results, failures + ) + + if aggregated_parameters is not None: + print(f"Saving round {server_round} aggregated_parameters...") + + # Convert `Parameters` to `list[np.ndarray]` + aggregated_ndarrays: list[np.ndarray] = parameters_to_ndarrays( + aggregated_parameters + ) + + + params_dict = OrderedDict(zip(net.state_dict().keys(), aggregated_ndarrays)) + + # Save the model to disk + save_model(params_dict, server_round, './save/last.pbuf') + + if aggregated_metrics["train_loss"] < self.largest_loss: + self.largest_loss = aggregated_metrics["train_loss"] + save_model(params_dict, server_round, './save/best.pbuf') + + return aggregated_parameters, aggregated_metrics + + +def weighted_average(metrics: List[Tuple[int, Metrics]]) -> Metrics: + # Multiply accuracy of each client by number of examples used + + print(metrics) + losses = [num_examples * m["train_loss"] for num_examples, m in metrics] + examples = [num_examples for num_examples, _ in metrics] + + # Aggregate and return custom metric (weighted average) + return {"train_loss": sum(losses) / sum(examples)} + +##################################################################################### + + def server_fn(context: Context): # Read from config num_rounds = context.run_config["num-server-rounds"] @@ -16,7 +88,8 @@ def server_fn(context: Context): parameters = ndarrays_to_parameters(ndarrays) # Define strategy - strategy = FedAvg( + strategy = SaveModelStrategy( + fit_metrics_aggregation_fn=weighted_average, fraction_fit=fraction_fit, fraction_evaluate=1.0, min_available_clients=2, diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index a8e92b1f70..70e5a49cc7 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -274,7 +274,7 @@ def do_task(self, task, round_number) -> dict: # TODO: better to use self.send_task_results(global_output_tensor_dict, round_number, task_name) # maybe set global_output_tensor to empty self.client.send_local_task_results(self.collaborator_name, round_number, task_name) - metrics = {'collaborator1/start_client_adapter': 'Completed'} + metrics = {f'{self.collaborator_name}/start_client_adapter': 'Completed'} return metrics else: raise AttributeError(f"{func_name} is not callable on {self.task_runner}") diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 20b5b1ccbe..8173c609e4 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -11,6 +11,8 @@ import psutil import time import os +import numpy as np +from pathlib import Path os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") os.makedirs(os.environ["FLWR_HOME"], exist_ok=True) @@ -39,6 +41,7 @@ def __init__(self, **kwargs): **kwargs: Additional parameters to pass to the functions. """ super().__init__(**kwargs) + self.model = None self.logger = getLogger(__name__) self.num_partitions = self.data_loader.get_node_configs()[0] self.partition_id = self.data_loader.get_node_configs()[1] @@ -175,4 +178,45 @@ def terminate_process(process, timeout=5): signal_handler(signal.SIGTERM, None) time.sleep(0.1) except KeyboardInterrupt: - signal_handler(signal.SIGINT, None) \ No newline at end of file + signal_handler(signal.SIGINT, None) + + def set_tensor_dict(self, tensor_dict, with_opt_vars=False): + """Set the tensor dictionary. + To be framework agnostic, this method will not attempt to load the weights into the model + and save out the native format. Instead, it will load and save the dictionary directly + + Args: + tensor_dict (dict): The tensor dictionary. + with_opt_vars (bool): This argument is inherited from the parent class + but is not used in the FlowerTaskRunner. + """ + self.tensor_dict = tensor_dict + + def save_native( + self, + filepath, + **kwargs, + ): + """ + Save model weights in a .npz file specified by the filepath. + The model weights are stored as a dictionary of np.ndarray + + Args: + filepath (str): Path to the .npz file to be created by np.savez(). + **kwargs: Additional parameters (currently not used). + + Returns: + None + + Raises: + AssertionError: If the file extension is not '.npz'. + """ + # Ensure the file extension is .npz + if isinstance(filepath, Path): + filepath = str(filepath) + + # Ensure the file extension is .npz + assert filepath.endswith('.npz'), "Currently, only '.npz' file type is supported." + + # Save the tensor dictionary to a .npz file + np.savez(filepath, **self.tensor_dict) From 6a558c759d1d1dc009aec2920de23a46d3e0e1b4 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 20 Feb 2025 07:16:35 -0800 Subject: [PATCH 083/107] stable commit. TODO: fix docstrings. NotImplemented: callbacks Signed-off-by: kta-intel --- .../interoperability/connector_flower.py | 22 ++++++----- .../connector/flower/deserialize_message.py | 37 ------------------- .../connector/flower/local_grpc_client.py | 19 ++++------ .../connector/flower/local_grpc_server.py | 4 +- .../connector/flower/message_conversion.py | 5 ++- 5 files changed, 24 insertions(+), 63 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 5ad044f950..d6b99ce153 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -1,5 +1,4 @@ import subprocess -import json from openfl.component.interoperability.connector import Connector from openfl.transport.grpc.connector.flower.local_grpc_client import LocalGRPCClient @@ -13,13 +12,16 @@ class ConnectorFlower(Connector): Responsible for generating the Flower server command. """ - def __init__(self, superlink_params: dict, flwr_run_params: dict = None, - automatic_shutdown: bool = False, **kwargs): + def __init__(self, + superlink_params: dict, + flwr_run_params: dict, + automatic_shutdown: bool = False, + **kwargs): """ Initialize ConnectorFlower by building the server command from the superlink_params. Args: superlink_params (dict): A dictionary of Flower server settings. - flwr_run_params (dict, optional): A dictionary containing the Flower run parameters. Defaults to None. + flwr_run_params (dict): A dictionary containing the Flower run parameters. """ self.automatic_shutdown = automatic_shutdown self.superlink_params = superlink_params @@ -28,8 +30,9 @@ def __init__(self, superlink_params: dict, flwr_run_params: dict = None, super().__init__(command, component_name="Flower") self.flwr_run_params = flwr_run_params + self.flwr_run_command = self._build_flwr_run_command() + self.local_grpc_client = self._get_local_grpc_client() - self.flwr_run_command = self._build_flwr_run_command() if flwr_run_params else None self.signal_shutdown_sent = False def _get_local_grpc_client(self): @@ -155,12 +158,11 @@ def start(self): """ super().start() - if self.flwr_run_command: - self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") - subprocess.run(self.flwr_run_command) + self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") + subprocess.run(self.flwr_run_command) - if self.flwr_serverapp_command: - self.flwr_serverapp_subprocess = subprocess.Popen(self.flwr_serverapp_command) + if self.flwr_serverapp_command: + self.flwr_serverapp_subprocess = subprocess.Popen(self.flwr_serverapp_command) def stop(self): """ diff --git a/openfl/transport/grpc/connector/flower/deserialize_message.py b/openfl/transport/grpc/connector/flower/deserialize_message.py index 9de362b77b..46f37e8acc 100644 --- a/openfl/transport/grpc/connector/flower/deserialize_message.py +++ b/openfl/transport/grpc/connector/flower/deserialize_message.py @@ -1,39 +1,6 @@ import importlib -# import os from google.protobuf.message import DecodeError -# def get_next_log_filename(log_dir='./logs'): -# """ -# Get the next log filename based on the existing log files in the directory. - -# Args: -# log_dir: The directory where log files are stored. - -# Returns: -# The next log filename. -# """ -# if not os.path.exists(log_dir): -# os.makedirs(log_dir) - -# existing_logs = [f for f in os.listdir(log_dir) if f.endswith('.log')] -# if not existing_logs: -# return os.path.join(log_dir, '1.log') - -# existing_numbers = [int(f.split('.')[0]) for f in existing_logs] -# next_number = max(existing_numbers) + 1 -# return os.path.join(log_dir, f'{next_number}.log') - -# def save_message_to_log(message, log_filename): -# """ -# Save the message to a log file. - -# Args: -# message: The message object to save. -# log_filename: The filename of the log file. -# """ -# with open(log_filename, 'w') as log_file: -# log_file.write(str(message)) - def deserialize_flower_message(flower_message): """ Deserialize the grpc_message_content of a Flower message using the module and class name @@ -71,8 +38,4 @@ def deserialize_flower_message(flower_message): print(f"Failed to deserialize message content. Error: {e}") return None - # Save the message to a log file - # log_filename = get_next_log_filename() - # save_message_to_log(message, log_filename) - # print(f"Message saved to {log_filename}") return message \ No newline at end of file diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index 8d6cacfc0d..90e8c78330 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -1,9 +1,6 @@ import grpc -import subprocess -import json from flwr.proto import grpcadapter_pb2_grpc from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message -from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message from logging import getLogger class LocalGRPCClient: @@ -12,7 +9,8 @@ class LocalGRPCClient: and the OpenFL Server. It converts messages between OpenFL and Flower formats and handles the send-receive communication with the Flower SuperNode using gRPC. """ - def __init__(self, superlink_address, automatic_shutdown=False, is_flwr_serverapp_running_callback=None): + def __init__(self, superlink_address, automatic_shutdown=False, + is_flwr_serverapp_running_callback=None): """ Initialize. @@ -41,16 +39,15 @@ def send_receive(self, openfl_message, header): The response from the Flower SuperLink, converted back to OpenFL format. """ flower_message = openfl_to_flower_message(openfl_message) - - deserialized_message = deserialize_flower_message(flower_message) - if hasattr(deserialized_message, 'messages_list'): - for message in deserialized_message.messages_list: - self.round_number = message.metadata.group_id - flower_response = self.superlink_stub.SendReceive(flower_message) if self.automatic_shutdown: + # Check if the flwr_serverapp subprocess is still running, if it isn't + # then the experiment has completed self.end_experiment = not self.is_flwr_serverapp_running_callback() - openfl_response = flower_to_openfl_message(flower_response, header=header, end_experiment=self.end_experiment) + openfl_response = flower_to_openfl_message(flower_response, + header=header, + end_experiment=self.end_experiment) + return openfl_response diff --git a/openfl/transport/grpc/connector/flower/local_grpc_server.py b/openfl/transport/grpc/connector/flower/local_grpc_server.py index e6c85921b8..f6a71aed35 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_server.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_server.py @@ -1,6 +1,5 @@ import threading import queue -import grpc from flwr.proto import grpcadapter_pb2_grpc from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message @@ -28,7 +27,6 @@ def __init__(self, openfl_client, collaborator_name, message_callback): self.processing_thread = threading.Thread(target=self.process_queue) self.processing_thread.daemon = True self.processing_thread.start() - self.shutting_down = False # Flag to indicate if the server is shutting down def SendReceive(self, request, context): """ Handles incoming gRPC requests by putting them into the request queue and waiting for the response. @@ -50,7 +48,7 @@ def process_queue(self): """ while True: request, response_queue = self.request_queue.get() - openfl_request = flower_to_openfl_message(request, header=None) + openfl_request = flower_to_openfl_message(request) # Send request to the OpenFL server openfl_response = self.openfl_client.send_message_to_server(openfl_request, self.collaborator_name) diff --git a/openfl/transport/grpc/connector/flower/message_conversion.py b/openfl/transport/grpc/connector/flower/message_conversion.py index f5279c64c1..ded2235659 100644 --- a/openfl/transport/grpc/connector/flower/message_conversion.py +++ b/openfl/transport/grpc/connector/flower/message_conversion.py @@ -1,8 +1,9 @@ from flwr.proto import grpcadapter_pb2 from openfl.protocols import aggregator_pb2 -# from openfl.transport.grpc.connector.flower.deserialize_message import deserialize_flower_message -def flower_to_openfl_message(flower_message, header=None, end_experiment=False): +def flower_to_openfl_message(flower_message, + header=None, + end_experiment=False): """ Convert a Flower MessageContainer to an OpenFL DropPod. From 306f7ffe34cd51f0a53b3fe94f5f5c65ffc364b4 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 20 Feb 2025 09:15:16 -0800 Subject: [PATCH 084/107] stable commit 2.0: switched order of stop to kill serverapp first Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index d6b99ce153..bdf97c3ee8 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -168,5 +168,5 @@ def stop(self): """ Stop the `flower-superlink` subprocess. """ - super().stop() - self._stop_flwr_serverapp() \ No newline at end of file + self._stop_flwr_serverapp() + super().stop() \ No newline at end of file From 32e96d0362eb497e3035b62fab9bff6715e39f37 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 20 Feb 2025 10:11:38 -0800 Subject: [PATCH 085/107] stable commit 3.0: new self attribute for serverapp Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index bdf97c3ee8..69e71823ca 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -25,7 +25,6 @@ def __init__(self, """ self.automatic_shutdown = automatic_shutdown self.superlink_params = superlink_params - self.flwr_serverapp_command = None command = self._build_command() super().__init__(command, component_name="Flower") @@ -161,7 +160,7 @@ def start(self): self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") subprocess.run(self.flwr_run_command) - if self.flwr_serverapp_command: + if hasattr(self, 'flwr_serverapp_command') and self.flwr_serverapp_command: self.flwr_serverapp_subprocess = subprocess.Popen(self.flwr_serverapp_command) def stop(self): From 7c413f173cd25b5ec9df213f8b5e67b4d5d18c12 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 20 Feb 2025 10:19:07 -0800 Subject: [PATCH 086/107] stable commit 4.0: adding flower taskrunner and dataloader to init Signed-off-by: kta-intel --- openfl/federated/__init__.py | 6 ++++++ openfl/federated/data/__init__.py | 4 ++++ openfl/federated/task/__init__.py | 3 +++ 3 files changed, 13 insertions(+) diff --git a/openfl/federated/__init__.py b/openfl/federated/__init__.py index 1e01b03986..192ec16e31 100644 --- a/openfl/federated/__init__.py +++ b/openfl/federated/__init__.py @@ -27,6 +27,12 @@ from openfl.federated.data import XGBoostDataLoader from openfl.federated.task import FederatedModel # NOQA from openfl.federated.task import XGBoostTaskRunner +if util.find_spec("flwr") is not None: + from openfl.federated.data import FederatedDataSet # NOQA + from openfl.federated.data import FlowerDataLoader + from openfl.federated.task import FederatedModel # NOQA + from openfl.federated.task import FlowerTaskRunner + __all__ = [ "Plan", diff --git a/openfl/federated/data/__init__.py b/openfl/federated/data/__init__.py index fe12bd9f5a..e9efc0718c 100644 --- a/openfl/federated/data/__init__.py +++ b/openfl/federated/data/__init__.py @@ -26,3 +26,7 @@ if util.find_spec("xgboost") is not None: from openfl.federated.data.federated_data import FederatedDataSet # NOQA from openfl.federated.data.loader_xgb import XGBoostDataLoader # NOQA + +if util.find_spec("flwr") is not None: + from openfl.federated.data.federated_data import FederatedDataSet # NOQA + from openfl.federated.data.loader_flower import FlowerDataLoader # NOQA diff --git a/openfl/federated/task/__init__.py b/openfl/federated/task/__init__.py index 5b79f28193..8e1fb91bb5 100644 --- a/openfl/federated/task/__init__.py +++ b/openfl/federated/task/__init__.py @@ -24,3 +24,6 @@ if util.find_spec("xgboost") is not None: from openfl.federated.task.fl_model import FederatedModel # NOQA from openfl.federated.task.runner_xgb import XGBoostTaskRunner # NOQA +if util.find_spec("flwr") is not None: + from openfl.federated.task.fl_model import FederatedModel # NOQA + from openfl.federated.task.runner_flower import FlowerTaskRunner # NOQA From 676c2737c463a2630bc0243a80b7989a27ae3f57 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 20 Feb 2025 11:51:01 -0800 Subject: [PATCH 087/107] keep openfl_client at the collaborator Signed-off-by: kta-intel --- openfl/component/collaborator/collaborator.py | 4 +++- openfl/federated/task/runner_flower.py | 7 ++++--- .../transport/grpc/connector/flower/local_grpc_server.py | 9 ++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index 70e5a49cc7..fd45cf53a3 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -14,6 +14,7 @@ from openfl.pipelines import NoCompressionPipeline, TensorCodec from openfl.protocols import utils from openfl.utilities import TensorKey +from openfl.transport.grpc.connector.flower.local_grpc_server import LocalGRPCServer logger = logging.getLogger(__name__) @@ -270,7 +271,8 @@ def do_task(self, task, round_number) -> dict: if hasattr(self.task_runner, func_name): method = getattr(self.task_runner, func_name) if callable(method): - method(self.client, self.collaborator_name, **kwargs) + local_grpc_server = LocalGRPCServer(self.client, self.collaborator_name) + method(local_grpc_server, **kwargs) # TODO: better to use self.send_task_results(global_output_tensor_dict, round_number, task_name) # maybe set global_output_tensor to empty self.client.send_local_task_results(self.collaborator_name, round_number, task_name) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 8173c609e4..4a6ea529e7 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -4,7 +4,6 @@ from flwr.proto import grpcadapter_pb2_grpc from multiprocessing import cpu_count from openfl.federated.task.runner import TaskRunner -from openfl.transport.grpc.connector.flower.local_grpc_server import LocalGRPCServer import subprocess from logging import getLogger import signal @@ -51,7 +50,7 @@ def __init__(self, **kwargs): self.patch = kwargs.get('patch') self.shutdown_requested = False # Flag signal shutdown - def start_client_adapter(self, openfl_client, collaborator_name, **kwargs): + def start_client_adapter(self, local_grpc_server, **kwargs): """ Starts the local gRPC server and the Flower SuperNode. @@ -83,9 +82,11 @@ def message_callback(): """ self.shutdown_requested = True + local_grpc_server.set_end_experiment_callback(message_callback) + server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server( - LocalGRPCServer(openfl_client, collaborator_name, message_callback), server + local_grpc_server, server ) server.add_insecure_port(f'[::]:{local_server_port}') server.start() diff --git a/openfl/transport/grpc/connector/flower/local_grpc_server.py b/openfl/transport/grpc/connector/flower/local_grpc_server.py index f6a71aed35..56b27a730c 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_server.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_server.py @@ -11,7 +11,7 @@ class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): request handling issues. """ - def __init__(self, openfl_client, collaborator_name, message_callback): + def __init__(self, openfl_client, collaborator_name): """ Initialize. @@ -22,12 +22,15 @@ def __init__(self, openfl_client, collaborator_name, message_callback): """ self.openfl_client = openfl_client self.collaborator_name = collaborator_name - self.message_callback = message_callback + self.end_experiment_callback = None self.request_queue = queue.Queue() self.processing_thread = threading.Thread(target=self.process_queue) self.processing_thread.daemon = True self.processing_thread.start() + def set_end_experiment_callback(self, callback): + self.end_experiment_callback = callback + def SendReceive(self, request, context): """ Handles incoming gRPC requests by putting them into the request queue and waiting for the response. Args: @@ -56,7 +59,7 @@ def process_queue(self): # Check to end experiment if hasattr(openfl_response, 'metadata'): if openfl_response.metadata['end_experiment'] == 'True': - self.message_callback() + self.end_experiment_callback() # Send response to Flower client flower_response = openfl_to_flower_message(openfl_response) From 7dc9de02a10778040cff223ac2f873303ef84a8c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 20 Feb 2025 12:31:50 -0800 Subject: [PATCH 088/107] move serverapp callback out Signed-off-by: kta-intel --- openfl/component/interoperability/connector_flower.py | 3 ++- .../grpc/connector/flower/local_grpc_client.py | 11 ++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 69e71823ca..3bd7d5e9d6 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -45,7 +45,7 @@ def _get_local_grpc_client(self): """ connector_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") - return LocalGRPCClient(connector_address, self.automatic_shutdown, self.is_flwr_serverapp_running) + return LocalGRPCClient(connector_address, self.automatic_shutdown) def _build_command(self) -> list[str]: """ @@ -161,6 +161,7 @@ def start(self): subprocess.run(self.flwr_run_command) if hasattr(self, 'flwr_serverapp_command') and self.flwr_serverapp_command: + self.local_grpc_client.set_is_flwr_serverapp_running_callback(self.is_flwr_serverapp_running) self.flwr_serverapp_subprocess = subprocess.Popen(self.flwr_serverapp_command) def stop(self): diff --git a/openfl/transport/grpc/connector/flower/local_grpc_client.py b/openfl/transport/grpc/connector/flower/local_grpc_client.py index 90e8c78330..90d7618be0 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_client.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_client.py @@ -9,8 +9,7 @@ class LocalGRPCClient: and the OpenFL Server. It converts messages between OpenFL and Flower formats and handles the send-receive communication with the Flower SuperNode using gRPC. """ - def __init__(self, superlink_address, automatic_shutdown=False, - is_flwr_serverapp_running_callback=None): + def __init__(self, superlink_address, automatic_shutdown=False): """ Initialize. @@ -22,11 +21,13 @@ def __init__(self, superlink_address, automatic_shutdown=False, self.automatic_shutdown = automatic_shutdown self.end_experiment = False - self.is_flwr_serverapp_running_callback = is_flwr_serverapp_running_callback - self.round_number = 0 + self.is_flwr_serverapp_running_callback = None self.logger = getLogger(__name__) + def set_is_flwr_serverapp_running_callback(self, is_flwr_serverapp_running_callback): + self.is_flwr_serverapp_running_callback = is_flwr_serverapp_running_callback + def send_receive(self, openfl_message, header): """ Sends a message to the Flower SuperLink and receives the response. @@ -41,7 +42,7 @@ def send_receive(self, openfl_message, header): flower_message = openfl_to_flower_message(openfl_message) flower_response = self.superlink_stub.SendReceive(flower_message) - if self.automatic_shutdown: + if self.automatic_shutdown and self.is_flwr_serverapp_running_callback: # Check if the flwr_serverapp subprocess is still running, if it isn't # then the experiment has completed self.end_experiment = not self.is_flwr_serverapp_running_callback() From 70b02a80c708f9931cdd86c91af3261b4b78d5b5 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 20 Feb 2025 14:00:31 -0800 Subject: [PATCH 089/107] attempts to decouple lgs from task runner - still WIP Signed-off-by: kta-intel --- .../flower-app-pytorch/plan/plan.yaml | 2 + openfl/component/collaborator/collaborator.py | 4 +- openfl/federated/task/runner_flower.py | 99 ++----------------- openfl/transport/grpc/connector/__init__.py | 3 + .../connector/flower/local_grpc_server.py | 73 +++++++++++++- 5 files changed, 87 insertions(+), 94 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index a45f7f5562..34b8d75f45 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -56,6 +56,8 @@ assigner : tasks : defaults : plan/defaults/tasks_connector.yaml + settings : + connect_to : Flower compression_pipeline : defaults : plan/defaults/compression_pipeline.yaml \ No newline at end of file diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index fd45cf53a3..197fe2911b 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -14,7 +14,7 @@ from openfl.pipelines import NoCompressionPipeline, TensorCodec from openfl.protocols import utils from openfl.utilities import TensorKey -from openfl.transport.grpc.connector.flower.local_grpc_server import LocalGRPCServer +from openfl.transport.grpc import connector logger = logging.getLogger(__name__) @@ -271,6 +271,8 @@ def do_task(self, task, round_number) -> dict: if hasattr(self.task_runner, func_name): method = getattr(self.task_runner, func_name) if callable(method): + framework = self.task_config['settings']["connect_to"] + LocalGRPCServer = connector.get_local_grpc_server(framework) local_grpc_server = LocalGRPCServer(self.client, self.collaborator_name) method(local_grpc_server, **kwargs) # TODO: better to use self.send_task_results(global_output_tensor_dict, round_number, task_name) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 4a6ea529e7..63459baded 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -53,44 +53,15 @@ def __init__(self, **kwargs): def start_client_adapter(self, local_grpc_server, **kwargs): """ Starts the local gRPC server and the Flower SuperNode. - - Args: - openfl_client: The OpenFL client instance used to communicate with the OpenFL server. - collaborator_name: The name of the collaborator. - **kwargs: Additional parameters, including 'local_server_port'. - - The method performs the following steps: - 1. Starts a local gRPC server to handle communication between the OpenFL client and the Flower SuperNode. - 2. Launches the Flower SuperNode in a subprocess. - 3. Sets up signal handlers for manual shutdown (via CTRL+C). - 4. If auto_shutdown is enabled, monitors run activity and initiates shutdown if no new subprocesses start within the expected time frame. - - Shutdown Process: - - When a shutdown signal (SIGINT or SIGTERM) is received, the method will: - 1. Terminate all child processes of the SuperNode subprocess. - 2. Terminate the main SuperNode subprocess. - 3. Stop the gRPC server. - 4. Log the shutdown process and set the termination event to stop the server. """ local_server_port = kwargs.get('local_server_port') def message_callback(): - """ - Callback function to handle messaging events. - If auto_shutdown is enabled, logs a message indicating that the final reply - has been sent and triggers the SIGTERM signal handler to initiate shutdown. - """ self.shutdown_requested = True + # TODO: Can we isolate the local_grpc_server from the task runner? local_grpc_server.set_end_experiment_callback(message_callback) - - server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) - grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server( - local_grpc_server, server - ) - server.add_insecure_port(f'[::]:{local_server_port}') - server.start() - self.logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") + local_grpc_server.start_server(local_server_port) if self.patch: command = [ @@ -113,73 +84,19 @@ def message_callback(): ] supernode_process = subprocess.Popen(command, shell=False) - - termination_event = threading.Event() - - def signal_handler(_sig, _frame): - """ - Handles shutdown signals (SIGINT or SIGTERM) to terminate the SuperNode process and stop the local gRPC server. - Args: - _sig: The signal number. - _frame: The current stack frame (not used). - """ - - def terminate_process(process, timeout=5): - """ - Helper function to terminate a process gracefully. - Args: - process: The process to terminate. - timeout: The timeout for waiting for the process to terminate. - """ - try: - process.terminate() - process.wait(timeout=timeout) - except psutil.TimeoutExpired: - self.logger.debug(f"Timeout expired while waiting for process {process.pid} to terminate. Killing the process.") - process.kill() - except psutil.NoSuchProcess: - self.logger.debug(f"Process {process.pid} does not exist. Skipping.") - pass - - if supernode_process.poll() is None: - try: - main_subprocess = psutil.Process(supernode_process.pid) - client_app_processes = main_subprocess.children(recursive=True) - - for client_app_process in client_app_processes: - terminate_process(client_app_process) - - terminate_process(main_subprocess) - self.logger.info("SuperNode process terminated.") - - except Exception as e: - self.logger.debug(f"Error during graceful shutdown: {e}") - # Gramine does not detect psutil.Process - # Give time for clientapp to stop then directly shutdown the supernode_process - time.sleep(10) - supernode_process.kill() - - self.logger.info("SuperNode process terminated.") - else: - self.logger.info("SuperNode process already terminated.") - - self.logger.info("Shutting down local gRPC server...") - server.stop(0) - self.logger.info("local gRPC server stopped.") - termination_event.set() - - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) + local_grpc_server.handle_signals(supernode_process) self.logger.info("Press CTRL+C to stop the server and SuperNode process.") try: - while not termination_event.is_set(): + while not local_grpc_server.termination_event.is_set(): if self.shutdown_requested: - signal_handler(signal.SIGTERM, None) + local_grpc_server.terminate_supernode_process(supernode_process) + local_grpc_server.stop_server() time.sleep(0.1) except KeyboardInterrupt: - signal_handler(signal.SIGINT, None) + local_grpc_server.terminate_supernode_process(supernode_process) + local_grpc_server.stop_server() def set_tensor_dict(self, tensor_dict, with_opt_vars=False): """Set the tensor dictionary. diff --git a/openfl/transport/grpc/connector/__init__.py b/openfl/transport/grpc/connector/__init__.py index e69de29bb2..8bc7eca6d9 100644 --- a/openfl/transport/grpc/connector/__init__.py +++ b/openfl/transport/grpc/connector/__init__.py @@ -0,0 +1,3 @@ +from openfl.transport.grpc.connector.utils import get_local_grpc_server + +__all__ = ['get_local_grpc_server'] \ No newline at end of file diff --git a/openfl/transport/grpc/connector/flower/local_grpc_server.py b/openfl/transport/grpc/connector/flower/local_grpc_server.py index 56b27a730c..7000380824 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_server.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_server.py @@ -1,7 +1,17 @@ +import logging import threading import queue +import grpc +from concurrent.futures import ThreadPoolExecutor from flwr.proto import grpcadapter_pb2_grpc from openfl.transport.grpc.connector.flower.message_conversion import flower_to_openfl_message, openfl_to_flower_message +from multiprocessing import cpu_count +import signal +import psutil +import time + +logger = logging.getLogger(__name__) + class LocalGRPCServer(grpcadapter_pb2_grpc.GrpcAdapterServicer): """ @@ -18,7 +28,6 @@ def __init__(self, openfl_client, collaborator_name): Args: openfl_client: An instance of the OpenFL Client. collaborator_name: The name of the collaborator. - message_callback: A callback function to be called when a specific message is received. """ self.openfl_client = openfl_client self.collaborator_name = collaborator_name @@ -27,10 +36,28 @@ def __init__(self, openfl_client, collaborator_name): self.processing_thread = threading.Thread(target=self.process_queue) self.processing_thread.daemon = True self.processing_thread.start() + self.server = None + self.termination_event = threading.Event() def set_end_experiment_callback(self, callback): self.end_experiment_callback = callback + def start_server(self, local_server_port): + """Starts the gRPC server.""" + self.server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) + grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(self, self.server) + self.server.add_insecure_port(f'[::]:{local_server_port}') + self.server.start() + logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") + + def stop_server(self): + """Stops the gRPC server.""" + if self.server: + logger.info("Shutting down local gRPC server...") + self.server.stop(0) + logger.info("local gRPC server stopped.") + self.termination_event.set() + def SendReceive(self, request, context): """ Handles incoming gRPC requests by putting them into the request queue and waiting for the response. Args: @@ -64,4 +91,46 @@ def process_queue(self): # Send response to Flower client flower_response = openfl_to_flower_message(openfl_response) response_queue.put(flower_response) - self.request_queue.task_done() \ No newline at end of file + self.request_queue.task_done() + + def handle_signals(self, supernode_process): + """Sets up signal handlers for graceful shutdown.""" + def signal_handler(_sig, _frame): + self.terminate_supernode_process(supernode_process) + self.stop_server() + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + def terminate_supernode_process(self, supernode_process): + """Terminates the SuperNode process.""" + if supernode_process.poll() is None: + try: + main_subprocess = psutil.Process(supernode_process.pid) + client_app_processes = main_subprocess.children(recursive=True) + + for client_app_process in client_app_processes: + self.terminate_process(client_app_process) + + self.terminate_process(main_subprocess) + logger.info("SuperNode process terminated.") + + except Exception as e: + logger.debug(f"Error during graceful shutdown: {e}") + time.sleep(10) + supernode_process.kill() + logger.info("SuperNode process terminated.") + else: + logger.info("SuperNode process already terminated.") + + def terminate_process(self, process, timeout=5): + """Helper function to terminate a process gracefully.""" + try: + process.terminate() + process.wait(timeout=timeout) + except psutil.TimeoutExpired: + logger.debug(f"Timeout expired while waiting for process {process.pid} to terminate. Killing the process.") + process.kill() + except psutil.NoSuchProcess: + logger.debug(f"Process {process.pid} does not exist. Skipping.") + pass \ No newline at end of file From 9265d219458cda61bd909976152a049ec6f2faa2 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 20 Feb 2025 14:04:30 -0800 Subject: [PATCH 090/107] add connector utils Signed-off-by: kta-intel --- openfl/transport/grpc/connector/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 openfl/transport/grpc/connector/utils.py diff --git a/openfl/transport/grpc/connector/utils.py b/openfl/transport/grpc/connector/utils.py new file mode 100644 index 0000000000..36d8b75c6b --- /dev/null +++ b/openfl/transport/grpc/connector/utils.py @@ -0,0 +1,10 @@ +import importlib + +def get_local_grpc_server(framework: str = 'Flower') -> object: + if framework == 'Flower': + try: + module = importlib.import_module('openfl.transport.grpc.connector.flower.local_grpc_server') + return module.LocalGRPCServer + except ImportError: + print("Flower is not installed.") + return None \ No newline at end of file From 54cb6290236ebb4eb95d81857ea7f7589384ac13 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 21 Feb 2025 08:15:36 -0800 Subject: [PATCH 091/107] more seamless separate of base class and extensibility of sub class Signed-off-by: kta-intel --- .../component/interoperability/connector.py | 76 +++++------------- .../interoperability/connector_flower.py | 80 +++++++++++++------ 2 files changed, 72 insertions(+), 84 deletions(-) diff --git a/openfl/component/interoperability/connector.py b/openfl/component/interoperability/connector.py index 1040ccc480..ec371a4387 100644 --- a/openfl/component/interoperability/connector.py +++ b/openfl/component/interoperability/connector.py @@ -1,89 +1,49 @@ -import subprocess -import psutil import signal import sys from logging import getLogger +from abc import ABC, abstractmethod -class Connector: +class Connector(ABC): """ - A skeletal base class for managing a server process of an external federated learning framework and - the connection with OpenFL's server + Abstract base class for managing a server process of an external federated learning framework + and the connection with OpenFL's server. """ - def __init__(self, command: list[str], component_name: str = "Base", **kwargs): + def __init__(self, component_name: str = "Base", **kwargs): """ - Initialize the OpenFL Connector. + Initialize the BaseConnector. + Args: command (list[str]): The command to run the server process. component_name (str): The name of the specific Connector component being used. """ - self.local_grpc_client = None - self._command = command - self._process = None self.logger = getLogger(__name__) self.component_name = component_name + self.local_grpc_client = None # Register signal handler for clean termination signal.signal(signal.SIGINT, self._handle_sigint) + @abstractmethod def start(self): - """ - Start the server process with the provided command. - """ - if self._process is None: - self.logger.info(f"[OpenFL Connector] Starting server process: {' '.join(self._command)}") - self._process = subprocess.Popen(self._command) - self.logger.info(f"[OpenFL Connector] server process started with PID: {self._process.pid}") - else: - self.logger.info("[OpenFL Connector] server process is already running.") + """Start the server process with the provided command.""" + pass + @abstractmethod def stop(self): - """ - Stop the server process if it is running. - """ - if self._process: - try: - self.logger.info(f"[OpenFL Connector] Stopping server process with PID: {self._process.pid}...") - # find and terminate sub_process processes - main_process = psutil.Process(self._process.pid) - sub_processes = main_process.children(recursive=True) - for sub_process in sub_processes: - self.logger.info(f"[OpenFL Connector] Stopping server subprocess with PID: {sub_process.pid}...") - sub_process.terminate() - _, still_alive = psutil.wait_procs(sub_processes, timeout=1) - for p in still_alive: - p.kill() - # Terminate the main process - try: - self._process.terminate() - self._process.wait(timeout=5) - except subprocess.TimeoutExpired: - self._process.kill() - self._process = None - self.logger.info("[OpenFL Connector] Server process stopped.") - except Exception as e: - self.logger.debug(f"[OpenFL Connector] Error during graceful shutdown: {e}") - self._process.kill() - self.logger.info("[OpenFL Connector] Server process forcefully terminated.") - else: - self.logger.info("[OpenFL Connector] No server process is currently running.") + """Stop the server process if it is running.""" + pass def get_local_grpc_client(self): - """ - Get the local gRPC client. - """ + """Get the local gRPC client.""" return self.local_grpc_client - def print_Connector_info(self): - """ - Print information indicating which Connector component is being used. - """ + def print_connector_info(self): + """Print information indicating which Connector component is being used.""" self.logger.info(f"OpenFL Connector Enabled: {self.component_name}") def _handle_sigint(self, signum, frame): - """ - Handle the SIGINT signal (Ctrl+C) to cleanly stop the server process and its children. - """ + """Handle the SIGINT signal (Ctrl+C) to cleanly stop the server process and its children.""" self.logger.info("[OpenFL Connector] SIGINT received. Terminating server process...") self.stop() sys.exit(0) \ No newline at end of file diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 3bd7d5e9d6..25c73bd25d 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -2,6 +2,9 @@ from openfl.component.interoperability.connector import Connector from openfl.transport.grpc.connector.flower.local_grpc_client import LocalGRPCClient +import subprocess +import psutil + import os os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") os.makedirs(os.environ["FLWR_HOME"], exist_ok=True) @@ -18,21 +21,25 @@ def __init__(self, automatic_shutdown: bool = False, **kwargs): """ - Initialize ConnectorFlower by building the server command from the superlink_params. + Initialize ConnectorFlower by building the server command. + Args: superlink_params (dict): A dictionary of Flower server settings. flwr_run_params (dict): A dictionary containing the Flower run parameters. """ + super().__init__(component_name="Flower") + self._process = None + self.automatic_shutdown = automatic_shutdown + self.signal_shutdown_sent = False + self.superlink_params = superlink_params - command = self._build_command() - super().__init__(command, component_name="Flower") + self.flwr_superlink_command = self._build_flwr_superlink_command() self.flwr_run_params = flwr_run_params self.flwr_run_command = self._build_flwr_run_command() self.local_grpc_client = self._get_local_grpc_client() - self.signal_shutdown_sent = False def _get_local_grpc_client(self): """ @@ -44,12 +51,11 @@ def _get_local_grpc_client(self): connector address and number of server rounds. """ connector_address = self.superlink_params.get("fleet-api-address", "0.0.0.0:9092") - return LocalGRPCClient(connector_address, self.automatic_shutdown) - def _build_command(self) -> list[str]: + def _build_flwr_superlink_command(self) -> list[str]: """ - Start the Flower SuperLink based on superlink_params. + Build the command to start the Flower SuperLink based on superlink_params. Returns: list[str]: A list representing the Flower server start command. @@ -59,9 +65,8 @@ def _build_command(self) -> list[str]: else: command = ["flower-superlink", "--fleet-api-type", "grpc-adapter"] - if "insecure" in self.superlink_params: - if self.superlink_params["insecure"]: - command += ["--insecure"] + if "insecure" in self.superlink_params and self.superlink_params["insecure"]: + command += ["--insecure"] if "serverappio-api-address" in self.superlink_params: command += ["--serverappio-api-address", str(self.superlink_params["serverappio-api-address"])] @@ -84,26 +89,27 @@ def _build_command(self) -> list[str]: def _build_flwr_serverapp_command(self) -> list[str]: """ - Start the Flower SuperLink based on superlink_params. + Build the command to start the Flower ServerApp based on superlink_params. Returns: list[str]: A list representing the Flower server start command. """ command = ["flwr-serverapp", "--run-once"] - if "insecure" in self.superlink_params: - if self.superlink_params["insecure"]: - command += ["--insecure"] + if "insecure" in self.superlink_params and self.superlink_params["insecure"]: + command += ["--insecure"] if "serverappio-api-address" in self.superlink_params: command += ["--serverappio-api-address", str(self.superlink_params["serverappio-api-address"])] - # flwr default: 0.0.0.0:9091 return command def is_flwr_serverapp_running(self): """ Check if the flwr_serverapp subprocess is still running. + + Returns: + bool: True if the ServerApp is running, False otherwise. """ if not hasattr(self, 'flwr_serverapp_subprocess'): self.logger.debug("[OpenFL Connector] ServerApp was never started.") @@ -120,9 +126,7 @@ def is_flwr_serverapp_running(self): return False def _stop_flwr_serverapp(self): - """ - Stop the `flwr_serverapp` subprocess if it is still running. - """ + """Stop the `flwr_serverapp` subprocess if it is still running.""" if hasattr(self, 'flwr_serverapp_subprocess') and self.flwr_serverapp_subprocess.poll() is None: self.logger.debug("[OpenFL Connector] ServerApp still running. Stopping...") self.flwr_serverapp_subprocess.terminate() @@ -152,10 +156,13 @@ def _build_flwr_run_command(self) -> list[str]: return command def start(self): - """ - Start the `flower-superlink` and `flwr run` subprocesses with the provided commands. - """ - super().start() + """Start the `flower-superlink` and `flwr run` subprocesses with the provided commands.""" + if self._process is None: + self.logger.info(f"[OpenFL Connector] Starting server process: {' '.join(self.flwr_superlink_command)}") + self._process = subprocess.Popen(self.flwr_superlink_command) + self.logger.info(f"[OpenFL Connector] Server process started with PID: {self._process.pid}") + else: + self.logger.info("[OpenFL Connector] Server process is already running.") self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") subprocess.run(self.flwr_run_command) @@ -165,8 +172,29 @@ def start(self): self.flwr_serverapp_subprocess = subprocess.Popen(self.flwr_serverapp_command) def stop(self): - """ - Stop the `flower-superlink` subprocess. - """ + """Stop the `flower-superlink` subprocess.""" self._stop_flwr_serverapp() - super().stop() \ No newline at end of file + if self._process: + try: + self.logger.info(f"[OpenFL Connector] Stopping server process with PID: {self._process.pid}...") + main_process = psutil.Process(self._process.pid) + sub_processes = main_process.children(recursive=True) + for sub_process in sub_processes: + self.logger.info(f"[OpenFL Connector] Stopping server subprocess with PID: {sub_process.pid}...") + sub_process.terminate() + _, still_alive = psutil.wait_procs(sub_processes, timeout=1) + for p in still_alive: + p.kill() + try: + self._process.terminate() + self._process.wait(timeout=5) + except subprocess.TimeoutExpired: + self._process.kill() + self._process = None + self.logger.info("[OpenFL Connector] Server process stopped.") + except Exception as e: + self.logger.debug(f"[OpenFL Connector] Error during graceful shutdown: {e}") + self._process.kill() + self.logger.info("[OpenFL Connector] Server process forcefully terminated.") + else: + self.logger.info("[OpenFL Connector] No server process is currently running.") \ No newline at end of file From 806c0f8e7e52cbc45242c363eb58112593c58588 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 21 Feb 2025 14:01:44 -0800 Subject: [PATCH 092/107] run local grpc server on separate ports Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 63459baded..6d60a5c0b9 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -46,7 +46,9 @@ def __init__(self, **kwargs): self.partition_id = self.data_loader.get_node_configs()[1] base_port = 5000 + # Only necessary to local runs in order to avoid port conflicts self.client_port = base_port + self.partition_id + self.patch = kwargs.get('patch') self.shutdown_requested = False # Flag signal shutdown @@ -56,6 +58,9 @@ def start_client_adapter(self, local_grpc_server, **kwargs): """ local_server_port = kwargs.get('local_server_port') + # Only necessary to local runs in order to avoid port conflicts + local_server_port = local_server_port - self.partition_id + def message_callback(): self.shutdown_requested = True From 1a4bb4b81414496c9117aec9d71d4918d4736537 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 24 Feb 2025 08:16:59 -0800 Subject: [PATCH 093/107] add flower app installation and fab installation to task runner Signed-off-by: kta-intel --- .../flower-app-pytorch/plan/plan.yaml | 1 + .../flower-app-pytorch/requirements.txt | 2 - openfl/federated/task/runner_flower.py | 55 ++++++++++++++++--- openfl/interface/plan.py | 3 + openfl/interface/workspace.py | 1 - 5 files changed, 50 insertions(+), 12 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 34b8d75f45..41012b3d11 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -40,6 +40,7 @@ task_runner : template : openfl.federated.task.runner_flower.FlowerTaskRunner settings : patch : True + flwr_app_name : "app-pytorch" network : defaults : plan/defaults/network.yaml diff --git a/openfl-workspace/flower-app-pytorch/requirements.txt b/openfl-workspace/flower-app-pytorch/requirements.txt index 8141047b9d..e69de29bb2 100644 --- a/openfl-workspace/flower-app-pytorch/requirements.txt +++ b/openfl-workspace/flower-app-pytorch/requirements.txt @@ -1,2 +0,0 @@ -./src/app-pytorch -toml diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 6d60a5c0b9..49a05bf875 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -1,17 +1,11 @@ -import threading -import grpc -from concurrent.futures import ThreadPoolExecutor -from flwr.proto import grpcadapter_pb2_grpc -from multiprocessing import cpu_count from openfl.federated.task.runner import TaskRunner import subprocess from logging import getLogger -import signal -import psutil import time import os import numpy as np from pathlib import Path +import sys os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") os.makedirs(os.environ["FLWR_HOME"], exist_ok=True) @@ -40,6 +34,16 @@ def __init__(self, **kwargs): **kwargs: Additional parameters to pass to the functions. """ super().__init__(**kwargs) + + self.patch = kwargs.get('patch') + if self.data_loader is None: + flwr_app_name = kwargs.get('flwr_app_name') + install_flower_app(flwr_app_name) + + if self.patch: + install_flower_FAB(flwr_app_name) + return + self.model = None self.logger = getLogger(__name__) self.num_partitions = self.data_loader.get_node_configs()[0] @@ -48,8 +52,6 @@ def __init__(self, **kwargs): base_port = 5000 # Only necessary to local runs in order to avoid port conflicts self.client_port = base_port + self.partition_id - - self.patch = kwargs.get('patch') self.shutdown_requested = False # Flag signal shutdown def start_client_adapter(self, local_grpc_server, **kwargs): @@ -143,3 +145,38 @@ def save_native( # Save the tensor dictionary to a .npz file np.savez(filepath, **self.tensor_dict) + + +def install_flower_app(flwr_app_name): + """Install the Flower application.""" + subprocess.check_call( + [sys.executable, "-m", "pip", "install", f"./src/{flwr_app_name}"], + shell=False, + ) + +def install_flower_FAB(flwr_app_name): + """Build and install the patch for the Flower application.""" + flwr_dir = os.environ["FLWR_HOME"] + + # Run the build command + subprocess.check_call([ + sys.executable, + "src/patch/flwr_run_patch.py", + "build", + "--app", + f"./src/{flwr_app_name}" + ]) + + # List .fab files after running the build command + fab_files = list(Path(flwr_dir).glob("*.fab")) + + # Determine the newest .fab file + newest_fab_file = max(fab_files, key=os.path.getmtime) + + # Run the install command using the newest .fab file + subprocess.check_call([ + sys.executable, + "src/patch/flwr_run_patch.py", + "install", + str(newest_fab_file) + ]) \ No newline at end of file diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index 7a6bda9a77..19ad443480 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -154,6 +154,9 @@ def initialize( if 'connector' in plan.config: logger.info("OpenFL Connector enabled: %s", plan.config['connector']) + # Only need to initialize task runner to install apps/packages + # that were not installable via requirements.txt + plan.get_task_runner(data_loader=None) else: init_state_path = plan.config["aggregator"]["settings"]["init_state_path"] # This is needed to bypass data being locally available diff --git a/openfl/interface/workspace.py b/openfl/interface/workspace.py index 7328f3c4fe..522ff99b5f 100644 --- a/openfl/interface/workspace.py +++ b/openfl/interface/workspace.py @@ -139,7 +139,6 @@ def create(prefix, template): requirements_filename = "requirements.txt" if os.path.isfile(f"{str(prefix)}/{requirements_filename}"): - os.chdir(prefix) check_call( [ executable, From 27866fc8cce533264d3607172bcc7679690ac404 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 24 Feb 2025 14:09:56 -0800 Subject: [PATCH 094/107] update dataloader to accept datapath Signed-off-by: kta-intel --- .../src/app-pytorch/app_pytorch/client_app.py | 5 +- .../src/app-pytorch/app_pytorch/task.py | 47 +++---------------- .../flower-app-pytorch/src/setup_data.py | 2 +- .../plan/defaults/tasks_connector.yaml | 2 +- openfl/federated/data/loader_flower.py | 22 ++------- openfl/federated/task/runner_flower.py | 35 +++++++++----- .../connector/flower/local_grpc_server.py | 9 +++- 7 files changed, 45 insertions(+), 77 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/client_app.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/client_app.py index ea0412c948..38f8b01047 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/client_app.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/client_app.py @@ -40,9 +40,8 @@ def evaluate(self, parameters, config): def client_fn(context: Context): # Load model and data net = Net() - partition_id = context.node_config["partition-id"] - num_partitions = context.node_config["num-partitions"] - trainloader, valloader = load_data(partition_id, num_partitions) + data_path = context.node_config["data-path"] + trainloader, valloader = load_data(data_path) local_epochs = context.run_config["local-epochs"] # Return Client instance diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py index ac5244d1ac..dd4515d969 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py @@ -6,9 +6,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -# from flwr_datasets import FederatedDataset # NOTE: flwr_dataset will create ~/.flwr/source -# from flwr_datasets.partitioner import IidPartitioner -from torch.utils.data import DataLoader # Dataset +from torch.utils.data import DataLoader from torchvision.transforms import Compose, Normalize, ToTensor import os @@ -34,42 +32,9 @@ def forward(self, x): return self.fc3(x) -# fds = None # Cache FederatedDataset - -# def load_data(partition_id: int, num_partitions: int): -# """Load partition CIFAR10 data.""" -# # Only initialize `FederatedDataset` once -# global fds -# if fds is None: -# partitioner = IidPartitioner(num_partitions=num_partitions) -# fds = FederatedDataset( -# dataset="uoft-cs/cifar10", -# partitioners={"train": partitioner}, -# ) -# partition = fds.load_partition(partition_id) -# # Divide data on each node: 80% train, 20% test -# partition_train_test = partition.train_test_split(test_size=0.2, seed=42) -# pytorch_transforms = Compose( -# [ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] -# ) - -# def apply_transforms(batch): -# """Apply transforms to the partition from FederatedDataset.""" -# batch["img"] = [pytorch_transforms(img) for img in batch["img"]] -# return batch - -# partition_train_test = partition_train_test.with_transform(apply_transforms) -# trainloader = DataLoader(partition_train_test["train"], batch_size=32, shuffle=True) -# testloader = DataLoader(partition_train_test["test"], batch_size=32) -# import pdb; pdb.set_trace() -# return trainloader, testloader - - -def load_partition_data(partition_id, data_dir="data"): - partition_dir = os.path.join(data_dir, f"{partition_id}") - - train_data_path = os.path.join(partition_dir, "train") - test_data_path = os.path.join(partition_dir, "test") +def load_partition_data(data_path): + train_data_path = os.path.join(data_path, "train") + test_data_path = os.path.join(data_path, "test") train_data = load_from_disk(train_data_path) test_data = load_from_disk(test_data_path) @@ -77,9 +42,9 @@ def load_partition_data(partition_id, data_dir="data"): return train_data, test_data -def load_data(partition_id: int, num_partitions: int): +def load_data(data_path: int): """Load partition CIFAR10 data.""" - train_data, test_data = load_partition_data(partition_id) + train_data, test_data = load_partition_data(data_path) pytorch_transforms = Compose( [ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] ) diff --git a/openfl-workspace/flower-app-pytorch/src/setup_data.py b/openfl-workspace/flower-app-pytorch/src/setup_data.py index e8fb78d52a..f22f8aeae4 100644 --- a/openfl-workspace/flower-app-pytorch/src/setup_data.py +++ b/openfl-workspace/flower-app-pytorch/src/setup_data.py @@ -19,7 +19,7 @@ def main(num_partitions): # Function to save partition data def save_partition_data(partition_id, partition_train_test): - partition_dir = os.path.join(save_dir, f"{partition_id}") + partition_dir = os.path.join(save_dir, f"{partition_id+1}") os.makedirs(partition_dir, exist_ok=True) train_data_path = os.path.join(partition_dir, "train") diff --git a/openfl-workspace/workspace/plan/defaults/tasks_connector.yaml b/openfl-workspace/workspace/plan/defaults/tasks_connector.yaml index 3bc37b3060..999da5a8a6 100644 --- a/openfl-workspace/workspace/plan/defaults/tasks_connector.yaml +++ b/openfl-workspace/workspace/plan/defaults/tasks_connector.yaml @@ -1,4 +1,4 @@ start_client_adapter: function : start_client_adapter kwargs : - local_server_port : 9090 # local grpc server + local_server_port : 0 # local grpc server, 0 to dynamically allocate diff --git a/openfl/federated/data/loader_flower.py b/openfl/federated/data/loader_flower.py index 6cfb7764d7..3c91c04d6d 100644 --- a/openfl/federated/data/loader_flower.py +++ b/openfl/federated/data/loader_flower.py @@ -17,33 +17,21 @@ class FlowerDataLoader(DataLoader): num_partitions (int): The number of partitions to divide the dataset into. """ - def __init__(self, data_path, collaborator_count, **kwargs): + def __init__(self, data_path, **kwargs): """ Initialize the FlowerDataLoader. Args: - data_path (str or int): The shard number of the dataset. + data_path (str or int): The directory of the dataset. collaborator_count (int): The number of partitions to divide the dataset into. **kwargs: Additional keyword arguments to pass to the parent DataLoader class. Raises: ValueError: If collaborator_count is not provided or if data_path is not a number. """ - if collaborator_count is None: - raise ValueError("collaborator_count must be set and cannot be None.") - - try: - partition_id = int(data_path) - except ValueError: - raise ValueError("data_path must be a number corresponding to the shard.") - - if partition_id >= collaborator_count: - raise ValueError("data_path is used as the partition_id and therefore cannot be greater than or equal to the collaborator count.") - super().__init__(**kwargs) - self.partition_id = partition_id - self.num_partitions = collaborator_count - + self.data_path = data_path + def get_node_configs(self): """ Get the configuration for each node. @@ -54,7 +42,7 @@ def get_node_configs(self): Returns: tuple: A tuple containing the number of partitions and the data shard. """ - return self.num_partitions, self.partition_id + return self.data_path def get_feature_shape(self): """ diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 49a05bf875..3e5fee32b5 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -6,6 +6,7 @@ import numpy as np from pathlib import Path import sys +import socket os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "src/.flwr") os.makedirs(os.environ["FLWR_HOME"], exist_ok=True) @@ -46,12 +47,13 @@ def __init__(self, **kwargs): self.model = None self.logger = getLogger(__name__) - self.num_partitions = self.data_loader.get_node_configs()[0] - self.partition_id = self.data_loader.get_node_configs()[1] - base_port = 5000 - # Only necessary to local runs in order to avoid port conflicts - self.client_port = base_port + self.partition_id + self.data_path = self.data_loader.get_node_configs() + + self.client_port = kwargs.get('client_port') + if self.client_port is None: + self.client_port = get_dynamic_port() + self.shutdown_requested = False # Flag signal shutdown def start_client_adapter(self, local_grpc_server, **kwargs): @@ -60,15 +62,14 @@ def start_client_adapter(self, local_grpc_server, **kwargs): """ local_server_port = kwargs.get('local_server_port') - # Only necessary to local runs in order to avoid port conflicts - local_server_port = local_server_port - self.partition_id - def message_callback(): self.shutdown_requested = True # TODO: Can we isolate the local_grpc_server from the task runner? local_grpc_server.set_end_experiment_callback(message_callback) - local_grpc_server.start_server(local_server_port) + local_grpc_server.start_server(0) + + local_server_port = local_grpc_server.get_port() if self.patch: command = [ @@ -78,7 +79,7 @@ def message_callback(): "--grpc-adapter", "--superlink", f"127.0.0.1:{local_server_port}", "--clientappio-api-address", f"127.0.0.1:{self.client_port}", - "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" + "--node-config", f"data-path='{self.data_path}'" ] else: command = [ @@ -87,7 +88,7 @@ def message_callback(): "--grpc-adapter", "--superlink", f"127.0.0.1:{local_server_port}", "--clientappio-api-address", f"127.0.0.1:{self.client_port}", - "--node-config", f"num-partitions={self.num_partitions} partition-id={self.partition_id}" + "--node-config", f"data-path='{self.data_path}'" ] supernode_process = subprocess.Popen(command, shell=False) @@ -179,4 +180,14 @@ def install_flower_FAB(flwr_app_name): "src/patch/flwr_run_patch.py", "install", str(newest_fab_file) - ]) \ No newline at end of file + ]) + +def get_dynamic_port(): + # Create a socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + # Bind to port 0 to let the OS assign an available port + s.bind(('', 0)) + # Get the assigned port number + port = s.getsockname()[1] + return port + \ No newline at end of file diff --git a/openfl/transport/grpc/connector/flower/local_grpc_server.py b/openfl/transport/grpc/connector/flower/local_grpc_server.py index 7000380824..85314948c1 100644 --- a/openfl/transport/grpc/connector/flower/local_grpc_server.py +++ b/openfl/transport/grpc/connector/flower/local_grpc_server.py @@ -38,6 +38,7 @@ def __init__(self, openfl_client, collaborator_name): self.processing_thread.start() self.server = None self.termination_event = threading.Event() + self.port = None def set_end_experiment_callback(self, callback): self.end_experiment_callback = callback @@ -46,9 +47,13 @@ def start_server(self, local_server_port): """Starts the gRPC server.""" self.server = grpc.server(ThreadPoolExecutor(max_workers=cpu_count())) grpcadapter_pb2_grpc.add_GrpcAdapterServicer_to_server(self, self.server) - self.server.add_insecure_port(f'[::]:{local_server_port}') + self.port = self.server.add_insecure_port(f'[::]:{local_server_port}') self.server.start() - logger.info(f"OpenFL local gRPC server started, listening on port {local_server_port}.") + logger.info(f"OpenFL local gRPC server started, listening on port {self.port}.") + + def get_port(self): + # Return the port that was assigned + return self.port def stop_server(self): """Stops the gRPC server.""" From 95bc634af82c63dd39dd534747530313807b2d39 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 24 Feb 2025 14:34:58 -0800 Subject: [PATCH 095/107] move installation back to requirements.txt Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/requirements.txt | 1 + openfl/federated/task/runner_flower.py | 9 --------- openfl/interface/workspace.py | 1 + 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/requirements.txt b/openfl-workspace/flower-app-pytorch/requirements.txt index e69de29bb2..016dbec06b 100644 --- a/openfl-workspace/flower-app-pytorch/requirements.txt +++ b/openfl-workspace/flower-app-pytorch/requirements.txt @@ -0,0 +1 @@ +./src/app-pytorch \ No newline at end of file diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 3e5fee32b5..d667ebcb26 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -39,8 +39,6 @@ def __init__(self, **kwargs): self.patch = kwargs.get('patch') if self.data_loader is None: flwr_app_name = kwargs.get('flwr_app_name') - install_flower_app(flwr_app_name) - if self.patch: install_flower_FAB(flwr_app_name) return @@ -148,13 +146,6 @@ def save_native( np.savez(filepath, **self.tensor_dict) -def install_flower_app(flwr_app_name): - """Install the Flower application.""" - subprocess.check_call( - [sys.executable, "-m", "pip", "install", f"./src/{flwr_app_name}"], - shell=False, - ) - def install_flower_FAB(flwr_app_name): """Build and install the patch for the Flower application.""" flwr_dir = os.environ["FLWR_HOME"] diff --git a/openfl/interface/workspace.py b/openfl/interface/workspace.py index 522ff99b5f..7328f3c4fe 100644 --- a/openfl/interface/workspace.py +++ b/openfl/interface/workspace.py @@ -139,6 +139,7 @@ def create(prefix, template): requirements_filename = "requirements.txt" if os.path.isfile(f"{str(prefix)}/{requirements_filename}"): + os.chdir(prefix) check_call( [ executable, From 1a34378d0e88c5e4dde131a5c94ba2323197f8be Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 25 Feb 2025 12:08:42 -0800 Subject: [PATCH 096/107] add initialize_tensorkey_for_functions method with pass Signed-off-by: kta-intel --- openfl/federated/plan/plan.py | 8 ++------ openfl/federated/task/runner_flower.py | 3 +++ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index 53c8de7ccd..1d68bc2c01 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -491,12 +491,8 @@ def get_task_runner(self, data_loader): if self.runner_ is None: self.runner_ = Plan.build(**defaults) - # Define task dependencies after taskrunner has been initialized - if 'Flower' in defaults['template']: - return self.runner_ - else: - self.runner_.initialize_tensorkeys_for_functions() - return self.runner_ + self.runner_.initialize_tensorkeys_for_functions() + return self.runner_ # Python interactive api def get_core_task_runner(self, data_loader=None, model_provider=None, task_keeper=None): diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index d667ebcb26..5b86876603 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -145,6 +145,9 @@ def save_native( # Save the tensor dictionary to a .npz file np.savez(filepath, **self.tensor_dict) + def initialize_tensorkeys_for_functions(self, with_opt_vars=False): + pass + def install_flower_FAB(flwr_app_name): """Build and install the patch for the Flower application.""" From b32054b16d1b8345b55fe3515030112ea83e9c07 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 25 Feb 2025 14:28:49 -0800 Subject: [PATCH 097/107] debug Signed-off-by: kta-intel --- openfl/federated/plan/plan.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index 1d68bc2c01..2971fa5b6d 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -340,13 +340,12 @@ def get_assigner(self): return self.assigner_ def get_connector(self): - """Get federated learning exchange object.""" + """Get OpenFL Connector object.""" defaults = self.config.get("connector") + self.logger.info("Connector defaults: %s", defaults) if self.connector_ is None and defaults: self.connector_ = Plan.build(**defaults) - else: - self.connector_ = None return self.connector_ From 092c55d03a0b6731698c46a556b4c351d8ad234f Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 28 Feb 2025 11:09:09 -0800 Subject: [PATCH 098/107] update data loader Signed-off-by: kta-intel --- .../src/app-pytorch/app_pytorch/server_app.py | 57 +++++-- .../src/app-pytorch/app_pytorch/task.py | 159 +++++++++++++++--- .../flower-app-pytorch/src/setup_data.py | 20 ++- 3 files changed, 189 insertions(+), 47 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py index ec9361b46b..b14698b4d5 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py @@ -20,9 +20,12 @@ def save_model(tensor_dict, round_number, file_path): # from flwr.server.strategy import FedAvg from flwr.server.client_proxy import ClientProxy -from flwr.common import FitRes, Scalar, Parameters, parameters_to_ndarrays, Metrics +from flwr.common import FitRes, EvaluateRes, Scalar, Parameters, parameters_to_ndarrays from typing import Optional, Union, OrderedDict, List, Tuple import numpy as np +from flwr.server.strategy.aggregate import weighted_loss_avg +from flwr.common.logger import log +from logging import WARNING net = Net() @@ -30,6 +33,7 @@ class SaveModelStrategy(FedAvg): def __init__(self, **kwargs): super().__init__(**kwargs) self.largest_loss = 1e9 + self.params_dict = None def aggregate_fit( self, @@ -45,35 +49,54 @@ def aggregate_fit( ) if aggregated_parameters is not None: - print(f"Saving round {server_round} aggregated_parameters...") - # Convert `Parameters` to `list[np.ndarray]` aggregated_ndarrays: list[np.ndarray] = parameters_to_ndarrays( aggregated_parameters ) - params_dict = OrderedDict(zip(net.state_dict().keys(), aggregated_ndarrays)) + self.params_dict = OrderedDict(zip(net.state_dict().keys(), aggregated_ndarrays)) # Save the model to disk - save_model(params_dict, server_round, './save/last.pbuf') - - if aggregated_metrics["train_loss"] < self.largest_loss: - self.largest_loss = aggregated_metrics["train_loss"] - save_model(params_dict, server_round, './save/best.pbuf') + save_model(self.params_dict , server_round, './save/last.pbuf') return aggregated_parameters, aggregated_metrics + def aggregate_evaluate( + self, + server_round: int, + results: list[tuple[ClientProxy, EvaluateRes]], + failures: list[Union[tuple[ClientProxy, EvaluateRes], BaseException]], + ) -> tuple[Optional[float], dict[str, Scalar]]: + """Aggregate evaluation losses using weighted average.""" + if not results: + return None, {} + # Do not aggregate if there are failures and failures are not accepted + if not self.accept_failures and failures: + return None, {} + + # Aggregate loss + loss_aggregated = weighted_loss_avg( + [ + (evaluate_res.num_examples, evaluate_res.loss) + for _, evaluate_res in results + ] + ) + -def weighted_average(metrics: List[Tuple[int, Metrics]]) -> Metrics: - # Multiply accuracy of each client by number of examples used + # Aggregate custom metrics if aggregation fn was provided + metrics_aggregated = {} + if self.evaluate_metrics_aggregation_fn: + eval_metrics = [(res.num_examples, res.metrics) for _, res in results] + metrics_aggregated = self.evaluate_metrics_aggregation_fn(eval_metrics) + elif server_round == 1: # Only log this warning once + log(WARNING, "No evaluate_metrics_aggregation_fn provided") - print(metrics) - losses = [num_examples * m["train_loss"] for num_examples, m in metrics] - examples = [num_examples for num_examples, _ in metrics] + if loss_aggregated < self.largest_loss: + self.largest_loss = loss_aggregated + save_model(self.params_dict, server_round, './save/best.pbuf') - # Aggregate and return custom metric (weighted average) - return {"train_loss": sum(losses) / sum(examples)} + return loss_aggregated, metrics_aggregated ##################################################################################### @@ -89,7 +112,7 @@ def server_fn(context: Context): # Define strategy strategy = SaveModelStrategy( - fit_metrics_aggregation_fn=weighted_average, + # fit_metrics_aggregation_fn=weighted_average, fraction_fit=fraction_fit, fraction_evaluate=1.0, min_available_clients=2, diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py index dd4515d969..de21a9e14f 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py @@ -1,13 +1,125 @@ +# """app-pytorch: A Flower / PyTorch app.""" + +# from collections import OrderedDict +# from datasets import load_from_disk + +# import torch +# import torch.nn as nn +# import torch.nn.functional as F +# from torch.utils.data import DataLoader +# from torchvision.transforms import Compose, Normalize, ToTensor +# import os + + +# class Net(nn.Module): +# """Model (simple CNN adapted from 'PyTorch: A 60 Minute Blitz')""" + +# def __init__(self): +# super(Net, self).__init__() +# self.conv1 = nn.Conv2d(3, 6, 5) +# self.pool = nn.MaxPool2d(2, 2) +# self.conv2 = nn.Conv2d(6, 16, 5) +# self.fc1 = nn.Linear(16 * 5 * 5, 120) +# self.fc2 = nn.Linear(120, 84) +# self.fc3 = nn.Linear(84, 10) + +# def forward(self, x): +# x = self.pool(F.relu(self.conv1(x))) +# x = self.pool(F.relu(self.conv2(x))) +# x = x.view(-1, 16 * 5 * 5) +# x = F.relu(self.fc1(x)) +# x = F.relu(self.fc2(x)) +# return self.fc3(x) + + +# def load_partition_data(data_path): +# train_data_path = os.path.join(data_path, "train") +# test_data_path = os.path.join(data_path, "test") + +# train_data = load_from_disk(train_data_path) +# test_data = load_from_disk(test_data_path) + +# return train_data, test_data + + +# def load_data(data_path): +# """Load partition CIFAR10 data.""" +# train_data, test_data = load_partition_data(data_path) +# pytorch_transforms = Compose( +# [ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] +# ) + +# def apply_transforms(batch): +# """Apply transforms to the partition from FederatedDataset.""" +# batch["img"] = [pytorch_transforms(img) for img in batch["img"]] +# return batch + +# train_data = train_data.with_transform(apply_transforms) +# test_data = test_data.with_transform(apply_transforms) + +# trainloader = DataLoader(train_data, batch_size=32, shuffle=True) +# testloader = DataLoader(test_data, batch_size=32) +# return trainloader, testloader + + +# def train(net, trainloader, epochs, device): +# """Train the model on the training set.""" +# net.to(device) # move model to GPU if available +# criterion = torch.nn.CrossEntropyLoss().to(device) +# optimizer = torch.optim.Adam(net.parameters(), lr=0.01) +# net.train() +# running_loss = 0.0 +# for _ in range(epochs): +# for batch in trainloader: +# images = batch["img"] +# labels = batch["label"] +# optimizer.zero_grad() +# loss = criterion(net(images.to(device)), labels.to(device)) +# loss.backward() +# optimizer.step() +# running_loss += loss.item() + +# avg_trainloss = running_loss / len(trainloader) +# return avg_trainloss + + +# def test(net, testloader, device): +# """Validate the model on the test set.""" +# net.to(device) +# criterion = torch.nn.CrossEntropyLoss() +# correct, loss = 0, 0.0 +# with torch.no_grad(): +# for batch in testloader: +# images = batch["img"].to(device) +# labels = batch["label"].to(device) +# outputs = net(images) +# loss += criterion(outputs, labels).item() +# correct += (torch.max(outputs.data, 1)[1] == labels).sum().item() +# accuracy = correct / len(testloader.dataset) +# loss = loss / len(testloader) +# return loss, accuracy + + +# def get_weights(net): +# return [val.cpu().numpy() for _, val in net.state_dict().items()] + + +# def set_weights(net, parameters): +# params_dict = zip(net.state_dict().keys(), parameters) +# state_dict = OrderedDict({k: torch.tensor(v) for k, v in params_dict}) +# net.load_state_dict(state_dict, strict=True) + """app-pytorch: A Flower / PyTorch app.""" from collections import OrderedDict -from datasets import load_from_disk import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import DataLoader -from torchvision.transforms import Compose, Normalize, ToTensor +from torchvision import transforms +from torchvision.datasets import ImageFolder +from torch.utils.data import DataLoader import os @@ -30,35 +142,35 @@ def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) return self.fc3(x) - - + def load_partition_data(data_path): train_data_path = os.path.join(data_path, "train") test_data_path = os.path.join(data_path, "test") - train_data = load_from_disk(train_data_path) - test_data = load_from_disk(test_data_path) + # Use ImageFolder to load images from directories + train_data = ImageFolder(root=train_data_path, transform=None) + test_data = ImageFolder(root=test_data_path, transform=None) return train_data, test_data - -def load_data(data_path: int): +def load_data(data_path): """Load partition CIFAR10 data.""" train_data, test_data = load_partition_data(data_path) - pytorch_transforms = Compose( - [ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] - ) - - def apply_transforms(batch): - """Apply transforms to the partition from FederatedDataset.""" - batch["img"] = [pytorch_transforms(img) for img in batch["img"]] - return batch - - train_data = train_data.with_transform(apply_transforms) - test_data = test_data.with_transform(apply_transforms) + + # Define PyTorch transforms + pytorch_transforms = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + ]) + + # Apply transforms to the datasets + train_data.transform = pytorch_transforms + test_data.transform = pytorch_transforms + # Create DataLoaders trainloader = DataLoader(train_data, batch_size=32, shuffle=True) testloader = DataLoader(test_data, batch_size=32) + return trainloader, testloader @@ -70,9 +182,7 @@ def train(net, trainloader, epochs, device): net.train() running_loss = 0.0 for _ in range(epochs): - for batch in trainloader: - images = batch["img"] - labels = batch["label"] + for images, labels in trainloader: optimizer.zero_grad() loss = criterion(net(images.to(device)), labels.to(device)) loss.backward() @@ -89,9 +199,8 @@ def test(net, testloader, device): criterion = torch.nn.CrossEntropyLoss() correct, loss = 0, 0.0 with torch.no_grad(): - for batch in testloader: - images = batch["img"].to(device) - labels = batch["label"].to(device) + for images, labels in testloader: + images, labels = images.to(device), labels.to(device) outputs = net(images) loss += criterion(outputs, labels).item() correct += (torch.max(outputs.data, 1)[1] == labels).sum().item() diff --git a/openfl-workspace/flower-app-pytorch/src/setup_data.py b/openfl-workspace/flower-app-pytorch/src/setup_data.py index f22f8aeae4..f86ccf8c3d 100644 --- a/openfl-workspace/flower-app-pytorch/src/setup_data.py +++ b/openfl-workspace/flower-app-pytorch/src/setup_data.py @@ -2,6 +2,8 @@ import sys from flwr_datasets import FederatedDataset from flwr_datasets.partitioner import IidPartitioner +from PIL import Image +import numpy as np def main(num_partitions): # Directory to save the partitions @@ -22,11 +24,19 @@ def save_partition_data(partition_id, partition_train_test): partition_dir = os.path.join(save_dir, f"{partition_id+1}") os.makedirs(partition_dir, exist_ok=True) - train_data_path = os.path.join(partition_dir, "train") - test_data_path = os.path.join(partition_dir, "test") - - partition_train_test["train"].save_to_disk(train_data_path) - partition_train_test["test"].save_to_disk(test_data_path) + for split, dataset in partition_train_test.items(): + split_dir = os.path.join(partition_dir, split) + os.makedirs(split_dir, exist_ok=True) + + for idx, example in enumerate(dataset): + img_array = np.array(example['img']) + label = example['label'] + label_dir = os.path.join(split_dir, str(label)) + os.makedirs(label_dir, exist_ok=True) + + img = Image.fromarray(img_array) + img_path = os.path.join(label_dir, f"{idx}.png") + img.save(img_path) # Download, split, and save the dataset for partition_id in range(num_partitions): From 0525520b4f1077ca6c0589bdd516150ed78380d4 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 28 Feb 2025 12:55:03 -0800 Subject: [PATCH 099/107] fix defaults Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/plan/plan.yaml | 6 ------ openfl/component/interoperability/connector_flower.py | 11 ++++++----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml index 41012b3d11..18c7e1dd8b 100644 --- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml +++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml @@ -13,17 +13,14 @@ connector : defaults : plan/defaults/connector.yaml template : openfl.component.ConnectorFlower settings : - automatic_shutdown : True superlink_params : insecure : True serverappio-api-address : 127.0.0.1:9091 # note [kta-intel]: ServerApp will connect here fleet-api-address : 127.0.0.1:9092 # note [kta-intel]: local gRPC client will connect here exec-api-address : 127.0.0.1:9093 # note [kta-intel]: port for server-app toml (for flwr run) - patch : True flwr_run_params : flwr_app_name : "app-pytorch" federation_name : "local-poc" - patch : True collaborator : defaults : plan/defaults/collaborator.yaml @@ -38,9 +35,6 @@ data_loader : task_runner : defaults : plan/defaults/task_runner.yaml template : openfl.federated.task.runner_flower.FlowerTaskRunner - settings : - patch : True - flwr_app_name : "app-pytorch" network : defaults : plan/defaults/network.yaml diff --git a/openfl/component/interoperability/connector_flower.py b/openfl/component/interoperability/connector_flower.py index 25c73bd25d..c8a2dcb93d 100644 --- a/openfl/component/interoperability/connector_flower.py +++ b/openfl/component/interoperability/connector_flower.py @@ -17,8 +17,8 @@ class ConnectorFlower(Connector): def __init__(self, superlink_params: dict, - flwr_run_params: dict, - automatic_shutdown: bool = False, + flwr_run_params: dict = None, + automatic_shutdown: bool = True, **kwargs): """ Initialize ConnectorFlower by building the server command. @@ -37,7 +37,7 @@ def __init__(self, self.flwr_superlink_command = self._build_flwr_superlink_command() self.flwr_run_params = flwr_run_params - self.flwr_run_command = self._build_flwr_run_command() + self.flwr_run_command = self._build_flwr_run_command() if self.flwr_run_params else None self.local_grpc_client = self._get_local_grpc_client() @@ -164,8 +164,9 @@ def start(self): else: self.logger.info("[OpenFL Connector] Server process is already running.") - self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") - subprocess.run(self.flwr_run_command) + if hasattr(self, 'flwr_run_command') and self.flwr_run_command: + self.logger.info(f"[OpenFL Connector] Starting `flwr run` subprocess: {' '.join(self.flwr_run_command)}") + subprocess.run(self.flwr_run_command) if hasattr(self, 'flwr_serverapp_command') and self.flwr_serverapp_command: self.local_grpc_client.set_is_flwr_serverapp_running_callback(self.is_flwr_serverapp_running) From 40b58e45473c20a40f5c534de0eb4c009124163a Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 28 Feb 2025 12:55:21 -0800 Subject: [PATCH 100/107] clean up save function Signed-off-by: kta-intel --- .../src/app-pytorch/app_pytorch/server_app.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py index b14698b4d5..04175879bf 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py @@ -6,9 +6,7 @@ from app_pytorch.task import Net, get_weights -#################################################################################### -# TODO: Consider moving this to a separate file and importing SaveModelStrategy - +############################# Save Model ########################################## from openfl.protocols import utils from openfl.pipelines import NoCompressionPipeline def save_model(tensor_dict, round_number, file_path): @@ -97,7 +95,6 @@ def aggregate_evaluate( save_model(self.params_dict, server_round, './save/best.pbuf') return loss_aggregated, metrics_aggregated - ##################################################################################### From 7f25dd02a09544e250f84a74ab080d13c953b6d4 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 28 Feb 2025 12:55:31 -0800 Subject: [PATCH 101/107] udpate README.md Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/README.md | 151 ++++++++++-------- 1 file changed, 86 insertions(+), 65 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/README.md index b0fa089cee..2886d8922c 100644 --- a/openfl-workspace/flower-app-pytorch/README.md +++ b/openfl-workspace/flower-app-pytorch/README.md @@ -1,19 +1,17 @@ # Open(FL)ower -This workspace demonstrates a new functionality in OpenFL to interoperate with [Flower](https://flower.ai/). In particular, a user can now use the Flower API to run on an OpenFL infrastructure. OpenFL will act as an intermediary step between the Flower SuperLink and Flower SuperNode to relay messages across the network using OpenFL's transport mechanisms while Flower manages the experiment. +This workspace demonstrates a new functionality in OpenFL to interoperate with [Flower](https://flower.ai/). In particular, a user can now use the Flower API to run on OpenFL infrastructure. OpenFL will act as an intermediary step between the Flower SuperLink and Flower SuperNode to relay messages across the network using OpenFL's transport mechanisms. ## Overview -In this repository, you'll notice a directory under `src` called `app-pytorch`. This is effectively a Flower PyTorch app created using Flower's `flwr new` command that has been modified to run a local federation. The client and server apps dictate what will be run by the client and server respectively. `Task.py` defines the logic that will be executed by each app, such as the model definition, train/test tasks, etc. +In this repository, you'll notice a directory under `src` called `app-pytorch`. This is essentially a Flower PyTorch app created using Flower's `flwr new` command that has been modified to run a local federation. The `client_app.py` and `server_app.py` dictate what will be run by the client and server respectively. `task.py` defines the logic that will be executed by each app, such as the model definition, train/test tasks, etc. Under `server_app.py` a section titled "Save Model" is added in order to save the `best.pbuf` and `last.pbuf` models from the experiment in your local workspace under `./save`. This uses native OpenFL logic to store the model as a `.pbuf` in order to later be retrieved by `fx model save` into a native format (limited to `.npz` to be deep learning framework agnostic), but this can be overridden to save the model directly following Flower's recommended method for [saving model checkpoints](https://flower.ai/docs/framework/how-to-save-and-load-model-checkpoints.html). ## Execution Methods There are two ways to execute this: -1. Run `flwr run` as a sub-process of the aggregator alongside the superlink. (default) -2. Run `flwr run` as a [separate process](#invoke-flower-experiment-as-a-separate-command) after initializing the `SuperLink` and `SuperNode` at the aggregator and collaborators respectively. - -In addition, there are options to run the `SuperLink` and `SuperNode` as [long-lived components](#long-lived-superlink-and-supernode) that will indefinitely wait for new runs or, by default, as a short-lived component (similar to OpenFL's task runner) that terminates at the end of the experiment. +1. Automatic shutdown which will spawn a `server-app` in isolation and trigger an experiment termination once the it shuts down. (Default/Recommended) +2. Running `SuperLink` and `SuperNode` as [long-lived components](#long-lived-superlink-and-supernode) that will indefinitely wait for new runs. (Limited Functionality) ## Getting Started @@ -46,9 +44,9 @@ This will create a workspace in your current working directory called `./my_work ### Configure the Experiment Notice under `./plan`, you will find the familiar OpenFL YAML files to configure the experiment. `col.yaml` and `data.yaml` will be populated by the collaborators that will run the Flower client app and the respective data shard or directory they will perform their training and testing on. -plan.yaml configures the experiment itself. The Open-Flower integration makes a few key changes to the `plan.yaml`: +`plan.yaml` configures the experiment itself. The Open-Flower integration makes a few key changes to the `plan.yaml`: -1. Introduction of a new top-level key (`connector`) to configure a newly introduced component called `Connector`. Specifically, the Flower integration uses a `Connector` subclass called `ConnectorFlower`. This component is run by the aggregator and is responsible for initializing the Flower SuperLink and connecting to the OpenFL server. The superlink parameters can be configured using `connector.settings.superlink_params`. If nothing is supplied, it will simply run `flower-superlink --insecure` with the command's default settings as dictated by Flower. It also includes the option to run the flwr run command via `connector.settings.flwr_run_params`. Without setting these commands, the aggregator will not invoke `flwr run` and it will be up to the user to run this process separately to start a Flower experiment. +1. Introduction of a new top-level key (`connector`) to configure a newly introduced component called `Connector`. Specifically, the Flower integration uses a `Connector` subclass called `ConnectorFlower`. This component is run by the aggregator and is responsible for initializing the Flower `SuperLink` and connecting to the OpenFL server. The `SuperLink` parameters can be configured using `connector.settings.superlink_params`. If nothing is supplied, it will simply run `flower-superlink --insecure` with the command's default settings as dictated by Flower. It also includes the option to run the flwr run command via `connector.settings.flwr_run_params`. If `flwr_run_params` are not provided, the user will be expected to run `flwr run ` from the aggregator machine to initiate the experiment. ```yaml connector: @@ -141,23 +139,24 @@ fx aggregator start This will prepare the workspace and start the OpenFL aggregator, Flower superlink, and Flower serverapp. You should see something like: ```SH -INFO 🧿 Starting the Aggregator Service. aggregator.py:70 -INFO Building `openfl.component.FLEXAssigner` Module. plan.py:226 -INFO Building `openfl.pipelines.NoCompressionPipeline` Module. plan.py:226 -INFO Building `openfl.component.straggler_handling_functions.CutoffTimeBasedStragglerHandling` Module. plan.py:226 -WARNING CutoffTimeBasedStragglerHandling is disabled as straggler_cutoff_time is set to np.inf. cutoff_time_based_straggler_handling.py:46 -INFO Building `openfl.component.FLEXFlower` Module. plan.py:226 -INFO Building `openfl.component.Aggregator` Module. plan.py:226 -use_tls=True -INFO [OpenFL Connector] Starting server process: flower-superlink --fleet-api-type grpc-adapter --insecure connector.py:28 - --serverappio-api-address 127.0.0.1:9091 --fleet-api-address 127.0.0.1:9092 --exec-api-address 127.0.1:9093 -INFO [OpenFL Connector] server process started with PID: 1972825 connector.py:30 -INFO Starting Aggregator gRPC Server aggregator_server.py:389 +INFO 🧿 Starting the Aggregator Service. +. +. +. INFO : Starting Flower SuperLink WARNING : Option `--insecure` was set. Starting insecure HTTP server. INFO : Flower Deployment Engine: Starting Exec API on 127.0.0.1:9093 INFO : Flower ECE: Starting ServerAppIo API (gRPC-rere) on 127.0.0.1:9091 INFO : Flower ECE: Starting Fleet API (GrpcAdapter) on 127.0.0.1:9092 +. +. +. +INFO : [INIT] +INFO : Using initial global parameters provided by strategy +INFO : Starting evaluation of initial global parameters +INFO : Evaluation returned no results (`None`) +INFO : +INFO : [ROUND 1] ``` ### Start Collaborators @@ -173,20 +172,15 @@ fx collaborator start -n collaborator2 This will start the collaborator nodes, the Flower `SuperNode`, and Flower `ClientApp`, and begin running the Flower experiment. You should see something like: ```SH -INFO 🧿 Starting a Collaborator Service. collaborator.py:85 -INFO Building `openfl.federated.data.loader_flower.FlowerDataLoader` Module. plan.py:226 -INFO Building `openfl.federated.task.runner_flower.FlowerTaskRunner` Module. plan.py:226 -INFO Building `openfl.pipelines.NoCompressionPipeline` Module. plan.py:226 -INFO Building `openfl.component.Collaborator` Module. plan.py:226 -INFO Waiting for tasks... collaborator.py:222 -INFO Received the following tasks: [name: "start_client_adapter" collaborator.py:172 - ] -INFO OpenFL local gRPC server started, listening on port 9090. runner_flower.py:61 -INFO Automatic shutdown enabled. Monitoring subprocess activity... runner_flower.py:105 -INFO Press CTRL+C to stop the server and supernode process. runner_flower.py:139 + INFO 🧿 Starting a Collaborator Service. +. +. +. INFO : Starting Flower SuperNode -WARNING : Option `--insecure` was set. Starting insecure HTTP channel to 127.0.0.1:9090. -INFO : Starting Flower ClientAppIo gRPC server on 127.0.0.1:5000 +WARNING : Option `--insecure` was set. Starting insecure HTTP channel to 127.0.0.1:... +INFO : Starting Flower ClientAppIo gRPC server on 127.0.0.1:... +INFO : +INFO : [RUN 297994661073077505, ROUND 1] ``` ### Completion of the Experiment Upon the completion of the experiment, on the `aggregator` terminal, the Flower components should send an experiment summary as the `SuperLink `continues to receive requests from the supernode: @@ -197,49 +191,54 @@ INFO : History (loss, distributed): INFO : round 1: 2.0937052175497555 INFO : round 2: 1.8027011854633406 INFO : round 3: 1.6812996898487116 -INFO : GrpcAdapter.PullTaskIns -INFO : GrpcAdapter.PullTaskIns -INFO : GrpcAdapter.PullTaskIns ``` -If `autoshutdown` is enabled, this will be shortly followed by the OpenFL `aggregator` receiving "results" from the `collaborator` and subsequently shutting down: +If `automatic_shutdown` is enabled, this will be shortly followed by the OpenFL `aggregator` receiving "results" from the `collaborator` and subsequently shutting down: ```SH -INFO Collaborator collaborator1 is sending task results for start_client_adapter, round 0 aggregator.py:633 -INFO Round: 0, Collaborators that have completed all tasks: ['collaborator1'] aggregator.py:1095 -INFO : GrpcAdapter.DeleteNode -INFO Collaborator collaborator2 is sending task results for start_client_adapter, round 0 aggregator.py:633 -INFO Round: 0, Collaborators that have completed all tasks: ['collaborator1', 'collaborator2'] aggregator.py:1095 -INFO Experiment Completed. Cleaning up... aggregator.py:1053 -INFO Sending signal to collaborator collaborator2 to shutdown... aggregator.py:360 -INFO Sending signal to collaborator collaborator1 to shutdown... aggregator.py:360 -INFO [OpenFL Connector] Stopping server process with PID: 1963348... connector.py:39 -INFO [OpenFL Connector] Stopping server subprocess with PID: 1964099... connector.py:44 -INFO [OpenFL Connector] Server process stopped. +INFO Round 0: Collaborators that have completed all tasks: ['collaborator1', 'collaborator2'] +INFO Experiment Completed. Cleaning up... +INFO Sending signal to collaborator collaborator2 to shutdown... +INFO Sending signal to collaborator collaborator1 to shutdown... +INFO [OpenFL Connector] Stopping server process with PID: ... +INFO : SuperLink terminated gracefully. +INFO [OpenFL Connector] Server process stopped. ``` Upon the completion of the experiment, on the `collaborator` terminals, the Flower components should be outputting the information about the run: ```SH INFO : [RUN ..., ROUND 3] -INFO : Received: evaluate message 53e1ad1c-ffeb-41cc-9857-3d1b83273bd9 -INFO : Starting Flower ClientApp -INFO : Pulling ClientAppInputs for token ... -INFO : Pushing ClientAppOutputs for token ... +INFO : Received: evaluate message +INFO : Start `flwr-clientapp` process +INFO : [flwr-clientapp] Pull `ClientAppInputs` for token ... +INFO : [flwr-clientapp] Push `ClientAppOutputs` for token ... ``` -If `autoshutdown` is enabled, this will be shortly followed by the OpenFL `collaborator` shutting down: +If `automatic_shutdown` is enabled, this will be shortly followed by the OpenFL `collaborator` shutting down: ```SH -INFO : Disconnect and shut down -INFO Supernode process terminated. Shutting down gRPC server... runner_flower.py:96 -INFO gRPC server stopped. runner_flower.py:98 -INFO Waiting for tasks... collaborator.py:222 -INFO End of Federation reached. Exiting... +INFO : SuperNode terminated gracefully. +INFO SuperNode process terminated. +INFO Shutting down local gRPC server... +INFO local gRPC server stopped. +INFO Waiting for tasks... +INFO Received shutdown signal. Exiting... ``` Congratulations, you have run a Flower experiment through OpenFL's task runner! ## Advanced Usage ### Long-lived SuperLink and SuperNode -If `autoshutdown` is not enabled, Flower's `ServerApp` and `ClientApp` will shut down at the completion of the Flower experiment, but the `SuperLink` and `SuperNode` will continue to run. As a result, on the `aggregator` terminal, you will see a constant request coming from the `SuperNode`: +A user can set `automatic_shutdown: False` in the `Connector` settings of the `plan.yaml`. + +```yaml +connector : + defaults : plan/defaults/connector.yaml + template : openfl.component.ConnectorFlower + settings : + automatic_shutdown: False +``` + +By doing so, Flower's `ServerApp` and `ClientApp` will still shut down at the completion of the Flower experiment, but the `SuperLink` and `SuperNode` will continue to run. As a result, on the `aggregator` terminal, you will see a constant request coming from the `SuperNode`: + ```SH INFO : GrpcAdapter.PullTaskIns INFO : GrpcAdapter.PullTaskIns @@ -251,9 +250,31 @@ flwr run ./src/app-pytorch ``` It will run another experiment. Once you are done, you can manually shut down OpenFL's `collaborator` and Flower's `SuperNode` with `CTRL+C`. This will trigger a task-completion by the task runner that'll subsequently begin the graceful shutdown process of the OpenFL and Flower components. -### Invoke Flower experiment as a separate command -If you did not set `flwr_run_params` in the `plan.yaml`, the OpenFL `Connector` will not automatically start a Flower experiment. Instead, you should open a terminal, navigate to this workspace, and run -```SH -flwr run ./src/app-pytorch -``` -separately to begin the experiment. \ No newline at end of file +### Running in SGX Enclave +Gramine does not support all Linux system calls. Flower FAB is built and installed at runtime. During this, `utime()` is called, which is an [unsupported call](https://gramine.readthedocs.io/en/latest/devel/features.html#list-of-system-calls), resulting in error or unexpected behavior. To navigate this, when running in an SGX enclave, we opt to build and install the FAB during initialization and package it alongside the OpenFL workspace. To make this work, we introduce some patches to Flower's build command. In addition, since secure enclaves have strict read/write permissions, dictate by a set of trusted/allowed files, we also patch Flower's telemetry command in order to consolidate written file locations. + +To run these patches, simply add `patch: True` to the `Connector` and `Task Runner` settings. For the `Task Runner` also include the name of the Flower app for building and installation. + +```yaml +connector : + defaults : plan/defaults/connector.yaml + template : openfl.component.ConnectorFlower + settings : + superlink_params : + insecure : True + serverappio-api-address : 127.0.0.1:9091 + fleet-api-address : 127.0.0.1:9092 + exec-api-address : 127.0.0.1:9093 + patch : True + flwr_run_params : + flwr_app_name : "app-pytorch" + federation_name : "local-poc" + patch : True + +task_runner : + defaults : plan/defaults/task_runner.yaml + template : openfl.federated.task.runner_flower.FlowerTaskRunner + settings : + patch : True + flwr_app_name : "app-pytorch" +``` \ No newline at end of file From 12531507768d23d4c1fd9a687017c1c85d51b4b1 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 3 Mar 2025 07:16:03 -0800 Subject: [PATCH 102/107] pass local_server_port key from yaml to taksrunner Signed-off-by: kta-intel --- openfl/federated/task/runner_flower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl/federated/task/runner_flower.py b/openfl/federated/task/runner_flower.py index 5b86876603..ae58e05630 100644 --- a/openfl/federated/task/runner_flower.py +++ b/openfl/federated/task/runner_flower.py @@ -65,7 +65,7 @@ def message_callback(): # TODO: Can we isolate the local_grpc_server from the task runner? local_grpc_server.set_end_experiment_callback(message_callback) - local_grpc_server.start_server(0) + local_grpc_server.start_server(local_server_port) local_server_port = local_grpc_server.get_port() From 5188b3c617e41c16e17d79a74a3fd53e607bb08c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 3 Mar 2025 13:36:29 -0800 Subject: [PATCH 103/107] remove atomic connection by default Signed-off-by: kta-intel --- .../workspace/plan/defaults/network.yaml | 1 + openfl/transport/grpc/aggregator_client.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/openfl-workspace/workspace/plan/defaults/network.yaml b/openfl-workspace/workspace/plan/defaults/network.yaml index 11e03c1890..654667240e 100644 --- a/openfl-workspace/workspace/plan/defaults/network.yaml +++ b/openfl-workspace/workspace/plan/defaults/network.yaml @@ -7,3 +7,4 @@ settings: client_reconnect_interval : 5 require_client_auth : True cert_folder : cert + enable_atomic_connections : False \ No newline at end of file diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index 3105b163cd..269b52e8b4 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -135,9 +135,9 @@ def intercept_stream_unary(self, continuation, client_call_details, request_iter def _atomic_connection(func): - # TODO: Need to investigate how to handle atomic connection when - # two requests are send in very quick succession def wrapper(self, *args, **kwargs): + if not self.enable_atomic_connections: + return func(self, *args, **kwargs) self.reconnect() response = func(self, *args, **kwargs) self.disconnect() @@ -148,6 +148,8 @@ def wrapper(self, *args, **kwargs): def _resend_data_on_reconnection(func): def wrapper(self, *args, **kwargs): + if not self.resend_data_on_reconnection: + return func(self, *args, **kwargs) while True: try: response = func(self, *args, **kwargs) @@ -201,6 +203,8 @@ def __init__( federation_uuid=None, single_col_cert_common_name=None, refetch_server_cert_callback=None, + enable_atomic_connections=True, + resend_data_on_reconnection=True, **kwargs, ): """ @@ -231,9 +235,13 @@ def __init__( self.certificate = certificate self.private_key = private_key self.sleeping_policy = ConstantBackoff( - int(kwargs.get("client_reconnect_interval", 1)), getLogger(__name__), self.uri + int(kwargs.get("client_reconnect_interval", 1)), + getLogger(__name__), + self.uri, ) self.logger = getLogger(__name__) + self.enable_atomic_connections = enable_atomic_connections + self.resend_data_on_reconnection = resend_data_on_reconnection if not self.use_tls: self.logger.warning("gRPC is running on insecure channel with TLS disabled.") From d99d62f9fdede98deba985ac74c715066de7718e Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 6 Mar 2025 13:56:38 -0800 Subject: [PATCH 104/107] updates Signed-off-by: kta-intel --- .../src/app-pytorch/app_pytorch/task.py | 111 ------------------ openfl/federated/__init__.py | 2 - openfl/federated/data/__init__.py | 1 - openfl/federated/task/__init__.py | 1 - 4 files changed, 115 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py index de21a9e14f..c17e9fa05a 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/task.py @@ -1,114 +1,3 @@ -# """app-pytorch: A Flower / PyTorch app.""" - -# from collections import OrderedDict -# from datasets import load_from_disk - -# import torch -# import torch.nn as nn -# import torch.nn.functional as F -# from torch.utils.data import DataLoader -# from torchvision.transforms import Compose, Normalize, ToTensor -# import os - - -# class Net(nn.Module): -# """Model (simple CNN adapted from 'PyTorch: A 60 Minute Blitz')""" - -# def __init__(self): -# super(Net, self).__init__() -# self.conv1 = nn.Conv2d(3, 6, 5) -# self.pool = nn.MaxPool2d(2, 2) -# self.conv2 = nn.Conv2d(6, 16, 5) -# self.fc1 = nn.Linear(16 * 5 * 5, 120) -# self.fc2 = nn.Linear(120, 84) -# self.fc3 = nn.Linear(84, 10) - -# def forward(self, x): -# x = self.pool(F.relu(self.conv1(x))) -# x = self.pool(F.relu(self.conv2(x))) -# x = x.view(-1, 16 * 5 * 5) -# x = F.relu(self.fc1(x)) -# x = F.relu(self.fc2(x)) -# return self.fc3(x) - - -# def load_partition_data(data_path): -# train_data_path = os.path.join(data_path, "train") -# test_data_path = os.path.join(data_path, "test") - -# train_data = load_from_disk(train_data_path) -# test_data = load_from_disk(test_data_path) - -# return train_data, test_data - - -# def load_data(data_path): -# """Load partition CIFAR10 data.""" -# train_data, test_data = load_partition_data(data_path) -# pytorch_transforms = Compose( -# [ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] -# ) - -# def apply_transforms(batch): -# """Apply transforms to the partition from FederatedDataset.""" -# batch["img"] = [pytorch_transforms(img) for img in batch["img"]] -# return batch - -# train_data = train_data.with_transform(apply_transforms) -# test_data = test_data.with_transform(apply_transforms) - -# trainloader = DataLoader(train_data, batch_size=32, shuffle=True) -# testloader = DataLoader(test_data, batch_size=32) -# return trainloader, testloader - - -# def train(net, trainloader, epochs, device): -# """Train the model on the training set.""" -# net.to(device) # move model to GPU if available -# criterion = torch.nn.CrossEntropyLoss().to(device) -# optimizer = torch.optim.Adam(net.parameters(), lr=0.01) -# net.train() -# running_loss = 0.0 -# for _ in range(epochs): -# for batch in trainloader: -# images = batch["img"] -# labels = batch["label"] -# optimizer.zero_grad() -# loss = criterion(net(images.to(device)), labels.to(device)) -# loss.backward() -# optimizer.step() -# running_loss += loss.item() - -# avg_trainloss = running_loss / len(trainloader) -# return avg_trainloss - - -# def test(net, testloader, device): -# """Validate the model on the test set.""" -# net.to(device) -# criterion = torch.nn.CrossEntropyLoss() -# correct, loss = 0, 0.0 -# with torch.no_grad(): -# for batch in testloader: -# images = batch["img"].to(device) -# labels = batch["label"].to(device) -# outputs = net(images) -# loss += criterion(outputs, labels).item() -# correct += (torch.max(outputs.data, 1)[1] == labels).sum().item() -# accuracy = correct / len(testloader.dataset) -# loss = loss / len(testloader) -# return loss, accuracy - - -# def get_weights(net): -# return [val.cpu().numpy() for _, val in net.state_dict().items()] - - -# def set_weights(net, parameters): -# params_dict = zip(net.state_dict().keys(), parameters) -# state_dict = OrderedDict({k: torch.tensor(v) for k, v in params_dict}) -# net.load_state_dict(state_dict, strict=True) - """app-pytorch: A Flower / PyTorch app.""" from collections import OrderedDict diff --git a/openfl/federated/__init__.py b/openfl/federated/__init__.py index 562b5d13f9..3afb2f3e4e 100644 --- a/openfl/federated/__init__.py +++ b/openfl/federated/__init__.py @@ -22,9 +22,7 @@ from openfl.federated.data import XGBoostDataLoader from openfl.federated.task import XGBoostTaskRunner if util.find_spec("flwr") is not None: - from openfl.federated.data import FederatedDataSet # NOQA from openfl.federated.data import FlowerDataLoader - from openfl.federated.task import FederatedModel # NOQA from openfl.federated.task import FlowerTaskRunner diff --git a/openfl/federated/data/__init__.py b/openfl/federated/data/__init__.py index 59bac75f78..29667f7b23 100644 --- a/openfl/federated/data/__init__.py +++ b/openfl/federated/data/__init__.py @@ -18,5 +18,4 @@ from openfl.federated.data.loader_xgb import XGBoostDataLoader # NOQA if util.find_spec("flwr") is not None: - from openfl.federated.data.federated_data import FederatedDataSet # NOQA from openfl.federated.data.loader_flower import FlowerDataLoader # NOQA diff --git a/openfl/federated/task/__init__.py b/openfl/federated/task/__init__.py index 83c20f2efd..1763b3c54d 100644 --- a/openfl/federated/task/__init__.py +++ b/openfl/federated/task/__init__.py @@ -15,5 +15,4 @@ if util.find_spec("xgboost") is not None: from openfl.federated.task.runner_xgb import XGBoostTaskRunner # NOQA if util.find_spec("flwr") is not None: - from openfl.federated.task.fl_model import FederatedModel # NOQA from openfl.federated.task.runner_flower import FlowerTaskRunner # NOQA From 1f3de0a0d843359a9b75c94483f160f9de78bc82 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 6 Mar 2025 16:22:21 -0800 Subject: [PATCH 105/107] more changes Signed-off-by: kta-intel --- .../src/app-pytorch/app_pytorch/server_app.py | 28 +++---------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py index 04175879bf..4610346e0b 100644 --- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py +++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/app_pytorch/server_app.py @@ -5,18 +5,6 @@ from flwr.server.strategy import FedAvg from app_pytorch.task import Net, get_weights - -############################# Save Model ########################################## -from openfl.protocols import utils -from openfl.pipelines import NoCompressionPipeline -def save_model(tensor_dict, round_number, file_path): - model = utils.construct_model_proto( - tensor_dict, round_number, NoCompressionPipeline() - ) - utils.dump_proto(model, file_path) - - -# from flwr.server.strategy import FedAvg from flwr.server.client_proxy import ClientProxy from flwr.common import FitRes, EvaluateRes, Scalar, Parameters, parameters_to_ndarrays from typing import Optional, Union, OrderedDict, List, Tuple @@ -25,13 +13,11 @@ def save_model(tensor_dict, round_number, file_path): from flwr.common.logger import log from logging import WARNING -net = Net() - class SaveModelStrategy(FedAvg): def __init__(self, **kwargs): super().__init__(**kwargs) self.largest_loss = 1e9 - self.params_dict = None + self.aggregated_ndarrays = None def aggregate_fit( self, @@ -48,15 +34,11 @@ def aggregate_fit( if aggregated_parameters is not None: # Convert `Parameters` to `list[np.ndarray]` - aggregated_ndarrays: list[np.ndarray] = parameters_to_ndarrays( + self.aggregated_ndarrays: list[np.ndarray] = parameters_to_ndarrays( aggregated_parameters ) - - self.params_dict = OrderedDict(zip(net.state_dict().keys(), aggregated_ndarrays)) - - # Save the model to disk - save_model(self.params_dict , server_round, './save/last.pbuf') + np.savez(f"last.npz", *self.aggregated_ndarrays) return aggregated_parameters, aggregated_metrics @@ -92,11 +74,9 @@ def aggregate_evaluate( if loss_aggregated < self.largest_loss: self.largest_loss = loss_aggregated - save_model(self.params_dict, server_round, './save/best.pbuf') + np.savez(f"best.npz", *self.aggregated_ndarrays) return loss_aggregated, metrics_aggregated -##################################################################################### - def server_fn(context: Context): # Read from config From 4cba60a405136543c606a6df2ddfbb8cb7120444 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 7 Mar 2025 08:03:43 -0800 Subject: [PATCH 106/107] fix headers Signed-off-by: kta-intel --- openfl/federated/plan/plan.py | 2 +- openfl/transport/grpc/aggregator_client.py | 9 +++++++-- openfl/transport/grpc/aggregator_server.py | 9 ++++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index 4f1daf272f..58f9e28b7d 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -334,7 +334,7 @@ def get_assigner(self): def get_connector(self): """Get OpenFL Connector object.""" defaults = self.config.get("connector") - self.logger.info("Connector defaults: %s", defaults) + logger.info("Connector defaults: %s", defaults) if self.connector_ is None and defaults: self.connector_ = Plan.build(**defaults) diff --git a/openfl/transport/grpc/aggregator_client.py b/openfl/transport/grpc/aggregator_client.py index b035fa9e25..c4f3fdc925 100644 --- a/openfl/transport/grpc/aggregator_client.py +++ b/openfl/transport/grpc/aggregator_client.py @@ -417,8 +417,13 @@ def send_message_to_server(self, openfl_message, collaborator_name): Returns: The response from the OpenFL server """ - self._set_header(collaborator_name) - openfl_message.header.CopyFrom(self.header) + header = create_header( + sender=collaborator_name, + receiver=self.aggregator_uuid, + federation_uuid=self.federation_uuid, + single_col_cert_common_name=self.single_col_cert_common_name, + ) + openfl_message.header.CopyFrom(header) openfl_response = self.stub.PelicanDrop(openfl_message) self.validate_response(openfl_response, collaborator_name) return openfl_response diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py index f531a27c28..f71af9322c 100644 --- a/openfl/transport/grpc/aggregator_server.py +++ b/openfl/transport/grpc/aggregator_server.py @@ -303,8 +303,15 @@ def PelicanDrop(self, request, context): self.check_request(request) collaborator_name = request.header.sender + header = create_header( + sender=self.aggregator.uuid, + receiver=collaborator_name, + federation_uuid=self.aggregator.federation_uuid, + single_col_cert_common_name=self.aggregator.single_col_cert_common_name, + ) + # Forward the incoming OpenFL message to the local gRPC client - return self.local_grpc_client.send_receive(request, header=self.get_header(collaborator_name)) + return self.local_grpc_client.send_receive(request, header=header) def serve(self): """Starts the aggregator gRPC server.""" From fb266609096a58075cd7400feed1358ec8ef5ca6 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 7 Mar 2025 10:27:47 -0800 Subject: [PATCH 107/107] add .workspace Signed-off-by: kta-intel --- openfl-workspace/flower-app-pytorch/.workspace | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 openfl-workspace/flower-app-pytorch/.workspace diff --git a/openfl-workspace/flower-app-pytorch/.workspace b/openfl-workspace/flower-app-pytorch/.workspace new file mode 100644 index 0000000000..3c2c5d08b4 --- /dev/null +++ b/openfl-workspace/flower-app-pytorch/.workspace @@ -0,0 +1,2 @@ +current_plan_name: default +