Skip to content

Commit 3a54e12

Browse files
committed
fix NC scalability
1 parent 3ff8466 commit 3a54e12

9 files changed

Lines changed: 8598 additions & 4861 deletions

File tree

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ cover/
5959
*.pot
6060

6161
# Django stuff:
62-
*.log
62+
#*.log
6363
*.csv
6464
local_settings.py
6565
db.sqlite3

benchmark/benchmark_NC_Distributed-PyG.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,8 @@
1010

1111
# Distributed PyG imports
1212
from torch_geometric.loader import NeighborLoader
13-
from torch_geometric.nn.models import GCN as PyGGCN
1413
from torch.distributed import init_process_group, destroy_process_group
1514
from torch.nn.parallel import DistributedDataParallel as DDP
16-
import torch.multiprocessing as mp
1715
import os
1816

1917
DATASETS = ['cora', 'citeseer', 'pubmed']

benchmark/figure/NC_comm_costs/NC10.log

Lines changed: 1999 additions & 1144 deletions
Large diffs are not rendered by default.

benchmark/figure/NC_comm_costs/NC15.log

Lines changed: 2319 additions & 0 deletions
Large diffs are not rendered by default.

benchmark/figure/NC_comm_costs/NC20.log

Lines changed: 2344 additions & 0 deletions
Large diffs are not rendered by default.

benchmark/figure/NC_comm_costs/NC5.log

Lines changed: 1905 additions & 3587 deletions
Large diffs are not rendered by default.

benchmark/figure/NC_comm_costs/client_scalability_analysis.py

Lines changed: 25 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -13,29 +13,25 @@
1313
sns.set_context("talk")
1414

1515

16-
def extract_nc_scalability_data(logfile):
17-
"""Extract training and communication time data from NC log files"""
16+
def extract_nc_scalability_data(logfile, expected_trainers=None):
1817
with open(logfile, "r", encoding="utf-8", errors="replace") as f:
1918
log_content = f.read()
2019

2120
results = []
2221

23-
# Find CSV FORMAT RESULT sections
2422
csv_sections = re.findall(
25-
r"CSV FORMAT RESULT:.*?DS,IID,BS,Time\[s\],FinalAcc\[%\],CompTime\[s\],CommCost\[MB\],PeakMem\[MB\],AvgRoundTime\[s\],ModelSize\[MB\],TotalParams\n(.*?)\n",
23+
r"CSV FORMAT RESULT:.*?DS,IID,BS,TotalTime\[s\],PureTrainingTime\[s\],CommTime\[s\],FinalAcc\[%\],CommCost\[MB\],PeakMem\[MB\],AvgRoundTime\[s\],ModelSize\[MB\],TotalParams\n(.*?)\n",
2624
log_content,
2725
re.DOTALL,
2826
)
2927

30-
# Extract number of trainers from experiment configuration
3128
trainer_matches = re.findall(r"Trainers: (\d+)", log_content)
3229

3330
for csv_idx, csv_line in enumerate(csv_sections):
3431
parts = csv_line.strip().split(",")
35-
if len(parts) >= 11:
32+
if len(parts) >= 12:
3633
try:
37-
# Get number of trainers for this experiment
38-
num_trainers = (
34+
num_trainers = expected_trainers if expected_trainers else (
3935
int(trainer_matches[csv_idx])
4036
if csv_idx < len(trainer_matches)
4137
else 10
@@ -44,17 +40,16 @@ def extract_nc_scalability_data(logfile):
4440
result = {
4541
"Dataset": parts[0],
4642
"IID_Beta": float(parts[1]),
47-
"Batch_Size": int(parts[2]),
43+
"Batch_Size": int(parts[2]) if parts[2] != '-1' else -1,
4844
"Total_Time": float(parts[3]),
49-
"Final_Accuracy": float(parts[4]),
50-
"Training_Time": float(parts[5]), # CompTime[s]
51-
"Communication_Cost": float(
52-
parts[6]
53-
), # CommCost[MB] - will convert to time
54-
"Peak_Memory": float(parts[7]),
55-
"Avg_Round_Time": float(parts[8]),
56-
"Model_Size": float(parts[9]),
57-
"Total_Params": int(float(parts[10])),
45+
"Training_Time": float(parts[4]),
46+
"Communication_Time": float(parts[5]),
47+
"Final_Accuracy": float(parts[6]),
48+
"Communication_Cost": float(parts[7]),
49+
"Peak_Memory": float(parts[8]),
50+
"Avg_Round_Time": float(parts[9]),
51+
"Model_Size": float(parts[10]),
52+
"Total_Params": int(float(parts[11])),
5853
"Num_Trainers": num_trainers,
5954
}
6055
results.append(result)
@@ -64,90 +59,54 @@ def extract_nc_scalability_data(logfile):
6459
return pd.DataFrame(results)
6560

6661

67-
def estimate_communication_time(comm_cost_mb, num_trainers):
68-
"""Estimate communication time based on communication cost and network assumptions"""
69-
# Assume network bandwidth: 100 Mbps = 12.5 MB/s
70-
# This is a reasonable assumption for federated learning scenarios
71-
network_bandwidth_mbps = 100 / 8 # Convert to MB/s
72-
73-
# Communication time = Total communication cost / bandwidth
74-
comm_time = comm_cost_mb / network_bandwidth_mbps
75-
76-
return comm_time
77-
78-
7962
def load_all_nc_logs():
80-
"""Load data from all NC log files"""
81-
log_files = ["NC.log", "NC5.log", "NC10.log", "NC20.log", "NC40.log"]
82-
trainer_counts = [10, 5, 10, 20, 40] # Default mapping
63+
log_files = ["NC5.log", "NC10.log", "NC15.log", "NC20.log"]
64+
trainer_counts = [5, 10, 15, 20]
8365

8466
all_data = []
8567

86-
for log_file, default_trainers in zip(log_files, trainer_counts):
68+
for log_file, expected_trainers in zip(log_files, trainer_counts):
8769
if os.path.exists(log_file):
88-
df = extract_nc_scalability_data(log_file)
70+
df = extract_nc_scalability_data(log_file, expected_trainers)
8971
if not df.empty:
90-
# If trainer count not detected, use default
91-
if "Num_Trainers" not in df.columns or df["Num_Trainers"].isna().all():
92-
df["Num_Trainers"] = default_trainers
72+
df["Num_Trainers"] = expected_trainers
9373
all_data.append(df)
94-
print(
95-
f"Loaded {len(df)} records from {log_file} (Trainers: {default_trainers})"
96-
)
9774

9875
if all_data:
9976
combined_df = pd.concat(all_data, ignore_index=True)
10077
return combined_df
10178
else:
102-
print("No NC log files found")
10379
return pd.DataFrame()
10480

10581

10682
def create_scalability_plot(df):
107-
"""Create scalability plot showing training time and communication time vs number of clients"""
108-
10983
if df.empty:
110-
print("No data available for plotting")
11184
return
11285

113-
# Filter for IID_Beta = 10.0 (as specified in your benchmark)
11486
df_filtered = df[df["IID_Beta"] == 10.0].copy()
11587

11688
if df_filtered.empty:
117-
print("No data found for IID_Beta = 10.0")
11889
return
11990

120-
# Add estimated communication time
121-
df_filtered["Communication_Time"] = df_filtered.apply(
122-
lambda row: estimate_communication_time(
123-
row["Communication_Cost"], row["Num_Trainers"]
124-
),
125-
axis=1,
126-
)
127-
128-
# Group by number of trainers and calculate average times
12991
scalability_data = (
13092
df_filtered.groupby("Num_Trainers")
13193
.agg(
13294
{
13395
"Training_Time": "mean",
13496
"Communication_Time": "mean",
13597
"Total_Time": "mean",
98+
"Final_Accuracy": "mean",
99+
"Communication_Cost": "mean",
100+
"Peak_Memory": "mean",
136101
}
137102
)
138103
.reset_index()
139104
)
140105

141-
# Sort by number of trainers
142106
scalability_data = scalability_data.sort_values("Num_Trainers")
143107

144-
print("Scalability Data Summary:")
145-
print(scalability_data)
146-
147-
# Create the plot
148108
plt.figure(figsize=(12, 8))
149109

150-
# Plot training time
151110
plt.plot(
152111
scalability_data["Num_Trainers"],
153112
scalability_data["Training_Time"],
@@ -158,7 +117,6 @@ def create_scalability_plot(df):
158117
label="Training Time",
159118
)
160119

161-
# Plot communication time
162120
plt.plot(
163121
scalability_data["Num_Trainers"],
164122
scalability_data["Communication_Time"],
@@ -169,7 +127,6 @@ def create_scalability_plot(df):
169127
label="Communication Time",
170128
)
171129

172-
# Add value labels on points
173130
for _, row in scalability_data.iterrows():
174131
plt.annotate(
175132
f'{row["Training_Time"]:.1f}s',
@@ -191,19 +148,16 @@ def create_scalability_plot(df):
191148
color="#ff7f0e",
192149
)
193150

194-
# Customize plot
195151
plt.xlabel("Number of Clients", fontsize=16)
196152
plt.ylabel("Time (seconds)", fontsize=16)
197153
plt.title("Federated Learning Scalability Analysis", fontsize=18, fontweight="bold")
198154
plt.legend(fontsize=14, loc="upper left")
199155
plt.grid(True, alpha=0.3)
200156

201-
# Set x-axis to show all client numbers
202157
client_numbers = sorted(scalability_data["Num_Trainers"].unique())
203158
plt.xticks(client_numbers, fontsize=14)
204159
plt.yticks(fontsize=14)
205160

206-
# Add some padding to y-axis
207161
y_max = max(
208162
scalability_data["Training_Time"].max(),
209163
scalability_data["Communication_Time"].max(),
@@ -214,66 +168,15 @@ def create_scalability_plot(df):
214168
plt.savefig("federated_learning_scalability.pdf", dpi=300, bbox_inches="tight")
215169
plt.close()
216170

217-
print("Generated: federated_learning_scalability.pdf")
218-
219-
# Create additional analysis table
220-
scalability_data["Training_Growth"] = (
221-
scalability_data["Training_Time"] / scalability_data["Training_Time"].iloc[0]
222-
)
223-
scalability_data["Communication_Growth"] = (
224-
scalability_data["Communication_Time"]
225-
/ scalability_data["Communication_Time"].iloc[0]
226-
)
227-
228-
print(f"\n{'='*60}")
229-
print("SCALABILITY ANALYSIS SUMMARY")
230-
print("=" * 60)
231-
print(
232-
f"{'Clients':<8} {'Train Time':<12} {'Comm Time':<12} {'Train Growth':<13} {'Comm Growth':<12}"
233-
)
234-
print("-" * 60)
235-
236-
for _, row in scalability_data.iterrows():
237-
print(
238-
f"{row['Num_Trainers']:<8.0f} "
239-
f"{row['Training_Time']:<12.1f} "
240-
f"{row['Communication_Time']:<12.1f} "
241-
f"{row['Training_Growth']:<13.2f}x "
242-
f"{row['Communication_Growth']:<12.2f}x"
243-
)
244-
245-
# Save detailed results
246171
scalability_data.to_csv("scalability_analysis.csv", index=False)
247-
print(f"\nDetailed results saved to: scalability_analysis.csv")
248172

249173

250174
def main():
251-
"""Main function to analyze federated learning scalability"""
252-
print("Loading federated learning scalability data...")
253-
254-
# Load all NC log data
255175
df = load_all_nc_logs()
256-
257-
if df.empty:
258-
print("No data found. Please check if NC log files exist:")
259-
print("- NC.log, NC5.log, NC10.log, NC20.log, NC40.log")
260-
return
261-
262-
print(f"\nLoaded data summary:")
263-
print(f"Total records: {len(df)}")
264-
print(f"Client counts: {sorted(df['Num_Trainers'].unique())}")
265-
print(f"Datasets: {list(df['Dataset'].unique())}")
266-
print(f"IID Betas: {sorted(df['IID_Beta'].unique())}")
267-
268-
# Create scalability analysis
269-
print("\nGenerating scalability analysis...")
270-
create_scalability_plot(df)
271-
272-
print(f"\nScalability analysis completed!")
273-
print("Generated files:")
274-
print("- federated_learning_scalability.pdf")
275-
print("- scalability_analysis.csv")
176+
177+
if not df.empty:
178+
create_scalability_plot(df)
276179

277180

278181
if __name__ == "__main__":
279-
main()
182+
main()
Binary file not shown.
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
Num_Trainers,Training_Time,Communication_Time,Total_Time,Training_Growth,Communication_Growth
2-
5,71.67500000000001,20.494,106.875,1.0,1.0
3-
10,49.2,40.989999999999995,84.725,0.6864318102546215,2.0000975895384014
4-
20,48.9,81.64,87.9,0.6822462504359957,3.983604957548551
5-
40,77.30000000000001,251.90800000000002,113.5,1.0784792465992326,12.291792719820435
1+
Num_Trainers,Training_Time,Communication_Time,Total_Time,Final_Accuracy,Communication_Cost,Peak_Memory
2+
5,33.2,2.625,112.4,0.5725,256.175,750.325
3+
10,12.7,4.475,87.175,0.47250000000000003,512.375,752.075
4+
15,6.575,16.150000000000002,93.45,0.5425,768.55,735.95
5+
20,5.824999999999999,8.075,82.175,0.5425,973.4749999999999,713.675

0 commit comments

Comments
 (0)