|
| 1 | +import re |
| 2 | + |
| 3 | +import matplotlib.pyplot as plt |
| 4 | +import numpy as np |
| 5 | +import pandas as pd |
| 6 | +import seaborn as sns |
| 7 | + |
| 8 | + |
| 9 | +def extract_comm_costs(logfile): |
| 10 | + with open(logfile, "r") as f: |
| 11 | + log_content = f.read() |
| 12 | + |
| 13 | + experiments = re.split(r"-{80}\nRunning experiment \d+/\d+:", log_content) |
| 14 | + results = [] |
| 15 | + |
| 16 | + for exp in experiments[1:]: |
| 17 | + algo_match = re.search(r"Algorithm: (\w+)", exp) |
| 18 | + dataset_match = re.search(r"Dataset: ([A-Z0-9-]+)", exp) |
| 19 | + trainers_match = re.search(r"Trainers: (\d+)", exp) |
| 20 | + accuracy_match = re.search(r"Average test accuracy: ([\d.]+)", exp) |
| 21 | + |
| 22 | + if not (algo_match and dataset_match and trainers_match): |
| 23 | + continue |
| 24 | + |
| 25 | + algo = algo_match.group(1) |
| 26 | + dataset = dataset_match.group(1) |
| 27 | + trainers = trainers_match.group(1) |
| 28 | + accuracy = float(accuracy_match.group(1)) if accuracy_match else None |
| 29 | + |
| 30 | + theoretical_pretrain = re.findall( |
| 31 | + r"//Log Theoretical Pretrain Comm Cost: ([\d.]+) MB //end", exp |
| 32 | + ) |
| 33 | + theoretical_train = re.findall( |
| 34 | + r"//Log Theoretical Train Comm Cost: ([\d.]+) MB //end", exp |
| 35 | + ) |
| 36 | + |
| 37 | + actual_pretrain = re.search( |
| 38 | + r"//Log Total Actual Pretrain Comm Cost: ([\d.]+) MB //end", exp |
| 39 | + ) |
| 40 | + actual_train = re.search( |
| 41 | + r"//Log Total Actual Train Comm Cost: ([\d.]+) MB //end", exp |
| 42 | + ) |
| 43 | + |
| 44 | + if not (theoretical_pretrain and theoretical_train): |
| 45 | + continue |
| 46 | + |
| 47 | + result = { |
| 48 | + "Algorithm": algo, |
| 49 | + "Dataset": dataset, |
| 50 | + "Trainers": int(trainers), |
| 51 | + "Theoretical_Pretrain_MB": float(theoretical_pretrain[-1]) |
| 52 | + if theoretical_pretrain |
| 53 | + else 0, |
| 54 | + "Theoretical_Train_MB": float(theoretical_train[-1]) |
| 55 | + if theoretical_train |
| 56 | + else 0, |
| 57 | + "Actual_Pretrain_MB": float(actual_pretrain.group(1)) |
| 58 | + if actual_pretrain |
| 59 | + else None, |
| 60 | + "Actual_Train_MB": float(actual_train.group(1)) if actual_train else None, |
| 61 | + "Accuracy": accuracy, |
| 62 | + } |
| 63 | + |
| 64 | + result["Theoretical_Total_MB"] = ( |
| 65 | + result["Theoretical_Pretrain_MB"] + result["Theoretical_Train_MB"] |
| 66 | + ) |
| 67 | + |
| 68 | + if ( |
| 69 | + result["Actual_Pretrain_MB"] is not None |
| 70 | + and result["Actual_Train_MB"] is not None |
| 71 | + ): |
| 72 | + result["Actual_Total_MB"] = ( |
| 73 | + result["Actual_Pretrain_MB"] + result["Actual_Train_MB"] |
| 74 | + ) |
| 75 | + |
| 76 | + if ( |
| 77 | + result["Theoretical_Pretrain_MB"] > 0 |
| 78 | + and result["Actual_Pretrain_MB"] > 0 |
| 79 | + ): |
| 80 | + result["Pretrain_Ratio"] = ( |
| 81 | + result["Actual_Pretrain_MB"] / result["Theoretical_Pretrain_MB"] |
| 82 | + ) |
| 83 | + else: |
| 84 | + result["Pretrain_Ratio"] = ( |
| 85 | + float("inf") |
| 86 | + if result["Actual_Pretrain_MB"] and result["Actual_Pretrain_MB"] > 0 |
| 87 | + else None |
| 88 | + ) |
| 89 | + |
| 90 | + if result["Theoretical_Train_MB"] > 0: |
| 91 | + result["Train_Ratio"] = ( |
| 92 | + result["Actual_Train_MB"] / result["Theoretical_Train_MB"] |
| 93 | + ) |
| 94 | + else: |
| 95 | + result["Train_Ratio"] = ( |
| 96 | + float("inf") |
| 97 | + if result["Actual_Train_MB"] and result["Actual_Train_MB"] > 0 |
| 98 | + else None |
| 99 | + ) |
| 100 | + |
| 101 | + if result["Theoretical_Total_MB"] > 0: |
| 102 | + result["Total_Ratio"] = ( |
| 103 | + result["Actual_Total_MB"] / result["Theoretical_Total_MB"] |
| 104 | + ) |
| 105 | + else: |
| 106 | + result["Total_Ratio"] = ( |
| 107 | + float("inf") |
| 108 | + if result["Actual_Total_MB"] and result["Actual_Total_MB"] > 0 |
| 109 | + else None |
| 110 | + ) |
| 111 | + |
| 112 | + results.append(result) |
| 113 | + |
| 114 | + return pd.DataFrame(results) |
| 115 | + |
| 116 | + |
| 117 | +def generate_dataset_comparisons(df, output_prefix="comm_cost"): |
| 118 | + comparison_data = ( |
| 119 | + df.groupby(["Dataset", "Algorithm"]) |
| 120 | + .agg( |
| 121 | + { |
| 122 | + "Theoretical_Pretrain_MB": "mean", |
| 123 | + "Theoretical_Train_MB": "mean", |
| 124 | + "Theoretical_Total_MB": "mean", |
| 125 | + "Actual_Pretrain_MB": "mean", |
| 126 | + "Actual_Train_MB": "mean", |
| 127 | + "Actual_Total_MB": "mean", |
| 128 | + "Train_Ratio": "mean", |
| 129 | + "Accuracy": "mean", |
| 130 | + } |
| 131 | + ) |
| 132 | + .reset_index() |
| 133 | + ) |
| 134 | + |
| 135 | + comparison_data.to_csv( |
| 136 | + f"{output_prefix}_dataset_algorithm_comparison.csv", index=False |
| 137 | + ) |
| 138 | + |
| 139 | + datasets = df["Dataset"].unique() |
| 140 | + report_tables = [] |
| 141 | + |
| 142 | + for dataset in datasets: |
| 143 | + dataset_data = comparison_data[comparison_data["Dataset"] == dataset] |
| 144 | + |
| 145 | + table_rows = [] |
| 146 | + for _, row in dataset_data.iterrows(): |
| 147 | + table_row = { |
| 148 | + "Algorithm": row["Algorithm"], |
| 149 | + "Theoretical Train (MB)": f"{row['Theoretical_Train_MB']:.2f}", |
| 150 | + "Actual Train (MB)": f"{row['Actual_Train_MB']:.2f}" |
| 151 | + if pd.notna(row["Actual_Train_MB"]) |
| 152 | + else "N/A", |
| 153 | + "Train Overhead (MB)": f"{row['Actual_Train_MB'] - row['Theoretical_Train_MB']:.2f}" |
| 154 | + if pd.notna(row["Actual_Train_MB"]) |
| 155 | + else "N/A", |
| 156 | + "Accuracy": f"{row['Accuracy']:.4f}" |
| 157 | + if pd.notna(row["Accuracy"]) |
| 158 | + else "N/A", |
| 159 | + } |
| 160 | + table_rows.append(table_row) |
| 161 | + |
| 162 | + dataset_table = pd.DataFrame(table_rows) |
| 163 | + dataset_table.to_csv(f"{output_prefix}_{dataset}_comparison.csv", index=False) |
| 164 | + report_tables.append((dataset, dataset_table)) |
| 165 | + |
| 166 | + # Create visualization for theoretical vs actual training communication costs |
| 167 | + plt.figure(figsize=(12, 8)) |
| 168 | + plot_data = pd.melt( |
| 169 | + dataset_data, |
| 170 | + id_vars=["Algorithm"], |
| 171 | + value_vars=["Theoretical_Train_MB", "Actual_Train_MB"], |
| 172 | + var_name="Type", |
| 173 | + value_name="Communication Cost (MB)", |
| 174 | + ) |
| 175 | + ax = sns.barplot( |
| 176 | + x="Algorithm", y="Communication Cost (MB)", hue="Type", data=plot_data |
| 177 | + ) |
| 178 | + plt.title(f"{dataset} - Theoretical vs Actual Training Communication Costs") |
| 179 | + plt.xticks(rotation=45) |
| 180 | + plt.tight_layout() |
| 181 | + plt.savefig(f"{output_prefix}_{dataset}_train_comparison.png", dpi=300) |
| 182 | + plt.close() |
| 183 | + |
| 184 | + return report_tables |
| 185 | + |
| 186 | + |
| 187 | +def generate_report(logfile, output_prefix="comm_cost"): |
| 188 | + df = extract_comm_costs(logfile) |
| 189 | + if df.empty: |
| 190 | + print("No communication cost data found in log file.") |
| 191 | + return None |
| 192 | + |
| 193 | + df.to_csv(f"{output_prefix}_raw.csv", index=False) |
| 194 | + |
| 195 | + report_tables = generate_dataset_comparisons(df, output_prefix) |
| 196 | + |
| 197 | + consolidated_report = pd.DataFrame() |
| 198 | + |
| 199 | + for dataset, dataset_table in report_tables: |
| 200 | + dataset_table["Dataset"] = dataset |
| 201 | + consolidated_report = pd.concat([consolidated_report, dataset_table]) |
| 202 | + |
| 203 | + consolidated_report.to_csv(f"{output_prefix}_consolidated_report.csv", index=False) |
| 204 | + |
| 205 | + algorithm_summary = ( |
| 206 | + df.groupby("Algorithm") |
| 207 | + .agg( |
| 208 | + { |
| 209 | + "Theoretical_Train_MB": "mean", |
| 210 | + "Actual_Train_MB": "mean", |
| 211 | + "Accuracy": "mean", |
| 212 | + } |
| 213 | + ) |
| 214 | + .reset_index() |
| 215 | + ) |
| 216 | + |
| 217 | + algorithm_summary["Average Overhead (MB)"] = ( |
| 218 | + algorithm_summary["Actual_Train_MB"] - algorithm_summary["Theoretical_Train_MB"] |
| 219 | + ) |
| 220 | + |
| 221 | + algorithm_summary.to_csv(f"{output_prefix}_algorithm_summary.csv", index=False) |
| 222 | + |
| 223 | + return consolidated_report |
| 224 | + |
| 225 | + |
| 226 | +if __name__ == "__main__": |
| 227 | + import sys |
| 228 | + |
| 229 | + logfile = "GC.log" |
| 230 | + if len(sys.argv) > 1: |
| 231 | + logfile = sys.argv[1] |
| 232 | + |
| 233 | + output_prefix = "comm_cost" |
| 234 | + if len(sys.argv) > 2: |
| 235 | + output_prefix = sys.argv[2] |
| 236 | + |
| 237 | + consolidated_report = generate_report(logfile, output_prefix) |
| 238 | + |
| 239 | + if consolidated_report is not None: |
| 240 | + print("\nComparison by Dataset and Algorithm:") |
| 241 | + for dataset in consolidated_report["Dataset"].unique(): |
| 242 | + print(f"\n=== Dataset: {dataset} ===") |
| 243 | + dataset_data = consolidated_report[ |
| 244 | + consolidated_report["Dataset"] == dataset |
| 245 | + ] |
| 246 | + print( |
| 247 | + dataset_data[ |
| 248 | + [ |
| 249 | + "Algorithm", |
| 250 | + "Theoretical Train (MB)", |
| 251 | + "Actual Train (MB)", |
| 252 | + "Accuracy", |
| 253 | + ] |
| 254 | + ] |
| 255 | + ) |
0 commit comments