From 6c4cfaccf594b8e43059fa84321aa673469beab3 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Thu, 10 Feb 2022 14:11:14 -0800 Subject: [PATCH 1/6] add perf plots --- src/common/perf.py | 319 ++++++++++++------ .../generate_data/conda_env.yaml | 1 + .../lightgbm_data2bin/conda_env.yml | 1 + .../partition_data/conda_env.yml | 1 + .../inferencing/custom_win_cli/conda_env.yaml | 1 + .../treelite_python/conda_env.yaml | 1 + .../treelite_compile/conda_env.yaml | 1 + src/scripts/sample/conda_env.yaml | 1 + 8 files changed, 224 insertions(+), 102 deletions(-) diff --git a/src/common/perf.py b/src/common/perf.py index ea02dd06..56066ae5 100644 --- a/src/common/perf.py +++ b/src/common/perf.py @@ -9,6 +9,7 @@ import threading import time import psutil +import numpy as np class PerformanceReportingThread(threading.Thread): @@ -187,6 +188,9 @@ def append_perf_metrics(self, perf_metrics): class PerfReportPlotter(): + PERF_DISK_NET_PLOT_BINS = 50 + PERF_DISK_NET_PLOT_FIGSIZE = (16,12) + """Once collected all perf reports from all nodes""" def __init__(self, metrics_logger): self.all_reports = {} @@ -197,107 +201,218 @@ def add_perf_reports(self, perf_reports, node): self.all_reports[node] = perf_reports def report_nodes_perf(self): + """Report performance via metrics/plots for all nodes (loop)""" # Currently reporting one metric per node for node in self.all_reports: - # CPU UTILIZATION - cpu_avg_utilization = [ report["cpu_pct_per_cpu_avg"] for report in self.all_reports[node] ] - - self.metrics_logger.log_metric( - "max_t_(cpu_pct_per_cpu_avg)", - max(cpu_avg_utilization), - step=node - ) - self.metrics_logger.log_metric( - "cpu_avg_utilization_pct", - sum(cpu_avg_utilization)/len(cpu_avg_utilization), - step=node - ) - self.metrics_logger.log_metric( - "cpu_avg_utilization_at100_pct", - sum( [ utilization >= 100.0 for utilization in cpu_avg_utilization])/len(cpu_avg_utilization)*100.0, - step=node - ) - self.metrics_logger.log_metric( - "cpu_avg_utilization_over80_pct", - sum( [ utilization >= 80.0 for utilization in cpu_avg_utilization])/len(cpu_avg_utilization)*100.0, - step=node - ) - self.metrics_logger.log_metric( - "cpu_avg_utilization_over40_pct", - sum( [ utilization >= 40.0 for utilization in cpu_avg_utilization])/len(cpu_avg_utilization)*100.0, - step=node - ) - self.metrics_logger.log_metric( - "cpu_avg_utilization_over20_pct", - sum( [ utilization >= 20.0 for utilization in cpu_avg_utilization])/len(cpu_avg_utilization)*100.0, - step=node - ) - self.metrics_logger.log_metric( - "max_t_(cpu_pct_per_cpu_min)", - max([ report["cpu_pct_per_cpu_min"] for report in self.all_reports[node] ]), - step=node - ) - self.metrics_logger.log_metric( - "max_t_(cpu_pct_per_cpu_max)", - max([ report["cpu_pct_per_cpu_max"] for report in self.all_reports[node] ]), - step=node - ) - - # "CPU HOURS" - job_internal_cpu_hours = (time.time() - self.all_reports[node][0]["timestamp"]) * psutil.cpu_count() / 60 / 60 - self.metrics_logger.log_metric( - "node_cpu_hours", - job_internal_cpu_hours, - step=node - ) - self.metrics_logger.log_metric( - "node_unused_cpu_hours", - job_internal_cpu_hours * (100.0 - sum(cpu_avg_utilization)/len(cpu_avg_utilization)) / 100.0, - step=node - ) - - # MEM - self.metrics_logger.log_metric( - "max_t_(mem_percent)", - max([ report["mem_percent"] for report in self.all_reports[node] ]), - step=node - ) - - # DISK - self.metrics_logger.log_metric( - "max_t_disk_usage_percent", - max([ report["disk_usage_percent"] for report in self.all_reports[node] ]), - step=node - ) - self.metrics_logger.log_metric( - "total_disk_io_read_mb", - max([ report["disk_io_read_mb"] for report in self.all_reports[node] ]), - step=node - ) - self.metrics_logger.log_metric( - "total_disk_io_write_mb", - max([ report["disk_io_write_mb"] for report in self.all_reports[node] ]), - step=node - ) - - # NET I/O - self.metrics_logger.log_metric( - "total_net_io_lo_sent_mb", - max([ report["net_io_lo_sent_mb"] for report in self.all_reports[node] ]), - step=node - ) - self.metrics_logger.log_metric( - "total_net_io_ext_sent_mb", - max([ report["net_io_ext_sent_mb"] for report in self.all_reports[node] ]), - step=node - ) - self.metrics_logger.log_metric( - "total_net_io_lo_recv_mb", - max([ report["net_io_lo_recv_mb"] for report in self.all_reports[node] ]), - step=node - ) - self.metrics_logger.log_metric( - "total_net_io_ext_recv_mb", - max([ report["net_io_ext_recv_mb"] for report in self.all_reports[node] ]), - step=node - ) + self.report_node_perf(node) + + def diff_at_partitions(self, a, p, t): + """Compute the difference end-begin for all partitions in an array. + + Args: + a (np.array): a np array of increasing values + p (int): a given number of partitions + t (np.array): np array of timestamps for a + + Returns: + diff_part (np.array): dimension p + """ + if len(a) > (2*p): + partitioned_diff = [] + partitioned_time = [] + index_partitions = np.array_split(np.arange(len(a)), p) + print(f"i={index_partitions}") + for part in index_partitions: + partitioned_diff.append(a[part[-1]] - a[part[0]]) # diff between last and first value of partition + partitioned_time.append(t[part[0]]) + return np.array(partitioned_diff), np.array(partitioned_time) + else: + return np.ediff1d(a), t[1::] + + def report_node_perf(self, node): + """Report performance via metrics/plots for one given node""" + # CPU UTILIZATION + cpu_avg_utilization = [ report["cpu_pct_per_cpu_avg"] for report in self.all_reports[node] ] + self.metrics_logger.log_metric( + "max_t_(cpu_pct_per_cpu_avg)", + max(cpu_avg_utilization), + step=node + ) + self.metrics_logger.log_metric( + "cpu_avg_utilization_pct", + sum(cpu_avg_utilization)/len(cpu_avg_utilization), + step=node + ) + self.metrics_logger.log_metric( + "cpu_avg_utilization_at100_pct", + sum( [ utilization >= 100.0 for utilization in cpu_avg_utilization])/len(cpu_avg_utilization)*100.0, + step=node + ) + self.metrics_logger.log_metric( + "cpu_avg_utilization_over80_pct", + sum( [ utilization >= 80.0 for utilization in cpu_avg_utilization])/len(cpu_avg_utilization)*100.0, + step=node + ) + self.metrics_logger.log_metric( + "cpu_avg_utilization_over40_pct", + sum( [ utilization >= 40.0 for utilization in cpu_avg_utilization])/len(cpu_avg_utilization)*100.0, + step=node + ) + self.metrics_logger.log_metric( + "cpu_avg_utilization_over20_pct", + sum( [ utilization >= 20.0 for utilization in cpu_avg_utilization])/len(cpu_avg_utilization)*100.0, + step=node + ) + self.metrics_logger.log_metric( + "max_t_(cpu_pct_per_cpu_min)", + max([ report["cpu_pct_per_cpu_min"] for report in self.all_reports[node] ]), + step=node + ) + self.metrics_logger.log_metric( + "max_t_(cpu_pct_per_cpu_max)", + max([ report["cpu_pct_per_cpu_max"] for report in self.all_reports[node] ]), + step=node + ) + + # "CPU HOURS" + job_internal_cpu_hours = (time.time() - self.all_reports[node][0]["timestamp"]) * psutil.cpu_count() / 60 / 60 + self.metrics_logger.log_metric( + "node_cpu_hours", + job_internal_cpu_hours, + step=node + ) + self.metrics_logger.log_metric( + "node_unused_cpu_hours", + job_internal_cpu_hours * (100.0 - sum(cpu_avg_utilization)/len(cpu_avg_utilization)) / 100.0, + step=node + ) + + # MEM + mem_utilization = [ report["mem_percent"] for report in self.all_reports[node] ] + self.metrics_logger.log_metric( + "max_t_(mem_percent)", + max([ report["mem_percent"] for report in self.all_reports[node] ]), + step=node + ) + + # DISK + self.metrics_logger.log_metric( + "max_t_disk_usage_percent", + max([ report["disk_usage_percent"] for report in self.all_reports[node] ]), + step=node + ) + + disk_io_read = np.array([ report["disk_io_read_mb"] for report in self.all_reports[node] ]) + self.metrics_logger.log_metric( + "total_disk_io_read_mb", + np.max(disk_io_read), + step=node + ) + disk_io_write = np.array([ report["disk_io_write_mb"] for report in self.all_reports[node] ]) + self.metrics_logger.log_metric( + "total_disk_io_write_mb", + np.max(disk_io_write), + step=node + ) + + # NET I/O + self.metrics_logger.log_metric( + "total_net_io_lo_sent_mb", + max([ report["net_io_lo_sent_mb"] for report in self.all_reports[node] ]), + step=node + ) + net_io_ext_sent = np.array([ report["net_io_ext_sent_mb"] for report in self.all_reports[node] ]) + self.metrics_logger.log_metric( + "total_net_io_ext_sent_mb", + np.max(net_io_ext_sent), + step=node + ) + self.metrics_logger.log_metric( + "total_net_io_lo_recv_mb", + max([ report["net_io_lo_recv_mb"] for report in self.all_reports[node] ]), + step=node + ) + net_io_ext_recv = np.array([ report["net_io_ext_recv_mb"] for report in self.all_reports[node] ]) + self.metrics_logger.log_metric( + "total_net_io_ext_recv_mb", + np.max(net_io_ext_recv), + step=node + ) + + # then plot everything + timestamps = np.array([ report["timestamp"] for report in self.all_reports[node] ]) + self.plot_all_perf(node, timestamps, disk_io_read, disk_io_write, net_io_ext_sent, net_io_ext_recv, cpu_avg_utilization, mem_utilization) + + def plot_all_perf(self, node, timestamps, disk_io_read, disk_io_write, net_io_ext_sent, net_io_ext_recv, cpu_utilization, mem_utilization): + """Plots mem/cpy/disk/net using matplotlib. + + Args: + node (int): node id + timestamps (np.array): timestamps for all following arrays + disk_io_read (np.array): disk read cumulative sum (increasing) + disk_io_write (np.array): disk write cumulative sum (increasing) + net_io_ext_sent (np.array): net sent cumulative sum (increasing) + net_io_ext_recv (np.array): net recv cumulative sum (increasing) + cpu_utilization (np.array): cpu utilization + mem_utilization (np.array): mem utilization + + Returns: + None + + NOTE: we're expecting all arrays to have same length. + """ + timestamps = timestamps - timestamps[0] + disk_io_read = disk_io_read - disk_io_read[0] # cumsum starting at 0 + disk_io_write = disk_io_write - disk_io_write[0] # cumsum starting at 0 + net_io_ext_sent = net_io_ext_sent - net_io_ext_sent[0] # cumsum starting at 0 + net_io_ext_recv = net_io_ext_recv - net_io_ext_recv[0] # cumsum starting at 0 + + # import matploblib just-in-time + import matplotlib.pyplot as plt + #plt.switch_backend('agg') + + # show the distribution prediction latencies + fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, figsize=(16,8)) + + #ax = axes[0] + ax.set_xlabel('job time') + ax.set_ylabel('mb') + + reduced_disk_io_read, reduced_timestamps = self.diff_at_partitions(disk_io_read, PerfReportPlotter.PERF_DISK_NET_PLOT_BINS, timestamps) + reduced_disk_io_write, _ = self.diff_at_partitions(disk_io_write, PerfReportPlotter.PERF_DISK_NET_PLOT_BINS, timestamps) + reduced_net_io_ext_sent, _ = self.diff_at_partitions(net_io_ext_sent, PerfReportPlotter.PERF_DISK_NET_PLOT_BINS, timestamps) + reduced_net_io_ext_recv, _ = self.diff_at_partitions(net_io_ext_recv, PerfReportPlotter.PERF_DISK_NET_PLOT_BINS, timestamps) + + # identify a good normalization value for disk+io plots + max_disk_net_normalization_value = max( + max(reduced_disk_io_read), + max(reduced_disk_io_write), + max(reduced_net_io_ext_sent), + max(reduced_net_io_ext_recv), + ) * 1.05 # 5% more to keep some visual margin on the plot + + width = max(reduced_timestamps) / len(reduced_timestamps) / 5 + ax.bar(reduced_timestamps, reduced_disk_io_read, width=width, label=f"disk read", color="springgreen") + ax.bar(reduced_timestamps + width, reduced_disk_io_write, width=width, label=f"disk write", color="darkgreen") + ax.bar(reduced_timestamps + 2*width, reduced_net_io_ext_sent, width=width, label=f"net recv", color="hotpink") + ax.bar(reduced_timestamps + 3*width, reduced_net_io_ext_recv, width=width, label=f"net sent", color="purple") + ax.set_ylim([0.0, max_disk_net_normalization_value]) + ax.legend(loc='upper left') + + ax2 = ax.twinx() + #ax2 = axes[1] + ax2.set_ylabel('%') + ax2.plot(timestamps, cpu_utilization, label="cpu", color="r") + ax2.plot(timestamps, mem_utilization, label="mem", color="b") + ax2.set_ylim([0.0, 100.0]) + ax2.legend(loc='upper right') + #ax.set_title(f"Disk+Net I/O normalized plot (max = {max_disk_net_normalization_value:.2f}mb") + + #plt.xlabel("job time") + #plt.ylim(0.0, max_disk_net_normalization_value) + plt.legend(loc='best') + + plt.show() + + # record in mlflow + self.metrics_logger.log_figure(fig, f"disk_and_net_plot_node{node}.png") diff --git a/src/scripts/data_processing/generate_data/conda_env.yaml b/src/scripts/data_processing/generate_data/conda_env.yaml index 223c34f7..9480a920 100644 --- a/src/scripts/data_processing/generate_data/conda_env.yaml +++ b/src/scripts/data_processing/generate_data/conda_env.yaml @@ -10,3 +10,4 @@ dependencies: - azureml-defaults==1.35.0 - azureml-mlflow==1.35.0 - psutil==5.8.0 + - matplotlib==3.4.3 diff --git a/src/scripts/data_processing/lightgbm_data2bin/conda_env.yml b/src/scripts/data_processing/lightgbm_data2bin/conda_env.yml index 2c403f37..3245a774 100644 --- a/src/scripts/data_processing/lightgbm_data2bin/conda_env.yml +++ b/src/scripts/data_processing/lightgbm_data2bin/conda_env.yml @@ -11,3 +11,4 @@ dependencies: - azureml-mlflow==1.35.0 - psutil==5.8.0 - lightgbm==3.2.1 + - matplotlib==3.4.3 diff --git a/src/scripts/data_processing/partition_data/conda_env.yml b/src/scripts/data_processing/partition_data/conda_env.yml index 39dabefc..f15a18f7 100644 --- a/src/scripts/data_processing/partition_data/conda_env.yml +++ b/src/scripts/data_processing/partition_data/conda_env.yml @@ -10,4 +10,5 @@ dependencies: - azureml-defaults==1.35.0 - azureml-mlflow==1.35.0 - psutil==5.8.0 + - matplotlib==3.4.3 diff --git a/src/scripts/inferencing/custom_win_cli/conda_env.yaml b/src/scripts/inferencing/custom_win_cli/conda_env.yaml index 78eed94f..a15650c6 100644 --- a/src/scripts/inferencing/custom_win_cli/conda_env.yaml +++ b/src/scripts/inferencing/custom_win_cli/conda_env.yaml @@ -8,3 +8,4 @@ dependencies: - azureml-defaults==1.35.0 - azureml-mlflow==1.35.0 - psutil==5.8.0 + - matplotlib==3.4.3 diff --git a/src/scripts/inferencing/treelite_python/conda_env.yaml b/src/scripts/inferencing/treelite_python/conda_env.yaml index b31a7368..0d08ea4c 100644 --- a/src/scripts/inferencing/treelite_python/conda_env.yaml +++ b/src/scripts/inferencing/treelite_python/conda_env.yaml @@ -12,3 +12,4 @@ dependencies: - treelite_runtime==2.1.0 - pandas>=1.1,<1.2 - numpy>=1.10,<1.20 + - matplotlib==3.4.3 diff --git a/src/scripts/model_transformation/treelite_compile/conda_env.yaml b/src/scripts/model_transformation/treelite_compile/conda_env.yaml index b31a7368..0d08ea4c 100644 --- a/src/scripts/model_transformation/treelite_compile/conda_env.yaml +++ b/src/scripts/model_transformation/treelite_compile/conda_env.yaml @@ -12,3 +12,4 @@ dependencies: - treelite_runtime==2.1.0 - pandas>=1.1,<1.2 - numpy>=1.10,<1.20 + - matplotlib==3.4.3 diff --git a/src/scripts/sample/conda_env.yaml b/src/scripts/sample/conda_env.yaml index 0201788d..2b11a210 100644 --- a/src/scripts/sample/conda_env.yaml +++ b/src/scripts/sample/conda_env.yaml @@ -8,3 +8,4 @@ dependencies: - azureml-defaults==1.35.0 - azureml-mlflow==1.35.0 - psutil==5.8.0 + - matplotlib==3.4.3 From d4f44377e0b341f7c0ea75a3a70d5b9af91eb793 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Thu, 10 Feb 2022 14:12:23 -0800 Subject: [PATCH 2/6] use global var figsize --- src/common/perf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/perf.py b/src/common/perf.py index 56066ae5..5075d61b 100644 --- a/src/common/perf.py +++ b/src/common/perf.py @@ -189,7 +189,7 @@ def append_perf_metrics(self, perf_metrics): class PerfReportPlotter(): PERF_DISK_NET_PLOT_BINS = 50 - PERF_DISK_NET_PLOT_FIGSIZE = (16,12) + PERF_DISK_NET_PLOT_FIGSIZE = (16,8) """Once collected all perf reports from all nodes""" def __init__(self, metrics_logger): @@ -372,7 +372,7 @@ def plot_all_perf(self, node, timestamps, disk_io_read, disk_io_write, net_io_ex #plt.switch_backend('agg') # show the distribution prediction latencies - fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, figsize=(16,8)) + fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, figsize=PerfReportPlotter.PERF_DISK_NET_PLOT_FIGSIZE) #ax = axes[0] ax.set_xlabel('job time') From 6050f379aba5a8ab17c187957de08d1d403ad604 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Fri, 11 Feb 2022 13:18:14 -0800 Subject: [PATCH 3/6] polish plot for accessibility (hopefully) --- src/common/perf.py | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/common/perf.py b/src/common/perf.py index 5075d61b..bfe6d96c 100644 --- a/src/common/perf.py +++ b/src/common/perf.py @@ -369,15 +369,15 @@ def plot_all_perf(self, node, timestamps, disk_io_read, disk_io_write, net_io_ex # import matploblib just-in-time import matplotlib.pyplot as plt - #plt.switch_backend('agg') + plt.switch_backend('agg') # to enable it in thread - # show the distribution prediction latencies - fig, ax = plt.subplots(nrows=1, ncols=1, sharex=False, figsize=PerfReportPlotter.PERF_DISK_NET_PLOT_FIGSIZE) + fig, ax = plt.subplots( + nrows=1, + ncols=1, + sharex=False, # both mem+cpu and disk+net will share the same x axis (job time) + figsize=PerfReportPlotter.PERF_DISK_NET_PLOT_FIGSIZE + ) - #ax = axes[0] - ax.set_xlabel('job time') - ax.set_ylabel('mb') - reduced_disk_io_read, reduced_timestamps = self.diff_at_partitions(disk_io_read, PerfReportPlotter.PERF_DISK_NET_PLOT_BINS, timestamps) reduced_disk_io_write, _ = self.diff_at_partitions(disk_io_write, PerfReportPlotter.PERF_DISK_NET_PLOT_BINS, timestamps) reduced_net_io_ext_sent, _ = self.diff_at_partitions(net_io_ext_sent, PerfReportPlotter.PERF_DISK_NET_PLOT_BINS, timestamps) @@ -391,28 +391,24 @@ def plot_all_perf(self, node, timestamps, disk_io_read, disk_io_write, net_io_ex max(reduced_net_io_ext_recv), ) * 1.05 # 5% more to keep some visual margin on the plot - width = max(reduced_timestamps) / len(reduced_timestamps) / 5 - ax.bar(reduced_timestamps, reduced_disk_io_read, width=width, label=f"disk read", color="springgreen") - ax.bar(reduced_timestamps + width, reduced_disk_io_write, width=width, label=f"disk write", color="darkgreen") - ax.bar(reduced_timestamps + 2*width, reduced_net_io_ext_sent, width=width, label=f"net recv", color="hotpink") - ax.bar(reduced_timestamps + 3*width, reduced_net_io_ext_recv, width=width, label=f"net sent", color="purple") + width = max(reduced_timestamps) / len(reduced_timestamps) / 5 # barplot width = 1/5 of timestamp unit + ax.bar(reduced_timestamps, reduced_disk_io_read, width=width, label=f"disk read", color="w", hatch='/', edgecolor="dimgray") + ax.bar(reduced_timestamps + width, reduced_disk_io_write, width=width, label=f"disk write", color="w", hatch='-', edgecolor="dimgray") + ax.bar(reduced_timestamps + 2*width, reduced_net_io_ext_sent, width=width, label=f"net recv", color="silver") + ax.bar(reduced_timestamps + 3*width, reduced_net_io_ext_recv, width=width, label=f"net sent", color="gray") ax.set_ylim([0.0, max_disk_net_normalization_value]) ax.legend(loc='upper left') + ax.set_ylabel('MB') - ax2 = ax.twinx() - #ax2 = axes[1] - ax2.set_ylabel('%') + ax2 = ax.twinx() # will superpose plot to ax, but show axis on right ax2.plot(timestamps, cpu_utilization, label="cpu", color="r") - ax2.plot(timestamps, mem_utilization, label="mem", color="b") - ax2.set_ylim([0.0, 100.0]) + ax2.plot(timestamps, mem_utilization, label="mem", color="b", linestyle='dotted') + ax2.set_ylim([0.0, max(max(cpu_utilization), max(mem_utilization), 100.0)]) ax2.legend(loc='upper right') - #ax.set_title(f"Disk+Net I/O normalized plot (max = {max_disk_net_normalization_value:.2f}mb") - - #plt.xlabel("job time") - #plt.ylim(0.0, max_disk_net_normalization_value) - plt.legend(loc='best') + ax2.set_ylabel('%') - plt.show() + ax.set_xlabel("time in seconds (job duration={:.2f}s)".format(timestamps[-1]-timestamps[0])) + plt.title(f"Perf plot for node {node}") # record in mlflow - self.metrics_logger.log_figure(fig, f"disk_and_net_plot_node{node}.png") + self.metrics_logger.log_figure(fig, f"perf_plot_node{node}.png") From 9b74effc9e095e38ac69abb16ae3e7cd0fcc248b Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Fri, 11 Feb 2022 14:23:33 -0800 Subject: [PATCH 4/6] add log_figure to component tests --- tests/common/test_component.py | 6 ++++-- tests/common/test_distributed.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/common/test_component.py b/tests/common/test_component.py index 2865f9e3..6d3bb9ea 100644 --- a/tests/common/test_component.py +++ b/tests/common/test_component.py @@ -102,11 +102,12 @@ def run(self, args, logger, metrics_logger, unknown_args): time.sleep(1) @patch('mlflow.end_run') +@patch('mlflow.log_figure') @patch('mlflow.log_artifact') @patch('mlflow.log_metric') @patch('mlflow.set_tags') @patch('mlflow.start_run') -def test_single_node_script_metrics(mlflow_start_run_mock, mlflow_set_tags_mock, mlflow_log_metric_mock, mlflow_log_artifact_mock, mlflow_end_run_mock): +def test_single_node_script_metrics(mlflow_start_run_mock, mlflow_set_tags_mock, mlflow_log_metric_mock, mlflow_log_artifact_mock, mlflow_log_figure_mock, mlflow_end_run_mock): # just run main test_component = FakeSingleNodeScript.main( [ @@ -132,7 +133,8 @@ def test_single_node_script_metrics(mlflow_start_run_mock, mlflow_set_tags_mock, mlflow_log_metric_mock ) - mlflow_log_artifact_mock.assert_called_once() + mlflow_log_artifact_mock.assert_called_once() # perf data exported in json + mlflow_log_figure_mock.assert_called_once() # perf plot class FailingSingleNodeScript(SingleNodeScript): diff --git a/tests/common/test_distributed.py b/tests/common/test_distributed.py index 23424af1..561212eb 100644 --- a/tests/common/test_distributed.py +++ b/tests/common/test_distributed.py @@ -27,11 +27,12 @@ def run(self, args, logger, metrics_logger, unknown_args): @patch('mlflow.end_run') @patch('mlflow.log_artifact') +@patch('mlflow.log_artifact') @patch('mlflow.log_metric') @patch('mlflow.set_tags') @patch('mlflow.start_run') @patch('common.distributed.MultiNodeMPIDriver') -def test_multi_node_script(mpi_driver_mock, mlflow_start_run_mock, mlflow_set_tags_mock, mlflow_log_metric_mock, mlflow_log_artifact_mock, mlflow_end_run_mock): +def test_multi_node_script(mpi_driver_mock, mlflow_start_run_mock, mlflow_set_tags_mock, mlflow_log_metric_mock, mlflow_log_artifact_mock, mlflow_log_figure_mock, mlflow_end_run_mock): # fake mpi initialization + config mpi_driver_mock().get_multinode_config.return_value = multinode_config_class( 1, # world_size @@ -65,7 +66,8 @@ def test_multi_node_script(mpi_driver_mock, mlflow_start_run_mock, mlflow_set_ta mlflow_log_metric_mock ) - mlflow_log_artifact_mock.assert_called_once() + mlflow_log_artifact_mock.assert_called_once() # perf data exported in json + mlflow_log_figure_mock.assert_called_once() # perf plot class FailingMultiNodeScript(MultiNodeScript): From 805ce93bbd5ba1b482274c8f7fc2309a32b78213 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Fri, 11 Feb 2022 15:03:22 -0800 Subject: [PATCH 5/6] update tests --- src/common/perf.py | 6 +++++- tests/common/test_component.py | 2 +- tests/common/test_distributed.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/common/perf.py b/src/common/perf.py index 310e2146..906b51dc 100644 --- a/src/common/perf.py +++ b/src/common/perf.py @@ -197,6 +197,7 @@ class PerfReportPlotter(): def __init__(self, metrics_logger): self.all_reports = {} self.metrics_logger = metrics_logger + self.logger = logging.getLogger(__name__) def save_to(self, perf_report_file_path=None): """Saves all reports into a json file""" @@ -234,7 +235,6 @@ def diff_at_partitions(self, a, p, t): partitioned_diff = [] partitioned_time = [] index_partitions = np.array_split(np.arange(len(a)), p) - print(f"i={index_partitions}") for part in index_partitions: partitioned_diff.append(a[part[-1]] - a[part[0]]) # diff between last and first value of partition partitioned_time.append(t[part[0]]) @@ -374,6 +374,10 @@ def plot_all_perf(self, node, timestamps, disk_io_read, disk_io_write, net_io_ex NOTE: we're expecting all arrays to have same length. """ + if len(timestamps) < 3: + self.logger.warning(f"Number of data points for perf is too low {len(timestamps)}, we will not print perf plot.") + return + timestamps = timestamps - timestamps[0] disk_io_read = disk_io_read - disk_io_read[0] # cumsum starting at 0 disk_io_write = disk_io_write - disk_io_write[0] # cumsum starting at 0 diff --git a/tests/common/test_component.py b/tests/common/test_component.py index 6d3bb9ea..5c7655cb 100644 --- a/tests/common/test_component.py +++ b/tests/common/test_component.py @@ -99,7 +99,7 @@ def __init__(self): def run(self, args, logger, metrics_logger, unknown_args): # don't do anything with metrics_logger.log_time_block("fake_time_block", step=1): - time.sleep(1) + time.sleep(5) @patch('mlflow.end_run') @patch('mlflow.log_figure') diff --git a/tests/common/test_distributed.py b/tests/common/test_distributed.py index 561212eb..79ff6c1d 100644 --- a/tests/common/test_distributed.py +++ b/tests/common/test_distributed.py @@ -22,7 +22,7 @@ def __init__(self): def run(self, args, logger, metrics_logger, unknown_args): # don't do anything with metrics_logger.log_time_block("fake_time_block", step=1): - time.sleep(1) + time.sleep(5) @patch('mlflow.end_run') From a41a699a8f18295d51243e23e7cca0a85e32a26e Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Fri, 11 Feb 2022 15:15:43 -0800 Subject: [PATCH 6/6] fix unit tests --- tests/common/test_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common/test_distributed.py b/tests/common/test_distributed.py index 79ff6c1d..80662093 100644 --- a/tests/common/test_distributed.py +++ b/tests/common/test_distributed.py @@ -26,7 +26,7 @@ def run(self, args, logger, metrics_logger, unknown_args): @patch('mlflow.end_run') -@patch('mlflow.log_artifact') +@patch('mlflow.log_figure') @patch('mlflow.log_artifact') @patch('mlflow.log_metric') @patch('mlflow.set_tags')