From 0207ac2db38226845d3e23ba5768b47d435b4e40 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Mon, 29 Nov 2021 09:17:23 -0800
Subject: [PATCH 1/3] unify latency metrics inside a helper function

---
 src/common/metrics.py                         | 66 +++++++++++++++++++
 .../inferencing/lightgbm_c_api/score.py       | 25 ++-----
 .../inferencing/lightgbm_python/score.py      | 29 +++++---
 3 files changed, 91 insertions(+), 29 deletions(-)

diff --git a/src/common/metrics.py b/src/common/metrics.py
index f0993945..b146b469 100644
--- a/src/common/metrics.py
+++ b/src/common/metrics.py
@@ -165,6 +165,72 @@ def log_time_block(self, metric_name):
         # see class below with proper __enter__ and __exit__
         return LogTimeBlock(metric_name)
 
+    def log_inferencing_latencies(self, time_per_batch, batch_length=1, factor_to_usecs=1000000.0):
+        """Logs prediction latencies (for inferencing) with lots of fancy metrics and plots.
+
+        Args:
+            time_per_batch_list (List[float]): time per inferencing batch
+            batch_lengths (Union[List[int],int]): length of each batch (List or constant)
+            factor_to_usecs (float): factor to apply to time_per_batch to convert to microseconds
+        """
+        if isinstance(batch_length, list):
+            sum_batch_lengths = sum(batch_length)
+        else:
+            sum_batch_lengths = batch_length*len(time_per_batch)
+
+        # log metadata
+        self.log_metric("prediction_batches", len(time_per_batch))
+        self.log_metric("prediction_queries", sum_batch_lengths)
+
+        if len(time_per_batch) > 0:
+            self.log_metric("prediction_latency_avg", (sum(time_per_batch) * factor_to_usecs)/sum_batch_lengths) # usecs
+
+        # if there's more than 1 batch, compute percentiles
+        if len(time_per_batch) > 1:
+            import numpy as np
+            import matplotlib.pyplot as plt
+
+            # latency per batch
+            batch_run_times = np.array(time_per_batch) * factor_to_usecs
+            self.log_metric("batch_latency_p50_usecs", np.percentile(batch_run_times, 50))
+            self.log_metric("batch_latency_p75_usecs", np.percentile(batch_run_times, 75))
+            self.log_metric("batch_latency_p90_usecs", np.percentile(batch_run_times, 90))
+            self.log_metric("batch_latency_p95_usecs", np.percentile(batch_run_times, 95))
+            self.log_metric("batch_latency_p99_usecs", np.percentile(batch_run_times, 99))
+
+            # show the distribution prediction latencies
+            fig, ax = plt.subplots(1)
+            ax.hist(batch_run_times, bins=100)
+            ax.set_title("Latency-per-batch histogram (log scale)")
+            plt.xlabel("usecs")
+            plt.ylabel("occurence")
+            plt.yscale('log')
+
+            # record in mlflow
+            self.log_figure(fig, "batch_latency_log_histogram.png")
+
+            # latency per query
+            if isinstance(batch_length, list):
+                prediction_latencies = np.array(time_per_batch) * factor_to_usecs / np.array(batch_length)
+            else:
+                prediction_latencies = np.array(time_per_batch) * factor_to_usecs / batch_length
+
+            self.log_metric("prediction_latency_p50_usecs", np.percentile(prediction_latencies, 50))
+            self.log_metric("prediction_latency_p75_usecs", np.percentile(prediction_latencies, 75))
+            self.log_metric("prediction_latency_p90_usecs", np.percentile(prediction_latencies, 90))
+            self.log_metric("prediction_latency_p95_usecs", np.percentile(prediction_latencies, 95))
+            self.log_metric("prediction_latency_p99_usecs", np.percentile(prediction_latencies, 99))
+
+            # show the distribution prediction latencies
+            fig, ax = plt.subplots(1)
+            ax.hist(prediction_latencies, bins=100)
+            ax.set_title("Latency-per-prediction histogram (log scale)")
+            plt.xlabel("usecs")
+            plt.ylabel("occurence")
+            plt.yscale('log')
+
+            # record in mlflow
+            self.log_figure(fig, "prediction_latency_log_histogram.png")
 
 
 ########################
diff --git a/src/scripts/inferencing/lightgbm_c_api/score.py b/src/scripts/inferencing/lightgbm_c_api/score.py
index 1353daa6..32a6ecb7 100644
--- a/src/scripts/inferencing/lightgbm_c_api/score.py
+++ b/src/scripts/inferencing/lightgbm_c_api/score.py
@@ -210,25 +210,12 @@ def run(self, args, logger, metrics_logger, unknown_args):
                 else:
                     logger.warning(f"log metric {line} does not match expected pattern {row_pattern}")
 
-
-        if len(time_inferencing_per_query) > 1:
-            batch_run_times = np.array(time_inferencing_per_query)
-            metrics_logger.log_metric("batch_time_inferencing_p50_usecs", np.percentile(batch_run_times, 50))
-            metrics_logger.log_metric("batch_time_inferencing_p75_usecs", np.percentile(batch_run_times, 75))
-            metrics_logger.log_metric("batch_time_inferencing_p90_usecs", np.percentile(batch_run_times, 90))
-            metrics_logger.log_metric("batch_time_inferencing_p95_usecs", np.percentile(batch_run_times, 95))
-            metrics_logger.log_metric("batch_time_inferencing_p99_usecs", np.percentile(batch_run_times, 99))
-
-            # show the distribution prediction latencies
-            fig, ax = plt.subplots(1)
-            ax.hist(batch_run_times, bins=100)
-            ax.set_title("Latency-per-query histogram (log scale)")
-            plt.xlabel("usecs")
-            plt.ylabel("occurence")
-            plt.yscale('log')
-
-            # record in mlflow
-            metrics_logger.log_figure(fig, "latency_log_histogram.png")
+        # use helper to log latency with the right metric names
+        metrics_logger.log_inferencing_latencies(
+            time_inferencing_per_query,
+            batch_length=1, # in this exec, each row is just 1 prediction call
+            factor_to_usecs=1.0 # values are already in usecs
+        )
 
         if args.output:
             np.savetxt(
diff --git a/src/scripts/inferencing/lightgbm_python/score.py b/src/scripts/inferencing/lightgbm_python/score.py
index bc69dbab..2f1d20b7 100644
--- a/src/scripts/inferencing/lightgbm_python/score.py
+++ b/src/scripts/inferencing/lightgbm_python/score.py
@@ -8,9 +8,10 @@
 import sys
 import argparse
 import logging
+import time
+import numpy as np
 from distutils.util import strtobool
 import lightgbm
-import numpy
 
 # Add the right path to PYTHONPATH
 # so that you can import from common.*
@@ -104,15 +105,24 @@ def run(self, args, logger, metrics_logger, unknown_args):
         )
 
         logger.info(f"Running .predict()")
-        with metrics_logger.log_time_block("time_inferencing"):
-            predictions_array = booster.predict(
-                data=inference_raw_data,
-                num_threads=args.num_threads,
-                predict_disable_shape_check=bool(args.predict_disable_shape_check)
-            )
-        
+        batch_start_time = time.monotonic()
+        predictions_array = booster.predict(
+            data=inference_raw_data,
+            num_threads=args.num_threads,
+            predict_disable_shape_check=bool(args.predict_disable_shape_check)
+        )
+        prediction_time = (time.monotonic() - batch_start_time)
+        metrics_logger.log_metric("time_inferencing", prediction_time)
+
+        # use helper to log latency with the right metric names
+        metrics_logger.log_inferencing_latencies(
+            [prediction_time], # only one big batch
+            batch_length=inference_data.num_data(),
+            factor_to_usecs=1000000.0 # values are in seconds
+        )
+
         if args.output:
-            numpy.savetxt(
+            np.savetxt(
                 args.output,
                 predictions_array,
                 fmt='%f',
@@ -135,4 +145,3 @@ def main(cli_args=None):
 
 if __name__ == "__main__":
     main()
-

From f988aa3e36e90af79e5624ade26fb1b6d9f7f944 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Mon, 29 Nov 2021 09:37:37 -0800
Subject: [PATCH 2/3] add unit tests

---
 tests/common/test_metrics.py | 37 ++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/common/test_metrics.py b/tests/common/test_metrics.py
index 7af3af66..42d09560 100644
--- a/tests/common/test_metrics.py
+++ b/tests/common/test_metrics.py
@@ -208,3 +208,40 @@ def test_metrics_logger_log_time_block(mlflow_log_metric_mock):
         time.sleep(0.01)
 
     mlflow_log_metric_mock.assert_called_once()
+
+
+@patch('mlflow.log_figure')
+@patch('mlflow.log_metric')
+def test_log_inferencing_larencies(mlflow_log_metric_mock, mlflow_log_figure_mock):
+    """ Tests MetricsLogger().log_inferencing_larencies() """
+    metrics_logger = MetricsLogger()
+    metrics_logger.close()
+
+    test_latencies = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 5.0]
+    test_batch_sizes = [1, 1, 1, 1, 1, 1, 1, 5]
+
+    metrics_logger.log_inferencing_latencies(test_latencies, batch_length=test_batch_sizes, factor_to_usecs=1000000.0)
+
+    mlflow_log_metric_mock.assert_has_calls(
+        [
+            call("prediction_batches", 8, step=None), # len(test_latencies)
+            call("prediction_queries", 12, step=None), # sum(test_batch_sizes)
+
+            # reference values based on test_latencies above
+            
+            call("prediction_latency_avg", 650000.0, step=None), # sum(test_batch_sizes)
+
+            call("batch_latency_p50_usecs", 450000.0, step=None), # sum(test_batch_sizes)
+            call("batch_latency_p75_usecs", 625000.0, step=None), # sum(test_batch_sizes)
+            call("batch_latency_p90_usecs", 1989999.999999999, step=None), # sum(test_batch_sizes)
+            call("batch_latency_p95_usecs", 3494999.9999999977, step=None), # sum(test_batch_sizes)
+            call("batch_latency_p99_usecs", 4698999.999999998, step=None), # sum(test_batch_sizes)
+
+            call("prediction_latency_p50_usecs", 450000.0, step=None), # sum(test_batch_sizes)
+            call("prediction_latency_p75_usecs", 625000.0, step=None), # sum(test_batch_sizes)
+            call("prediction_latency_p90_usecs", 790000.0, step=None), # sum(test_batch_sizes)
+            call("prediction_latency_p95_usecs", 894999.9999999998, step=None), # sum(test_batch_sizes)
+            call('prediction_latency_p99_usecs', 979000.0, step=None)
+        ],
+        any_order=False
+    )

From 00cc37456ea62223eed1bff4a431ba18a7a96dbf Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Mon, 29 Nov 2021 09:58:28 -0800
Subject: [PATCH 3/3] align unit tests

---
 tests/common/test_metrics.py | 38 ++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/tests/common/test_metrics.py b/tests/common/test_metrics.py
index 42d09560..06997ca9 100644
--- a/tests/common/test_metrics.py
+++ b/tests/common/test_metrics.py
@@ -222,26 +222,34 @@ def test_log_inferencing_larencies(mlflow_log_metric_mock, mlflow_log_figure_moc
 
     metrics_logger.log_inferencing_latencies(test_latencies, batch_length=test_batch_sizes, factor_to_usecs=1000000.0)
 
+    #assert mlflow_log_metric_mock.call_args_list == []
+
     mlflow_log_metric_mock.assert_has_calls(
         [
             call("prediction_batches", 8, step=None), # len(test_latencies)
             call("prediction_queries", 12, step=None), # sum(test_batch_sizes)
 
             # reference values based on test_latencies above
-            
-            call("prediction_latency_avg", 650000.0, step=None), # sum(test_batch_sizes)
-
-            call("batch_latency_p50_usecs", 450000.0, step=None), # sum(test_batch_sizes)
-            call("batch_latency_p75_usecs", 625000.0, step=None), # sum(test_batch_sizes)
-            call("batch_latency_p90_usecs", 1989999.999999999, step=None), # sum(test_batch_sizes)
-            call("batch_latency_p95_usecs", 3494999.9999999977, step=None), # sum(test_batch_sizes)
-            call("batch_latency_p99_usecs", 4698999.999999998, step=None), # sum(test_batch_sizes)
-
-            call("prediction_latency_p50_usecs", 450000.0, step=None), # sum(test_batch_sizes)
-            call("prediction_latency_p75_usecs", 625000.0, step=None), # sum(test_batch_sizes)
-            call("prediction_latency_p90_usecs", 790000.0, step=None), # sum(test_batch_sizes)
-            call("prediction_latency_p95_usecs", 894999.9999999998, step=None), # sum(test_batch_sizes)
-            call('prediction_latency_p99_usecs', 979000.0, step=None)
+            call('prediction_latency_avg', 650000.0, step=None),
+            call('batch_latency_p50_usecs', 450000.0, step=None),
+            call('batch_latency_p75_usecs', 625000.0, step=None),
+            call('batch_latency_p90_usecs', 1989999.9999999993, step=None),
+            call('batch_latency_p95_usecs', 3494999.9999999977, step=None),
+            call('batch_latency_p99_usecs', 4698999.999999999, step=None),
+            call('prediction_latency_p50_usecs', 450000.0, step=None),
+            call('prediction_latency_p75_usecs', 625000.0, step=None),
+            call('prediction_latency_p90_usecs', 790000.0, step=None),
+            call('prediction_latency_p95_usecs', 894999.9999999999, step=None),
+            call('prediction_latency_p99_usecs', 978999.9999999999, step=None),
         ],
-        any_order=False
+        any_order=True
     )
+
+    # testing logging figures
+    figure_calls = mlflow_log_figure_mock.call_args_list
+    assert mlflow_log_figure_mock.call_count == 2
+    assert len(figure_calls) == 2
+
+    # second argument of each call is file name
+    assert (figure_calls[0].args[1] == "batch_latency_log_histogram.png")
+    assert (figure_calls[1].args[1] == "prediction_latency_log_histogram.png")