mlpiper · PMjoydas · Mar 2, 2019 · Mar 2, 2019 · Mar 2, 2019 · Mar 2, 2019
diff --git a/components/Python/Scikit-learn/scikit_learn_predictor/__init__.py b/components/Python/Scikit-learn/scikit_learn_predictor/__init__.py
diff --git a/components/Python/Scikit-learn/scikit_learn_predictor/component.json b/components/Python/Scikit-learn/scikit_learn_predictor/component.json
@@ -0,0 +1,41 @@
+{
+  "engineType": "Python",
+  "language": "Python",
+  "userStandalone": false,
+  "name": "scikit_learn_predictor",
+  "label": "Scikit Learn Predictor",
+  "program": "main.py",
+  "componentClass": "MCenterComponentAdapter",
+  "modelBehavior": "ModelConsumer",
+  "useMLOps": true,
+  "inputInfo": [
+    {
+      "description": "Pandas Dataframe",
+      "label": "dataframe",
+      "defaultComponent": "",
+      "type": "dataframe",
+      "group": "data"
+    }
+  ],
+  "outputInfo": [
+    {
+      "description": "Pandas Dataframe",
+      "label": "dataframe",
+      "defaultComponent": "",
+      "type": "dataframe",
+      "group": "data"
+    }
+  ],
+  "group": "Algorithms",
+  "arguments": [
+    {
+      "key": "input-model",
+      "label": "Model input file",
+      "type": "str",
+      "description": "File to use for loading the model",
+      "optional": true,
+      "tag": "input_model_path"
+    }
+  ],
+  "version": 1
+}
diff --git a/components/Python/Scikit-learn/scikit_learn_predictor/main.py b/components/Python/Scikit-learn/scikit_learn_predictor/main.py
@@ -0,0 +1,105 @@
+from __future__ import print_function
+
+import argparse
+import sys
+import time
+import os
+import pickle
+import pandas as pd
+import numpy as np
+
+from sklearn.exceptions import NotFittedError
+from parallelm.components import ConnectableComponent
+from parallelm.mlops import mlops as mlops
+from parallelm.mlops.stats.bar_graph import BarGraph
+from parallelm.mlops.stats.multi_line_graph import MultiLineGraph
+from parallelm.mlops.predefined_stats import PredefinedStats
+
+
+class MCenterComponentAdapter(ConnectableComponent):
+    """
+    Adapter for the do_predict
+    """
+    def __init__(self, engine):
+        super(self.__class__, self).__init__(engine)
+
+    def _materialize(self, parent_data_objs, user_data):
+        df_infer_data = parent_data_objs[0]
+        input_model = self._params.get("input-model")
+        return[do_predict(df_infer_data, input_model)]
+
+
+def do_predict(df_infer, input_model):
+    """
+    Perform predictions:
+        a) load scikit-learn model
+        b) on dataset run predict, collect stats assocaited to predictions
+        c) obtain class probability stats
+        d) report stats using mlops APIs
+    """
+    prog_start_time = time.time()
+    mlops.init()
+    model = pickle.load(open(input_model, "rb"))
+    data_features = df_infer.values
+
+    # Start timer (inference)
+    inference_start_time = time.time()
+    # Predict labels
+    predict_results = model.predict(df_infer)
+    # End timer (inference)
+    inference_elapsed_time = time.time() - inference_start_time
+
+    # Predict probability
+    class_probability = model.predict_proba(df_infer)
+    maximum_prob = np.max(class_probability, axis=1)
+    # Tag samples that are below a certain probability and write to a file
+    confidence = 0.7
+
+    low_prob_predictions = predict_results[np.where(maximum_prob < confidence)]
+    unique_elements_low, counts_elements_low = np.unique(low_prob_predictions, return_counts=True)
+    unique_elements_low = [str(i) for i in unique_elements_low]
+    # self._logger.info("Low confidence predictions: \n {0} \n with frequency {1}".format(unique_elements_low, counts_elements_low))
+
+    # ########## Start of MCenter instrumentation ##############
+    # # BarGraph showing distribution of low confidence labels
+    bar = BarGraph().name("Low confidence label distribution").cols(unique_elements_low).data(counts_elements_low.tolist())
+    # self._logger.info("Low bar : ", type(bar), "->", bar)
+    mlops.set_stat(bar)
+
+    # ########## End of MCenter instrumentation ################
+    #
+    # # Samples with high probability
+    high_prob_predictions = predict_results[np.where(maximum_prob > confidence)]
+    unique_elements_high, counts_elements_high = np.unique(high_prob_predictions, return_counts=True)
+    unique_elements_high = [str(i) for i in unique_elements_high]
+    # self._logger.info("High confidence predictions: \n {0} \n with frequency {1}".format(unique_elements_high,
+    #                                                                                 counts_elements_low))
+
+    # ########## Start of MCenter instrumentation ##############
+    # #  BarGraph showing distribution of high confidence labels
+    bar = BarGraph().name("High confidence label distribution").cols(unique_elements_high).data(counts_elements_high.tolist())
+    # self._logger.info("High bar : ", type(bar), "->", bar)
+    mlops.set_stat(bar)
+    ########## End of MCenter instrumentation ################
+
+    ########  Report PM stats ###########
+    mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(data_features))
+    prog_elapsed_time = time.time() - prog_start_time
+    mlt_time = MultiLineGraph().name("Time Elapsed").labels(["Program Time", "Inference Time"])
+    mlt_time.data([prog_elapsed_time, inference_elapsed_time])
+    mlops.set_stat(mlt_time)
+    ###  End of PM stats reporting #####
+
+    mlops.done()
+
+    # return predict_results
+    return class_probability
+
+
+def parse_args():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-model", help="Path to load model from")
+    options = parser.parse_args()
+    return options
+
diff --git a/components/Python/featureEng/df_label_encoding/__init__.py b/components/Python/featureEng/df_label_encoding/__init__.py
diff --git a/components/Python/featureEng/df_label_encoding/component.json b/components/Python/featureEng/df_label_encoding/component.json
@@ -0,0 +1,32 @@
+{
+  "engineType": "Python",
+  "language": "Python",
+  "userStandalone": false,
+  "name": "df_label_encoding",
+  "label": "Feature Engineering: Label Encoding DataFrame",
+  "program": "main.py",
+  "componentClass": "MCenterComponentAdapter",
+  "modelBehavior": "Auxiliary",
+  "useMLOps": true,
+  "inputInfo": [
+    {
+      "description": "Pandas Dataframe",
+      "label": "dataframe",
+      "defaultComponent": "",
+      "type": "dataframe",
+      "group": "data"
+    }
+  ],
+  "outputInfo": [
+    {
+      "description": "Pandas Dataframe",
+      "label": "dataframe",
+      "defaultComponent": "",
+      "type": "dataframe",
+      "group": "data"
+    }
+  ],
+  "group": "FeatureEng",
+  "arguments": [],
+  "version": 1
+}
diff --git a/components/Python/featureEng/df_label_encoding/main.py b/components/Python/featureEng/df_label_encoding/main.py
@@ -0,0 +1,55 @@
+from __future__ import print_function
+
+import argparse
+import sys
+import time
+import os
+import pandas
+import numpy as np
+
+from sklearn.exceptions import NotFittedError
+from sklearn import preprocessing
+from parallelm.components import ConnectableComponent
+from parallelm.mlops import mlops as mlops
+from parallelm.mlops.stats.bar_graph import BarGraph
+from parallelm.mlops.stats.multi_line_graph import MultiLineGraph
+from parallelm.mlops.predefined_stats import PredefinedStats
+
+
+class MCenterComponentAdapter(ConnectableComponent):
+    """
+    Adapter for the do_label_encoding
+    """
+
+    def __init__(self, engine):
+        super(self.__class__, self).__init__(engine)
+
+    def _materialize(self, parent_data_objs, user_data):
+        df_data = parent_data_objs[0]
+        return[do_label_encoding(df_data)]
+
+
+def do_label_encoding(df_data):
+    """
+    Cleanup: (Feature Engineering)
+        a) simple label encoding, convert string to real values
+        b) remove NaN's drop rows with NaN
+    """
+    for column in df_data.columns:
+        if df_data[column].dtype == type(object):
+            le = preprocessing.LabelEncoder()
+            df_data[column] = le.fit_transform(df_data[column])
+
+    df_data = df_data.dropna()
+
+    # Initialize MLOps Library
+    mlops.init()
+
+    # Output Health Statistics to MCenter
+    # MLOps API to report the distribution statistics of each feature
+    mlops.set_data_distribution_stat(df_data)
+
+    # Terminate MLOPs
+    mlops.done()
+    return df_data
+
diff --git a/components/Python/fileConnectors/dataframe_to_file/__init__.py b/components/Python/fileConnectors/dataframe_to_file/__init__.py
diff --git a/components/Python/fileConnectors/dataframe_to_file/component.json b/components/Python/fileConnectors/dataframe_to_file/component.json
@@ -0,0 +1,40 @@
+{
+  "engineType": "Python",
+  "language": "Python",
+  "userStandalone": false,
+  "name": "dataframe_to_file",
+  "label": "Sink DataFrame to File",
+  "program": "main.py",
+  "componentClass": "MCenterComponentAdapter",
+  "modelBehavior": "Auxiliary",
+  "useMLOps": true,
+  "inputInfo": [
+    {
+      "description": "Pandas DataFrame",
+      "label": "dataframe",
+      "defaultComponent": "",
+      "type": "dataframe",
+      "group": "data"
+    }
+  ],
+  "outputInfo": [
+    {
+      "description": "File name",
+      "label": "filename",
+      "defaultComponent": "",
+      "type": "str",
+      "group": "data"
+    }
+  ],
+  "group": "Sinks",
+  "arguments": [
+    {
+      "key": "file-path",
+      "label": "save to file",
+      "type": "str",
+      "description": "Save DataFrame to file",
+      "optional": true
+    }
+  ],
+  "version": 1
+}
diff --git a/components/Python/fileConnectors/dataframe_to_file/main.py b/components/Python/fileConnectors/dataframe_to_file/main.py
@@ -0,0 +1,47 @@
+from __future__ import print_function
+
+import argparse
+import sys
+import time
+import os
+import pandas
+
+from parallelm.components import ConnectableComponent
+from parallelm.mlops.stats.multi_line_graph import MultiLineGraph
+from parallelm.mlops import mlops as mlops
+
+class MCenterComponentAdapter(ConnectableComponent):
+    """
+    Adapter for df_to_file 
+    """
+
+    def __init__(self, engine):
+        super(self.__class__, self).__init__(engine)
+
+    def _materialize(self, parent_data_objs, user_data):
+        df_results = parent_data_objs[0]
+        results_path = self._params.get('file_path')
+        return [df_to_file(df_results, results_path)]
+
+
+def df_to_file(df_predict_results, filepath):
+    """
+    Save DataFrame to file
+    """
+    prog_start_time = time.time()
+    mlops.init()
+    suffix_time_stamp = str(int(time.time()))
+    save_file = filepath + '.' + suffix_time_stamp
+    sfile = open(save_file, 'w+')
+    pandas.DataFrame(df_predict_results).to_csv(save_file)
+    sfile.close()
+    mlops.done()
+    return save_file
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file-path", default='/tmp/results', help="Save DataFrame to file")
+    options = parser.parse_args()
+    return options
+
diff --git a/components/Python/fileConnectors/file_to_dataframe/__init__.py b/components/Python/fileConnectors/file_to_dataframe/__init__.py
diff --git a/components/Python/fileConnectors/file_to_dataframe/component.json b/components/Python/fileConnectors/file_to_dataframe/component.json
@@ -0,0 +1,38 @@
+{
+  "engineType": "Python",
+  "language": "Python",
+  "userStandalone": false,
+  "name": "file_to_dataframe",
+  "label": "Source File to DataFrame",
+  "program": "main.py",
+  "componentClass": "MCenterComponentAdapter",
+  "modelBehavior": "Auxiliary",
+  "useMLOps": true,
+  "inputInfo": [{
+    "description": "File to read contents",
+    "label": "File-Name",
+    "defaultComponent": "",
+    "type": "str",
+    "group": "data"
+  }],
+  "outputInfo": [
+    {
+      "description": "Pandas Dataframe",
+      "label": "dataframe",
+      "defaultComponent": "",
+      "type": "dataframe",
+      "group": "data"
+    }
+  ],
+  "group": "Connectors",
+  "arguments": [
+    {
+      "key": "file-path",
+      "label": "Dataset file to read",
+      "type": "str",
+      "description": "File to use for loading DataSet into DataFrame",
+      "optional": true
+    }
+  ],
+  "version": 1
+}