Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"engineType": "Python",
"language": "Python",
"userStandalone": false,
"name": "scikit_learn_predictor",
"label": "Scikit Learn Predictor",
"program": "main.py",
"componentClass": "MCenterComponentAdapter",
"modelBehavior": "ModelConsumer",
"useMLOps": true,
"inputInfo": [
{
"description": "Pandas Dataframe",
"label": "dataframe",
"defaultComponent": "",
"type": "dataframe",
"group": "data"
}
],
"outputInfo": [
{
"description": "Pandas Dataframe",
"label": "dataframe",
"defaultComponent": "",
"type": "dataframe",
"group": "data"
}
],
"group": "Algorithms",
"arguments": [
{
"key": "input-model",
"label": "Model input file",
"type": "str",
"description": "File to use for loading the model",
"optional": true,
"tag": "input_model_path"
}
],
"version": 1
}
105 changes: 105 additions & 0 deletions components/Python/Scikit-learn/scikit_learn_predictor/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from __future__ import print_function

import argparse
import sys
import time
import os
import pickle
import pandas as pd
import numpy as np

from sklearn.exceptions import NotFittedError
from parallelm.components import ConnectableComponent
from parallelm.mlops import mlops as mlops
from parallelm.mlops.stats.bar_graph import BarGraph
from parallelm.mlops.stats.multi_line_graph import MultiLineGraph
from parallelm.mlops.predefined_stats import PredefinedStats


class MCenterComponentAdapter(ConnectableComponent):
"""
Adapter for the do_predict
"""
def __init__(self, engine):
super(self.__class__, self).__init__(engine)

def _materialize(self, parent_data_objs, user_data):
df_infer_data = parent_data_objs[0]
input_model = self._params.get("input-model")
return[do_predict(df_infer_data, input_model)]


def do_predict(df_infer, input_model):
"""
Perform predictions:
a) load scikit-learn model
b) on dataset run predict, collect stats assocaited to predictions
c) obtain class probability stats
d) report stats using mlops APIs
"""
prog_start_time = time.time()
mlops.init()
model = pickle.load(open(input_model, "rb"))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we "try" model loading. In case model not is ready no to get an exception.

data_features = df_infer.values

# Start timer (inference)
inference_start_time = time.time()
# Predict labels
predict_results = model.predict(df_infer)
# End timer (inference)
inference_elapsed_time = time.time() - inference_start_time

# Predict probability
class_probability = model.predict_proba(df_infer)
maximum_prob = np.max(class_probability, axis=1)
# Tag samples that are below a certain probability and write to a file
confidence = 0.7

low_prob_predictions = predict_results[np.where(maximum_prob < confidence)]
unique_elements_low, counts_elements_low = np.unique(low_prob_predictions, return_counts=True)
unique_elements_low = [str(i) for i in unique_elements_low]
# self._logger.info("Low confidence predictions: \n {0} \n with frequency {1}".format(unique_elements_low, counts_elements_low))

# ########## Start of MCenter instrumentation ##############
# # BarGraph showing distribution of low confidence labels
bar = BarGraph().name("Low confidence label distribution").cols(unique_elements_low).data(counts_elements_low.tolist())
# self._logger.info("Low bar : ", type(bar), "->", bar)
mlops.set_stat(bar)

# ########## End of MCenter instrumentation ################
#
# # Samples with high probability
high_prob_predictions = predict_results[np.where(maximum_prob > confidence)]
unique_elements_high, counts_elements_high = np.unique(high_prob_predictions, return_counts=True)
unique_elements_high = [str(i) for i in unique_elements_high]
# self._logger.info("High confidence predictions: \n {0} \n with frequency {1}".format(unique_elements_high,
# counts_elements_low))

# ########## Start of MCenter instrumentation ##############
# # BarGraph showing distribution of high confidence labels
bar = BarGraph().name("High confidence label distribution").cols(unique_elements_high).data(counts_elements_high.tolist())
# self._logger.info("High bar : ", type(bar), "->", bar)
mlops.set_stat(bar)
########## End of MCenter instrumentation ################

######## Report PM stats ###########
mlops.set_stat(PredefinedStats.PREDICTIONS_COUNT, len(data_features))
prog_elapsed_time = time.time() - prog_start_time
mlt_time = MultiLineGraph().name("Time Elapsed").labels(["Program Time", "Inference Time"])
mlt_time.data([prog_elapsed_time, inference_elapsed_time])
mlops.set_stat(mlt_time)
### End of PM stats reporting #####

mlops.done()

# return predict_results
return class_probability
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We return the prediction probability per class and not the predictions. Maybe at least have a configuration for this?
Also do we need another post processing stage to go back to the labels?



def parse_args():

parser = argparse.ArgumentParser()
parser.add_argument("--input-model", help="Path to load model from")
options = parser.parse_args()
return options

Empty file.
32 changes: 32 additions & 0 deletions components/Python/featureEng/df_label_encoding/component.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"engineType": "Python",
"language": "Python",
"userStandalone": false,
"name": "df_label_encoding",
"label": "Feature Engineering: Label Encoding DataFrame",
"program": "main.py",
"componentClass": "MCenterComponentAdapter",
"modelBehavior": "Auxiliary",
"useMLOps": true,
"inputInfo": [
{
"description": "Pandas Dataframe",
"label": "dataframe",
"defaultComponent": "",
"type": "dataframe",
"group": "data"
}
],
"outputInfo": [
{
"description": "Pandas Dataframe",
"label": "dataframe",
"defaultComponent": "",
"type": "dataframe",
"group": "data"
}
],
"group": "FeatureEng",
"arguments": [],
"version": 1
}
55 changes: 55 additions & 0 deletions components/Python/featureEng/df_label_encoding/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from __future__ import print_function

import argparse
import sys
import time
import os
import pandas
import numpy as np

from sklearn.exceptions import NotFittedError
from sklearn import preprocessing
from parallelm.components import ConnectableComponent
from parallelm.mlops import mlops as mlops
from parallelm.mlops.stats.bar_graph import BarGraph
from parallelm.mlops.stats.multi_line_graph import MultiLineGraph
from parallelm.mlops.predefined_stats import PredefinedStats


class MCenterComponentAdapter(ConnectableComponent):
"""
Adapter for the do_label_encoding
"""

def __init__(self, engine):
super(self.__class__, self).__init__(engine)

def _materialize(self, parent_data_objs, user_data):
df_data = parent_data_objs[0]
return[do_label_encoding(df_data)]


def do_label_encoding(df_data):
"""
Cleanup: (Feature Engineering)
a) simple label encoding, convert string to real values
b) remove NaN's drop rows with NaN
"""
for column in df_data.columns:
if df_data[column].dtype == type(object):
le = preprocessing.LabelEncoder()
df_data[column] = le.fit_transform(df_data[column])

df_data = df_data.dropna()

# Initialize MLOps Library
mlops.init()

# Output Health Statistics to MCenter
# MLOps API to report the distribution statistics of each feature
mlops.set_data_distribution_stat(df_data)

# Terminate MLOPs
mlops.done()
return df_data

Empty file.
40 changes: 40 additions & 0 deletions components/Python/fileConnectors/dataframe_to_file/component.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"engineType": "Python",
"language": "Python",
"userStandalone": false,
"name": "dataframe_to_file",
"label": "Sink DataFrame to File",
"program": "main.py",
"componentClass": "MCenterComponentAdapter",
"modelBehavior": "Auxiliary",
"useMLOps": true,
"inputInfo": [
{
"description": "Pandas DataFrame",
"label": "dataframe",
"defaultComponent": "",
"type": "dataframe",
"group": "data"
}
],
"outputInfo": [
{
"description": "File name",
"label": "filename",
"defaultComponent": "",
"type": "str",
"group": "data"
}
],
"group": "Sinks",
"arguments": [
{
"key": "file-path",
"label": "save to file",
"type": "str",
"description": "Save DataFrame to file",
"optional": true
}
],
"version": 1
}
47 changes: 47 additions & 0 deletions components/Python/fileConnectors/dataframe_to_file/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from __future__ import print_function

import argparse
import sys
import time
import os
import pandas

from parallelm.components import ConnectableComponent
from parallelm.mlops.stats.multi_line_graph import MultiLineGraph
from parallelm.mlops import mlops as mlops

class MCenterComponentAdapter(ConnectableComponent):
"""
Adapter for df_to_file
"""

def __init__(self, engine):
super(self.__class__, self).__init__(engine)

def _materialize(self, parent_data_objs, user_data):
df_results = parent_data_objs[0]
results_path = self._params.get('file_path')
return [df_to_file(df_results, results_path)]


def df_to_file(df_predict_results, filepath):
"""
Save DataFrame to file
"""
prog_start_time = time.time()
mlops.init()
suffix_time_stamp = str(int(time.time()))
save_file = filepath + '.' + suffix_time_stamp
sfile = open(save_file, 'w+')
pandas.DataFrame(df_predict_results).to_csv(save_file)
sfile.close()
mlops.done()
return save_file


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--file-path", default='/tmp/results', help="Save DataFrame to file")
options = parser.parse_args()
return options

Empty file.
38 changes: 38 additions & 0 deletions components/Python/fileConnectors/file_to_dataframe/component.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"engineType": "Python",
"language": "Python",
"userStandalone": false,
"name": "file_to_dataframe",
"label": "Source File to DataFrame",
"program": "main.py",
"componentClass": "MCenterComponentAdapter",
"modelBehavior": "Auxiliary",
"useMLOps": true,
"inputInfo": [{
"description": "File to read contents",
"label": "File-Name",
"defaultComponent": "",
"type": "str",
"group": "data"
}],
"outputInfo": [
{
"description": "Pandas Dataframe",
"label": "dataframe",
"defaultComponent": "",
"type": "dataframe",
"group": "data"
}
],
"group": "Connectors",
"arguments": [
{
"key": "file-path",
"label": "Dataset file to read",
"type": "str",
"description": "File to use for loading DataSet into DataFrame",
"optional": true
}
],
"version": 1
}
Loading