From 83c2e6c4bcfd4dde7dbb49e1d12e03ccac9adc2f Mon Sep 17 00:00:00 2001 From: Lior Amar Date: Tue, 4 Jun 2019 13:50:52 -0700 Subject: [PATCH 1/3] [REF-6255] source/sink: adding string-source and string-sink o Good as examples. o Useful as general source/sinks for demos --- .../Scikit-learn/sink/string-sink/__init__.py | 0 .../sink/string-sink/component.json | 30 ++++++++++++++++++ .../sink/string-sink/string_sink.py | 15 +++++++++ .../source/string-source/__init__.py | 0 .../source/string-source/component.json | 31 +++++++++++++++++++ .../source/string-source/string_source.py | 11 +++++++ 6 files changed, 87 insertions(+) create mode 100644 components/Python/Scikit-learn/sink/string-sink/__init__.py create mode 100644 components/Python/Scikit-learn/sink/string-sink/component.json create mode 100644 components/Python/Scikit-learn/sink/string-sink/string_sink.py create mode 100644 components/Python/Scikit-learn/source/string-source/__init__.py create mode 100644 components/Python/Scikit-learn/source/string-source/component.json create mode 100644 components/Python/Scikit-learn/source/string-source/string_source.py diff --git a/components/Python/Scikit-learn/sink/string-sink/__init__.py b/components/Python/Scikit-learn/sink/string-sink/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/components/Python/Scikit-learn/sink/string-sink/component.json b/components/Python/Scikit-learn/sink/string-sink/component.json new file mode 100644 index 0000000..5cf0d7a --- /dev/null +++ b/components/Python/Scikit-learn/sink/string-sink/component.json @@ -0,0 +1,30 @@ +{ + "version": 1, + "engineType": "Generic", + "userStandalone": false, + "language": "Python", + "name": "string-sink", + "label": "Simple String Sink", + "program": "string_sink.py", + "componentClass": "StringSink", + "group": "Sinks", + "useMLOps": true, + "inputInfo": [ + { + "description": "String", + "label": "string", + "defaultComponent": "", + "type": "str", + "group": "data" + }], + "outputInfo": [], + "arguments": [ + { + "key": "expected-value", + "label": "Expected value to compare to", + "type": "str", + "description": "This is the value expected as input. If no value is provided no check is done", + "optional": true + } + ] +} diff --git a/components/Python/Scikit-learn/sink/string-sink/string_sink.py b/components/Python/Scikit-learn/sink/string-sink/string_sink.py new file mode 100644 index 0000000..cf3df83 --- /dev/null +++ b/components/Python/Scikit-learn/sink/string-sink/string_sink.py @@ -0,0 +1,15 @@ +from parallelm.components import ConnectableComponent + + +class StringSink(ConnectableComponent): + + def __init__(self, engine): + super(self.__class__, self).__init__(engine) + + def _materialize(self, parent_data_objs, user_data): + expected_str_value = self._params.get('expected-value', "") + actual_value = parent_data_objs[0] + + if len(expected_str_value) > 0 and expected_str_value != actual_value: + raise Exception("Actual [{}] != Expected [{}]".format(actual_value, expected_str_value)) + return [] diff --git a/components/Python/Scikit-learn/source/string-source/__init__.py b/components/Python/Scikit-learn/source/string-source/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/components/Python/Scikit-learn/source/string-source/component.json b/components/Python/Scikit-learn/source/string-source/component.json new file mode 100644 index 0000000..8e1e6f1 --- /dev/null +++ b/components/Python/Scikit-learn/source/string-source/component.json @@ -0,0 +1,31 @@ +{ + "version": 1, + "engineType": "Generic", + "userStandalone": false, + "language": "Python", + "name": "string-source", + "label": "Simple String Source", + "program": "string_source.py", + "componentClass": "StringSource", + "group": "Connectors", + "useMLOps": true, + "inputInfo": [], + "outputInfo": [ + { + "description": "String", + "label": "string", + "defaultComponent": "", + "type": "str", + "group": "data" + } + ], + "arguments": [ + { + "key": "value", + "type": "str", + "label": "String value", + "description": "String value to provide as output", + "optional": false + } + ] +} diff --git a/components/Python/Scikit-learn/source/string-source/string_source.py b/components/Python/Scikit-learn/source/string-source/string_source.py new file mode 100644 index 0000000..f43621e --- /dev/null +++ b/components/Python/Scikit-learn/source/string-source/string_source.py @@ -0,0 +1,11 @@ +from parallelm.components import ConnectableComponent + + +class StringSource(ConnectableComponent): + + def __init__(self, engine): + super(self.__class__, self).__init__(engine) + + def _materialize(self, parent_data_objs, user_data): + str_value = self._params.get('value', "default-string-value") + return [str_value] From 2e0e7a5ebfda187fa58661a4852d8df6e887fda4 Mon Sep 17 00:00:00 2001 From: Lior Amar Date: Thu, 6 Jun 2019 12:33:05 -0700 Subject: [PATCH 2/3] [REF-6255] generic: adding a Generic engine section with some misc components o Adding file_to_dataframe o Adding dataframe_stats --- .../Generic/dataframe_stats/__init__.py | 1 + .../Generic/dataframe_stats/component.json | 57 +++++++++++++++++++ .../dataframe_stats/dataframe_stats.py | 44 ++++++++++++++ .../Generic/file_to_dataframe/__init__.py | 1 + .../Generic/file_to_dataframe/component.json | 38 +++++++++++++ .../file_to_dataframe/file_to_dataframe.py | 38 +++++++++++++ 6 files changed, 179 insertions(+) create mode 100644 components/Python/Generic/dataframe_stats/__init__.py create mode 100644 components/Python/Generic/dataframe_stats/component.json create mode 100644 components/Python/Generic/dataframe_stats/dataframe_stats.py create mode 100644 components/Python/Generic/file_to_dataframe/__init__.py create mode 100644 components/Python/Generic/file_to_dataframe/component.json create mode 100644 components/Python/Generic/file_to_dataframe/file_to_dataframe.py diff --git a/components/Python/Generic/dataframe_stats/__init__.py b/components/Python/Generic/dataframe_stats/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/components/Python/Generic/dataframe_stats/__init__.py @@ -0,0 +1 @@ + diff --git a/components/Python/Generic/dataframe_stats/component.json b/components/Python/Generic/dataframe_stats/component.json new file mode 100644 index 0000000..e5ac1b1 --- /dev/null +++ b/components/Python/Generic/dataframe_stats/component.json @@ -0,0 +1,57 @@ +{ + "version": 1, + "engineType": "Generic", + "language": "Python", + "userStandalone": false, + "name": "dataframe_stats", + "label": "DataFrame Statistics", + "description": "Generate statistics on the input dataframe.", + "program": "dataframe_stats.py", + "componentClass": "MCenterStatsComponentAdapter", + "modelBehavior": "Auxiliary", + "useMLOps": true, + "inputInfo": [ + { + "description": "In Pandas Dataframe", + "label": "dataframe", + "defaultComponent": "", + "type": "dataframe", + "group": "data" + } + ], + "outputInfo": [ + { + "description": "Out Pandas Dataframe", + "label": "dataframe", + "defaultComponent": "", + "type": "dataframe", + "group": "data" + } + ], + "group": "FeatureEng", + "arguments": [ + { + "key": "dataframe_is", + "label": "Dataframe Is", + "description": "What is this dataframe represents", + "type": "string", + "uiType": "select", + "options": [ + { + "label": "Input Data", + "value": "input_data" + }, + { + "label": "Categorical Predictions Probabilities", + "value": "categorical_predictions_probabilities" + }, + { + "label": "Other", + "value": "other" + } + ], + "defaultValue": "input_data", + "optional": false + } + ] +} diff --git a/components/Python/Generic/dataframe_stats/dataframe_stats.py b/components/Python/Generic/dataframe_stats/dataframe_stats.py new file mode 100644 index 0000000..a4ea148 --- /dev/null +++ b/components/Python/Generic/dataframe_stats/dataframe_stats.py @@ -0,0 +1,44 @@ +from __future__ import print_function + +from parallelm.components import ConnectableComponent +from parallelm.mlops import mlops as mlops +from parallelm.mlops.stats.bar_graph import BarGraph + + +class MCenterStatsComponentAdapter(ConnectableComponent): + def __init__(self, engine): + super(self.__class__, self).__init__(engine) + + def _materialize(self, parent_data_objs, user_data): + mlops.init() + + df = parent_data_objs[0] + dataframe_is = self._params.get("dataframe_is", "input_data") + + if dataframe_is == "input_data": + self._handle_input_data(df) + elif dataframe_is == "categorical_predictions_probabilities": + self._handle_categorical_predictions(df) + elif dataframe_is == "other": + pass + else: + self._logger("Error: argument value is not supported: {}".format(dataframe_is)) + + mlops.done() + + return[df] + + def _handle_input_data(self, df): + mlops.set_data_distribution_stat(df) + + def _handle_categorical_predictions(self, df): + + df_max_col = df.idxmax(axis=1) + series_value_count = df_max_col.value_counts(normalize=True) + + col_values = [] + for col in df.columns: + col_values.append(series_value_count.at[col]) + + bg = BarGraph().name("Categorical Prediction Distribution").cols(list(df.columns)).data(col_values) + mlops.set_stat(bg) diff --git a/components/Python/Generic/file_to_dataframe/__init__.py b/components/Python/Generic/file_to_dataframe/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/components/Python/Generic/file_to_dataframe/__init__.py @@ -0,0 +1 @@ + diff --git a/components/Python/Generic/file_to_dataframe/component.json b/components/Python/Generic/file_to_dataframe/component.json new file mode 100644 index 0000000..3482ad1 --- /dev/null +++ b/components/Python/Generic/file_to_dataframe/component.json @@ -0,0 +1,38 @@ +{ + "version": 1, + "engineType": "Generic", + "language": "Python", + "userStandalone": false, + "name": "file_to_dataframe", + "label": "Source File to DataFrame", + "program": "file_to_dataframe.py", + "componentClass": "MCenterComponentAdapter", + "modelBehavior": "Auxiliary", + "useMLOps": true, + "inputInfo": [{ + "description": "File to read contents", + "label": "filename", + "defaultComponent": "", + "type": "str", + "group": "data" + }], + "outputInfo": [ + { + "description": "Pandas Dataframe", + "label": "dataframe", + "defaultComponent": "", + "type": "dataframe", + "group": "data" + } + ], + "group": "Connectors", + "arguments": [ + { + "key": "filename", + "label": "Dataset file to read", + "type": "str", + "description": "File to use for loading DataSet into DataFrame", + "optional": true + } + ] +} diff --git a/components/Python/Generic/file_to_dataframe/file_to_dataframe.py b/components/Python/Generic/file_to_dataframe/file_to_dataframe.py new file mode 100644 index 0000000..24344de --- /dev/null +++ b/components/Python/Generic/file_to_dataframe/file_to_dataframe.py @@ -0,0 +1,38 @@ +from __future__ import print_function + +import sys +import os +import pandas + +from parallelm.components import ConnectableComponent + + +class MCenterComponentAdapter(ConnectableComponent): + """ + Adapter for read_file_to_df + """ + + def __init__(self, engine): + super(self.__class__, self).__init__(engine) + + def _materialize(self, parent_data_objs, user_data): + if len(parent_data_objs) is not 0: + file_path = str(parent_data_objs[0]) + else: + file_path = self._params.get('filename') + + self._logger.info("file: {}".format(file_path)) + df = self.read_file_to_df(file_path) + return [df] + + def read_file_to_df(self, filepath): + """ + Read file and return DataFrame + """ + + if not os.path.exists(filepath): + self._logger.info("stderr- failed to find {}".format(filepath), file=sys.stderr) + raise Exception("file path does not exist: {}".format(filepath)) + + df = pandas.read_csv(filepath) + return df From 39580357a1f7a96526494505e7e62417173ef304 Mon Sep 17 00:00:00 2001 From: Lior Amar Date: Thu, 6 Jun 2019 12:43:08 -0700 Subject: [PATCH 3/3] [REF-6255] restructure: moving non Scikit learn components to Generic engine --- .../{Scikit-learn/sink => Generic}/s3_file_sink/__init__.py | 0 .../{Scikit-learn/sink => Generic}/s3_file_sink/component.json | 0 .../Python/{Scikit-learn/sink => Generic}/s3_file_sink/main.py | 0 .../sink/string-sink => Generic/s3_file_source}/__init__.py | 0 .../source => Generic}/s3_file_source/component.json | 0 .../{Scikit-learn/source => Generic}/s3_file_source/main.py | 0 .../source/s3_file_source => Generic/string-sink}/__init__.py | 0 .../{Scikit-learn/sink => Generic}/string-sink/component.json | 0 .../{Scikit-learn/sink => Generic}/string-sink/string_sink.py | 0 .../{Scikit-learn/source => Generic}/string-source/__init__.py | 0 .../{Scikit-learn/source => Generic}/string-source/component.json | 0 .../source => Generic}/string-source/string_source.py | 0 12 files changed, 0 insertions(+), 0 deletions(-) rename components/Python/{Scikit-learn/sink => Generic}/s3_file_sink/__init__.py (100%) rename components/Python/{Scikit-learn/sink => Generic}/s3_file_sink/component.json (100%) rename components/Python/{Scikit-learn/sink => Generic}/s3_file_sink/main.py (100%) rename components/Python/{Scikit-learn/sink/string-sink => Generic/s3_file_source}/__init__.py (100%) rename components/Python/{Scikit-learn/source => Generic}/s3_file_source/component.json (100%) rename components/Python/{Scikit-learn/source => Generic}/s3_file_source/main.py (100%) rename components/Python/{Scikit-learn/source/s3_file_source => Generic/string-sink}/__init__.py (100%) rename components/Python/{Scikit-learn/sink => Generic}/string-sink/component.json (100%) rename components/Python/{Scikit-learn/sink => Generic}/string-sink/string_sink.py (100%) rename components/Python/{Scikit-learn/source => Generic}/string-source/__init__.py (100%) rename components/Python/{Scikit-learn/source => Generic}/string-source/component.json (100%) rename components/Python/{Scikit-learn/source => Generic}/string-source/string_source.py (100%) diff --git a/components/Python/Scikit-learn/sink/s3_file_sink/__init__.py b/components/Python/Generic/s3_file_sink/__init__.py similarity index 100% rename from components/Python/Scikit-learn/sink/s3_file_sink/__init__.py rename to components/Python/Generic/s3_file_sink/__init__.py diff --git a/components/Python/Scikit-learn/sink/s3_file_sink/component.json b/components/Python/Generic/s3_file_sink/component.json similarity index 100% rename from components/Python/Scikit-learn/sink/s3_file_sink/component.json rename to components/Python/Generic/s3_file_sink/component.json diff --git a/components/Python/Scikit-learn/sink/s3_file_sink/main.py b/components/Python/Generic/s3_file_sink/main.py similarity index 100% rename from components/Python/Scikit-learn/sink/s3_file_sink/main.py rename to components/Python/Generic/s3_file_sink/main.py diff --git a/components/Python/Scikit-learn/sink/string-sink/__init__.py b/components/Python/Generic/s3_file_source/__init__.py similarity index 100% rename from components/Python/Scikit-learn/sink/string-sink/__init__.py rename to components/Python/Generic/s3_file_source/__init__.py diff --git a/components/Python/Scikit-learn/source/s3_file_source/component.json b/components/Python/Generic/s3_file_source/component.json similarity index 100% rename from components/Python/Scikit-learn/source/s3_file_source/component.json rename to components/Python/Generic/s3_file_source/component.json diff --git a/components/Python/Scikit-learn/source/s3_file_source/main.py b/components/Python/Generic/s3_file_source/main.py similarity index 100% rename from components/Python/Scikit-learn/source/s3_file_source/main.py rename to components/Python/Generic/s3_file_source/main.py diff --git a/components/Python/Scikit-learn/source/s3_file_source/__init__.py b/components/Python/Generic/string-sink/__init__.py similarity index 100% rename from components/Python/Scikit-learn/source/s3_file_source/__init__.py rename to components/Python/Generic/string-sink/__init__.py diff --git a/components/Python/Scikit-learn/sink/string-sink/component.json b/components/Python/Generic/string-sink/component.json similarity index 100% rename from components/Python/Scikit-learn/sink/string-sink/component.json rename to components/Python/Generic/string-sink/component.json diff --git a/components/Python/Scikit-learn/sink/string-sink/string_sink.py b/components/Python/Generic/string-sink/string_sink.py similarity index 100% rename from components/Python/Scikit-learn/sink/string-sink/string_sink.py rename to components/Python/Generic/string-sink/string_sink.py diff --git a/components/Python/Scikit-learn/source/string-source/__init__.py b/components/Python/Generic/string-source/__init__.py similarity index 100% rename from components/Python/Scikit-learn/source/string-source/__init__.py rename to components/Python/Generic/string-source/__init__.py diff --git a/components/Python/Scikit-learn/source/string-source/component.json b/components/Python/Generic/string-source/component.json similarity index 100% rename from components/Python/Scikit-learn/source/string-source/component.json rename to components/Python/Generic/string-source/component.json diff --git a/components/Python/Scikit-learn/source/string-source/string_source.py b/components/Python/Generic/string-source/string_source.py similarity index 100% rename from components/Python/Scikit-learn/source/string-source/string_source.py rename to components/Python/Generic/string-source/string_source.py