From 633089f5800c3b7472a2d7237daebfc2adf322d8 Mon Sep 17 00:00:00 2001 From: Dhananjoy Das Date: Fri, 1 Mar 2019 16:13:05 -0800 Subject: [PATCH 1/3] [REF-5768] component to Read file into DataFrame --- .../file_to_dataframe/__init__.py | 0 .../file_to_dataframe/component.json | 38 +++++++++++++++ .../fileConnectors/file_to_dataframe/main.py | 47 +++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 components/Python/fileConnectors/file_to_dataframe/__init__.py create mode 100644 components/Python/fileConnectors/file_to_dataframe/component.json create mode 100644 components/Python/fileConnectors/file_to_dataframe/main.py diff --git a/components/Python/fileConnectors/file_to_dataframe/__init__.py b/components/Python/fileConnectors/file_to_dataframe/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/components/Python/fileConnectors/file_to_dataframe/component.json b/components/Python/fileConnectors/file_to_dataframe/component.json new file mode 100644 index 0000000..bf22bdc --- /dev/null +++ b/components/Python/fileConnectors/file_to_dataframe/component.json @@ -0,0 +1,38 @@ +{ + "engineType": "Python", + "language": "Python", + "userStandalone": false, + "name": "file_to_dataframe", + "label": "Source File to DataFrame", + "program": "main.py", + "componentClass": "MCenterComponentAdapter", + "modelBehavior": "Auxiliary", + "useMLOps": true, + "inputInfo": [{ + "description": "File to read contents", + "label": "File-Name", + "defaultComponent": "", + "type": "str", + "group": "data" + }], + "outputInfo": [ + { + "description": "Pandas Dataframe", + "label": "dataframe", + "defaultComponent": "", + "type": "dataframe", + "group": "data" + } + ], + "group": "Connectors", + "arguments": [ + { + "key": "file-path", + "label": "Dataset file to read", + "type": "str", + "description": "File to use for loading DataSet into DataFrame", + "optional": true + } + ], + "version": 1 +} diff --git a/components/Python/fileConnectors/file_to_dataframe/main.py b/components/Python/fileConnectors/file_to_dataframe/main.py new file mode 100644 index 0000000..591d398 --- /dev/null +++ b/components/Python/fileConnectors/file_to_dataframe/main.py @@ -0,0 +1,47 @@ +from __future__ import print_function + +import argparse +import sys +import time +import os +import pandas + +from parallelm.components import ConnectableComponent +from parallelm.mlops.stats.multi_line_graph import MultiLineGraph +from parallelm.mlops import mlops as mlops + +class MCenterComponentAdapter(ConnectableComponent): + """ + Adapter for read_file_to_df + """ + + def __init__(self, engine): + super(self.__class__, self).__init__(engine) + + def _materialize(self, parent_data_objs, user_data): + file_path = str(parent_data_objs[0]) + if file_path is None: + file_path = self._params.get('file_path') + return [read_file_to_df(file_path)] + + +def read_file_to_df(filepath): + """ + Read file and return DataFrame + """ + mlops.init() + if not os.path.exists(filepath): + print("stderr- failed to find {}".format(filepath), file=sys.stderr) + raise Exception("file path does not exist: {}".format(filepath)) + + test_data = pandas.read_csv(filepath) + mlops.done() + return test_data + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--file-path", default='/tmp/test-data.csv', help="Dataset to read") + options = parser.parse_args() + return options + From d51505beb953e3614776afda9e9ab1c231f1094c Mon Sep 17 00:00:00 2001 From: Dhananjoy Das Date: Fri, 1 Mar 2019 18:12:37 -0800 Subject: [PATCH 2/3] [REF-5768] minor:fix print --- components/Python/fileConnectors/file_to_dataframe/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/Python/fileConnectors/file_to_dataframe/main.py b/components/Python/fileConnectors/file_to_dataframe/main.py index 591d398..30a12f0 100644 --- a/components/Python/fileConnectors/file_to_dataframe/main.py +++ b/components/Python/fileConnectors/file_to_dataframe/main.py @@ -22,16 +22,16 @@ def _materialize(self, parent_data_objs, user_data): file_path = str(parent_data_objs[0]) if file_path is None: file_path = self._params.get('file_path') - return [read_file_to_df(file_path)] + return [read_file_to_df(self, file_path)] -def read_file_to_df(filepath): +def read_file_to_df(self, filepath): """ Read file and return DataFrame """ mlops.init() if not os.path.exists(filepath): - print("stderr- failed to find {}".format(filepath), file=sys.stderr) + self._logger.info("stderr- failed to find {}".format(filepath), file=sys.stderr) raise Exception("file path does not exist: {}".format(filepath)) test_data = pandas.read_csv(filepath) From 183c6893226d508d5c83e818ad075681a6ce5c4c Mon Sep 17 00:00:00 2001 From: Dhananjoy Das Date: Fri, 1 Mar 2019 19:20:54 -0800 Subject: [PATCH 3/3] [REF-5768] minor param-check --- components/Python/fileConnectors/file_to_dataframe/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/components/Python/fileConnectors/file_to_dataframe/main.py b/components/Python/fileConnectors/file_to_dataframe/main.py index 30a12f0..385ff52 100644 --- a/components/Python/fileConnectors/file_to_dataframe/main.py +++ b/components/Python/fileConnectors/file_to_dataframe/main.py @@ -19,9 +19,10 @@ def __init__(self, engine): super(self.__class__, self).__init__(engine) def _materialize(self, parent_data_objs, user_data): - file_path = str(parent_data_objs[0]) - if file_path is None: - file_path = self._params.get('file_path') + if len(parent_data_objs) is not 0: + file_path = str(parent_data_objs[0]) + else: + file_path = self._params.get('file-path') return [read_file_to_df(self, file_path)]