Skip to content

Commit 8a684b2

Browse files
committed
[REF-6255] generic: adding a Generic engine section with some misc components
o Adding file_to_dataframe o Adding dataframe_stats
1 parent 7f8f72f commit 8a684b2

File tree

6 files changed

+179
-0
lines changed

6 files changed

+179
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
{
2+
"version": 1,
3+
"engineType": "Generic",
4+
"language": "Python",
5+
"userStandalone": false,
6+
"name": "dataframe_stats",
7+
"label": "DataFrame Statistics",
8+
"description": "Generate statistics on the input dataframe.",
9+
"program": "dataframe_stats.py",
10+
"componentClass": "MCenterStatsComponentAdapter",
11+
"modelBehavior": "Auxiliary",
12+
"useMLOps": true,
13+
"inputInfo": [
14+
{
15+
"description": "In Pandas Dataframe",
16+
"label": "dataframe",
17+
"defaultComponent": "",
18+
"type": "dataframe",
19+
"group": "data"
20+
}
21+
],
22+
"outputInfo": [
23+
{
24+
"description": "Out Pandas Dataframe",
25+
"label": "dataframe",
26+
"defaultComponent": "",
27+
"type": "dataframe",
28+
"group": "data"
29+
}
30+
],
31+
"group": "FeatureEng",
32+
"arguments": [
33+
{
34+
"key": "dataframe_is",
35+
"label": "Dataframe Is",
36+
"description": "What is this dataframe represents",
37+
"type": "string",
38+
"uiType": "select",
39+
"options": [
40+
{
41+
"label": "Input Data",
42+
"value": "input_data"
43+
},
44+
{
45+
"label": "Categorical Predictions Probabilities",
46+
"value": "categorical_predictions_probabilities"
47+
},
48+
{
49+
"label": "Other",
50+
"value": "other"
51+
}
52+
],
53+
"defaultValue": "input_data",
54+
"optional": false
55+
}
56+
]
57+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from __future__ import print_function
2+
3+
from parallelm.components import ConnectableComponent
4+
from parallelm.mlops import mlops as mlops
5+
from parallelm.mlops.stats.bar_graph import BarGraph
6+
7+
8+
class MCenterStatsComponentAdapter(ConnectableComponent):
9+
def __init__(self, engine):
10+
super(self.__class__, self).__init__(engine)
11+
12+
def _materialize(self, parent_data_objs, user_data):
13+
mlops.init()
14+
15+
df = parent_data_objs[0]
16+
dataframe_is = self._params.get("dataframe_is", "input_data")
17+
18+
if dataframe_is == "input_data":
19+
self._handle_input_data(df)
20+
elif dataframe_is == "categorical_predictions_probabilities":
21+
self._handle_categorical_predictions(df)
22+
elif dataframe_is == "other":
23+
pass
24+
else:
25+
self._logger("Error: argument value is not supported: {}".format(dataframe_is))
26+
27+
mlops.done()
28+
29+
return[df]
30+
31+
def _handle_input_data(self, df):
32+
mlops.set_data_distribution_stat(df)
33+
34+
def _handle_categorical_predictions(self, df):
35+
36+
df_max_col = df.idxmax(axis=1)
37+
series_value_count = df_max_col.value_counts(normalize=True)
38+
39+
col_values = []
40+
for col in df.columns:
41+
col_values.append(series_value_count.at[col])
42+
43+
bg = BarGraph().name("Categorical Prediction Distribution").cols(list(df.columns)).data(col_values)
44+
mlops.set_stat(bg)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
{
2+
"version": 1,
3+
"engineType": "Generic",
4+
"language": "Python",
5+
"userStandalone": false,
6+
"name": "file_to_dataframe",
7+
"label": "Source File to DataFrame",
8+
"program": "file_to_dataframe.py",
9+
"componentClass": "MCenterComponentAdapter",
10+
"modelBehavior": "Auxiliary",
11+
"useMLOps": true,
12+
"inputInfo": [{
13+
"description": "File to read contents",
14+
"label": "filename",
15+
"defaultComponent": "",
16+
"type": "str",
17+
"group": "data"
18+
}],
19+
"outputInfo": [
20+
{
21+
"description": "Pandas Dataframe",
22+
"label": "dataframe",
23+
"defaultComponent": "",
24+
"type": "dataframe",
25+
"group": "data"
26+
}
27+
],
28+
"group": "Connectors",
29+
"arguments": [
30+
{
31+
"key": "filename",
32+
"label": "Dataset file to read",
33+
"type": "str",
34+
"description": "File to use for loading DataSet into DataFrame",
35+
"optional": true
36+
}
37+
]
38+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from __future__ import print_function
2+
3+
import sys
4+
import os
5+
import pandas
6+
7+
from parallelm.components import ConnectableComponent
8+
9+
10+
class MCenterComponentAdapter(ConnectableComponent):
11+
"""
12+
Adapter for read_file_to_df
13+
"""
14+
15+
def __init__(self, engine):
16+
super(self.__class__, self).__init__(engine)
17+
18+
def _materialize(self, parent_data_objs, user_data):
19+
if len(parent_data_objs) is not 0:
20+
file_path = str(parent_data_objs[0])
21+
else:
22+
file_path = self._params.get('filename')
23+
24+
self._logger.info("file: {}".format(file_path))
25+
df = self.read_file_to_df(file_path)
26+
return [df]
27+
28+
def read_file_to_df(self, filepath):
29+
"""
30+
Read file and return DataFrame
31+
"""
32+
33+
if not os.path.exists(filepath):
34+
self._logger.info("stderr- failed to find {}".format(filepath), file=sys.stderr)
35+
raise Exception("file path does not exist: {}".format(filepath))
36+
37+
df = pandas.read_csv(filepath)
38+
return df

0 commit comments

Comments
 (0)