-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmunging.py
More file actions
executable file
·150 lines (110 loc) · 5.26 KB
/
munging.py
File metadata and controls
executable file
·150 lines (110 loc) · 5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
""" A set of data munging functions to getting things done quickly and cleanly """
import numpy as np
def asList(x):
""" Convert a variable to a list.
:param x: an object that you would like to convert to a list. Will raise an
exception if it is iterable but not already a tuple or list.
"""
if isinstance(x, list):
return x
elif isinstance(x, tuple):
return list(x)
elif not hasattr(x, '__iter__'):
return [x, ]
else:
raise ValueError('Could not convert {} to a list'.format(str(x)))
def mergeAndDrop(df, flags, left_on, right_on, flagName, keep_in_list=None, keep_logic=None, drop_in_list=None, drop_logic=None, drop_when='all'):
""" Merge on a set of flags and drop depending on a criteria.
Arguments:
:type df: pandas.DataFrame
:param df: A data frame that you want to merge flags on to.
:type flags: pandas.DataFrame
:param flags: A data frame of flags that you want to merge.
:param list left_on: Column(s) name that you want to merge in your main data frame.
:param list right_on: Column(s) name that you want to merge in your flags data frame.
:param list flagName: Column names with flags.
:param list keep_in_list: List of what flag values to keep.
:param str keep_logic: A string with the logical opperator for which
flags to keep. For example, ('== 0', '<= 1', '>= 10', '!= 2').
:param list drop_in_list: List of what flag values to drop.
:param str drop_logic: A string with the logical opperator for which
flags to drop. For example, ('== 0', '<= 1', '>= 10', '!= 2').
:type drop_when: str|int
:param drop_when: If 'all' then a row will be kept if all flags return
True (keep) or False (drop). If 'any', a row will be kept if any of the
flags were True (keep) or False (drop). If number between 1 and
100, then this number will be used as a proportional cutoff. For
example, if how was 50, then a row will be kept or dropped if a
flag was True (keep) or False (drop) in 50% of the flags.
Returns:
:rtype: pd.DataFrame
:return: A data frame with the rows that were flagged (keep) or without
the flagged rows (drop).
"""
# Merge datasets
## Re-index first to make merging easier.
dfI = df.reset_index()
flagsI = flags.reset_index()
merged = dfI.merge(flagsI, how='left', left_on=left_on, right_on=right_on)
# Make sure flagName is a list
nameList = asList(flagName)
# Make sure at least 1 and only 1 keep|drop condition is set
if len(np.array([keep_in_list, keep_logic, drop_in_list, drop_logic]).nonzero()[0]) > 1:
print "keep_in_list, keep_logic, drop_in_list, drop_logic are mutually exclusive. Please only provide one."
raise Exception
elif len(np.array([keep_in_list, keep_logic, drop_in_list, drop_logic]).nonzero()[0]) == 0:
print "Please provide at least on of keep_in_list, keep_logic, drop_in_list, drop_logic."
raise Exception
# Create Boolean Mask using keep|drop conditions
if keep_in_list:
print "Keeping when {} is in the given list.".format(flagName)
# Make sure keep_in_list is a list
flagList = asList(keep_in_list)
# Create mask where drop_when values are False
cleanMask = merged[nameList].isin(flagList)
elif drop_in_list:
print "Dropping when {} is in given list.".format(flagName)
# Make sure keep_in_list is a list
flagList = asList(drop_in_list)
# Create mask where drop_when values are False
cleanMask = ~merged[nameList].isin(flagList)
elif keep_logic:
print "Keeping when {} {}.".format(flagName, keep_logic)
cleanMask = eval('merged[nameList] ' + keep_logic)
elif drop_logic:
print "Dropping when {} {}.".format(flagName, drop_logic)
cleanMask = ~eval('merged[nameList] ' + drop_logic)
# Return df with specific rows
if drop_when == 'all':
return df[cleanMask.values.all(axis=1)]
elif drop_when == 'any':
return df[cleanMask.values.any(axis=1)]
elif drop_when >= 1 and drop_when <= 100:
rowMargin = cleanMask.sum(axis=1)
rowTotal = cleanMask.count(axis=1)
rowProp = rowMargin / rowTotal * 100
propMask = rowProp >= drop_when
return df[propMask.values]
else:
print '"how" must have a value of "all", "any", or an integer between 1 and 100'
raise ValueError
def orderDf(df, varList):
""" Re-order the columns of a dataframe.
Arguments:
:type df: pandas.DataFrame
:param df: A pandas dataframe
:param list varList: List of column names you want to be placed at the
front of your data frame.
Returns:
:rtype: pandas.DataFrame
:returns: A pandas DataFrame with re-ordered columns.
"""
# Create a list of the other columns in df, not including the columns that
# are being moved.
otherCols = [x for x in df.columns if x not in varList]
# Create new data frame with the columns name in varList at the front.
dfOrder = df[varList + otherCols].copy()
return dfOrder
if __name__ == '__main__':
pass