-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path5G_Extractor.py
More file actions
276 lines (264 loc) · 12.6 KB
/
5G_Extractor.py
File metadata and controls
276 lines (264 loc) · 12.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import pandas as pd
import numpy as np
import os
class Extractor:
def __init__(self, path, file, method):
print(f"Loading: {file}")
self.df = pd.read_csv(path+file)
self.path = path
self.file = file
self.method = method
self.threshold = 2 # Min packets for flow analysis
# Ignoring source should improve DDoS detection
self.id_cols = ['ip.dst', 'dstport', 'ip.proto']
self.feature_cols = ['Pkts_Per_Sec', 'MBits_Per_Sec',
'Pkt_Size_Avg', 'Pkt_Size_Std', 'Pkt_Size_Q1', 'Pkt_Size_Q2', 'Pkt_Size_Q3', 'Pkt_Size_Min', 'Pkt_Size_Max',
#'TCP_Flags_Avg', 'TCP_Flags_Std', 'TCP_Flags_Q1', 'TCP_Flags_Q2', 'TCP_Flags_Q3', 'TCP_Flags_Min', 'TCP_Flags_Max',
#'TTL_Avg', 'TTL_Std', 'TTL_Q1', 'TTL_Q2', 'TTL_Q3', 'TTL_Min', 'TTL_Max',
'Anomaly']
def getSubflowFeatures(self):
return self.subflow_features
def getFlowInfo(self):
# Number of subflows per flow
flow_num = []
for i in range(len(self.subflow_indices)):
flow_num.append(len(self.subflow_indices[i]))
flow_num = pd.Series(flow_num)
flow_keys = pd.Series(self.keys)
flow_df = pd.DataFrame([flow_num,flow_keys]).T
flow_df.columns = ['Num Subflows', 'Flow ID']
return flow_df
def dropNaN(self):
print("Cleaning data...")
df = self.df
# NAN values
df.dropna(subset=['ip.proto'], inplace=True) # Drop non-IP packets
df.fillna(0, inplace=True) # Remaining values can be 0
# Invalid rows from concatenation of CSV files
errorneous_indices = df[df['ip.len'] == 'ip.len'].index
df.drop(index=errorneous_indices, inplace=True)
def convertColumns(self):
print("Converting column types...")
df = self.df
id_cols = self.id_cols
# Numeric
df[['ip.len','ip.ttl']] = df[['ip.len','ip.ttl']].astype(int)
df['tcp.flags'] = df['tcp.flags'].astype(str).apply(int, base=16)
df['frame.time_epoch'] = pd.to_datetime(df['frame.time_epoch'], unit='s')
# TCP or UDP ports
df['srcport'] = df[['tcp.srcport','udp.srcport']].astype(int).max(axis=1)
df['dstport'] = df[['tcp.dstport','udp.dstport']].astype(int).max(axis=1)
df.drop(['tcp.srcport','udp.srcport', 'tcp.dstport', 'udp.dstport'], axis=1, inplace=True)
# ID columns
# First convert ip proto to int (this prevents duplicates)
df['ip.proto'] = df['ip.proto'].astype(int)
df[id_cols] = df[id_cols].astype(str)
def partitionFlows(self):
print("Partitioning by flow. This may take awhile...")
df = self.df
id_cols = self.id_cols
fid_frame = df[id_cols].drop_duplicates() # Need unique IDs
feature_cols = ['frame.time_epoch', 'ip.len', 'ip.ttl', 'tcp.flags'] # Raw features
# Partition by unique ID
partitions = []
for i in range(fid_frame.shape[0]):
# We must only select rows of our dataframe that match the unique ID
next_fid = fid_frame.iloc[i] # Next unique ID
# Selection based on multiple column conditions being true (all ID columns must match)
conditions = None
for j in range(len(id_cols)):
if conditions is None: # First iteration doesn't need boolean operation
conditions = (df[id_cols[j]] == next_fid[j])
else:
conditions = conditions & (df[id_cols[j]] == next_fid[j])
# Select raw features from matching rows
partitions.append(df[conditions][feature_cols])
self.partitions = partitions
self.fid_frame = fid_frame
def linkKeys(self):
print("Linking keys to flows...")
fid_keys = self.fid_frame.values.tolist() # Convert fid_frame to keys
partitions = self.partitions
# Link IDs to partitions using a dictionary
fid_dict = {}
indices = []
for i in range(len(partitions)):
if partitions[i].shape[0] >= self.threshold: # Partition contains min number of packets
fid_dict[str(fid_keys[i])] = partitions[i]
else:
indices.append(i) # Otherwise we ignore this partition
self.fid_dict = fid_dict
# Get rid of extraneous keys (those linked to ignored partitions)
self.keys = [v for i,v in enumerate(fid_keys) if i not in indices]
def findIndices(self):
print("Finding indices for subflows...")
if self.method == "timeout":
self.findIndicesByTimeout()
else:
# Max subflow length in seconds (adjustable)
interval = 10
# 2D List of tuples (start and end indices) for each flow's subflows
subflow_indices = []
for i in range(len(self.keys)):
# Packets in this flow
flow_id = str(self.keys[i])
flow_pkts = self.fid_dict[flow_id]
# Packet arrival times
flow_pkt_times = flow_pkts['frame.time_epoch']
start_index = flow_pkt_times.index[0]
index_tuples = []
while True:
start_time = flow_pkt_times.loc[start_index]
# Subtract start time from all subsequent packet times
sub_frame = flow_pkt_times.loc[start_index:] - start_time
sub_frame = sub_frame/np.timedelta64(1,'s') # Convert to seconds
# See if we can split
if (sub_frame > interval).sum() > 0:
end_index = sub_frame[sub_frame <= interval].index[-1] # Last index <= interval
index_tuples.append((start_index,end_index))
start_index = sub_frame[sub_frame > interval].index[0] # First index > interval
else:
# The remaining flow is <= our interval
end_index = sub_frame.index[-1] # Last index of sub frame
index_tuples.append((start_index,end_index))
break
# Subflow indices for this flow
subflow_indices.append(index_tuples)
self.subflow_indices = subflow_indices
def findIndicesByTimeout(self):
timeout_interval = 2 # Max seconds since last packet arrival (adjustable)
subflow_indices = [] # 2D list containing subflow indices for each flow
for i in range(len(self.keys)):
# Packets in this flow
flow_id = str(self.keys[i])
flow_pkts = self.fid_dict[flow_id]
# Packet arrival times
flow_pkt_times = flow_pkts['frame.time_epoch']
# Difference between any row and the row before it (arrival time difference)
flow_time_diffs = flow_pkt_times.diff()
flow_time_diffs = flow_time_diffs/np.timedelta64(1,'s') # Convert to seconds
# Indices where the inter-arrival time is greater than the timeout interval
subflow_indices.append(flow_time_diffs[flow_time_diffs > timeout_interval].index)
self.subflow_indices = subflow_indices
def partitionSubflows(self):
print("Partitioning subflows...")
if self.method == "timeout":
self.partitionSubflowsByTimeout()
else:
subflows = []
for i in range(len(self.keys)):
# Packets in this flow
flow_id = str(self.keys[i])
pkt_list = self.fid_dict[flow_id]
# Subflow intervals
for j in range(len(self.subflow_indices[i])):
next_tuple = self.subflow_indices[i][j]
subflow = pkt_list.loc[next_tuple[0]:next_tuple[1]]
subflows.append(subflow)
self.subflows = subflows
def partitionSubflowsByTimeout(self):
subflows = []
for i in range(len(self.subflow_indices)):
# Packets in this flow
flow_id = str(self.keys[i])
pkt_list = self.fid_dict[flow_id]
# Subflow intervals
start = pkt_list.index[0]
for j in range(len(self.subflow_indices[i])):
# The end of this list is the start of the next subflow, so it must be excluded
end = self.subflow_indices[i][j]
subflows.append(pkt_list.loc[start:end][:-1]) # Exclude last row
start = end
# Final subflow for this flow
subflows.append(pkt_list.loc[start:])
self.subflows = subflows
def extractSubflowFeatures(self):
print("Extracting subflow features...")
subflow_features = []
for i in range(len(self.subflows)):
# Next subflow
subflow = self.subflows[i]
num_pkts = subflow.shape[0]
# Discard subflow if too few packets
if num_pkts < self.threshold:
continue
# Calculate duration
start_time = subflow.iloc[0]['frame.time_epoch']
end_time = subflow.iloc[-1]['frame.time_epoch']
subflow_dur = end_time - start_time
subflow_dur = subflow_dur/np.timedelta64(1,'s') # seconds
if subflow_dur < 1:
subflow_dur = 1
# Subflow features
sub_features = []
# Packets per second
sub_features.append(num_pkts/subflow_dur)
# MBits per second
pkt_sizes = subflow['ip.len']
total_bytes = pkt_sizes.sum()
total_bytes /= 1e6 # Convert to MB
bits_sec = (total_bytes * 8)/subflow_dur # MB to MBit/s
sub_features.append(bits_sec)
# Packet size statistics
sub_features.append(pkt_sizes.mean())
sub_features.append(pkt_sizes.std())
sub_features.append(pkt_sizes.quantile(.25))
sub_features.append(pkt_sizes.median())
sub_features.append(pkt_sizes.quantile(.75))
sub_features.append(pkt_sizes.min())
sub_features.append(pkt_sizes.max())
'''
# TCP statistics
tcp_flags = subflow['tcp.flags']
sub_features.append(tcp_flags.mean())
sub_features.append(tcp_flags.std())
sub_features.append(tcp_flags.quantile(.25))
sub_features.append(tcp_flags.median())
sub_features.append(tcp_flags.quantile(.75))
sub_features.append(tcp_flags.min())
sub_features.append(tcp_flags.max())
# TTL statistics
ttl = subflow['ip.ttl']
sub_features.append(ttl.mean())
sub_features.append(ttl.std())
sub_features.append(ttl.quantile(.25))
sub_features.append(ttl.median())
sub_features.append(ttl.quantile(.75))
sub_features.append(ttl.min())
sub_features.append(ttl.max())
'''
# No anomalies in nominal data
sub_features.append(0)
# Add sublist to main list
subflow_features.append(sub_features)
# Convert to dataframe
self.subflow_features = pd.DataFrame(subflow_features)
self.subflow_features.columns = self.feature_cols
def shuffleSubflows(self):
self.subflow_features = self.subflow_features.sample(frac=1)
def featuresToCSV(self):
print("Saving features to CSV...")
path = self.path[0:-4]
path += 'features/'
file = self.file.split('.')[0]+"_features.csv"
if not os.path.exists(path):
os.makedirs(path)
self.subflow_features.to_csv(path+file, encoding="utf-8", index=False)
print(self.subflow_features.info(), end="\n\n")
# 5G Data
path = 'C:\\Users\\Michael\\Dropbox\\Backup\\Michael\\Shared\\Documents\\VTEC\\5G Code\\csv\\'
file = 'combined_5G_pcaps.csv'
method = "timeout" #Options: "timeout" or "interval" (default/recommended)
extractor = Extractor(path, file, method)
extractor.dropNaN()
extractor.convertColumns()
extractor.partitionFlows()
extractor.linkKeys()
extractor.findIndices()
extractor.partitionSubflows()
extractor.extractSubflowFeatures()
extractor.shuffleSubflows()
extractor.featuresToCSV()
# DEBUG
feats = extractor.getSubflowFeatures()
flows = extractor.getFlowInfo()