-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsonar.py
More file actions
48 lines (40 loc) · 1.96 KB
/
sonar.py
File metadata and controls
48 lines (40 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# Load and prepare Sonar data. For a detailed description of the Sonar
# dataset, see: https://archive.ics.uci.edu/ml/datasets/Connectionist+Bench+(Sonar,+Mines+vs.+Rocks)
# Load the entire dataset into a large (208, 61) NumPy array:
import csv
import numpy as np
data_as_python_list = list(csv.reader(open("sonar_data/sonar.all-data")))
data_without_bias_column = np.array(data_as_python_list)
# Prepend a bias column, resulting in a (208, 62) matrix:
data = np.insert(data_without_bias_column, 0, 1, axis=1)
# Shuffle data. This is important, because the Sonar dataset contains all
# "Rock" examples first, and all "Metal" examples later. If we don't shuffle it,
# we'll end up with a test set composed exclusively of "Metal" examples when
# we split the dataset later.
np.random.seed(1234) # Have the same predictable shuffle every time
np.random.shuffle(data) # Shuffle matrix rows in place
# Extract a (208, 61) input matrix:
# - [:, 0:-1] stands for: "all rows, all columns except the last one"
# - Convert all strings to float
X = data[:, 0:-1].astype(np.float32)
# Extract a (208, 1) matrix of labels:
# - [:, -1] stands for: "extract all rows, but only the last column"
# - Reshape to be 1 column and as many rows as necessary
# - Convert all 'M's to True and all 'R's to False
# - Convert True and False to 1 and 0, respectively
labels = data[:, -1].reshape(-1, 1)
Y_unencoded = (labels == 'M').astype(np.int_)
# Split into training and test set:
SIZE_OF_TRAINING_SET = 160 # Keep the remaining 48 elements for testing
X_train, X_test = np.vsplit(X, [SIZE_OF_TRAINING_SET])
Y_train_unencoded, Y_test = np.vsplit(Y_unencoded, [SIZE_OF_TRAINING_SET])
# One hot encode the training set, but not the test set:
def one_hot_encode(Y):
n_labels = Y.shape[0]
n_classes = 2
encoded_Y = np.zeros((n_labels, n_classes))
for i in range(n_labels):
label = Y[i]
encoded_Y[i][label] = 1
return encoded_Y
Y_train = one_hot_encode(Y_train_unencoded)