Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 254 additions & 0 deletions KNN_RF_classification_Khudiakov.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
{
"metadata": {
"name": "",
"signature": "sha256:6c6a9db99ed7aca8fabca83238325003c94cd8e0bd87bb51d4c424f0762b60cd"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.utils import shuffle\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from matplotlib.colors import ListedColormap"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Let's see what is in the file:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"red_dots = np.loadtxt(\"red.txt\")\n",
"blue_dots = np.loadtxt(\"blue.txt\")\n",
"plt.plot(red_dots[:, 0], red_dots[:, 1], '.', color='red')\n",
"plt.plot(blue_dots[:, 0], blue_dots[:, 1], '.', color='blue')\n",
"plt.show()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"[<matplotlib.lines.Line2D at 0x1077c4650>]"
]
}
],
"prompt_number": 2
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Class definition:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"red_class = np.array(([1])*len(red_dots)) # red is 1\n",
"red_class = red_class.reshape(red_class.size, 1)\n",
"blue_class = np.array(([0])*len(blue_dots)) # blue is 0\n",
"blue_class = blue_class.reshape(blue_class.size, 1)\n",
"total_class = np.concatenate((red_class, blue_class))\n",
"total_dots = np.concatenate((red_dots, blue_dots))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Mixing dataset and its division on the train set (950 dots) and test set (50 dots)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"shuffled_total_dots, shuffled_total_class = shuffle(total_dots, total_class)\n",
"shuffled_total_class = shuffled_total_class.reshape(total_class.size, )\n",
"dots_train = shuffled_total_dots[:950]\n",
"class_train = shuffled_total_class[:950]\n",
"dots_test = shuffled_total_dots[950:]\n",
"class_test = shuffled_total_class[950:]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"KNN"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"knn = KNeighborsClassifier(n_neighbors=7)\n",
"knn.fit(dots_train, class_train)\n",
"print 'Accuracy of KNN train set:', knn.score(dots_train, class_train)\n",
"print 'Accuracy of KNN test set:', knn.score(dots_test, class_test)\n",
"def plot_knn(X):\n",
" h = .018\n",
" cmap_light = ListedColormap(['#0000FF', '#FF0000'])\n",
" x_min, x_max = X[:, 0].min(), X[:, 0].max()\n",
" y_min, y_max = X[:, 1].min(), X[:, 1].max()\n",
" xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
" Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])\n",
" Z = Z.reshape(xx.shape)\n",
" plt.scatter(shuffled_total_dots[:, 0], shuffled_total_dots[:, 1], c=shuffled_total_class, cmap=cmap_light)\n",
" plt.scatter(xx, yy, Z)\n",
" plt.title('k Nearest Neighbors')\n",
" plt.show()\n",
"\n",
"plot_knn(dots_test)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Accuracy of KNN train set: 0.896842105263\n",
"Accuracy of KNN test set: 0.94\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"RF"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"rfc = RandomForestClassifier(n_estimators=10)\n",
"rfc.fit(dots_train, class_train)\n",
"print 'Accuracy of Random Forest train set:', rfc.score(dots_train, class_train)\n",
"print 'Accuracy of Random Forest test set:', rfc.score(dots_test, class_test)\n",
"\n",
"def plot_rfc(X):\n",
" h = .018\n",
" cmap_light = ListedColormap(['#0000FF', '#FF0000'])\n",
" x_min, x_max = X[:, 0].min(), X[:, 0].max()\n",
" y_min, y_max = X[:, 1].min(), X[:, 1].max()\n",
" xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
" Z = rfc.predict(np.c_[xx.ravel(), yy.ravel()])\n",
" Z = Z.reshape(xx.shape)\n",
" plt.scatter(shuffled_total_dots[:, 0], shuffled_total_dots[:, 1], c = shuffled_total_class, cmap=cmap_light)\n",
" plt.scatter(xx, yy, Z)\n",
" plt.title('Random Forest')\n",
" plt.show()\n",
"\n",
"plot_rfc(dots_test)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Accuracy of Random Forest train set: 0.983157894737\n",
"Accuracy of Random Forest test set: 0.92\n"
]
}
],
"prompt_number": 8
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Color prediction"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"\n",
"def what_is_the_color_of_the_dot_knn(coord_x, coord_y):\n",
" list = [coord_x, coord_y]\n",
" if knn.predict(list) == 1:\n",
" return 'KNN decision: Red'\n",
" else:\n",
" return 'KNN decision: Blue'\n",
"\n",
"def what_is_the_color_of_the_dot_rfc(coord_x, coord_y):\n",
" list = [coord_x, coord_y]\n",
" if rfc.predict(list) == 1:\n",
" return 'RF decision: Red'\n",
" else:\n",
" return 'RF decision: Blue'\n",
" \n",
"print what_is_the_color_of_the_dot_knn(0.6, 0.47)\n",
"print what_is_the_color_of_the_dot_rfc(0.6, 0.47)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"KNN decision: Blue\n",
"RF decision: Red\n"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
75 changes: 75 additions & 0 deletions khudiakov/A1_components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@

import sys
import threading



def main():

# read number of vertices
f = open('components.in')
first_string = f.readline().strip().split(' ')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Лучше делать .split() чем .split(' ')

n = int(first_string[0])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Видимо, вторым числом было количество рёбер, и оно наверное пригодилось бы, чтобы сделать дальнейший ввод проще


#read edges list
next_string = ' '
edges_list = []
inner_list = []
while next_string != ['']:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Очень сложное условие итерирования, можно было бы просто

for line in f:
    u, v = [int(x) for x in line.split()]
    edges_list.append((u, v))

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вообще, надо стараться писать итерацию так, чтобы не писать один код дважды.

В этом цикле два дублированная: проверка next_string != [''] и присваивание inner_list = []

Присваивание можно было сделать один раз в начале цикла.

next_string = f.readline().strip().split(' ')
if next_string != ['']:
inner_list.append(int(next_string[0])-1)
inner_list.append(int(next_string[1])-1)
edges_list.append(inner_list)
inner_list = []
f.close()

# make full edges list with reverse edges
rev_edges_list = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вообще, это отлично переписывается через comprehension
rev_edges = [(v, u) for (u, v) in edges]

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Кстати, вообще в питоне list и tuple почти одно и то же, но всё же тут логичнее использовать для рёбер tuple

for i in edges_list:
a = i[::-1]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

зачем тут эта переменная?

Можно было сразу написать rev_edges_list.append(i[::-1])

rev_edges_list.append(a)
full_edges_list = edges_list + rev_edges_list

#convert edges list to adjacency list
adjacency_list = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

опять же, comprehension

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Скажу поподробнее =)

Видимо, вначале тут был код adjecency_list = [[]] * n который не работает, потому что это n одинаковых списков. Правильно писать adjecency_list = [[] for _ in range(n)].

Просто для сравнения, матрицу из нулей удобно создавать так:

m = [[0] * n for _ in range(n)]


for i in range(n):
adjacency_list.append([])

for i in full_edges_list:
a = (i[0])
adjacency_list[a].append(i[1])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

загадочная индексация.

Проще так:

for edge in full_edges_list:
    u, v = edge
    adjacency_list[u].append(v)


#dfs
visited = [False] * n
comp_number = 0
components_list = [-1] * n
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Нет необходимости хранить и visited и components_list, потому что visited[i] это то же самое, что и components[i] == -1


def dfs(v):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вообще это не очень хорошо -- объявлять большую функцию в середине огромной функции

components_list[v] = comp_number
visited[v] = True
for w in adjacency_list[v]:
if not visited[w]:
dfs(w)

#counting components
for v in range(n):
if not visited[v]:
dfs(v)
comp_number += 1


#write to file

ans = open('components.out', 'w')
ans.write(str(comp_number) + '\n')
for w in components_list:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Возможно лучше опять использовать comprehension =)

ans.write(' '.join(str(w + 1) for x in components_list))

ans.write(str(int(w) + 1) + ' ')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int(w) == w?

ans.close()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вообще, main слишком огромная функция. Лучше сделать так

def read_input():
    ....
    return graph

def dfs(graph, v, components): # no global vars!
    ...

def find_components():
    ...
    return components

def print_answer():
    ....

def main():
    g = read_input()
    c = find_components()
    print_answer(c)



threading.stack_size(2 ** 26) # 64 MB stack size
sys.setrecursionlimit(1000000000) # recursion depth
thread = threading.Thread(target=main)
thread.start()
50 changes: 50 additions & 0 deletions khudiakov/A2_Shortest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

import sys
import threading

def main():

# read number of vertices
f = open('pathbge1.in')
v, e = (int(i) for i in f.readline().strip().split())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.strip().split() эквивалентен .split()



#adjacency list
adjacency_list=[[] for i in range(v)]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍, только есть convention использовать _ вместо i если переменная не используется

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В смысле, for _ in range() ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ога

for edge in f:
x, y = ((int(i) - 1) for i in edge.strip().split())
adjacency_list[x].append(y)
adjacency_list[y].append(x)

#bfs
visited = [False] * v
distance = [-1 for i in range(v)]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

опять же, достаточно только distance


def bfs(start, adjacency_list):
queue = [start]
visited[start] = True
distance[start] = 0 # distance from start vertex to current one
while len(queue) > 0:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while queue:


x = queue.pop(0)

for i in adjacency_list[x]:
if visited[i] == False:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if not visited[i]

Есть две вещи, которые не надо делать с booleanamи,:

if x == True:
if x:
    return True
else:
    return False

queue.append(i)
visited[i] = True
if distance[i] == -1:
distance[i] = distance[x] + 1
return distance

#write to file
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Каждый раз, когда чувствуется необходимость написать комментарий, лучше подумать, как бы переписать код так, чтобы он не был нужен.

В данном случае, #write to file намекает на необходимость функции def write_to_file(distances).

Коммент #bfs выше сигнализирует о том, что надо вынести bfs в отдельную глобальную функцию, и избавится от 'глобальных' переменных visited и distance


ans = open('pathbge1.out', 'w')
for w in bfs(0, adjacency_list):
ans.write(str(int(w)) + ' ')
ans.close()


threading.stack_size(2 ** 26) # 64 MB stack size
sys.setrecursionlimit(1000000000) # recursion depth
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Зачем, тут же нет рекурсии?

thread = threading.Thread(target=main)
thread.start()
Loading