-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrecommender.py
More file actions
81 lines (65 loc) · 3.16 KB
/
recommender.py
File metadata and controls
81 lines (65 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#import necessary libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
#read the data in from local files (movies and tags)
movies = pd.read_csv("./ml-latest/movies.csv")
tags = pd.read_csv("./ml-latest/tags.csv")
#get what information we need from the csv files (mainly wanted to cut out timestamp from this)
movies = movies[['movieId','title', 'genres']]
tags = tags[['movieId','tag']]
#a function to turn movies with no genres listed to NaN
def clean_genres(s):
if s == "(no genres listed)":
return np.nan
return s
#joins the tags based on movieId so that they are all in one list
tags = tags.fillna('').groupby('movieId')['tag'].apply(','.join).reset_index()
#applies the clean_genres function made above
movies['genres'] = movies['genres'].apply(clean_genres)
#ensures that the movieIds and Genres are not null
tags = tags[tags['movieId'].notnull()]
movies = movies[movies['movieId'].notnull()]
movies = movies[movies['genres'].notnull()]
#converts the movieIds into integers
movies['movieId'] = movies['movieId'].astype('int')
tags['movieId'] = tags['movieId'].astype('int')
#this takes how the genres are split in the data and puts them into an array to access the data easier
movies['genres'] = movies['genres'].apply(lambda x: "".join(x.replace(" ", "").lower()).split("|"))
#merges the tags based on movieId
movies = movies.merge(tags, on="movieId")
#makes all values into 1 word rather than having a space, and then puts all the values into an array
movies['tag'] = movies['tag'].apply(lambda x: x.replace(" ", "").lower().split(","))
#creates a metadata column that contains all the tags and genres for the movie
movies['metadata'] = movies.apply(lambda x: " ".join(x["genres"]) + " ".join(x["tag"]), axis=1)
#creates a vector that counts the words and maps them based on location
vec = CountVectorizer(stop_words="english")
#puts the words into a matrix
matrix = vec.fit_transform(movies['metadata'])
#gets the cosine similarity
cos_sim = cosine_similarity(matrix, matrix)
#creates a mapping of each title to an index
mapping = pd.Series(movies.index, index=movies['title'])
def select_movie(input):
#grabs the word closes to the input (makes search more flexible)
mov = movies[movies['title'].str.contains(input)]
#grabs just the title of the movie
mov = mov['title'].values[0]
return mov
def movie_recommender(input):
mov_input = select_movie(input)
print("Recommend a movie similar to: ", mov_input)
#finds the movie based on input
movie = mapping[mov_input]
#gets the similarity score of the movie
sim_score = list(enumerate(cos_sim[movie]))
#grabs the most similar movies by reversing the list
sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
#grabs the first 10 movies
sim_score = sim_score[1:11]
#gets the movie ids of the top 10 movies+
indices = [i[0] for i in sim_score]
#returns the movies with their IDs and title in a list
return movies['title'].iloc[indices]
print(movie_recommender("Avengers"))