-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathreddit_project_functions.py
More file actions
128 lines (83 loc) · 4.17 KB
/
reddit_project_functions.py
File metadata and controls
128 lines (83 loc) · 4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from datetime import datetime
from functools import reduce
from sklearn.externals import joblib
def pipe(obj, *fns):
'''
This function will run every function through a pipeline process
'''
return reduce(lambda x, y: y(x), [obj] + list(fns))
def data_load_and_clean(data_file_directoy) -> pd.DataFrame:
'''
This function will take in a csv with columns [ubreddit, utc, username]
It will then filter out outliers
Lastly, it will return a dataframe filtered for non-outlier subreddits
'''
print('Loading dataset...\n')
df = pd.read_csv(data_file_directoy)
print('Cleaning dataset...\n')
#First filter logic
count = pd.DataFrame(df['subreddit'].value_counts())
above_1000 = count[(count['subreddit']> 1000) & (count['subreddit']< 400000)]
clean_list = list(above_1000.index)
#Remove largest subreddit
clean_df = df[df['subreddit'] != 'AskReddit']
#Filter table for subreddits that were selected in the above range
return clean_df[clean_df['subreddit'].str.contains('|'.join(clean_list))]
def build_datastructure(clean_dataframe : pd.DataFrame) -> pd.DataFrame:
'''
This function takes a dataframe and calculates ratings by user for their respective subreddit
It then removes users who only have 1 subreddit interaction (making their interest rating 1) which would not be useful for our recommendations
It finally returns a dataframe with users as 1 column and each subreddit is a column with the user/subreddit pair having their rating
'''
print('Building 1st data structure...\n')
agg_df = clean_dataframe.pivot_table(index = ['username','subreddit'], aggfunc = 'count').sort_values(by='username', ascending=False)
user_df = pd.DataFrame(agg_df.groupby('username')['utc'].sum())
#Merge data gathered with original dataframe
final_df = pd.merge(agg_df.reset_index(),user_df,on='username',how='left')
#Get a rating score
final_df['rating'] = final_df['utc_x'] / final_df['utc_y']
#Get final dataframe (necessary columns)
final_df = final_df[['username', 'subreddit', 'rating']]
#Remove users that have one subreddit
final_df = final_df[final_df['rating'] < 1]
#Get dataframe structure ready for the model
return final_df.pivot_table(index = 'username', columns = 'subreddit', values = 'rating', aggfunc = 'max').fillna(0).reset_index(), final_df
def fit_model(clean_pivot : pd.DataFrame):
'''
This function fits the clean dataframe to train the model
It returns a trained model
It saves the model in the current directory to be used when predicting the user's inputs
'''
print('\nTraining model...\n')
kmeans = KMeans(n_clusters = 25)
subreddit_clustering_model = kmeans.fit(clean_pivot.drop('username', axis = 1))
print('Sending saved model to file called "subreddit_clustering_model.sav"...\n')
file_name = 'subreddit_clustering_model.sav'
joblib.dump(subreddit_clustering_model, file_name)
return subreddit_clustering_model
def join_labels(trained_model, train_df_orig, train_df):
'''
This function takes a trained model
Predicts the labels for our data
Merges the labels to the original dataframe
'''
#Attaching the labels back to original dataframe
train_df_orig['labels'] = trained_model.fit_predict(train_df_orig.drop('username', axis = 1))
train_merged = train_df.merge(train_df_orig[['username','labels']], on = 'username', how = 'left')
print('Sending labeled data to CSV file called "labeled_dataset"...')
train_merged.to_csv('labeled_dataset.csv', index = False)
def run_subreddit_clustering_program():
'''
'''
modeling_datastructure, master_dataframe = pipe(data_load_and_clean('reddit_data.csv'),
build_datastructure
)
print('\nSending subreddit table to a csv called "subreddit_table"')
modeling_datastructure.to_csv('subreddit_pivot_table.csv', index = False)
model = fit_model(modeling_datastructure)
return join_labels(model, modeling_datastructure, master_dataframe)
print('\nRunning clustering program...\n')
run_subreddit_clustering_program()