reddit_recommender/reddit_project_functions.py at master · reche025/reddit_recommender · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from datetime import datetime
from functools import reduce
from sklearn.externals import joblib


def pipe(obj, *fns):
    '''
    This function will run every function through a pipeline process
    '''

    return reduce(lambda x, y: y(x), [obj] + list(fns))


def data_load_and_clean(data_file_directoy) -> pd.DataFrame:
    '''
    This function will take in a csv with columns [ubreddit, utc, username]

    It will then filter out outliers

    Lastly, it will return a dataframe filtered for non-outlier subreddits
    '''

    print('Loading dataset...\n')
    df = pd.read_csv(data_file_directoy)

    print('Cleaning dataset...\n')
    #First filter logic
    count = pd.DataFrame(df['subreddit'].value_counts())
    above_1000 = count[(count['subreddit']> 1000) & (count['subreddit']< 400000)]
    clean_list = list(above_1000.index)

    #Remove largest subreddit
    clean_df = df[df['subreddit'] != 'AskReddit']

    #Filter table for subreddits that were selected in the above range
    return clean_df[clean_df['subreddit'].str.contains('|'.join(clean_list))]


def build_datastructure(clean_dataframe : pd.DataFrame) -> pd.DataFrame:
    '''
    This function takes a dataframe and calculates ratings by user for their respective subreddit

    It then removes users who only have 1 subreddit interaction (making their interest rating 1) which would not be useful for our recommendations

    It finally returns a dataframe with users as 1 column and each subreddit is a column with the user/subreddit pair having their rating
    '''

    print('Building 1st data structure...\n')
    agg_df = clean_dataframe.pivot_table(index = ['username','subreddit'], aggfunc = 'count').sort_values(by='username', ascending=False)

    user_df = pd.DataFrame(agg_df.groupby('username')['utc'].sum())

    #Merge data gathered with original dataframe
    final_df = pd.merge(agg_df.reset_index(),user_df,on='username',how='left')

    #Get a rating score
    final_df['rating'] = final_df['utc_x'] / final_df['utc_y']

    #Get final dataframe (necessary columns)
    final_df = final_df[['username', 'subreddit', 'rating']]

    #Remove users that have one subreddit
    final_df = final_df[final_df['rating'] < 1]

    #Get dataframe structure ready for the model
    return final_df.pivot_table(index = 'username', columns = 'subreddit', values = 'rating', aggfunc = 'max').fillna(0).reset_index(), final_df


def fit_model(clean_pivot : pd.DataFrame):
    '''
    This function fits the clean dataframe to train the model

    It returns a trained model

    It saves the model in the current directory to be used when predicting the user's inputs
    '''

    print('\nTraining model...\n')
    kmeans = KMeans(n_clusters = 25)
    subreddit_clustering_model = kmeans.fit(clean_pivot.drop('username', axis = 1))

    print('Sending saved model to file called "subreddit_clustering_model.sav"...\n')
    file_name = 'subreddit_clustering_model.sav'

    joblib.dump(subreddit_clustering_model, file_name)

    return subreddit_clustering_model


def join_labels(trained_model, train_df_orig, train_df):
    '''
    This function takes a trained model

    Predicts the labels for our data

    Merges the labels to the original dataframe
    '''

    #Attaching the labels back to original dataframe
    train_df_orig['labels'] = trained_model.fit_predict(train_df_orig.drop('username', axis = 1))

    train_merged = train_df.merge(train_df_orig[['username','labels']], on = 'username', how = 'left')

    print('Sending labeled data to CSV file called "labeled_dataset"...')
    train_merged.to_csv('labeled_dataset.csv', index = False)


def run_subreddit_clustering_program():
    '''
    '''

    modeling_datastructure, master_dataframe = pipe(data_load_and_clean('reddit_data.csv'),
        build_datastructure
    )

    print('\nSending subreddit table to a csv called "subreddit_table"')
    modeling_datastructure.to_csv('subreddit_pivot_table.csv', index = False)

    model = fit_model(modeling_datastructure)

    return join_labels(model, modeling_datastructure, master_dataframe)


print('\nRunning clustering program...\n')
run_subreddit_clustering_program()