Subreddit_Recommender/pull.py at master · sahivy13/Subreddit_Recommender · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
This file will be responsible for housing the functions that pull and clean the data
"""
# DEPENDENCIES
import pandas as pd
import numpy as np
import sqlite3


# Query the DB and select a random 100k rows.
# export it to CSV

try:
    # import the data from csv
    data = pd.read_csv('reddit_user_subreddit.csv')

except:
    # import and run the sql query
    connection = sqlite3.connect('reddit_data.db')

    data = pd.read_sql_query(
        """
        SELECT O.username_id, O.subreddit_id, COUNT(O.utc) AS visits
            FROM
                Observation O
            GROUP BY O.username_id, O.subreddit_id
            ORDER BY RANDOM()
            LIMIT 100000;
        """,
        connection
        )

    data.to_csv('reddit_user_subreddit.csv')


# Query the DB and build our mapper
try:
    # a second table to map the subreddit_id to the subreddit name, to be accessed after our recommender
    mapper_df = pd.read_csv('subreddit_mapper.csv')

except:
    connection = sqlite3.connect('reddit_data.db')
    mapper_df = pd.read_sql_query(
        """
        SELECT S.subreddit_id, S.subreddit FROM Subreddit S;
        """, connection)

    mapper_df.to_csv('subreddit_mapper.csv')

# Ramon's mapper - need to turn this into a function to build the mapper automatically
# this mapper will include all subreddits from the original db, not just from our 100k rows
dict_subreddits = mapper_df['subreddit_id'] # create a Series of IDs from the df
dict_subreddits.index = mapper_df['subreddit'] # Set the index of that Series as the Subreddit names
dict_subreddits = dict_subreddits.to_dict() # turn that into a dict
sr_map = {v: k.lower() for k, v in dict_subreddits.items()} # reverse the key-value

# print(sr_map[8]) # print a specific key's value