-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpull.py
More file actions
57 lines (45 loc) · 1.75 KB
/
pull.py
File metadata and controls
57 lines (45 loc) · 1.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
This file will be responsible for housing the functions that pull and clean the data
"""
# DEPENDENCIES
import pandas as pd
import numpy as np
import sqlite3
# Query the DB and select a random 100k rows.
# export it to CSV
try:
# import the data from csv
data = pd.read_csv('reddit_user_subreddit.csv')
except:
# import and run the sql query
connection = sqlite3.connect('reddit_data.db')
data = pd.read_sql_query(
"""
SELECT O.username_id, O.subreddit_id, COUNT(O.utc) AS visits
FROM
Observation O
GROUP BY O.username_id, O.subreddit_id
ORDER BY RANDOM()
LIMIT 100000;
""",
connection
)
data.to_csv('reddit_user_subreddit.csv')
# Query the DB and build our mapper
try:
# a second table to map the subreddit_id to the subreddit name, to be accessed after our recommender
mapper_df = pd.read_csv('subreddit_mapper.csv')
except:
connection = sqlite3.connect('reddit_data.db')
mapper_df = pd.read_sql_query(
"""
SELECT S.subreddit_id, S.subreddit FROM Subreddit S;
""", connection)
mapper_df.to_csv('subreddit_mapper.csv')
# Ramon's mapper - need to turn this into a function to build the mapper automatically
# this mapper will include all subreddits from the original db, not just from our 100k rows
dict_subreddits = mapper_df['subreddit_id'] # create a Series of IDs from the df
dict_subreddits.index = mapper_df['subreddit'] # Set the index of that Series as the Subreddit names
dict_subreddits = dict_subreddits.to_dict() # turn that into a dict
sr_map = {v: k.lower() for k, v in dict_subreddits.items()} # reverse the key-value
# print(sr_map[8]) # print a specific key's value