Creepathon/setconnection.py at master · pde-bakk/Creepathon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from pymongo import MongoClient
import pandas as pd
import sys


# In order to display all the columns in the same line, rather than splitting them under each other, we set options on pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('max_columns', None)
pd.set_option("max_colwidth", 0)
pd.set_option("max_seq_item", None)


def _connect_mongo(host, port, username, password, db):
	""" A util for making a connection to mongo"""

	# For us, we don't need a username and password, but if you did, here's how
	if username and password:
		mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
		conn = MongoClient(mongo_uri)
	else:
		conn = MongoClient(host, port)

	return conn[db]


def aggregate_collection(arg, db='test', collection='venmo', host='localhost', port=27017, username=None, password=None, no_id=True):
	""" Unsorted data is so messy, that's why we can aggregate it into more useful smaller parts"""

	db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

	collection = db.get_collection(collection)

	# We set up a few default aggregations
	collection.aggregate([{'$match': {"payment.target.user.first_name": 'Jason'}}, {'$out': "Jasons"}])


def read_mongo(db='test', collection='venmo', query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
	""" Read from Mongo and Store into DataFrame"""

	# Let's set up a connection to the mongoDB server we host on our own machine.
	db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

	# Make a query to the specific Database and collection
	# For testing purposes, we limit the response to 10000
	cursor = db[collection].find(query).limit(10000)

	# Expand the cursor and construct the DataFrame
	df = pd.DataFrame(list(cursor))

	# Delete the _id
	if no_id:
		del df['_id']

	return df


print(sys.argv[0])
print(len(sys.argv))
if len(sys.argv) > 2 and sys.argv[1] == 'aggregate':
	# Right now we don't want to print the dataset, but we want to aggregate (i.e. split) it
	aggregate_collection(sys.argv[2])

else:
	if len(sys.argv) > 2:
		dataframe = read_mongo(sys.argv[1], sys.argv[2])
	else:
		dataframe = read_mongo()

	# To get a feel for what the data looks like, we take a sneak peek at the first (10) results
	print(dataframe.head(10))