Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 170 additions & 0 deletions dmc_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# -*- coding: utf-8 -*-
from flask import Flask
from flask import render_template
from flask import request
from flask import Response

import json
import time
import sys
import random
import math

import pyorient

from Queue import Queue

from sklearn import preprocessing
from sklearn import svm

import numpy as np

app = Flask(__name__)

q = Queue()

def remap(value, min1, max1, min2, max2):
return float(min2) + (float(value) - float(min1)) * (float(max2) - float(min2)) / (float(max1) - float(min1))

def event_stream():
while True:
result = q.get()
yield 'data: %s\n\n' % str(result)


# Filter out database
def filter_database():
# Connect to weibo database
client = pyorient.OrientDB("localhost", 2424)
session_id = client.connect("root", "admin")
db_name = "weibo"
db_username = "admin"
db_password = "admin"

if client.db_exists( db_name, pyorient.STORAGE_TYPE_MEMORY ):
client.db_open( db_name, db_username, db_password )
print db_name + " opened successfully"
else:
print "database [" + db_name + "] does not exist! session ending..."
sys.exit()

# Pick up records that are not checked in in Dongguan hotel
query = 'SELECT FROM Checkins WHERE text containstext "东莞" and text containstext "太子"'
records = client.command(query) # My guess: records is the set of all records

# Delete them all
cluster_id = 3 # Don't know anything about cluster_id, pick one randomly
rec_list = []
for rec in records:
rec_list.append(rec)
print rec_list
client.db_close()

# End Filter
@app.route('/eventSource/')
def sse_source():
return Response(
event_stream(),
mimetype='text/event-stream')

@app.route("/")
def index():
return render_template("index.html")

@app.route("/getData/")
def getData():

q.put("starting data query...")

lat1 = str(request.args.get('lat1'))
lng1 = str(request.args.get('lng1'))
lat2 = str(request.args.get('lat2'))
lng2 = str(request.args.get('lng2'))

print "received coordinates: [" + lat1 + ", " + lat2 + "], [" + lng1 + ", " + lng2 + "]"

client = pyorient.OrientDB("localhost", 2424)
session_id = client.connect("root", "admin")
db_name = "weibo"
db_username = "admin"
db_password = "admin"

if client.db_exists( db_name, pyorient.STORAGE_TYPE_MEMORY ):
client.db_open( db_name, db_username, db_password )
print db_name + " opened successfully"
else:
print "database [" + db_name + "] does not exist! session ending..."
sys.exit()

query = 'SELECT FROM Place WHERE lat BETWEEN {} AND {} AND lng BETWEEN {} AND {} AND cat_2 = "Food/Drinks"'

records = client.command(query.format(lat1, lat2, lng1, lng2))
#query = 'SELECT FROM Place WHERE lat BETWEEN {} AND {} AND lng BETWEEN {} AND {} AND cat_2 = "Food/Drinks"'
query = 'SELECT lat, lng, cat_2, title FROM Place WHERE cat_1 = "Outdoors"'
#possiblly suitable query format?

#records = client.command(query.format(lat1, lat2, lng1, lng2))
records = client.command(query)

numListings = len(records)
print 'received ' + str(numListings) + ' records'

placesDict = {}
scoreDict = {}

for place in records:
placesDict[place._rid] = {'lat': place.lat, 'lng': place.lng}
scoreDict[place._rid] = 0

for i, rid in enumerate(placesDict.keys()):

q.put('processing ' + str(i) + ' out of ' + str(numListings) + ' places...')

s = "SELECT * FROM (TRAVERSE in(Checkin) FROM {}) WHERE @class = 'User'"

people = client.command(s.format(rid))
uids = [person.uid for person in people]

placesDict[rid]['users'] = set(uids)

q.put('matching records...')

lines = []

for place1 in placesDict.keys():
users1 = placesDict[place1]['users']
lat1 = placesDict[place1]['lat']
lng1 = placesDict[place1]['lng']
placesDict.pop(place1)
for place2 in placesDict.keys():
if len(users1 & placesDict[place2]['users']) > 1:
scoreDict[place1] += 1
scoreDict[place2] += 1
lines.append({'from': place1, 'to': place2, 'coordinates': [lat1, lng1, placesDict[place2]['lat'], placesDict[place2]['lng']]})

client.db_close()


output = {"type":"FeatureCollection","features":[]}

for record in records:
if scoreDict[record._rid] < 1:
continue
feature = {"type":"Feature","properties":{},"geometry":{"type":"Point"}}
feature["id"] = record._rid
feature["properties"]["name"] = record.title
feature["properties"]["cat"] = record.cat_1
feature["properties"]["score"] = scoreDict[record._rid]
feature["geometry"]["coordinates"] = [record.lat, record.lng]

output["features"].append(feature)


output["lines"] = lines

q.put('idle')
return json.dumps(output)


if __name__ == "__main__":
app.run(host='0.0.0.0',port=5000,debug=True,threaded=True)
49 changes: 49 additions & 0 deletions documentation_database_Xi.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
Data Processing

The database we used is Weibo Dataset, and we tried to look into the check-in information to geo-locate the places where prostitution may happened.
On the first test, we wanted to create a subset of the Weibo data in order to shrink the range of data to provide a more stable query process. In this round, we targeted the Dongguan prince hotel and queried the check-in information either based on the geo-location of the spot or the checkin_text which contains the words about 'dongguan����ݸ��' and 'prince��̫�ӣ�'.
The filtering code as following:

def filter_database():
query = 'SELECT FROM Checkins WHERE text containstext "��ݸ" and text containstext "̫��"'
records = client.command(query) # My guess: records is the set of all records
cluster_id = 3
rec_list = []
for rec in records:
rec_list.append(rec)
print rec_list
client.db_close()
The result of the code were quite limited and only about 20 check-in results came out, thus in this stage we kept changing different types of query code in orientDB to find the better results we want.
And we also typed some code based on the time range in order to filter out specific type of data. For instance, we want to query the data from 8:00 pm to 4:00am everyday which more related to the yellow industry. Following are the union syntax to check the specific time range.
SELECT * FROM Checkin WHERE lat BETWEEN 22.53 AND 22.56 AND lng BETWEEN 114.04 AND 114.08 AND time REGEXP "^2014-{1}[01]{1}[0-9]{1}-{1}[0-3]{1}[0-9]{1}\s2{1}[0-3]{1}:.*$"

UNION

SELECT * FROM Checkin WHERE lat BETWEEN 22.53 AND 22.56 AND lng BETWEEN 114.04 AND 114.08 AND time REGEXP "^2014-{1}[01]{1}[0-9]{1}-{1}[0-3]{1}[0-9]{1}\s0{1}[0-3]{1}:.*$"
]{1}[0-9]{1}-{1}[0-3]{1}[0-9]{1}\s0{1}[0-3]{1}:.*$"

The language of the code cannot be read by python and it may need further filtering in terms of the uncertainty of the type of check-in.

Then, we slightly changed our target from one spot 'prince hotel' to one district 'houjie', another street that are also famous for these kind of industry. And in these area, we started making the query on one specific day period after the yellow clearance action, this help us get the original places as our set A.

query = 'SELECT FROM Checkin WHERE lat BETWEEN 22.929935 AND 22.961751 AND lng BETWEEN 113.639837 AND 113.693017 AND time BETWEEN "2014-09-03 03:00:00" and "2014-09-04 04:00:00"'
numListings = len(records)
print 'received ' + str(numListings) + ' Checkins'

The next thing we do was using the 60 results we get from the last step to search for the user profile who issued the Weibo check-ins. And then used 'Traverse' command to build up a new query to look for the other places where the group of people goes. In this case, we tried to build up the connection between our original places(set A) and the extended places(set C).

uniqueUsers = []
originPlaces = []
connectedPlaces = []

output = {"type":"FeatureCollection","originP":[],"connectedP":[]}

for record in records:
user = str(record.out)

if user in uniqueUsers:
continue
uniqueUsers.append(user)

places = client.command("SELECT * FROM (TRAVERSE out(Checkin) FROM (SELECT * FROM {})) WHERE @class = 'Place'".format(record.out))
print 'received ' + str(len(places)) + ' connected places from ' + str(record._in)
Loading