From 6d1a5ac936b973b7d3516ebe666fb401b17da295 Mon Sep 17 00:00:00 2001
From: William Gayde <gayde2@illinois.edu>
Date: Sun, 30 Sep 2018 18:17:02 -0500
Subject: [PATCH 1/8] starting csv parse for sbserver

---
 cmd/sbserver/parse-csv.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 cmd/sbserver/parse-csv.py

diff --git a/cmd/sbserver/parse-csv.py b/cmd/sbserver/parse-csv.py
new file mode 100644
index 0000000..9882d61
--- /dev/null
+++ b/cmd/sbserver/parse-csv.py
@@ -0,0 +1,35 @@
+import requests
+import json
+import csv
+
+url = "https://safebrowsing.googleapis.com/v4/threatMatches:find?key="
+
+x = {
+	'threatInfo': {
+	'threatTypes': ['ANY_TYPE'], #TODO check this
+		'threadEntries': []
+	}
+}
+
+entries = [
+	{'url':'google.com'},
+]
+
+#print (entries[0]['url'])
+
+with open('blacklist-entries.csv') as csvfile:
+    spamreader = csv.reader(csvfile, delimiter=',')
+    next(spamreader)
+    for row in spamreader:
+		if(row[0][:2] == '//'):
+			newEntry = 	{'url':row[0][2:]}
+			
+		else:
+			newEntry = 	{'url':row[0]}
+
+		#print(newEntry)	
+		entries.append(newEntry)
+#print(entries)
+
+x['threatInfo']['threadEntries'] = entries
+print (json.dumps(x))
\ No newline at end of file

From d503247af1c5faf20f9f813007cfa50c0ab89c0a Mon Sep 17 00:00:00 2001
From: William Gayde <gayde2@illinois.edu>
Date: Fri, 5 Oct 2018 11:13:14 -0500
Subject: [PATCH 2/8] working on python

---
 cmd/sbserver/parse-csv.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/cmd/sbserver/parse-csv.py b/cmd/sbserver/parse-csv.py
index 9882d61..8f2cec7 100644
--- a/cmd/sbserver/parse-csv.py
+++ b/cmd/sbserver/parse-csv.py
@@ -2,7 +2,7 @@
 import json
 import csv
 
-url = "https://safebrowsing.googleapis.com/v4/threatMatches:find?key="
+url = "127.0.0.1:8080/v4/threatMatches:find"
 
 x = {
 	'threatInfo': {
@@ -16,11 +16,13 @@
 ]
 
 #print (entries[0]['url'])
-
+i = 0;
 with open('blacklist-entries.csv') as csvfile:
-    spamreader = csv.reader(csvfile, delimiter=',')
-    next(spamreader)
-    for row in spamreader:
+	spamreader = csv.reader(csvfile, delimiter=',')
+	next(spamreader)
+	for row in spamreader:
+		if (i == 10):
+			break
 		if(row[0][:2] == '//'):
 			newEntry = 	{'url':row[0][2:]}
 			
@@ -29,7 +31,10 @@
 
 		#print(newEntry)	
 		entries.append(newEntry)
+		i = i + 1;
 #print(entries)
 
 x['threatInfo']['threadEntries'] = entries
-print (json.dumps(x))
\ No newline at end of file
+request = requests.post(url, params=x)
+print request.txt
+#print (json.dumps(x))
\ No newline at end of file

From 6159410e6997b3c65347a69983e12cfb920f1a00 Mon Sep 17 00:00:00 2001
From: gayde2 <gayde2@illinois.edu>
Date: Fri, 5 Oct 2018 17:03:01 -0500
Subject: [PATCH 3/8] working on python

---
 cmd/sbserver/parse-csv.py | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/cmd/sbserver/parse-csv.py b/cmd/sbserver/parse-csv.py
index 8f2cec7..aa71e60 100644
--- a/cmd/sbserver/parse-csv.py
+++ b/cmd/sbserver/parse-csv.py
@@ -2,12 +2,26 @@
 import json
 import csv
 
-url = "127.0.0.1:8080/v4/threatMatches:find"
+url = "https://safebrowsing.googleapis.com/v4/fullHashes:find?key=AIzaSyC4EjprVfp6YX6aSlHqFZhaUrmbgRMoi7w"
 
 x = {
+
+        'client':{
+        'clientId': 'IllinoisNSRG',
+        'clientVersion': '1.0',
+        },
 	'threatInfo': {
-	'threatTypes': ['ANY_TYPE'], #TODO check this
-		'threadEntries': []
+	'threatTypes': ['THREAT_TYPE_UNSPECIFIED'],
+	'platformTypes': ['ANY_PLATFORM'],
+	'threatEntryTypes':['URL'],
+	'threatEntries': [
+      {"hash": "WwuJdQ=="},
+      {"hash": "771MOg=="},
+      {"hash": "5eOrwQ=="}	]
+	},
+	'apiClient':{
+	'clientId': 'IllinoisNSRG',
+	'clientVersion': '1.0',
 	}
 }
 
@@ -25,16 +39,19 @@
 			break
 		if(row[0][:2] == '//'):
 			newEntry = 	{'url':row[0][2:]}
-			
+
 		else:
 			newEntry = 	{'url':row[0]}
 
-		#print(newEntry)	
+		#print(newEntry)
 		entries.append(newEntry)
 		i = i + 1;
 #print(entries)
 
-x['threatInfo']['threadEntries'] = entries
-request = requests.post(url, params=x)
-print request.txt
-#print (json.dumps(x))
\ No newline at end of file
+
+#x['threatInfo']['threatEntries'] = entries
+#print(x)
+response = requests.post(url, json=x)
+print (response.text)
+#print (response.json())
+print (json.dumps(x))

From 40884b587bfff977edbc64be53d6544e87e11eeb Mon Sep 17 00:00:00 2001
From: gayde2 <gayde2@illinois.edu>
Date: Mon, 8 Oct 2018 16:16:50 -0500
Subject: [PATCH 4/8] work on csv parser and hashing

---
 cmd/sbserver/parse-csv.py | 217 +++++++++++++++++++++++++++++---------
 1 file changed, 165 insertions(+), 52 deletions(-)

diff --git a/cmd/sbserver/parse-csv.py b/cmd/sbserver/parse-csv.py
index aa71e60..0af833f 100644
--- a/cmd/sbserver/parse-csv.py
+++ b/cmd/sbserver/parse-csv.py
@@ -1,57 +1,170 @@
+from functools import wraps
+
+try:
+	import urllib, urlparse
+except ImportError:
+	import urllib.parse as urllib
+	from urllib import parse as urlparse
+
+import struct
+import time
+import posixpath
+import re
+import hashlib
+import socket
+import random
+import base64
 import requests
 import json
 import csv
 
-url = "https://safebrowsing.googleapis.com/v4/fullHashes:find?key=AIzaSyC4EjprVfp6YX6aSlHqFZhaUrmbgRMoi7w"
-
-x = {
-
-        'client':{
-        'clientId': 'IllinoisNSRG',
-        'clientVersion': '1.0',
-        },
-	'threatInfo': {
-	'threatTypes': ['THREAT_TYPE_UNSPECIFIED'],
-	'platformTypes': ['ANY_PLATFORM'],
-	'threatEntryTypes':['URL'],
-	'threatEntries': [
-      {"hash": "WwuJdQ=="},
-      {"hash": "771MOg=="},
-      {"hash": "5eOrwQ=="}	]
-	},
-	'apiClient':{
-	'clientId': 'IllinoisNSRG',
-	'clientVersion': '1.0',
+def full_unescape(u):
+	uu = urllib.unquote(u)
+	if uu == u:
+		return uu
+	else:
+		return full_unescape(uu)
+
+def quote(s):
+	safe_chars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~'
+	return urllib.quote(s, safe=safe_chars)
+
+def canonical(s):
+	url = s.strip()
+	url = url.replace('\n', '').replace('\r', '').replace('\t', '')
+	url = url.split('#', 1)[0]
+	if url.startswith('//'):
+		 url = 'http:' + url
+	if len(url.split('://')) <= 1:
+		url = 'http://' + url
+	url = quote(full_unescape(url))
+	url_parts = urlparse.urlsplit(url)
+	if not url_parts[0]:
+		url = 'http://%s' % url
+		url_parts = urlparse.urlsplit(url)
+	protocol = url_parts.scheme
+	host = full_unescape(url_parts.hostname)
+	path = full_unescape(url_parts.path)
+	query = url_parts.query
+	if not query and '?' not in url:
+		query = None
+	if not path:
+		path = '/'
+	has_trailing_slash = (path[-1] == '/')
+	path = posixpath.normpath(path).replace('//', '/')
+	if has_trailing_slash and path[-1] != '/':
+		path = path + '/'
+	port = url_parts.port
+	host = host.strip('.')
+	host = re.sub(r'\.+', '.', host).lower()
+	if host.isdigit():
+		try:
+			host = socket.inet_ntoa(struct.pack("!I", int(host)))
+		except:
+			pass
+	if host.startswith('0x') and '.' not in host:
+		try:
+			host = socket.inet_ntoa(struct.pack("!I", int(host, 16)))
+		except:
+			pass
+	quoted_path = quote(path)
+	quoted_host = quote(host)
+	if port is not None:
+		quoted_host = '%s:%s' % (quoted_host, port)
+	canonical_url = '%s://%s%s' % (protocol, quoted_host, quoted_path)
+	if query is not None:
+		canonical_url = '%s?%s' % (canonical_url, query)
+#	print("canonical url is " + canonical_url)
+	return canonical_url
+
+def url_host_permutations(host):
+	if re.match(r'\d+\.\d+\.\d+\.\d+', host):
+		yield host
+		return
+	parts = host.split('.')
+	l = min(len(parts),5)
+	if l > 4:
+		yield host
+	for i in range(l-1):
+		yield '.'.join(parts[i-l:])
+
+def url_path_permutations(path):
+	yield path
+	query = None
+	if '?' in path:
+		path, query =  path.split('?', 1)
+	if query is not None:
+		yield path
+	path_parts = path.split('/')[0:-1]
+	curr_path = ''
+	for i in range(min(4, len(path_parts) )):
+		curr_path = curr_path + path_parts[i] + '/'
+		yield curr_path
+
+def url_permutations(url):
+#	print("in url_permutations, url is " + url)
+	protocol, address_str = urllib.splittype(url)
+	host, path = urllib.splithost(address_str)
+	user, host = urllib.splituser(str(host))
+	host, port = urllib.splitport(host)
+	host = host.strip('/')
+	seen_permutations = set()
+	for h in url_host_permutations(host):
+		for p in url_path_permutations(path):
+			u = '%s%s' % (h, p)
+			if u not in seen_permutations:
+				yield u
+
+				seen_permutations.add(u)
+
+def digest(url):
+	return hashlib.sha256(url.encode('utf-8')).digest()
+
+def hashes(url):
+#	print("in hash function, url is " + url)
+	url_hash = digest(url)
+	yield url_hash
+
+
+if __name__ == '__main__':
+	url = "https://safebrowsing.googleapis.com/v4/fullHashes:find?key="
+
+	x = {
+			'client':{},
+			'clientStates':[],
+			'threatInfo': {
+			'threatTypes': ['THREAT_TYPE_UNSPECIFIED'],
+			'platformTypes': ['ANY_PLATFORM'],
+			'threatEntryTypes':['URL'],
+			'threatEntries': []
+			},
+			'apiClient':{},
 	}
-}
-
-entries = [
-	{'url':'google.com'},
-]
-
-#print (entries[0]['url'])
-i = 0;
-with open('blacklist-entries.csv') as csvfile:
-	spamreader = csv.reader(csvfile, delimiter=',')
-	next(spamreader)
-	for row in spamreader:
-		if (i == 10):
-			break
-		if(row[0][:2] == '//'):
-			newEntry = 	{'url':row[0][2:]}
-
-		else:
-			newEntry = 	{'url':row[0]}
-
-		#print(newEntry)
-		entries.append(newEntry)
-		i = i + 1;
-#print(entries)
-
-
-#x['threatInfo']['threatEntries'] = entries
-#print(x)
-response = requests.post(url, json=x)
-print (response.text)
-#print (response.json())
-print (json.dumps(x))
+
+	entries = [	]
+
+	i = 0;
+	with open('blacklist-entries.csv') as csvfile:
+			spamreader = csv.reader(csvfile, delimiter=',')
+			next(spamreader)
+			for row in spamreader:
+				if (i == 475):
+						break
+				if(row[0][:2] == '//'):
+					inputURL = row[0][2:]
+				else:
+					inputURL = row[0]
+				for permutations in url_permutations(canonical(inputURL)):
+					for hashed in hashes(permutations):
+						hashValue = base64.b64encode(hashed[0:4])
+						hashValue.replace('\n', '')
+						newEntry = {'hash': hashValue}
+						entries.append(newEntry)
+				i = i + 1;
+	x['threatInfo']['threatEntries'] = entries
+	request = requests.post(url, json=x)
+	print (request.text)
+
+
+
+

From ff889ebcc85bda6b638c9be62dce6153d52a1675 Mon Sep 17 00:00:00 2001
From: gayde2 <gayde2@illinois.edu>
Date: Fri, 12 Oct 2018 14:09:42 -0500
Subject: [PATCH 5/8] python script now does all entries in file in blocks of
 size 496

---
 cmd/sbserver/parse-csv.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/cmd/sbserver/parse-csv.py b/cmd/sbserver/parse-csv.py
index 0af833f..621918a 100644
--- a/cmd/sbserver/parse-csv.py
+++ b/cmd/sbserver/parse-csv.py
@@ -74,7 +74,7 @@ def canonical(s):
 	canonical_url = '%s://%s%s' % (protocol, quoted_host, quoted_path)
 	if query is not None:
 		canonical_url = '%s?%s' % (canonical_url, query)
-#	print("canonical url is " + canonical_url)
+	#print("canonical url is " + canonical_url)
 	return canonical_url
 
 def url_host_permutations(host):
@@ -127,7 +127,7 @@ def hashes(url):
 
 
 if __name__ == '__main__':
-	url = "https://safebrowsing.googleapis.com/v4/fullHashes:find?key="
+	url = "https://safebrowsing.googleapis.com/v4/fullHashes:find?key=AIzaSyBU6G2w4ItQUaWMTQCAzgViEX2mN-a1sxc"
 
 	x = {
 			'client':{},
@@ -143,27 +143,37 @@ def hashes(url):
 
 	entries = [	]
 
+
+	#print (entries[0]['url'])
 	i = 0;
+	send = 0
 	with open('blacklist-entries.csv') as csvfile:
 			spamreader = csv.reader(csvfile, delimiter=',')
 			next(spamreader)
 			for row in spamreader:
-				if (i == 475):
-						break
+				if (i % 496 == 0):
+						send = 1
 				if(row[0][:2] == '//'):
 					inputURL = row[0][2:]
 				else:
 					inputURL = row[0]
 				for permutations in url_permutations(canonical(inputURL)):
 					for hashed in hashes(permutations):
+						#print (hashed[0:4])
 						hashValue = base64.b64encode(hashed[0:4])
 						hashValue.replace('\n', '')
+						#print(hashValue)
 						newEntry = {'hash': hashValue}
 						entries.append(newEntry)
 				i = i + 1;
-	x['threatInfo']['threatEntries'] = entries
-	request = requests.post(url, json=x)
-	print (request.text)
+
+				if send == 1:
+					x['threatInfo']['threatEntries'] = entries
+					request = requests.post(url, json=x)
+#					print(json.dumps(x))
+					print (request.text)
+					send = 0
+					entries = []
 
 
 

From de39e09def49498732f10acede8556575b6daa9f Mon Sep 17 00:00:00 2001
From: gayde2 <gayde2@illinois.edu>
Date: Fri, 12 Oct 2018 14:34:57 -0500
Subject: [PATCH 6/8] parse python function now returns results

---
 cmd/sbserver/parse-csv.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cmd/sbserver/parse-csv.py b/cmd/sbserver/parse-csv.py
index 621918a..83b36d7 100644
--- a/cmd/sbserver/parse-csv.py
+++ b/cmd/sbserver/parse-csv.py
@@ -127,13 +127,16 @@ def hashes(url):
 
 
 if __name__ == '__main__':
-	url = "https://safebrowsing.googleapis.com/v4/fullHashes:find?key=AIzaSyBU6G2w4ItQUaWMTQCAzgViEX2mN-a1sxc"
+	url = "https://safebrowsing.googleapis.com/v4/fullHashes:find?key="
 
 	x = {
-			'client':{},
+			'client':{
+			'clientId': 'NSRG',
+			'clientVersion': '1.0'
+			},
 			'clientStates':[],
 			'threatInfo': {
-			'threatTypes': ['THREAT_TYPE_UNSPECIFIED'],
+			'threatTypes': ['MALWARE', 'SOCIAL_ENGINEERING', 'UNWANTED_SOFTWARE', 'POTENTIALLY_HARMFUL_APPLICATION', 'THREAT_TYPE_UNSPECIFIED'],
 			'platformTypes': ['ANY_PLATFORM'],
 			'threatEntryTypes':['URL'],
 			'threatEntries': []

From 17e0c5154e471fc2cfb2ff9f8410c0df0499f53c Mon Sep 17 00:00:00 2001
From: gayde2 <gayde2@illinois.edu>
Date: Mon, 15 Oct 2018 14:26:17 -0500
Subject: [PATCH 7/8] progress on csv parser

---
 cmd/sbserver/parse-csv.py | 74 +++++++++++++++++++++++++++++++--------
 1 file changed, 59 insertions(+), 15 deletions(-)

diff --git a/cmd/sbserver/parse-csv.py b/cmd/sbserver/parse-csv.py
index 83b36d7..48691b6 100644
--- a/cmd/sbserver/parse-csv.py
+++ b/cmd/sbserver/parse-csv.py
@@ -1,3 +1,5 @@
+#Usage python parse-csv.py api-key inputCSV outputCSV
+
 from functools import wraps
 
 try:
@@ -17,6 +19,11 @@
 import requests
 import json
 import csv
+import datetime
+import sys
+
+urlLookup = dict()
+partialHashLookup = dict()
 
 def full_unescape(u):
 	uu = urllib.unquote(u)
@@ -118,16 +125,17 @@ def url_permutations(url):
 				seen_permutations.add(u)
 
 def digest(url):
-	return hashlib.sha256(url.encode('utf-8')).digest()
+	digest = hashlib.sha256(url.encode('utf-8')).digest()
+	urlLookup[base64.b64encode(digest)] = url
+	return digest
 
 def hashes(url):
-#	print("in hash function, url is " + url)
 	url_hash = digest(url)
 	yield url_hash
 
 
 if __name__ == '__main__':
-	url = "https://safebrowsing.googleapis.com/v4/fullHashes:find?key="
+	url = "https://safebrowsing.googleapis.com/v4/fullHashes:find?key=" + sys.argv[1]
 
 	x = {
 			'client':{
@@ -147,36 +155,72 @@ def hashes(url):
 	entries = [	]
 
 
-	#print (entries[0]['url'])
 	i = 0;
 	send = 0
-	with open('blacklist-entries.csv') as csvfile:
-			spamreader = csv.reader(csvfile, delimiter=',')
-			next(spamreader)
-			for row in spamreader:
-				if (i % 496 == 0):
-						send = 1
+	with open(sys.argv[2]) as csvfile:
+		with open(sys.argv[3], "w+") as outfile:
+			blacklistWriter = csv.writer(outfile, delimiter=',')
+			blacklistReader = csv.reader(csvfile, delimiter=',')
+			blacklistWriter.writerow(['URL', 'Full Hash', 'Partial Hash', 'UTC Time Stamp', 'Match Type', 'Match Metadata', 'Platform'])
+			next(blacklistReader)
+			for row in blacklistReader:
+				if (i % 200 == 0):
+					send = 1
 				if(row[0][:2] == '//'):
 					inputURL = row[0][2:]
 				else:
 					inputURL = row[0]
 				for permutations in url_permutations(canonical(inputURL)):
 					for hashed in hashes(permutations):
-						#print (hashed[0:4])
 						hashValue = base64.b64encode(hashed[0:4])
+						partialHashLookup[base64.b64encode(hashed)] = hashValue
 						hashValue.replace('\n', '')
-						#print(hashValue)
 						newEntry = {'hash': hashValue}
 						entries.append(newEntry)
 				i = i + 1;
 
 				if send == 1:
 					x['threatInfo']['threatEntries'] = entries
-					request = requests.post(url, json=x)
-#					print(json.dumps(x))
-					print (request.text)
+					response = requests.post(url, json=x)
+					responseJSON = json.loads(response.text)
+					if 'matches' in responseJSON:
+						for hits in responseJSON['matches']:
+							if 'threatType' in hits:
+								threatType = hits['threatType']
+							if 'platformType' in hits:
+								platformType = hits['platformType']
+							currentHash = ''
+							if 'threat' in hits:
+								currentHash = hits['threat']['hash']
+							malwareType = ''
+							timestamp = datetime.datetime.utcnow()
+							if 'threatEntryMetadata' in hits:
+								if 'entries' in hits['threatEntryMetadata']:
+									malwareTypeHash = hits['threatEntryMetadata']['entries'][0]['value']
+									if 'TEFORElORw' in malwareTypeHash:
+										malwareType = 'MALWARE LANDING'
+									if 'RElTVFJJQlVUSU9O' in malwareTypeHash:
+										malwareType = 'MALWARE DISTRIBUTION'
+
+							if currentHash in urlLookup:
+								currentURL = urlLookup[currentHash]
+								urlLookup.pop(currentHash)
+							else:
+								currentURL = ""
+
+							if currentHash in partialHashLookup:
+								partialHash = partialHashLookup[currentHash]
+								partialHashLookup.pop(currentHash)
+							else:
+								partialHash = ""
+							blacklistWriter.writerow([currentURL, currentHash, partialHash, timestamp, threatType, malwareType, platformType])
 					send = 0
 					entries = []
+			leftovers = urlLookup.items()
+			for extra in leftovers:
+				blacklistWriter.writerow([extra[1], extra[0], partialHashLookup[extra[0]], timestamp, "", "", ""])
+
+
 
 
 

From bd6ac636a9d2bf339ffb19b587f6c79908dd200c Mon Sep 17 00:00:00 2001
From: gayde2 <gayde2@illinois.edu>
Date: Mon, 15 Oct 2018 16:16:03 -0500
Subject: [PATCH 8/8] fixed variable names and useless lines

---
 cmd/sbserver/parse-csv.py | 151 ++++++++++++++++++++------------------
 1 file changed, 79 insertions(+), 72 deletions(-)

diff --git a/cmd/sbserver/parse-csv.py b/cmd/sbserver/parse-csv.py
index 48691b6..eaf78a6 100644
--- a/cmd/sbserver/parse-csv.py
+++ b/cmd/sbserver/parse-csv.py
@@ -22,8 +22,8 @@
 import datetime
 import sys
 
-urlLookup = dict()
-partialHashLookup = dict()
+URL_lookup = dict() #K:Hash, V: URL
+partial_hash_lookup = dict() #K:Hash, V:partial_hash
 
 def full_unescape(u):
 	uu = urllib.unquote(u)
@@ -120,13 +120,12 @@ def url_permutations(url):
 		for p in url_path_permutations(path):
 			u = '%s%s' % (h, p)
 			if u not in seen_permutations:
-				yield u
-
+				yield u			
 				seen_permutations.add(u)
 
 def digest(url):
 	digest = hashlib.sha256(url.encode('utf-8')).digest()
-	urlLookup[base64.b64encode(digest)] = url
+	URL_lookup[base64.b64encode(digest)] = url
 	return digest
 
 def hashes(url):
@@ -155,73 +154,81 @@ def hashes(url):
 	entries = [	]
 
 
-	i = 0;
-	send = 0
-	with open(sys.argv[2]) as csvfile:
-		with open(sys.argv[3], "w+") as outfile:
-			blacklistWriter = csv.writer(outfile, delimiter=',')
-			blacklistReader = csv.reader(csvfile, delimiter=',')
-			blacklistWriter.writerow(['URL', 'Full Hash', 'Partial Hash', 'UTC Time Stamp', 'Match Type', 'Match Metadata', 'Platform'])
-			next(blacklistReader)
-			for row in blacklistReader:
-				if (i % 200 == 0):
-					send = 1
-				if(row[0][:2] == '//'):
-					inputURL = row[0][2:]
-				else:
-					inputURL = row[0]
-				for permutations in url_permutations(canonical(inputURL)):
-					for hashed in hashes(permutations):
-						hashValue = base64.b64encode(hashed[0:4])
-						partialHashLookup[base64.b64encode(hashed)] = hashValue
-						hashValue.replace('\n', '')
-						newEntry = {'hash': hashValue}
-						entries.append(newEntry)
-				i = i + 1;
-
-				if send == 1:
-					x['threatInfo']['threatEntries'] = entries
-					response = requests.post(url, json=x)
-					responseJSON = json.loads(response.text)
-					if 'matches' in responseJSON:
-						for hits in responseJSON['matches']:
-							if 'threatType' in hits:
-								threatType = hits['threatType']
-							if 'platformType' in hits:
-								platformType = hits['platformType']
-							currentHash = ''
-							if 'threat' in hits:
-								currentHash = hits['threat']['hash']
-							malwareType = ''
-							timestamp = datetime.datetime.utcnow()
-							if 'threatEntryMetadata' in hits:
-								if 'entries' in hits['threatEntryMetadata']:
-									malwareTypeHash = hits['threatEntryMetadata']['entries'][0]['value']
-									if 'TEFORElORw' in malwareTypeHash:
-										malwareType = 'MALWARE LANDING'
-									if 'RElTVFJJQlVUSU9O' in malwareTypeHash:
-										malwareType = 'MALWARE DISTRIBUTION'
-
-							if currentHash in urlLookup:
-								currentURL = urlLookup[currentHash]
-								urlLookup.pop(currentHash)
-							else:
-								currentURL = ""
-
-							if currentHash in partialHashLookup:
-								partialHash = partialHashLookup[currentHash]
-								partialHashLookup.pop(currentHash)
-							else:
-								partialHash = ""
-							blacklistWriter.writerow([currentURL, currentHash, partialHash, timestamp, threatType, malwareType, platformType])
-					send = 0
-					entries = []
-			leftovers = urlLookup.items()
-			for extra in leftovers:
-				blacklistWriter.writerow([extra[1], extra[0], partialHashLookup[extra[0]], timestamp, "", "", ""])
-
-
-
 
+i = 0;
+send = 0
+with open(sys.argv[2]) as csvfile:
+	with open(sys.argv[3], "w+") as outfile:
+		csv_writer = csv.writer(outfile, delimiter=',')
+		blacklist_reader = csv.reader(csvfile, delimiter=',')
+		csv_writer.writerow(['URL', 'Full Hash', 'Partial Hash', 'UTC Time Stamp', 'Match Type', 'Match Metadata', 'Platform'])
+		next(blacklist_reader)
+		for row in blacklist_reader:
+			if (i % 200 == 0):
+				send = 1
+			if(row[0][:2] == '//'):
+				input_URL = row[0][2:]
+			else:
+				input_URL = row[0]
+
+			seen_input_hashes = set()
+			for permutation in url_permutations(canonical(input_URL)):
+				sha256_hash = digest(permutation)
+				if sha256_hash not in seen_input_hashes:
+					partial_hash = base64.b64encode(sha256_hash[0:4])
+					partial_hash_lookup[base64.b64encode(sha256_hash)] = partial_hash
+					new_entry = {'hash': partial_hash}
+					entries.append(new_entry)
+					seen_input_hashes.add(sha256_hash)
+			i = i + 1;
+
+			if send == 1:
+				x['threatInfo']['threatEntries'] = entries
+				response = requests.post(url, json=x)
+				response_JSON = json.loads(response.text)
+				seen_output_hashes = set()
+				if 'matches' in response_JSON:
+					for hits in response_JSON['matches']:
+						if 'threatType' in hits:
+							threat_type = hits['threatType']
+						if 'platformType' in hits:
+							platform_type = hits['platformType']
+						current_hash = ''
+						if 'threat' in hits:
+							current_hash = hits['threat']['hash']
+						timestamp = datetime.datetime.utcnow()
+						malware_type = ''
+						if 'threatEntryMetadata' in hits:
+							if 'entries' in hits['threatEntryMetadata']:
+								malware_type_hash = hits['threatEntryMetadata']['entries'][0]['value']
+								if 'TEFORElORw' in malware_type_hash:
+									malware_type = 'MALWARE LANDING'
+								if 'RElTVFJJQlVUSU9O' in malware_type_hash:
+									malware_type = 'MALWARE DISTRIBUTION'
+
+						if current_hash in URL_lookup:
+							current_URL = URL_lookup[current_hash]
+							URL_lookup.pop(current_hash)
+						else:
+							current_URL = ""
+
+						if current_hash in partial_hash_lookup:
+							partial_hash = partial_hash_lookup[current_hash]
+							partial_hash_lookup.pop(current_hash)
+						else:
+							decoded_hash = base64.b64decode(current_hash)[0:4]
+							partial_hash = base64.b64encode(decoded_hash)
+						new_row = [current_URL, current_hash, partial_hash, timestamp, threat_type, malware_type, platform_type]
+						if new_row[1] not in seen_output_hashes:
+							csv_writer.writerow(new_row)
+							seen_output_hashes.add(new_row[1])
+				send = 0
+				entries = []
+		leftovers = URL_lookup.items()
+		seen_leftover_hashes = set()
+		for extra in leftovers:
+			if extra[0] not in seen_leftover_hashes:
+				csv_writer.writerow([extra[1], extra[0], partial_hash_lookup[extra[0]], timestamp, "", "", ""])
+			seen_leftover_hashes.add(extra[0])