Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions DataGetter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@
if __name__ == "__main__":
searcher = searchPage.SearchPage()
# urls = searcher.get_page_urls()
urls = searcher.load_urls("./data/case_urls.csv")
data = searcher.collect_data(urls)
data.to_csv("./unodc_export.csv", index=False)
# with open("./all_urls.csv", "w") as f:
# for url in urls:
# f.write(url[0])
# f.write("\n")
urls = searcher.load_urls("./all_urls.csv")
data, new_urls = searcher.collect_data(urls)
with open("./all_urls_fixed.csv", "w") as f:
for url in new_urls:
f.write(url)
f.write("\n")
print(data)
data.to_csv("./unodc_export_full.csv", index=False)
data.to_pickle("./unodc_export_full.pkl")
33 changes: 22 additions & 11 deletions UNODC_WebScrapper/casePage.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def set_url(self, url):
self.html = bs(text, 'html.parser')

def get_all_data(self):
COLUMNS = ["Title", "UNODC_NO", "Summary", "Keywords", "Procedural_Fields", "Procedural_Text", "Victims", "Defendants",
COLUMNS = ["UNODC_NO", "Title", "YEAR", "Country", "Summary", "Keywords", "Procedural_Fields", "Procedural_Text", "Victims", "Defendants",
"Charges"]

title = self.get_title()
Expand All @@ -31,9 +31,17 @@ def get_all_data(self):
if title == "404 Error":
self.set_url(self.url.replace("drugCrimetype","migrantsmugglingcrimetype"))
title = self.get_title()

if title == "404 Error":
self.set_url(self.url.replace("migrantsmugglingcrimetype","moneylaunderingcrimetype"))
title = self.get_title()
if title == "404 Error":
self.set_url(self.url.replace("moneylaunderingcrimetype","cybercrimecrimetype"))
title = self.get_title()
print(self.url)


year = self.url.split("/")[-2]
country = self.get_country()
unodc_no = self.get_unodc_no()
summary = self.get_summary()
keyword_dict = self.get_keywords()
Expand All @@ -42,11 +50,10 @@ def get_all_data(self):
defendants = self.get_defendants()
charges = self.get_charges()

dataframe = pd.DataFrame([[title, unodc_no, summary, keyword_dict, procedural_dict, procedural_text, victims,
dataframe = pd.DataFrame([[unodc_no, title, year, country, summary, keyword_dict, procedural_dict, procedural_text, victims,
defendants, charges]], columns=COLUMNS)


return dataframe
return dataframe, self.url


def get_title(self):
Expand All @@ -57,6 +64,10 @@ def get_unodc_no(self):
decisionDate = self.html.find("div", {"class", "decisionVerdictDate field line"})
return decisionDate.find("div",{"class":"value"}).text if decisionDate else None

def get_country(self):
country_flag = self.html.find("div", {"class":"country icon flip pull-left vcenter"})
return country_flag.find("span", {"class": "text"}).text if country_flag else None

def get_summary(self):
summary = self.html.find("div", {"class": "factSummary"})
if summary:
Expand All @@ -76,7 +87,7 @@ def get_keywords(self):
keyword_dict[label] = []
tags = keyword.find_all("div", {"class": "tags"})
for tag in tags:
keyword_dict[label].append(str(tag.text))
keyword_dict[label].append(tag.text.strip())
return keyword_dict if keywords else None

def get_procedural_info(self):
Expand All @@ -92,25 +103,25 @@ def get_procedural_info(self):
value = keyword.find("div", {"class": "value"})
if label:
label = label.text.replace(":", "")
procedural_dict[label] = value.text
elif "class" in keyword and "proceduralHistoryDescription" in keyword["class"]:
procedural_dict[label] = value.text.strip()
elif keyword.get("class") and "proceduralHistoryDescription" in keyword.get("class"):
procedure_text = " ".join([x.text for x in keyword.find_all("p")])
return (procedural_dict, procedure_text) if keywords else (None, "")

def get_people(self, className):
victim_list = []
victims = self.html.find("div", {"class":className})
victims = self.html.find("div", {"class": className})
if not victims:
return None
for victim in victims.find_all("div", {"class":"person"}):
victim_dict = {}
for keyword in victim.find_all("div"):
label = keyword.find("div", {"class": "label"})
value = keyword.find("div", {"class": "value"})
paragraphs = " ".join([x.text for x in keyword.find_all("p")])
if label and value:
label = label.text.replace(":", "").replace("\n", "")
paragraphs = " ".join([x.text for x in keyword.find_all("p")])
victim_dict[label] = value.text + paragraphs
victim_dict[label.strip()] = value.text.strip() + paragraphs.strip()


victim_list.append(victim_dict)
Expand Down
24 changes: 15 additions & 9 deletions UNODC_WebScrapper/searchPage.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@

class SearchPage:

def __init__(self, search_url="https://sherloc.unodc.org/cld/v3/htms/cldb/search.html?lng=en#?c=%7B%22filters%22:%5B%7B%22fieldName%22:%22en%23caseLaw@country_label_s%22,%22value%22:%22United%20States%20of%20America%22%7D%5D%7D"):
def __init__(self, search_url="https://sherloc.unodc.org/cld/v3/htms/cldb/search.html?lng=en#?c=%7B%22filters%22:%5B%5D,%22sortings%22:%22%22%7D"):
self.search_url = search_url

def get_html(self):
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(30)
driver.get(self.search_url)

SCROLL_PAUSE_TIME = 10
SCROLL_PAUSE_TIME = 5

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
Expand All @@ -40,17 +40,21 @@ def get_page_urls(self):
soup = self.get_html()
dates = soup.find_all('div', {"class": "pull-right flip date-value ng-binding"})
titles = soup.find_all("strong", {"class": "ng-binding"})
title_array = [strong.text.replace(" ", "_") for strong in titles]
title_array = [strong.text.replace(" ", "_").replace("/","") for strong in titles]

year_array = [div.text.replace('\n', "").replace(',', '').replace(' ', '').split("-")[0]
for i, div in enumerate(dates)
if i % 2 == 0]

BASE_URL = "https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa"
flags = soup.find_all("img")[2:]
flag_array = [x.get("src").split(".png")[0].split("/")[-1] for x in flags]

BASE_URL = "https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype"
target_urls = []
for title, year in zip(title_array, year_array):
target_urls.append("{base}/{year}//{title}.html?lng=en&tmpl=htms".format(
base=BASE_URL, year=year, title=title).replace("///", "/"))
for title, year, flag in zip(title_array, year_array, flag_array):
target_urls.append( ("{base}/{flag}/{year}/{title}.html?lng=en&tmpl=htms".format(
flag=flag, base=BASE_URL, year=year, title=title).replace("///", "/"),
year))

return target_urls

Expand All @@ -66,11 +70,13 @@ def load_urls(self, path):
def collect_data(self, urls):

df = None
new_urls =[]

for i, url in enumerate(urls):

page = casePage.CasePage(url)
pageData = page.get_all_data()
pageData, new_url = page.get_all_data()
new_urls.append(new_url)
print(pageData)
if df is None:
df = pageData
Expand All @@ -79,7 +85,7 @@ def collect_data(self, urls):

df.reset_index(inplace=True)
df.drop("index", axis=1, inplace=True)
return df
return df, new_urls



Expand Down
4 changes: 2 additions & 2 deletions data/case_urls.csv
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1993/
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1986//United_States_v._William_Alexander_Lewis_.html?lng=en&tmpl=htms
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1985//United_States_v._Steven_Lane_Crawford_85-2105.html?lng=en&tmpl=htms
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1981//United_States_v._Booker.html?lng=en&tmpl=htms
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/YYYY//Case_1014.html?lng=en&tmpl=htms
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/YYYY//Case_1314.html?lng=en&tmpl=htms
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/Case_1014.html?lng=en&tmpl=htms
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/Case_1314.html?lng=en&tmpl=htms
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/United_States_v._Abdenasser_Ennassime.html?lng=en&tmpl=htms
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/United_States_v._Jacobo_Dominguez_Vazquez.html?lng=en&tmpl=htms
https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/United_States_v._Greg_Parsons.html?lng=en&tmpl=htms
Expand Down