diff --git a/DataGetter.py b/DataGetter.py index 0264b8c..6499b03 100644 --- a/DataGetter.py +++ b/DataGetter.py @@ -3,6 +3,16 @@ if __name__ == "__main__": searcher = searchPage.SearchPage() # urls = searcher.get_page_urls() - urls = searcher.load_urls("./data/case_urls.csv") - data = searcher.collect_data(urls) - data.to_csv("./unodc_export.csv", index=False) + # with open("./all_urls.csv", "w") as f: + # for url in urls: + # f.write(url[0]) + # f.write("\n") + urls = searcher.load_urls("./all_urls.csv") + data, new_urls = searcher.collect_data(urls) + with open("./all_urls_fixed.csv", "w") as f: + for url in new_urls: + f.write(url) + f.write("\n") + print(data) + data.to_csv("./unodc_export_full.csv", index=False) + data.to_pickle("./unodc_export_full.pkl") diff --git a/UNODC_WebScrapper/casePage.py b/UNODC_WebScrapper/casePage.py index a25ad8b..398c3c0 100644 --- a/UNODC_WebScrapper/casePage.py +++ b/UNODC_WebScrapper/casePage.py @@ -17,7 +17,7 @@ def set_url(self, url): self.html = bs(text, 'html.parser') def get_all_data(self): - COLUMNS = ["Title", "UNODC_NO", "Summary", "Keywords", "Procedural_Fields", "Procedural_Text", "Victims", "Defendants", + COLUMNS = ["UNODC_NO", "Title", "YEAR", "Country", "Summary", "Keywords", "Procedural_Fields", "Procedural_Text", "Victims", "Defendants", "Charges"] title = self.get_title() @@ -31,9 +31,17 @@ def get_all_data(self): if title == "404 Error": self.set_url(self.url.replace("drugCrimetype","migrantsmugglingcrimetype")) title = self.get_title() - + if title == "404 Error": + self.set_url(self.url.replace("migrantsmugglingcrimetype","moneylaunderingcrimetype")) + title = self.get_title() + if title == "404 Error": + self.set_url(self.url.replace("moneylaunderingcrimetype","cybercrimecrimetype")) + title = self.get_title() print(self.url) + + year = self.url.split("/")[-2] + country = self.get_country() unodc_no = self.get_unodc_no() summary = self.get_summary() keyword_dict = self.get_keywords() @@ -42,11 +50,10 @@ def get_all_data(self): defendants = self.get_defendants() charges = self.get_charges() - dataframe = pd.DataFrame([[title, unodc_no, summary, keyword_dict, procedural_dict, procedural_text, victims, + dataframe = pd.DataFrame([[unodc_no, title, year, country, summary, keyword_dict, procedural_dict, procedural_text, victims, defendants, charges]], columns=COLUMNS) - - return dataframe + return dataframe, self.url def get_title(self): @@ -57,6 +64,10 @@ def get_unodc_no(self): decisionDate = self.html.find("div", {"class", "decisionVerdictDate field line"}) return decisionDate.find("div",{"class":"value"}).text if decisionDate else None + def get_country(self): + country_flag = self.html.find("div", {"class":"country icon flip pull-left vcenter"}) + return country_flag.find("span", {"class": "text"}).text if country_flag else None + def get_summary(self): summary = self.html.find("div", {"class": "factSummary"}) if summary: @@ -76,7 +87,7 @@ def get_keywords(self): keyword_dict[label] = [] tags = keyword.find_all("div", {"class": "tags"}) for tag in tags: - keyword_dict[label].append(str(tag.text)) + keyword_dict[label].append(tag.text.strip()) return keyword_dict if keywords else None def get_procedural_info(self): @@ -92,14 +103,14 @@ def get_procedural_info(self): value = keyword.find("div", {"class": "value"}) if label: label = label.text.replace(":", "") - procedural_dict[label] = value.text - elif "class" in keyword and "proceduralHistoryDescription" in keyword["class"]: + procedural_dict[label] = value.text.strip() + elif keyword.get("class") and "proceduralHistoryDescription" in keyword.get("class"): procedure_text = " ".join([x.text for x in keyword.find_all("p")]) return (procedural_dict, procedure_text) if keywords else (None, "") def get_people(self, className): victim_list = [] - victims = self.html.find("div", {"class":className}) + victims = self.html.find("div", {"class": className}) if not victims: return None for victim in victims.find_all("div", {"class":"person"}): @@ -107,10 +118,10 @@ def get_people(self, className): for keyword in victim.find_all("div"): label = keyword.find("div", {"class": "label"}) value = keyword.find("div", {"class": "value"}) + paragraphs = " ".join([x.text for x in keyword.find_all("p")]) if label and value: label = label.text.replace(":", "").replace("\n", "") - paragraphs = " ".join([x.text for x in keyword.find_all("p")]) - victim_dict[label] = value.text + paragraphs + victim_dict[label.strip()] = value.text.strip() + paragraphs.strip() victim_list.append(victim_dict) diff --git a/UNODC_WebScrapper/searchPage.py b/UNODC_WebScrapper/searchPage.py index 26fcb48..69425ff 100644 --- a/UNODC_WebScrapper/searchPage.py +++ b/UNODC_WebScrapper/searchPage.py @@ -8,7 +8,7 @@ class SearchPage: - def __init__(self, search_url="https://sherloc.unodc.org/cld/v3/htms/cldb/search.html?lng=en#?c=%7B%22filters%22:%5B%7B%22fieldName%22:%22en%23caseLaw@country_label_s%22,%22value%22:%22United%20States%20of%20America%22%7D%5D%7D"): + def __init__(self, search_url="https://sherloc.unodc.org/cld/v3/htms/cldb/search.html?lng=en#?c=%7B%22filters%22:%5B%5D,%22sortings%22:%22%22%7D"): self.search_url = search_url def get_html(self): @@ -16,7 +16,7 @@ def get_html(self): driver.implicitly_wait(30) driver.get(self.search_url) - SCROLL_PAUSE_TIME = 10 + SCROLL_PAUSE_TIME = 5 # Get scroll height last_height = driver.execute_script("return document.body.scrollHeight") @@ -40,17 +40,21 @@ def get_page_urls(self): soup = self.get_html() dates = soup.find_all('div', {"class": "pull-right flip date-value ng-binding"}) titles = soup.find_all("strong", {"class": "ng-binding"}) - title_array = [strong.text.replace(" ", "_") for strong in titles] + title_array = [strong.text.replace(" ", "_").replace("/","") for strong in titles] year_array = [div.text.replace('\n', "").replace(',', '').replace(' ', '').split("-")[0] for i, div in enumerate(dates) if i % 2 == 0] - BASE_URL = "https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa" + flags = soup.find_all("img")[2:] + flag_array = [x.get("src").split(".png")[0].split("/")[-1] for x in flags] + + BASE_URL = "https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype" target_urls = [] - for title, year in zip(title_array, year_array): - target_urls.append("{base}/{year}//{title}.html?lng=en&tmpl=htms".format( - base=BASE_URL, year=year, title=title).replace("///", "/")) + for title, year, flag in zip(title_array, year_array, flag_array): + target_urls.append( ("{base}/{flag}/{year}/{title}.html?lng=en&tmpl=htms".format( + flag=flag, base=BASE_URL, year=year, title=title).replace("///", "/"), + year)) return target_urls @@ -66,11 +70,13 @@ def load_urls(self, path): def collect_data(self, urls): df = None + new_urls =[] for i, url in enumerate(urls): page = casePage.CasePage(url) - pageData = page.get_all_data() + pageData, new_url = page.get_all_data() + new_urls.append(new_url) print(pageData) if df is None: df = pageData @@ -79,7 +85,7 @@ def collect_data(self, urls): df.reset_index(inplace=True) df.drop("index", axis=1, inplace=True) - return df + return df, new_urls diff --git a/data/case_urls.csv b/data/case_urls.csv index adf51d5..dade403 100644 --- a/data/case_urls.csv +++ b/data/case_urls.csv @@ -161,8 +161,8 @@ https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1993/ https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1986//United_States_v._William_Alexander_Lewis_.html?lng=en&tmpl=htms https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1985//United_States_v._Steven_Lane_Crawford_85-2105.html?lng=en&tmpl=htms https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1981//United_States_v._Booker.html?lng=en&tmpl=htms -https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/YYYY//Case_1014.html?lng=en&tmpl=htms -https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/YYYY//Case_1314.html?lng=en&tmpl=htms +https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/Case_1014.html?lng=en&tmpl=htms +https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/Case_1314.html?lng=en&tmpl=htms https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/United_States_v._Abdenasser_Ennassime.html?lng=en&tmpl=htms https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/United_States_v._Jacobo_Dominguez_Vazquez.html?lng=en&tmpl=htms https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/United_States_v._Greg_Parsons.html?lng=en&tmpl=htms