quantkeyvis · egrossman · Apr 9, 2019 · Apr 10, 2019
diff --git a/DataGetter.py b/DataGetter.py
@@ -3,6 +3,16 @@
 if __name__ == "__main__":
     searcher = searchPage.SearchPage()
     # urls = searcher.get_page_urls()
-    urls = searcher.load_urls("./data/case_urls.csv")
-    data = searcher.collect_data(urls)
-    data.to_csv("./unodc_export.csv", index=False)
+    # with open("./all_urls.csv", "w") as f:
+    #     for url in urls:
+    #         f.write(url[0])
+    #         f.write("\n")
+    urls = searcher.load_urls("./all_urls.csv")
+    data, new_urls = searcher.collect_data(urls)
+    with open("./all_urls_fixed.csv", "w") as f:
+        for url in new_urls:
+            f.write(url)
+            f.write("\n")
+    print(data)
+    data.to_csv("./unodc_export_full.csv", index=False)
+    data.to_pickle("./unodc_export_full.pkl")
diff --git a/UNODC_WebScrapper/casePage.py b/UNODC_WebScrapper/casePage.py
@@ -17,7 +17,7 @@ def set_url(self, url):
         self.html = bs(text, 'html.parser')
 
     def get_all_data(self):
-        COLUMNS = ["Title", "UNODC_NO", "Summary", "Keywords", "Procedural_Fields", "Procedural_Text", "Victims", "Defendants",
+        COLUMNS = ["UNODC_NO", "Title", "YEAR", "Country", "Summary", "Keywords", "Procedural_Fields", "Procedural_Text", "Victims", "Defendants",
                    "Charges"]
 
         title = self.get_title()
@@ -31,9 +31,17 @@ def get_all_data(self):
         if title == "404 Error":
             self.set_url(self.url.replace("drugCrimetype","migrantsmugglingcrimetype"))
         title = self.get_title()
-
+        if title == "404 Error":
+            self.set_url(self.url.replace("migrantsmugglingcrimetype","moneylaunderingcrimetype"))
+        title = self.get_title()
+        if title == "404 Error":
+            self.set_url(self.url.replace("moneylaunderingcrimetype","cybercrimecrimetype"))
+        title = self.get_title()
         print(self.url)
 
+
+        year = self.url.split("/")[-2]
+        country = self.get_country()
         unodc_no = self.get_unodc_no()
         summary = self.get_summary()
         keyword_dict = self.get_keywords()
@@ -42,11 +50,10 @@ def get_all_data(self):
         defendants = self.get_defendants()
         charges = self.get_charges()
 
-        dataframe = pd.DataFrame([[title, unodc_no, summary, keyword_dict, procedural_dict, procedural_text, victims,
+        dataframe = pd.DataFrame([[unodc_no, title, year, country, summary, keyword_dict, procedural_dict, procedural_text, victims,
                                    defendants, charges]], columns=COLUMNS)
 
-
-        return dataframe
+        return dataframe, self.url
 
 
     def get_title(self):
@@ -57,6 +64,10 @@ def get_unodc_no(self):
         decisionDate = self.html.find("div", {"class", "decisionVerdictDate field line"})
         return decisionDate.find("div",{"class":"value"}).text if decisionDate else None
 
+    def get_country(self):
+        country_flag = self.html.find("div", {"class":"country icon flip pull-left vcenter"})
+        return country_flag.find("span", {"class": "text"}).text if country_flag else None
+
     def get_summary(self):
         summary = self.html.find("div", {"class": "factSummary"})
         if summary:
@@ -76,7 +87,7 @@ def get_keywords(self):
             keyword_dict[label] = []
             tags = keyword.find_all("div", {"class": "tags"})
             for tag in tags:
-                keyword_dict[label].append(str(tag.text))
+                keyword_dict[label].append(tag.text.strip())
         return keyword_dict if keywords else None
 
     def get_procedural_info(self):
@@ -92,25 +103,25 @@ def get_procedural_info(self):
             value = keyword.find("div", {"class": "value"})
             if label:
                 label = label.text.replace(":", "")
-                procedural_dict[label] = value.text
-            elif "class" in keyword and "proceduralHistoryDescription" in keyword["class"]:
+                procedural_dict[label] = value.text.strip()
+            elif keyword.get("class") and "proceduralHistoryDescription" in keyword.get("class"):
                 procedure_text = " ".join([x.text for x in keyword.find_all("p")])
         return (procedural_dict, procedure_text) if keywords else (None, "")
 
     def get_people(self, className):
         victim_list = []
-        victims = self.html.find("div", {"class":className})
+        victims = self.html.find("div", {"class": className})
         if not victims:
             return None
         for victim in victims.find_all("div", {"class":"person"}):
             victim_dict = {}
             for keyword in victim.find_all("div"):
                 label = keyword.find("div", {"class": "label"})
                 value = keyword.find("div", {"class": "value"})
+                paragraphs = " ".join([x.text for x in keyword.find_all("p")])
                 if label and value:
                     label = label.text.replace(":", "").replace("\n", "")
-                    paragraphs = " ".join([x.text for x in keyword.find_all("p")])
-                    victim_dict[label] = value.text + paragraphs
+                    victim_dict[label.strip()] = value.text.strip() + paragraphs.strip()
 
 
             victim_list.append(victim_dict)

diff --git a/UNODC_WebScrapper/searchPage.py b/UNODC_WebScrapper/searchPage.py
@@ -8,15 +8,15 @@
 
 class SearchPage:
 
-    def __init__(self, search_url="https://sherloc.unodc.org/cld/v3/htms/cldb/search.html?lng=en#?c=%7B%22filters%22:%5B%7B%22fieldName%22:%22en%23caseLaw@country_label_s%22,%22value%22:%22United%20States%20of%20America%22%7D%5D%7D"):
+    def __init__(self, search_url="https://sherloc.unodc.org/cld/v3/htms/cldb/search.html?lng=en#?c=%7B%22filters%22:%5B%5D,%22sortings%22:%22%22%7D"):
         self.search_url = search_url
 
     def get_html(self):
         driver = webdriver.Chrome(ChromeDriverManager().install())
         driver.implicitly_wait(30)
         driver.get(self.search_url)
 
-        SCROLL_PAUSE_TIME = 10
+        SCROLL_PAUSE_TIME = 5
 
         # Get scroll height
         last_height = driver.execute_script("return document.body.scrollHeight")
@@ -40,17 +40,21 @@ def get_page_urls(self):
         soup = self.get_html()
         dates = soup.find_all('div', {"class": "pull-right flip date-value ng-binding"})
         titles = soup.find_all("strong", {"class": "ng-binding"})
-        title_array = [strong.text.replace(" ", "_") for strong in titles]
+        title_array = [strong.text.replace(" ", "_").replace("/","") for strong in titles]
 
         year_array = [div.text.replace('\n', "").replace(',', '').replace(' ', '').split("-")[0]
                       for i, div in enumerate(dates)
                       if i % 2 == 0]
 
-        BASE_URL = "https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa"
+        flags = soup.find_all("img")[2:]
+        flag_array = [x.get("src").split(".png")[0].split("/")[-1] for x in flags]
+
+        BASE_URL = "https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype"
         target_urls = []
-        for title, year in zip(title_array, year_array):
-            target_urls.append("{base}/{year}//{title}.html?lng=en&tmpl=htms".format(
-                base=BASE_URL, year=year, title=title).replace("///", "/"))
+        for title, year, flag in zip(title_array, year_array, flag_array):
+            target_urls.append( ("{base}/{flag}/{year}/{title}.html?lng=en&tmpl=htms".format(
+                flag=flag, base=BASE_URL, year=year, title=title).replace("///", "/"),
+                                 year))
 
         return target_urls
 
@@ -66,11 +70,13 @@ def load_urls(self, path):
     def collect_data(self, urls):
 
         df = None
+        new_urls =[]
 
         for i, url in enumerate(urls):
 
             page = casePage.CasePage(url)
-            pageData = page.get_all_data()
+            pageData, new_url = page.get_all_data()
+            new_urls.append(new_url)
             print(pageData)
             if df is None:
                 df = pageData
@@ -79,7 +85,7 @@ def collect_data(self, urls):
 
         df.reset_index(inplace=True)
         df.drop("index", axis=1, inplace=True)
-        return df
+        return df, new_urls
 
 
 

diff --git a/data/case_urls.csv b/data/case_urls.csv
@@ -161,8 +161,8 @@ https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1993/
 https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1986//United_States_v._William_Alexander_Lewis_.html?lng=en&tmpl=htms
 https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1985//United_States_v._Steven_Lane_Crawford_85-2105.html?lng=en&tmpl=htms
 https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/1981//United_States_v._Booker.html?lng=en&tmpl=htms
-https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/YYYY//Case_1014.html?lng=en&tmpl=htms
-https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/YYYY//Case_1314.html?lng=en&tmpl=htms
+https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/Case_1014.html?lng=en&tmpl=htms
+https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/Case_1314.html?lng=en&tmpl=htms
 https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/United_States_v._Abdenasser_Ennassime.html?lng=en&tmpl=htms
 https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/United_States_v._Jacobo_Dominguez_Vazquez.html?lng=en&tmpl=htms
 https://sherloc.unodc.org/cld/case-law-doc/traffickingpersonscrimetype/usa/United_States_v._Greg_Parsons.html?lng=en&tmpl=htms