From 693fd30d3e2c637ca5d9cb1ec36ef0ae4155b917 Mon Sep 17 00:00:00 2001 From: akeylg Date: Fri, 18 Feb 2022 18:08:09 -0500 Subject: [PATCH 01/17] added a scraper for baccalaureate majors. SCUFFED --- src/baccalaureate_scraper.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 src/baccalaureate_scraper.py diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py new file mode 100644 index 0000000..54ef20d --- /dev/null +++ b/src/baccalaureate_scraper.py @@ -0,0 +1,26 @@ +from bs4 import BeautifulSoup +import requests +import time +start_time = time.time() +r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542") +#print(r.content) + +baccalaureate = {} +soup = BeautifulSoup(r.content, 'html5lib') +#print(soup) +table = soup.find("td", class_="block_content") +major_elements = table.find_all("ul", class_="program-list") +ba_elements = major_elements[0].find_all("a") +for z in ba_elements: + #print(z.text.strip()) + major_title = z.text.strip() + major_link = z.get('href') + baccalaureate[major_title] = {"description": " ", "requirements": " "} + major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) + major_soup = BeautifulSoup(major_webpage.content, 'html5lib') + major_table = major_soup.find("table", class_="table_default") + description_tr = major_table.find("tr") + requirement_tr = description_tr.find_next("tr") + baccalaureate[major_title]["description"] = description_tr + baccalaureate[major_title]["requirements"] = requirement_tr +print(time.time() - start_time) From c0a890476ec5cfa3aa236b56c44278caa0a186ed Mon Sep 17 00:00:00 2001 From: akeylg Date: Sun, 20 Feb 2022 09:32:55 -0500 Subject: [PATCH 02/17] fixed issue and lowered runtime by 6s --- src/baccalaureate_scraper.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index 54ef20d..296cbba 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -1,10 +1,10 @@ +from tracemalloc import start from bs4 import BeautifulSoup import requests import time -start_time = time.time() r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542") #print(r.content) - +start_time = time.time() baccalaureate = {} soup = BeautifulSoup(r.content, 'html5lib') #print(soup) @@ -12,15 +12,16 @@ major_elements = table.find_all("ul", class_="program-list") ba_elements = major_elements[0].find_all("a") for z in ba_elements: - #print(z.text.strip()) major_title = z.text.strip() major_link = z.get('href') baccalaureate[major_title] = {"description": " ", "requirements": " "} major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) major_soup = BeautifulSoup(major_webpage.content, 'html5lib') - major_table = major_soup.find("table", class_="table_default") - description_tr = major_table.find("tr") - requirement_tr = description_tr.find_next("tr") + major_table = major_soup.find("td", class_="block_content") + major_table_def = major_table.find("table", class_="table_default") + trs = major_table_def.find_all("tr") + description_tr = trs[0] + requirement_tr = trs[3] baccalaureate[major_title]["description"] = description_tr - baccalaureate[major_title]["requirements"] = requirement_tr -print(time.time() - start_time) + baccalaureate[major_title]["requirements"] = requirement_tr +print(baccalaureate) From 4ec7b11d43c3885f6b9906f09aa2b0ab62404d6d Mon Sep 17 00:00:00 2001 From: akeylg Date: Sun, 20 Feb 2022 09:33:45 -0500 Subject: [PATCH 03/17] got rid of accidental test print --- src/baccalaureate_scraper.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index 296cbba..07f49c9 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -1,13 +1,9 @@ from tracemalloc import start from bs4 import BeautifulSoup import requests -import time r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542") -#print(r.content) -start_time = time.time() baccalaureate = {} soup = BeautifulSoup(r.content, 'html5lib') -#print(soup) table = soup.find("td", class_="block_content") major_elements = table.find_all("ul", class_="program-list") ba_elements = major_elements[0].find_all("a") @@ -24,4 +20,3 @@ requirement_tr = trs[3] baccalaureate[major_title]["description"] = description_tr baccalaureate[major_title]["requirements"] = requirement_tr -print(baccalaureate) From c55d2a9c666227f0ee3030d72e89e708fa622f10 Mon Sep 17 00:00:00 2001 From: akeylg Date: Fri, 4 Mar 2022 17:02:24 -0500 Subject: [PATCH 04/17] started to implement further scraper --- src/baccalaureate_scraper.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index 07f49c9..9ad1a78 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -10,7 +10,10 @@ for z in ba_elements: major_title = z.text.strip() major_link = z.get('href') - baccalaureate[major_title] = {"description": " ", "requirements": " "} + baccalaureate[major_title] = {"description": " ", "requirements": " ", + "years": [[], [], [], []], + "other-content": {}} + baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " "} major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) major_soup = BeautifulSoup(major_webpage.content, 'html5lib') major_table = major_soup.find("td", class_="block_content") @@ -19,4 +22,13 @@ description_tr = trs[0] requirement_tr = trs[3] baccalaureate[major_title]["description"] = description_tr - baccalaureate[major_title]["requirements"] = requirement_tr + baccalaureate[major_title]["requirements"] = requirement_tr + yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20") + yeartitlelist = yearsHTML.find_all("div", class_ = "acalog-core") + yeardescriptionlist = yearsHTML.find_all("div", class_="custom_leftpad_20") + for x in range(0, len(yeartitlelist)): + baccalaureate[major_title]["years"][x].add(yeartitlelist[x]) + baccalaureate[major_title]["years"][x].add(yeardescriptionlist[x]) + + print("----------------------") + break From b863e476098eafe2f847e4570bb3f893614a9f9a Mon Sep 17 00:00:00 2001 From: akeylg Date: Tue, 15 Mar 2022 16:15:21 -0400 Subject: [PATCH 05/17] progress --- src/baccalaureate_scraper.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index 9ad1a78..8881750 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -26,9 +26,11 @@ yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20") yeartitlelist = yearsHTML.find_all("div", class_ = "acalog-core") yeardescriptionlist = yearsHTML.find_all("div", class_="custom_leftpad_20") + #print(yeartitlelist) + print(yeardescriptionlist) for x in range(0, len(yeartitlelist)): - baccalaureate[major_title]["years"][x].add(yeartitlelist[x]) - baccalaureate[major_title]["years"][x].add(yeardescriptionlist[x]) + baccalaureate[major_title]["years"][x].append(yeartitlelist[x]) + baccalaureate[major_title]["years"][x].append(yeardescriptionlist[x]) print("----------------------") break From 123cc8f189a7cb161b10ec5e7ad2f95a24490c6a Mon Sep 17 00:00:00 2001 From: akeylg Date: Tue, 15 Mar 2022 16:32:28 -0400 Subject: [PATCH 06/17] year raw HTML now works --- src/baccalaureate_scraper.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index 8881750..b17b508 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -24,13 +24,22 @@ baccalaureate[major_title]["description"] = description_tr baccalaureate[major_title]["requirements"] = requirement_tr yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20") - yeartitlelist = yearsHTML.find_all("div", class_ = "acalog-core") - yeardescriptionlist = yearsHTML.find_all("div", class_="custom_leftpad_20") - #print(yeartitlelist) - print(yeardescriptionlist) - for x in range(0, len(yeartitlelist)): - baccalaureate[major_title]["years"][x].append(yeartitlelist[x]) - baccalaureate[major_title]["years"][x].append(yeardescriptionlist[x]) + yeartitlelist = yearsHTML.find("div", class_ = "acalog-core") + yeardescriptionlist = yearsHTML.find("div", class_="custom_leftpad_20") + count = 0 + while(yeartitlelist != None): + if count > 3: + break + baccalaureate[major_title]["years"][count].append(yeartitlelist) + baccalaureate[major_title]["years"][count].append(yeartitlelist.next_sibling) + yeartitlelist = yeartitlelist.next_sibling.next_sibling + count += 1 + for x in baccalaureate[major_title]["years"]: + print(x) + print("--------------------------------------") + #for x in range(0, len(yeartitlelist)): + #baccalaureate[major_title]["years"][x].append(yeartitlelist[x]) + #baccalaureate[major_title]["years"][x].append(yeardescriptionlist[x]) print("----------------------") break From e382e9c54c9704b67c01d2b4537f5e17118a87c7 Mon Sep 17 00:00:00 2001 From: akeylg Date: Tue, 15 Mar 2022 16:44:02 -0400 Subject: [PATCH 07/17] proto year description working, still needs work --- src/baccalaureate_scraper.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index b17b508..b1a6d8d 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -11,7 +11,7 @@ major_title = z.text.strip() major_link = z.get('href') baccalaureate[major_title] = {"description": " ", "requirements": " ", - "years": [[], [], [], []], + "years": [[], [], [], [], []], "other-content": {}} baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " "} major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) @@ -24,22 +24,20 @@ baccalaureate[major_title]["description"] = description_tr baccalaureate[major_title]["requirements"] = requirement_tr yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20") - yeartitlelist = yearsHTML.find("div", class_ = "acalog-core") - yeardescriptionlist = yearsHTML.find("div", class_="custom_leftpad_20") + if yearsHTML == None: + continue + yearlist = yearsHTML.find("div", class_ = "acalog-core") count = 0 - while(yeartitlelist != None): + while(yearlist != None): if count > 3: break - baccalaureate[major_title]["years"][count].append(yeartitlelist) - baccalaureate[major_title]["years"][count].append(yeartitlelist.next_sibling) - yeartitlelist = yeartitlelist.next_sibling.next_sibling + baccalaureate[major_title]["years"][count].append(yearlist) + baccalaureate[major_title]["years"][count].append(yearlist.next_sibling) + if yearlist.next_sibling == None or yearlist.next_sibling.next_sibing == None: + break + yearlist = yearlist.next_sibling.next_sibling count += 1 - for x in baccalaureate[major_title]["years"]: - print(x) - print("--------------------------------------") - #for x in range(0, len(yeartitlelist)): - #baccalaureate[major_title]["years"][x].append(yeartitlelist[x]) - #baccalaureate[major_title]["years"][x].append(yeardescriptionlist[x]) + print("----------------------") break From 3eef023c0aa4f3dbb3f66737a9922e9b670dc714 Mon Sep 17 00:00:00 2001 From: akeylg Date: Tue, 22 Mar 2022 16:57:55 -0400 Subject: [PATCH 08/17] fixed some stuff and implemented footnotes --- src/baccalaureate_scraper.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index b1a6d8d..e096da8 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -1,5 +1,6 @@ -from tracemalloc import start +from cgi import test from bs4 import BeautifulSoup +from numpy import equal import requests r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542") baccalaureate = {} @@ -7,13 +8,16 @@ table = soup.find("td", class_="block_content") major_elements = table.find_all("ul", class_="program-list") ba_elements = major_elements[0].find_all("a") + for z in ba_elements: major_title = z.text.strip() major_link = z.get('href') + baccalaureate[major_title] = {"description": " ", "requirements": " ", "years": [[], [], [], [], []], "other-content": {}} baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " "} + major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) major_soup = BeautifulSoup(major_webpage.content, 'html5lib') major_table = major_soup.find("td", class_="block_content") @@ -24,20 +28,30 @@ baccalaureate[major_title]["description"] = description_tr baccalaureate[major_title]["requirements"] = requirement_tr yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20") + if yearsHTML == None: continue yearlist = yearsHTML.find("div", class_ = "acalog-core") count = 0 + yearcount = 0 + print(yearsHTML) + print("---------------------------") while(yearlist != None): - if count > 3: - break + print(yearlist) + try: + testString = yearlist.h2.a + if "Footnotes" in testString: + baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist) + except: + print("no h2") baccalaureate[major_title]["years"][count].append(yearlist) - baccalaureate[major_title]["years"][count].append(yearlist.next_sibling) - if yearlist.next_sibling == None or yearlist.next_sibling.next_sibing == None: - break - yearlist = yearlist.next_sibling.next_sibling - count += 1 - + yearlist = yearlist.next_sibling + yearcount += 1 + if yearcount > 1: + yearcount = 0 + count += 1 + print("-------") + #print(baccalaureate[major_title]["years"]) print("----------------------") - break + break \ No newline at end of file From b9470c302af7c3fb09a3c6291fb451f94771cb91 Mon Sep 17 00:00:00 2001 From: akeylg Date: Tue, 22 Mar 2022 17:06:24 -0400 Subject: [PATCH 09/17] added options and transfer credit policy --- src/baccalaureate_scraper.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index e096da8..7d7493c 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -34,17 +34,26 @@ yearlist = yearsHTML.find("div", class_ = "acalog-core") count = 0 yearcount = 0 - print(yearsHTML) - print("---------------------------") + #print(yearsHTML) + #print("---------------------------") while(yearlist != None): print(yearlist) - try: - testString = yearlist.h2.a - if "Footnotes" in testString: - baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist) - except: - print("no h2") - baccalaureate[major_title]["years"][count].append(yearlist) + if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"): + try: + testString = yearlist.h2.a + print(testString) + if "Footnotes" in testString: + baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist) + if "Capstone" in testString: + baccalaureate[major_title]["other_content"]["capstone"].append(yearlist) + if "Transfer Credit Policy" in testString: + baccalaureate[major_title]["other_content"]["transfer_policy"].append(yearlist) + if "Options" in testString: + baccalaureate[major_title]["other_content"]["options"].append(yearlist) + except: + print("no h2") + else: + baccalaureate[major_title]["years"][count].append(yearlist) yearlist = yearlist.next_sibling yearcount += 1 if yearcount > 1: @@ -53,5 +62,4 @@ print("-------") #print(baccalaureate[major_title]["years"]) - print("----------------------") - break \ No newline at end of file + print("----------------------") \ No newline at end of file From 9f5e1689e51b0d9365caf83f587dfc7ae70adbf9 Mon Sep 17 00:00:00 2001 From: akeylg Date: Tue, 22 Mar 2022 17:10:46 -0400 Subject: [PATCH 10/17] added list for misc information incase a major has one --- src/baccalaureate_scraper.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index 7d7493c..a1839d5 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -16,7 +16,7 @@ baccalaureate[major_title] = {"description": " ", "requirements": " ", "years": [[], [], [], [], []], "other-content": {}} - baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " "} + baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": " "} major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) major_soup = BeautifulSoup(major_webpage.content, 'html5lib') @@ -37,21 +37,23 @@ #print(yearsHTML) #print("---------------------------") while(yearlist != None): - print(yearlist) + #print(yearlist) if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"): try: testString = yearlist.h2.a - print(testString) + #print(testString) if "Footnotes" in testString: baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist) - if "Capstone" in testString: + elif "Capstone" in testString: baccalaureate[major_title]["other_content"]["capstone"].append(yearlist) - if "Transfer Credit Policy" in testString: + elif "Transfer Credit Policy" in testString: baccalaureate[major_title]["other_content"]["transfer_policy"].append(yearlist) - if "Options" in testString: + elif "Options" in testString: baccalaureate[major_title]["other_content"]["options"].append(yearlist) + else: + baccalaureate[major_title]["other_content"]["misc"].append(yearlist) except: - print("no h2") + continue else: baccalaureate[major_title]["years"][count].append(yearlist) yearlist = yearlist.next_sibling @@ -59,7 +61,7 @@ if yearcount > 1: yearcount = 0 count += 1 - print("-------") + #print("-------") #print(baccalaureate[major_title]["years"]) - print("----------------------") \ No newline at end of file + #print("----------------------") \ No newline at end of file From fe911a0d6895eae3b3e9c9d3f6cbd962e9c21d5c Mon Sep 17 00:00:00 2001 From: akeylg Date: Thu, 24 Mar 2022 01:04:45 -0400 Subject: [PATCH 11/17] fixed some other things --- src/baccalaureate_scraper.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index a1839d5..c98f374 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -16,7 +16,7 @@ baccalaureate[major_title] = {"description": " ", "requirements": " ", "years": [[], [], [], [], []], "other-content": {}} - baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": " "} + baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []} major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) major_soup = BeautifulSoup(major_webpage.content, 'html5lib') @@ -38,22 +38,23 @@ #print("---------------------------") while(yearlist != None): #print(yearlist) + #print(major_title) if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"): try: - testString = yearlist.h2.a - #print(testString) + #print(yearlist) + testString = yearlist.h2.a['name'] if "Footnotes" in testString: - baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist) + baccalaureate[major_title]["other-content"]["footnotes"] = yearlist elif "Capstone" in testString: - baccalaureate[major_title]["other_content"]["capstone"].append(yearlist) + baccalaureate[major_title]["other-content"]["capstone"] = yearlist elif "Transfer Credit Policy" in testString: - baccalaureate[major_title]["other_content"]["transfer_policy"].append(yearlist) + baccalaureate[major_title]["other-content"]["transfer_policy"] = yearlist elif "Options" in testString: - baccalaureate[major_title]["other_content"]["options"].append(yearlist) + baccalaureate[major_title]["other-content"]["options"] = yearlist else: - baccalaureate[major_title]["other_content"]["misc"].append(yearlist) - except: - continue + baccalaureate[major_title]["other-content"]["misc"].append(yearlist) + except Exception as e: + baccalaureate[major_title]["other-content"]["misc"].append(yearlist) else: baccalaureate[major_title]["years"][count].append(yearlist) yearlist = yearlist.next_sibling @@ -61,7 +62,9 @@ if yearcount > 1: yearcount = 0 count += 1 - #print("-------") + #print("-------------------------------------------") - #print(baccalaureate[major_title]["years"]) - #print("----------------------") \ No newline at end of file +for x in baccalaureate: + print(baccalaureate[x]) + print("-------------------------------------------------------") + break From b5c3b228b6f36ed287ba60c5a6c31ac26cff730c Mon Sep 17 00:00:00 2001 From: akeylg Date: Thu, 24 Mar 2022 01:39:06 -0400 Subject: [PATCH 12/17] added notes for each part to make things clear --- src/baccalaureate_scraper.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index c98f374..e75d0ef 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -1,7 +1,7 @@ -from cgi import test from bs4 import BeautifulSoup -from numpy import equal import requests +import time +time_start = time.time() r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542") baccalaureate = {} soup = BeautifulSoup(r.content, 'html5lib') @@ -10,39 +10,55 @@ ba_elements = major_elements[0].find_all("a") for z in ba_elements: + #grabbing major title as well as the link to the major page major_title = z.text.strip() major_link = z.get('href') + #defining the inner values of the mega dictionary baccalaureate[major_title] = {"description": " ", "requirements": " ", "years": [[], [], [], [], []], "other-content": {}} baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []} + #getting the HTML from the webpage for the individual major major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) major_soup = BeautifulSoup(major_webpage.content, 'html5lib') + + #parsing through the large HTML from the webpage in order to get the description of the major as well as the massive list of requirements major_table = major_soup.find("td", class_="block_content") major_table_def = major_table.find("table", class_="table_default") trs = major_table_def.find_all("tr") description_tr = trs[0] requirement_tr = trs[3] baccalaureate[major_title]["description"] = description_tr - baccalaureate[major_title]["requirements"] = requirement_tr - yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20") + baccalaureate[major_title]["requirements"] = requirement_tr + #yearHTML is all of the raw description HTML for the entire major. + yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20") if yearsHTML == None: continue + + #yearHTML will be used to get the raw HTML for each year. yearlist = yearsHTML.find("div", class_ = "acalog-core") + + #count and yearcount will be used to put the raw HTML in each value of the years array. count = 0 yearcount = 0 #print(yearsHTML) #print("---------------------------") + + #if there are no more HTML chunks, then the loop ends while(yearlist != None): #print(yearlist) #print(major_title) + + #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years. if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"): try: #print(yearlist) testString = yearlist.h2.a['name'] + + #checks to see if they fall into a specific category that we defined in the mega dictionary if "Footnotes" in testString: baccalaureate[major_title]["other-content"]["footnotes"] = yearlist elif "Capstone" in testString: @@ -57,14 +73,22 @@ baccalaureate[major_title]["other-content"]["misc"].append(yearlist) else: baccalaureate[major_title]["years"][count].append(yearlist) + + #switches to the next HTML chunk yearlist = yearlist.next_sibling + + #yearcount increases by 1 until it is 2. If it's greater than 2, yearcount goes to 0 and count increases by 1. + #this is to ensure that each year gets not only the title HTML, but the information for that year as well. yearcount += 1 if yearcount > 1: yearcount = 0 count += 1 #print("-------------------------------------------") +#printing all the information in the mass dictionary for x in baccalaureate: print(baccalaureate[x]) print("-------------------------------------------------------") - break + +#runtime +print(time.time() - time_start) \ No newline at end of file From e214e471b670e623e22b9a8b46ddb260a2d63ebd Mon Sep 17 00:00:00 2001 From: akeylg Date: Fri, 25 Mar 2022 16:08:36 -0400 Subject: [PATCH 13/17] commented out some debug outputs and further commented --- src/baccalaureate_scraper.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index e75d0ef..a1a5456 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -1,6 +1,7 @@ from bs4 import BeautifulSoup import requests import time + time_start = time.time() r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542") baccalaureate = {} @@ -44,17 +45,17 @@ #count and yearcount will be used to put the raw HTML in each value of the years array. count = 0 yearcount = 0 - #print(yearsHTML) - #print("---------------------------") #if there are no more HTML chunks, then the loop ends while(yearlist != None): + #debug print #print(yearlist) #print(major_title) #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years. if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"): try: + #debug print #print(yearlist) testString = yearlist.h2.a['name'] @@ -83,12 +84,11 @@ if yearcount > 1: yearcount = 0 count += 1 - #print("-------------------------------------------") - -#printing all the information in the mass dictionary -for x in baccalaureate: - print(baccalaureate[x]) - print("-------------------------------------------------------") -#runtime +#printing all the information in the mass dictionary (DEBUG) +#for x in baccalaureate: + #print(baccalaureate[x]) + #print("-------------------------------------------------------") + +#runtime (DEBUG) print(time.time() - time_start) \ No newline at end of file From 0952edba25906a78855b42cb1e8c65c385d81b07 Mon Sep 17 00:00:00 2001 From: akeylg Date: Sat, 2 Apr 2022 10:51:23 -0400 Subject: [PATCH 14/17] prototype parsed baccalaurete scraper --- file.txt | 192 +++++++++++++++++++++++++++++ src/baccalaureate_scraper.py | 232 +++++++++++++++++++++++------------ 2 files changed, 347 insertions(+), 77 deletions(-) create mode 100644 file.txt diff --git a/file.txt b/file.txt new file mode 100644 index 0000000..fc0b523 --- /dev/null +++ b/file.txt @@ -0,0 +1,192 @@ +Accelerated Science, Technology, and Society (Law) B.S./J.D. + + + + + + + + + +
+ + Rensselaer Catalog 2021-2022
+

Accelerated Science, Technology, and Society (Law) B.S./J.D.

+ +
+ + +
+
+
+

Return to {$returnto_text} Return to: Programs

This template is designed to advise students participating in the accelerated STSO/Law (B.S./J.D.) program (SSLW). The degree requirements are the same as the B.S. in STSO, however, this template provides for accelerated completion of STSO major requirements in the program’s first three years (at Rensselaer) as well as the coursework of any first-year law program that allows for remote-completion of the requirements of the B.S. in STSO.

+ + + + +

First Year


Fall


    +
  • Science Core Elective Credit Hours: 4
    + (See footnote 6 below)
  • +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
+

Spring


    +
  • Science Core Elective Credit Hours: 4
    + (See footnote 6 below)
  • +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
  • Intermediate STS course Credit Hours: 4
    + (See footnote 2 below)
  • +
+

Second Year


Fall


    +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
  • Science Core Elective Credit Hours: 4
    + (See footnote 6 below)
  • +
  • CAS Course Credit Hours: 4
    + (See footnote 7 below)
  • +
+

Spring


    +
  • CAS Course Credit Hours: 4
  • +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
  • Science Core Elective Credit Hours: 4
    + (See footnote 6 below)
  • +
  • Advanced STS Course Credit Hours: 4
    + (See footnote 3 below)
  • +
+

Third Year


Note: The third year of this accelerated degree program is the last year in which courses are taken at Rensselaer. As such, students in the accelerated degree program are exempt from the Arch summer semester and away semester experience

+

Fall


    +
  • CAS Course Credit Hours: 4
  • +
  • Advanced STS Course Credit Hours: 4
  • +
+

Spring


    +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
  • CAS Course Credit Hours: 4
  • +
+

Fourth Year


Courses this year will be taken at law school as part of the first-year curriculum of a J.D. and will complete STSO’s B.S. requirements.

+

Fall


    +
  • Federal Civil Procedure Credit Hours: 4
  • +
  • Contracts Credit Hours: 3
  • +
  • Property I Credit Hours: 2
  • +
  • Torts Credit Hours: 4
  • +
  • Introduction to Lawyering Credit Hours: 3
  • +
+

Spring


    +
  • Constitutional Law Credit Hours: 4
    + (Course applied to HASS Core Elective)
  • +
  • Criminal Law Credit Hours: 3
  • +
  • Property II Credit Hours: 4
  • +
  • Introduction to Lawyering Credit Hours: 3
  • +
  • Contracts Credit Hours: 2
  • +
+

Footnotes


    +
  1. STSS/STSH 1110 Science, Technology, and Society may be substituted with ITWS 1220 IT and Society.
  2. +
  3. Intermediate STS course options include STSS 2300 Environment and Society and other STSS or STSH courses at the 2000-level.
  4. +
  5. Advanced STS Courses include STSH 4510 History of American Technology and other STSS or STSH courses at the 4000-level.
  6. +
  7. STSS 4980 Research Design is a prerequisite for STSS/STSH 4990 STS and Sustainability Senior Project.
  8. +
  9. All Rensselaer students must complete a HASS Core. This includes a communication intensive course, a HASS Inquiry course, and a 12-credit integrative pathway. In the case of the Accelerated STSO program (SSLW), students complete 20 credits of their HASS Core requirements at Rensselaer and the final 4 credits at law school through their Constitutional Law course. 
  10. +
  11. All Rensselaer students must complete a 24-credit Science Core. This includes at least 2 mathematics courses (8 credits), such as MATH 1010, MATH 1020, MATH 1500, MATH 1520, or others.
  12. +
  13. The 16-credit Complementary Area of Study (CAS) provides STSO majors with a depth of understanding from an external field that will inform their STS research and coursework. The CAS courses should come from a single department (not STS). Two courses should be taken at the 4000-level.
  14. +
+ +

 

+
+ +[[

First Year


,

Fall


    +
  • Science Core Elective Credit Hours: 4
    + (See footnote 6 below)
  • +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
+

Spring


    +
  • Science Core Elective Credit Hours: 4
    + (See footnote 6 below)
  • +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
  • Intermediate STS course Credit Hours: 4
    + (See footnote 2 below)
  • +
+
], [

Second Year


,

Fall


    +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
  • Science Core Elective Credit Hours: 4
    + (See footnote 6 below)
  • +
  • CAS Course Credit Hours: 4
    + (See footnote 7 below)
  • +
+

Spring


    +
  • CAS Course Credit Hours: 4
  • +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
  • Science Core Elective Credit Hours: 4
    + (See footnote 6 below)
  • +
  • Advanced STS Course Credit Hours: 4
    + (See footnote 3 below)
  • +
+
], [

Third Year


Note: The third year of this accelerated degree program is the last year in which courses are taken at Rensselaer. As such, students in the accelerated degree program are exempt from the Arch summer semester and away semester experience

+
,

Fall


    +
  • CAS Course Credit Hours: 4
  • +
  • Advanced STS Course Credit Hours: 4
  • +
+

Spring


    +
  • HASS Core Elective Credit Hours: 4
    + (See footnote 5 below)
  • +
  • CAS Course Credit Hours: 4
  • +
+
], [

Fourth Year


Courses this year will be taken at law school as part of the first-year curriculum of a J.D. and will complete STSO’s B.S. requirements.

+
,

Fall


    +
  • Federal Civil Procedure Credit Hours: 4
  • +
  • Contracts Credit Hours: 3
  • +
  • Property I Credit Hours: 2
  • +
  • Torts Credit Hours: 4
  • +
  • Introduction to Lawyering Credit Hours: 3
  • +
+

Spring


    +
  • Constitutional Law Credit Hours: 4
    + (Course applied to HASS Core Elective)
  • +
  • Criminal Law Credit Hours: 3
  • +
  • Property II Credit Hours: 4
  • +
  • Introduction to Lawyering Credit Hours: 3
  • +
  • Contracts Credit Hours: 2
  • +
+
], []] +{'options': ' ', 'capstone': ' ', 'transfer_policy': ' ', 'footnotes':

Footnotes


    +
  1. STSS/STSH 1110 Science, Technology, and Society may be substituted with ITWS 1220 IT and Society.
  2. +
  3. Intermediate STS course options include STSS 2300 Environment and Society and other STSS or STSH courses at the 2000-level.
  4. +
  5. Advanced STS Courses include STSH 4510 History of American Technology and other STSS or STSH courses at the 4000-level.
  6. +
  7. STSS 4980 Research Design is a prerequisite for STSS/STSH 4990 STS and Sustainability Senior Project.
  8. +
  9. All Rensselaer students must complete a HASS Core. This includes a communication intensive course, a HASS Inquiry course, and a 12-credit integrative pathway. In the case of the Accelerated STSO program (SSLW), students complete 20 credits of their HASS Core requirements at Rensselaer and the final 4 credits at law school through their Constitutional Law course. 
  10. +
  11. All Rensselaer students must complete a 24-credit Science Core. This includes at least 2 mathematics courses (8 credits), such as MATH 1010, MATH 1020, MATH 1500, MATH 1520, or others.
  12. +
  13. The 16-credit Complementary Area of Study (CAS) provides STSO majors with a depth of understanding from an external field that will inform their STS research and coursework. The CAS courses should come from a single department (not STS). Two courses should be taken at the 4000-level.
  14. +
+ +

 

+
, 'misc': []} +Aeronautical Engineering Curriculum + + + + + + + + + +
+ + Rensselaer Catalog 2021-2022
+

Aeronautical Engineering Curriculum

+ +
+ + +
+
+
+

Return to {$returnto_text} Return to: Programs

Baccalaureate Programs

+ +

Freshmen or sophomores who have identified aeronautical engineering as their major may follow the baccalaureate program below in lieu of the general core engineering program.  Dual major programs which lead to a single baccalaureate degree embracing two fields are also available and are described in more detail in the MANE Handbook (emailed to you when you declare a major in MANE, and available from the MANE Student Services Office, JEC 2012).

+ + diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index a1a5456..311845a 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -1,94 +1,172 @@ +from itertools import count +from readline import insert_text from bs4 import BeautifulSoup +from numpy import empty import requests import time -time_start = time.time() -r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542") -baccalaureate = {} -soup = BeautifulSoup(r.content, 'html5lib') -table = soup.find("td", class_="block_content") -major_elements = table.find_all("ul", class_="program-list") -ba_elements = major_elements[0].find_all("a") +def baccalurate_grab_html(): + r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542") + soup = BeautifulSoup(r.content, 'html5lib') + table = soup.find("td", class_="block_content") + major_elements = table.find_all("ul", class_="program-list") + ba_elements = major_elements[0].find_all("a") -for z in ba_elements: - #grabbing major title as well as the link to the major page - major_title = z.text.strip() - major_link = z.get('href') - - #defining the inner values of the mega dictionary - baccalaureate[major_title] = {"description": " ", "requirements": " ", - "years": [[], [], [], [], []], - "other-content": {}} - baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []} - - #getting the HTML from the webpage for the individual major - major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) - major_soup = BeautifulSoup(major_webpage.content, 'html5lib') + for z in ba_elements: + #grabbing major title as well as the link to the major page + major_title = z.text.strip() + major_link = z.get('href') + + #defining the inner values of the mega dictionary + baccalaureate_parsed[major_title] = {"description": " ", "requirements": " ", + "years": [[], [], [], [], []], + "other-content": {}} + baccalaureate_parsed[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []} - #parsing through the large HTML from the webpage in order to get the description of the major as well as the massive list of requirements - major_table = major_soup.find("td", class_="block_content") - major_table_def = major_table.find("table", class_="table_default") - trs = major_table_def.find_all("tr") - description_tr = trs[0] - requirement_tr = trs[3] - baccalaureate[major_title]["description"] = description_tr - baccalaureate[major_title]["requirements"] = requirement_tr - - #yearHTML is all of the raw description HTML for the entire major. - yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20") - if yearsHTML == None: - continue + baccalaureate[major_title] = {"description": " ", "requirements": " ", + "years": [[], [], [], [], []], + "other-content": {}} + baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []} + + #getting the HTML from the webpage for the individual major + major_webpage = requests.get("http://catalog.rpi.edu/" + major_link) + major_soup = BeautifulSoup(major_webpage.content, 'html5lib') - #yearHTML will be used to get the raw HTML for each year. - yearlist = yearsHTML.find("div", class_ = "acalog-core") - - #count and yearcount will be used to put the raw HTML in each value of the years array. - count = 0 - yearcount = 0 + #parsing through the large HTML from the webpage in order to get the description of the major as well as the massive list of requirements + major_table = major_soup.find("td", class_="block_content") + major_table_def = major_table.find("table", class_="table_default") + trs = major_table_def.find_all("tr") + description_tr = trs[0] + requirement_tr = trs[3] + baccalaureate[major_title]["description"] = description_tr + baccalaureate[major_title]["requirements"] = requirement_tr + + #yearHTML is all of the raw description HTML for the entire major. + yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20") + if yearsHTML == None: + continue - #if there are no more HTML chunks, then the loop ends - while(yearlist != None): - #debug print - #print(yearlist) - #print(major_title) + #yearHTML will be used to get the raw HTML for each year. + yearlist = yearsHTML.find("div", class_ = "acalog-core") - #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years. - if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"): - try: - #debug print - #print(yearlist) - testString = yearlist.h2.a['name'] + #count and yearcount will be used to put the raw HTML in each value of the years array. + count = 0 + yearcount = 0 + + #if there are no more HTML chunks, then the loop ends + while(yearlist != None): + #debug print + #print(yearlist) + #print(major_title) + + #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years. + if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"): + try: + #debug print + #print(yearlist) + testString = yearlist.h2.a['name'] - #checks to see if they fall into a specific category that we defined in the mega dictionary - if "Footnotes" in testString: - baccalaureate[major_title]["other-content"]["footnotes"] = yearlist - elif "Capstone" in testString: - baccalaureate[major_title]["other-content"]["capstone"] = yearlist - elif "Transfer Credit Policy" in testString: - baccalaureate[major_title]["other-content"]["transfer_policy"] = yearlist - elif "Options" in testString: - baccalaureate[major_title]["other-content"]["options"] = yearlist - else: + #checks to see if they fall into a specific category that we defined in the mega dictionary + if "Footnotes" in testString: + baccalaureate[major_title]["other-content"]["footnotes"] = yearlist + elif "Capstone" in testString: + baccalaureate[major_title]["other-content"]["capstone"] = yearlist + elif "Transfer Credit Policy" in testString: + baccalaureate[major_title]["other-content"]["transfer_policy"] = yearlist + elif "Options" in testString: + baccalaureate[major_title]["other-content"]["options"] = yearlist + else: + baccalaureate[major_title]["other-content"]["misc"].append(yearlist) + except Exception as e: baccalaureate[major_title]["other-content"]["misc"].append(yearlist) + else: + baccalaureate[major_title]["years"][count].append(yearlist) + + #switches to the next HTML chunk + yearlist = yearlist.next_sibling + + #yearcount increases by 1 until it is 2. If it's greater than 2, yearcount goes to 0 and count increases by 1. + #this is to ensure that each year gets not only the title HTML, but the information for that year as well. + yearcount += 1 + if yearcount > 1: + yearcount = 0 + count += 1 + +def baccalurate_parse_html(): + for x in baccalaureate: + baccalaureate_parsed[x]["description"] = baccalaureate[x]["description"].find_all("p")[1].string + yearcounter = 0 + for y in baccalaureate[x]["years"]: + try: + #print(y) + #print(y[0].h2.a["name"]) + baccalaureate_parsed[x]["years"][yearcounter] = [] + baccalaureate_parsed[x]["years"][yearcounter].append(y[0].h2.a["name"]) + baccalaureate_parsed[x]["years"][yearcounter].append([]) + baccalaureate_parsed[x]["years"][yearcounter].append([]) + #print(baccalaureate_parsed[x]["years"]) + #print("------------------------------------------------") + #baccalaureate_parsed[x]["years"][1] = [] + yearlist = y[1].find_all("div") + for z in yearlist: + #print(z.find_all("li")) + for i in z.find_all("li"): + #print(i.get_text()) + if "or" == i.get_text(): + continue + else: + inserted_string = i.get_text().replace("\xa0", " ") + inserted_string = inserted_string.replace("\n", " ") + inserted_string = inserted_string.replace("\t", " ") + footnote_value = " " + footnote_found = False + credit_hour_found = False + credit_hour_value = "" + if "(See footnote" in inserted_string: + #print(inserted_string.index("(See footnote ")) + footnote_value = inserted_string[inserted_string.index("(See footnote ") + 14] + inserted_string = inserted_string.replace("(See footnote " + footnote_value + " below)", "") + footnote_found = True + + if "Credit Hours" in inserted_string: + #print(inserted_string.index("Credit Hours: ")) + credit_hour_value = inserted_string[inserted_string.index("Credit Hours: ") + 14] + #print(credit_hour_value) + inserted_string = inserted_string.replace("Credit Hours: " + credit_hour_value, "") + credit_hour_found = True + + if footnote_found == True: + inserted_string = inserted_string + "[FOOTNOTE: " + footnote_value + "]" + if credit_hour_found == True: + inserted_string = inserted_string + " [CREDIT HOURS: " + credit_hour_value + "]" + if "Fall" in z.h3.a["name"]: + baccalaureate_parsed[x]["years"][yearcounter][1].append(inserted_string) + if "Spring" in z.h3.a["name"]: + baccalaureate_parsed[x]["years"][yearcounter][2].append(inserted_string) + #print("-----------------------") + + #print("------------------------------------------------") + yearcounter += 1 except Exception as e: - baccalaureate[major_title]["other-content"]["misc"].append(yearlist) - else: - baccalaureate[major_title]["years"][count].append(yearlist) - - #switches to the next HTML chunk - yearlist = yearlist.next_sibling - - #yearcount increases by 1 until it is 2. If it's greater than 2, yearcount goes to 0 and count increases by 1. - #this is to ensure that each year gets not only the title HTML, but the information for that year as well. - yearcount += 1 - if yearcount > 1: - yearcount = 0 - count += 1 + #print(e) + #print("------------------------------------------------") + continue + #print("------------------") + for y in baccalaureate[x]["other-content"]: + if len(baccalaureate[x]["other-content"][y]) == 0 or baccalaureate[x]["other-content"][y] == " ": + print("empty") + else: + print(baccalaureate[x]["other-content"][y]) + print("--------------------") + break #printing all the information in the mass dictionary (DEBUG) -#for x in baccalaureate: - #print(baccalaureate[x]) - #print("-------------------------------------------------------") +time_start = time.time() +baccalaureate = {} +baccalaureate_parsed = {} +baccalurate_grab_html() +baccalurate_parse_html() + #runtime (DEBUG) print(time.time() - time_start) \ No newline at end of file From 2da081dbd6f601dde690f4aa939bdadd7e2929a2 Mon Sep 17 00:00:00 2001 From: akeylg Date: Sun, 10 Apr 2022 20:31:56 -0400 Subject: [PATCH 15/17] parser done minus some issues --- src/baccalaureate_scraper.py | 56 +++++++++++++++++++++++++++++------- src/course.py | 7 +++-- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index 311845a..52c070d 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -1,4 +1,4 @@ -from itertools import count + from readline import insert_text from bs4 import BeautifulSoup from numpy import empty @@ -17,11 +17,11 @@ def baccalurate_grab_html(): major_title = z.text.strip() major_link = z.get('href') - #defining the inner values of the mega dictionary - baccalaureate_parsed[major_title] = {"description": " ", "requirements": " ", + #defining the inner values of the mega dictionaries + baccalaureate_parsed[major_title] = {"description": [], "years": [[], [], [], [], []], "other-content": {}} - baccalaureate_parsed[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []} + baccalaureate_parsed[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": [], "misc": []} baccalaureate[major_title] = {"description": " ", "requirements": " ", "years": [[], [], [], [], []], @@ -37,7 +37,19 @@ def baccalurate_grab_html(): major_table_def = major_table.find("table", class_="table_default") trs = major_table_def.find_all("tr") description_tr = trs[0] - requirement_tr = trs[3] + #print(description_tr.get_text()) + #print(description_tr.get_text().find("Return to: Programs")) + #print(description_tr.get_text()[description_tr.get_text().find("Return to: Programs") + 19]) + if description_tr.get_text()[description_tr.get_text().find("Return to: Programs") + 19].isalpha(): + description_tr = description_tr + else: + description_tr = "NO DESCRIPTION" + #print(description_tr) + #print("-------------------------------") + if major_title == "Architecture": + requirement_tr = trs[4] + else: + requirement_tr = trs[3] baccalaureate[major_title]["description"] = description_tr baccalaureate[major_title]["requirements"] = requirement_tr @@ -94,7 +106,16 @@ def baccalurate_grab_html(): def baccalurate_parse_html(): for x in baccalaureate: - baccalaureate_parsed[x]["description"] = baccalaureate[x]["description"].find_all("p")[1].string + print(x) + print("------------------------") + print(baccalaureate[x]["description"]) + if baccalaureate[x]["description"] != "NO DESCRIPTION": + for y in baccalaureate[x]["description"].find_all("p"): + if y.get_text() != " Return to: Programs": + inserted_string = y.get_text().replace("\xa0", " ") + inserted_string = inserted_string.replace("\n", " ") + inserted_string = inserted_string.replace("\t", " ") + baccalaureate_parsed[x]["description"].append(inserted_string) yearcounter = 0 for y in baccalaureate[x]["years"]: try: @@ -154,11 +175,21 @@ def baccalurate_parse_html(): #print("------------------") for y in baccalaureate[x]["other-content"]: if len(baccalaureate[x]["other-content"][y]) == 0 or baccalaureate[x]["other-content"][y] == " ": - print("empty") + print(" ") else: + #print(y) + print("---------------------------------") print(baccalaureate[x]["other-content"][y]) - print("--------------------") - break + try: + li_list = baccalaureate[x]["other-content"][y].find_all("li"); + for z in li_list: + #print(z.get_text()) + baccalaureate_parsed[x]["other-content"][y].append(z.get_text()) + except: + print(baccalaureate[x]["other-content"][y]) + #print(li_list) + #print("--------------------") + #printing all the information in the mass dictionary (DEBUG) time_start = time.time() @@ -166,7 +197,12 @@ def baccalurate_parse_html(): baccalaureate_parsed = {} baccalurate_grab_html() baccalurate_parse_html() - + +for x in baccalaureate_parsed: + print(x) + print("----------") + print(baccalaureate_parsed[x]) + print("-------------------------------------------------") #runtime (DEBUG) print(time.time() - time_start) \ No newline at end of file diff --git a/src/course.py b/src/course.py index 3fb077f..b070959 100644 --- a/src/course.py +++ b/src/course.py @@ -10,6 +10,7 @@ import json import re import time +import os # Do you want to output the time it took for the operations to complete timeit = True @@ -21,7 +22,9 @@ # output will be redirected to 'sis_courses_TEST.json' if True small_search = False +os.makedirs('data/', exist_ok=True) output_file = 'sis_courses_TEST.json' if small_search else 'sis_courses_data.json' +output_file = 'data/' + output_file host = "https://sis.rpi.edu" url_pre = '/rss/bwckctlg.p_display_courses?term_in=' @@ -83,7 +86,7 @@ def fetch_course_links(year, term): # if the course has a link, store the link and description in a dictionary if class_info.a: classes_links_dict[class_title.a.text] = [host+class_info.a['href'], desc] - + return classes_links_dict @@ -204,4 +207,4 @@ def fetch_course_info(link, desc): if timeit: print(f'\nTook {time.time() - before} to go to all sublinks') print(f'outputting into {output_file}') - json.dump(store, open(output_file, 'w')) + json.dump(store, open(output_file, 'w')) \ No newline at end of file From 0ea3747da0e93a2f3e56bc06fef65063dce6c67f Mon Sep 17 00:00:00 2001 From: akeylg Date: Mon, 11 Apr 2022 11:06:33 -0400 Subject: [PATCH 16/17] worked on ITWS fixes --- src/baccalaureate_scraper.py | 66 ++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index 52c070d..ab8bc78 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -1,9 +1,7 @@ - -from readline import insert_text from bs4 import BeautifulSoup -from numpy import empty import requests import time +import re def baccalurate_grab_html(): r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542") @@ -68,9 +66,24 @@ def baccalurate_grab_html(): #if there are no more HTML chunks, then the loop ends while(yearlist != None): #debug print - #print(yearlist) + #print(yearlist.h2.a['name']) #print(major_title) + + #for SOME reason, ITWS has an extensive amount of major information that needs to be parsed before getting to + #each major year, I have to add extra lists to add the HTML information to the mega dictionary. + if major_title == "Information Technology and Web Science" and len(baccalaureate[major_title]["years"]) == 5: + print("CHECKED!!!") + for g in range(6): + baccalaureate[major_title]["years"].append([]) + for g in range(4, 11): + baccalaureate[major_title]["years"][g].append(yearlist) + yearlist = yearlist.next_sibling + baccalaureate[major_title]["years"][g].append(yearlist) + yearlist = yearlist.next_sibling + + + #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years. if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"): try: @@ -90,25 +103,27 @@ def baccalurate_grab_html(): else: baccalaureate[major_title]["other-content"]["misc"].append(yearlist) except Exception as e: + #print(e) baccalaureate[major_title]["other-content"]["misc"].append(yearlist) else: baccalaureate[major_title]["years"][count].append(yearlist) + yearcount += 1 + if yearcount > 1: + yearcount = 0 + count += 1 #switches to the next HTML chunk yearlist = yearlist.next_sibling #yearcount increases by 1 until it is 2. If it's greater than 2, yearcount goes to 0 and count increases by 1. #this is to ensure that each year gets not only the title HTML, but the information for that year as well. - yearcount += 1 - if yearcount > 1: - yearcount = 0 - count += 1 + def baccalurate_parse_html(): for x in baccalaureate: - print(x) - print("------------------------") - print(baccalaureate[x]["description"]) + #print(x) + #print("------------------------") + #print(baccalaureate[x]["description"]) if baccalaureate[x]["description"] != "NO DESCRIPTION": for y in baccalaureate[x]["description"].find_all("p"): if y.get_text() != " Return to: Programs": @@ -175,18 +190,22 @@ def baccalurate_parse_html(): #print("------------------") for y in baccalaureate[x]["other-content"]: if len(baccalaureate[x]["other-content"][y]) == 0 or baccalaureate[x]["other-content"][y] == " ": - print(" ") + print("NO CONTENT HERE") else: #print(y) - print("---------------------------------") - print(baccalaureate[x]["other-content"][y]) + #print("---------------------------------") + #print(baccalaureate[x]["other-content"][y]) try: li_list = baccalaureate[x]["other-content"][y].find_all("li"); for z in li_list: #print(z.get_text()) - baccalaureate_parsed[x]["other-content"][y].append(z.get_text()) - except: - print(baccalaureate[x]["other-content"][y]) + inserted_string = z.get_text().replace("\xa0", " ") + inserted_string = inserted_string.replace("\n", " ") + inserted_string = inserted_string.replace("\t", " ") + baccalaureate_parsed[x]["other-content"][y].append(inserted_string) + except Exception as e: + #print(baccalaureate[x]["other-content"][y]) + print(e) #print(li_list) #print("--------------------") @@ -198,11 +217,14 @@ def baccalurate_parse_html(): baccalurate_grab_html() baccalurate_parse_html() -for x in baccalaureate_parsed: - print(x) - print("----------") - print(baccalaureate_parsed[x]) - print("-------------------------------------------------") +print(baccalaureate["Information Technology and Web Science"]["years"]) +print("-------------------------------------") +print(baccalaureate_parsed["Information Technology and Web Science"]) +#for x in baccalaureate_parsed: +# print(x) +# print("----------") +# print(baccalaureate_parsed[x]) +# print("-------------------------------------------------") #runtime (DEBUG) print(time.time() - time_start) \ No newline at end of file From b99abb5acc792f57ecc4eca54efb6cbfac2de307 Mon Sep 17 00:00:00 2001 From: akeylg Date: Mon, 11 Apr 2022 11:09:12 -0400 Subject: [PATCH 17/17] fixed footnote/credit hours bug --- src/baccalaureate_scraper.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py index ab8bc78..67e5da0 100644 --- a/src/baccalaureate_scraper.py +++ b/src/baccalaureate_scraper.py @@ -127,9 +127,9 @@ def baccalurate_parse_html(): if baccalaureate[x]["description"] != "NO DESCRIPTION": for y in baccalaureate[x]["description"].find_all("p"): if y.get_text() != " Return to: Programs": - inserted_string = y.get_text().replace("\xa0", " ") - inserted_string = inserted_string.replace("\n", " ") - inserted_string = inserted_string.replace("\t", " ") + inserted_string = y.get_text().replace("\xa0", "") + inserted_string = inserted_string.replace("\n", "") + inserted_string = inserted_string.replace("\t", "") baccalaureate_parsed[x]["description"].append(inserted_string) yearcounter = 0 for y in baccalaureate[x]["years"]: @@ -151,9 +151,9 @@ def baccalurate_parse_html(): if "or" == i.get_text(): continue else: - inserted_string = i.get_text().replace("\xa0", " ") - inserted_string = inserted_string.replace("\n", " ") - inserted_string = inserted_string.replace("\t", " ") + inserted_string = i.get_text().replace("\xa0", "") + inserted_string = inserted_string.replace("\n", "") + inserted_string = inserted_string.replace("\t", "") footnote_value = " " footnote_found = False credit_hour_found = False @@ -199,9 +199,9 @@ def baccalurate_parse_html(): li_list = baccalaureate[x]["other-content"][y].find_all("li"); for z in li_list: #print(z.get_text()) - inserted_string = z.get_text().replace("\xa0", " ") - inserted_string = inserted_string.replace("\n", " ") - inserted_string = inserted_string.replace("\t", " ") + inserted_string = z.get_text().replace("\xa0", "") + inserted_string = inserted_string.replace("\n", "") + inserted_string = inserted_string.replace("\t", "") baccalaureate_parsed[x]["other-content"][y].append(inserted_string) except Exception as e: #print(baccalaureate[x]["other-content"][y])