diff --git a/file.txt b/file.txt
new file mode 100644
index 0000000..fc0b523
--- /dev/null
+++ b/file.txt
@@ -0,0 +1,192 @@
+Accelerated Science, Technology, and Society (Law) B.S./J.D.
+
+
+
+
+
+
+ Rensselaer Catalog 2021-2022
+ Accelerated Science, Technology, and Society (Law) B.S./J.D.
+
+ |
+
+
+
+
+ |
+
+
+ Return to: Programs
This template is designed to advise students participating in the accelerated STSO/Law (B.S./J.D.) program (SSLW). The degree requirements are the same as the B.S. in STSO, however, this template provides for accelerated completion of STSO major requirements in the program’s first three years (at Rensselaer) as well as the coursework of any first-year law program that allows for remote-completion of the requirements of the B.S. in STSO.
+ |
+
+
+
+ Fall
+ - Science Core Elective Credit Hours: 4
+ (See footnote 6 below)
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+
+ Spring
+ - Science Core Elective Credit Hours: 4
+ (See footnote 6 below)
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+ - Intermediate STS course Credit Hours: 4
+ (See footnote 2 below)
+
+ Fall
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+ - Science Core Elective Credit Hours: 4
+ (See footnote 6 below)
+ - CAS Course Credit Hours: 4
+ (See footnote 7 below)
+
+ Spring
+ - CAS Course Credit Hours: 4
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+ - Science Core Elective Credit Hours: 4
+ (See footnote 6 below)
+ - Advanced STS Course Credit Hours: 4
+ (See footnote 3 below)
+
+ Third Year
Note: The third year of this accelerated degree program is the last year in which courses are taken at Rensselaer. As such, students in the accelerated degree program are exempt from the Arch summer semester and away semester experience.
+ Fall
+ - CAS Course Credit Hours: 4
+ - Advanced STS Course Credit Hours: 4
+
+ Spring
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+ - CAS Course Credit Hours: 4
+
+ Fourth Year
Courses this year will be taken at law school as part of the first-year curriculum of a J.D. and will complete STSO’s B.S. requirements.
+ Fall
+ - Federal Civil Procedure Credit Hours: 4
+ - Contracts Credit Hours: 3
+ - Property I Credit Hours: 2
+ - Torts Credit Hours: 4
+ - Introduction to Lawyering Credit Hours: 3
+
+ Spring
+ - Constitutional Law Credit Hours: 4
+ (Course applied to HASS Core Elective)
+ - Criminal Law Credit Hours: 3
+ - Property II Credit Hours: 4
+ - Introduction to Lawyering Credit Hours: 3
+ - Contracts Credit Hours: 2
+
+ Footnotes
+ - STSS/STSH 1110 Science, Technology, and Society may be substituted with ITWS 1220 IT and Society.
+ - Intermediate STS course options include STSS 2300 Environment and Society and other STSS or STSH courses at the 2000-level.
+ - Advanced STS Courses include STSH 4510 History of American Technology and other STSS or STSH courses at the 4000-level.
+ - STSS 4980 Research Design is a prerequisite for STSS/STSH 4990 STS and Sustainability Senior Project.
+ - All Rensselaer students must complete a HASS Core. This includes a communication intensive course, a HASS Inquiry course, and a 12-credit integrative pathway. In the case of the Accelerated STSO program (SSLW), students complete 20 credits of their HASS Core requirements at Rensselaer and the final 4 credits at law school through their Constitutional Law course.
+ - All Rensselaer students must complete a 24-credit Science Core. This includes at least 2 mathematics courses (8 credits), such as MATH 1010, MATH 1020, MATH 1500, MATH 1520, or others.
+ - The 16-credit Complementary Area of Study (CAS) provides STSO majors with a depth of understanding from an external field that will inform their STS research and coursework. The CAS courses should come from a single department (not STS). Two courses should be taken at the 4000-level.
+
+
+
+ |
+
+[[, Fall
+ - Science Core Elective Credit Hours: 4
+ (See footnote 6 below)
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+
+
Spring
+ - Science Core Elective Credit Hours: 4
+ (See footnote 6 below)
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+ - Intermediate STS course Credit Hours: 4
+ (See footnote 2 below)
+
+
], [, Fall
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+ - Science Core Elective Credit Hours: 4
+ (See footnote 6 below)
+ - CAS Course Credit Hours: 4
+ (See footnote 7 below)
+
+
Spring
+ - CAS Course Credit Hours: 4
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+ - Science Core Elective Credit Hours: 4
+ (See footnote 6 below)
+ - Advanced STS Course Credit Hours: 4
+ (See footnote 3 below)
+
+
], [Third Year
Note: The third year of this accelerated degree program is the last year in which courses are taken at Rensselaer. As such, students in the accelerated degree program are exempt from the Arch summer semester and away semester experience.
+
, Fall
+ - CAS Course Credit Hours: 4
+ - Advanced STS Course Credit Hours: 4
+
+
Spring
+ - HASS Core Elective Credit Hours: 4
+ (See footnote 5 below)
+ - CAS Course Credit Hours: 4
+
+
], [Fourth Year
Courses this year will be taken at law school as part of the first-year curriculum of a J.D. and will complete STSO’s B.S. requirements.
+
, Fall
+ - Federal Civil Procedure Credit Hours: 4
+ - Contracts Credit Hours: 3
+ - Property I Credit Hours: 2
+ - Torts Credit Hours: 4
+ - Introduction to Lawyering Credit Hours: 3
+
+
Spring
+ - Constitutional Law Credit Hours: 4
+ (Course applied to HASS Core Elective)
+ - Criminal Law Credit Hours: 3
+ - Property II Credit Hours: 4
+ - Introduction to Lawyering Credit Hours: 3
+ - Contracts Credit Hours: 2
+
+
], []]
+{'options': ' ', 'capstone': ' ', 'transfer_policy': ' ', 'footnotes': Footnotes
+ - STSS/STSH 1110 Science, Technology, and Society may be substituted with ITWS 1220 IT and Society.
+ - Intermediate STS course options include STSS 2300 Environment and Society and other STSS or STSH courses at the 2000-level.
+ - Advanced STS Courses include STSH 4510 History of American Technology and other STSS or STSH courses at the 4000-level.
+ - STSS 4980 Research Design is a prerequisite for STSS/STSH 4990 STS and Sustainability Senior Project.
+ - All Rensselaer students must complete a HASS Core. This includes a communication intensive course, a HASS Inquiry course, and a 12-credit integrative pathway. In the case of the Accelerated STSO program (SSLW), students complete 20 credits of their HASS Core requirements at Rensselaer and the final 4 credits at law school through their Constitutional Law course.
+ - All Rensselaer students must complete a 24-credit Science Core. This includes at least 2 mathematics courses (8 credits), such as MATH 1010, MATH 1020, MATH 1500, MATH 1520, or others.
+ - The 16-credit Complementary Area of Study (CAS) provides STSO majors with a depth of understanding from an external field that will inform their STS research and coursework. The CAS courses should come from a single department (not STS). Two courses should be taken at the 4000-level.
+
+
+
+
, 'misc': []}
+Aeronautical Engineering Curriculum
+
+
+
+
+
+
+ Rensselaer Catalog 2021-2022
+ Aeronautical Engineering Curriculum
+
+ |
+
+
+
+
+ |
+
+
+ Return to: Programs
Baccalaureate Programs
+
+Freshmen or sophomores who have identified aeronautical engineering as their major may follow the baccalaureate program below in lieu of the general core engineering program. Dual major programs which lead to a single baccalaureate degree embracing two fields are also available and are described in more detail in the MANE Handbook (emailed to you when you declare a major in MANE, and available from the MANE Student Services Office, JEC 2012).
+ |
+
diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
new file mode 100644
index 0000000..67e5da0
--- /dev/null
+++ b/src/baccalaureate_scraper.py
@@ -0,0 +1,230 @@
+from bs4 import BeautifulSoup
+import requests
+import time
+import re
+
+def baccalurate_grab_html():
+ r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
+ soup = BeautifulSoup(r.content, 'html5lib')
+ table = soup.find("td", class_="block_content")
+ major_elements = table.find_all("ul", class_="program-list")
+ ba_elements = major_elements[0].find_all("a")
+
+ for z in ba_elements:
+ #grabbing major title as well as the link to the major page
+ major_title = z.text.strip()
+ major_link = z.get('href')
+
+ #defining the inner values of the mega dictionaries
+ baccalaureate_parsed[major_title] = {"description": [],
+ "years": [[], [], [], [], []],
+ "other-content": {}}
+ baccalaureate_parsed[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": [], "misc": []}
+
+ baccalaureate[major_title] = {"description": " ", "requirements": " ",
+ "years": [[], [], [], [], []],
+ "other-content": {}}
+ baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []}
+
+ #getting the HTML from the webpage for the individual major
+ major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
+ major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
+
+ #parsing through the large HTML from the webpage in order to get the description of the major as well as the massive list of requirements
+ major_table = major_soup.find("td", class_="block_content")
+ major_table_def = major_table.find("table", class_="table_default")
+ trs = major_table_def.find_all("tr")
+ description_tr = trs[0]
+ #print(description_tr.get_text())
+ #print(description_tr.get_text().find("Return to: Programs"))
+ #print(description_tr.get_text()[description_tr.get_text().find("Return to: Programs") + 19])
+ if description_tr.get_text()[description_tr.get_text().find("Return to: Programs") + 19].isalpha():
+ description_tr = description_tr
+ else:
+ description_tr = "NO DESCRIPTION"
+ #print(description_tr)
+ #print("-------------------------------")
+ if major_title == "Architecture":
+ requirement_tr = trs[4]
+ else:
+ requirement_tr = trs[3]
+ baccalaureate[major_title]["description"] = description_tr
+ baccalaureate[major_title]["requirements"] = requirement_tr
+
+ #yearHTML is all of the raw description HTML for the entire major.
+ yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
+ if yearsHTML == None:
+ continue
+
+ #yearHTML will be used to get the raw HTML for each year.
+ yearlist = yearsHTML.find("div", class_ = "acalog-core")
+
+ #count and yearcount will be used to put the raw HTML in each value of the years array.
+ count = 0
+ yearcount = 0
+
+ #if there are no more HTML chunks, then the loop ends
+ while(yearlist != None):
+ #debug print
+ #print(yearlist.h2.a['name'])
+ #print(major_title)
+
+ #for SOME reason, ITWS has an extensive amount of major information that needs to be parsed before getting to
+ #each major year, I have to add extra lists to add the HTML information to the mega dictionary.
+ if major_title == "Information Technology and Web Science" and len(baccalaureate[major_title]["years"]) == 5:
+ print("CHECKED!!!")
+ for g in range(6):
+ baccalaureate[major_title]["years"].append([])
+ for g in range(4, 11):
+ baccalaureate[major_title]["years"][g].append(yearlist)
+ yearlist = yearlist.next_sibling
+ baccalaureate[major_title]["years"][g].append(yearlist)
+ yearlist = yearlist.next_sibling
+
+
+
+
+ #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years.
+ if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"):
+ try:
+ #debug print
+ #print(yearlist)
+ testString = yearlist.h2.a['name']
+
+ #checks to see if they fall into a specific category that we defined in the mega dictionary
+ if "Footnotes" in testString:
+ baccalaureate[major_title]["other-content"]["footnotes"] = yearlist
+ elif "Capstone" in testString:
+ baccalaureate[major_title]["other-content"]["capstone"] = yearlist
+ elif "Transfer Credit Policy" in testString:
+ baccalaureate[major_title]["other-content"]["transfer_policy"] = yearlist
+ elif "Options" in testString:
+ baccalaureate[major_title]["other-content"]["options"] = yearlist
+ else:
+ baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
+ except Exception as e:
+ #print(e)
+ baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
+ else:
+ baccalaureate[major_title]["years"][count].append(yearlist)
+ yearcount += 1
+ if yearcount > 1:
+ yearcount = 0
+ count += 1
+
+ #switches to the next HTML chunk
+ yearlist = yearlist.next_sibling
+
+ #yearcount increases by 1 until it is 2. If it's greater than 2, yearcount goes to 0 and count increases by 1.
+ #this is to ensure that each year gets not only the title HTML, but the information for that year as well.
+
+
+def baccalurate_parse_html():
+ for x in baccalaureate:
+ #print(x)
+ #print("------------------------")
+ #print(baccalaureate[x]["description"])
+ if baccalaureate[x]["description"] != "NO DESCRIPTION":
+ for y in baccalaureate[x]["description"].find_all("p"):
+ if y.get_text() != " Return to: Programs":
+ inserted_string = y.get_text().replace("\xa0", "")
+ inserted_string = inserted_string.replace("\n", "")
+ inserted_string = inserted_string.replace("\t", "")
+ baccalaureate_parsed[x]["description"].append(inserted_string)
+ yearcounter = 0
+ for y in baccalaureate[x]["years"]:
+ try:
+ #print(y)
+ #print(y[0].h2.a["name"])
+ baccalaureate_parsed[x]["years"][yearcounter] = []
+ baccalaureate_parsed[x]["years"][yearcounter].append(y[0].h2.a["name"])
+ baccalaureate_parsed[x]["years"][yearcounter].append([])
+ baccalaureate_parsed[x]["years"][yearcounter].append([])
+ #print(baccalaureate_parsed[x]["years"])
+ #print("------------------------------------------------")
+ #baccalaureate_parsed[x]["years"][1] = []
+ yearlist = y[1].find_all("div")
+ for z in yearlist:
+ #print(z.find_all("li"))
+ for i in z.find_all("li"):
+ #print(i.get_text())
+ if "or" == i.get_text():
+ continue
+ else:
+ inserted_string = i.get_text().replace("\xa0", "")
+ inserted_string = inserted_string.replace("\n", "")
+ inserted_string = inserted_string.replace("\t", "")
+ footnote_value = " "
+ footnote_found = False
+ credit_hour_found = False
+ credit_hour_value = ""
+ if "(See footnote" in inserted_string:
+ #print(inserted_string.index("(See footnote "))
+ footnote_value = inserted_string[inserted_string.index("(See footnote ") + 14]
+ inserted_string = inserted_string.replace("(See footnote " + footnote_value + " below)", "")
+ footnote_found = True
+
+ if "Credit Hours" in inserted_string:
+ #print(inserted_string.index("Credit Hours: "))
+ credit_hour_value = inserted_string[inserted_string.index("Credit Hours: ") + 14]
+ #print(credit_hour_value)
+ inserted_string = inserted_string.replace("Credit Hours: " + credit_hour_value, "")
+ credit_hour_found = True
+
+ if footnote_found == True:
+ inserted_string = inserted_string + "[FOOTNOTE: " + footnote_value + "]"
+ if credit_hour_found == True:
+ inserted_string = inserted_string + " [CREDIT HOURS: " + credit_hour_value + "]"
+ if "Fall" in z.h3.a["name"]:
+ baccalaureate_parsed[x]["years"][yearcounter][1].append(inserted_string)
+ if "Spring" in z.h3.a["name"]:
+ baccalaureate_parsed[x]["years"][yearcounter][2].append(inserted_string)
+ #print("-----------------------")
+
+ #print("------------------------------------------------")
+ yearcounter += 1
+ except Exception as e:
+ #print(e)
+ #print("------------------------------------------------")
+ continue
+ #print("------------------")
+ for y in baccalaureate[x]["other-content"]:
+ if len(baccalaureate[x]["other-content"][y]) == 0 or baccalaureate[x]["other-content"][y] == " ":
+ print("NO CONTENT HERE")
+ else:
+ #print(y)
+ #print("---------------------------------")
+ #print(baccalaureate[x]["other-content"][y])
+ try:
+ li_list = baccalaureate[x]["other-content"][y].find_all("li");
+ for z in li_list:
+ #print(z.get_text())
+ inserted_string = z.get_text().replace("\xa0", "")
+ inserted_string = inserted_string.replace("\n", "")
+ inserted_string = inserted_string.replace("\t", "")
+ baccalaureate_parsed[x]["other-content"][y].append(inserted_string)
+ except Exception as e:
+ #print(baccalaureate[x]["other-content"][y])
+ print(e)
+ #print(li_list)
+ #print("--------------------")
+
+
+#printing all the information in the mass dictionary (DEBUG)
+time_start = time.time()
+baccalaureate = {}
+baccalaureate_parsed = {}
+baccalurate_grab_html()
+baccalurate_parse_html()
+
+print(baccalaureate["Information Technology and Web Science"]["years"])
+print("-------------------------------------")
+print(baccalaureate_parsed["Information Technology and Web Science"])
+#for x in baccalaureate_parsed:
+# print(x)
+# print("----------")
+# print(baccalaureate_parsed[x])
+# print("-------------------------------------------------")
+
+#runtime (DEBUG)
+print(time.time() - time_start)
\ No newline at end of file
diff --git a/src/course.py b/src/course.py
index 3fb077f..b070959 100644
--- a/src/course.py
+++ b/src/course.py
@@ -10,6 +10,7 @@
import json
import re
import time
+import os
# Do you want to output the time it took for the operations to complete
timeit = True
@@ -21,7 +22,9 @@
# output will be redirected to 'sis_courses_TEST.json' if True
small_search = False
+os.makedirs('data/', exist_ok=True)
output_file = 'sis_courses_TEST.json' if small_search else 'sis_courses_data.json'
+output_file = 'data/' + output_file
host = "https://sis.rpi.edu"
url_pre = '/rss/bwckctlg.p_display_courses?term_in='
@@ -83,7 +86,7 @@ def fetch_course_links(year, term):
# if the course has a link, store the link and description in a dictionary
if class_info.a:
classes_links_dict[class_title.a.text] = [host+class_info.a['href'], desc]
-
+
return classes_links_dict
@@ -204,4 +207,4 @@ def fetch_course_info(link, desc):
if timeit:
print(f'\nTook {time.time() - before} to go to all sublinks')
print(f'outputting into {output_file}')
- json.dump(store, open(output_file, 'w'))
+ json.dump(store, open(output_file, 'w'))
\ No newline at end of file