From 693fd30d3e2c637ca5d9cb1ec36ef0ae4155b917 Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Fri, 18 Feb 2022 18:08:09 -0500
Subject: [PATCH 01/17] added a scraper for baccalaureate majors. SCUFFED

---
 src/baccalaureate_scraper.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 src/baccalaureate_scraper.py

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
new file mode 100644
index 0000000..54ef20d
--- /dev/null
+++ b/src/baccalaureate_scraper.py
@@ -0,0 +1,26 @@
+from bs4 import BeautifulSoup
+import requests
+import time
+start_time = time.time()
+r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
+#print(r.content)
+
+baccalaureate = {}
+soup = BeautifulSoup(r.content, 'html5lib')
+#print(soup)
+table = soup.find("td", class_="block_content")
+major_elements = table.find_all("ul", class_="program-list")
+ba_elements = major_elements[0].find_all("a")
+for z in ba_elements:
+    #print(z.text.strip())
+    major_title = z.text.strip()
+    major_link = z.get('href')
+    baccalaureate[major_title] = {"description": " ", "requirements": " "}
+    major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
+    major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
+    major_table = major_soup.find("table", class_="table_default")
+    description_tr = major_table.find("tr")
+    requirement_tr = description_tr.find_next("tr")
+    baccalaureate[major_title]["description"] = description_tr
+    baccalaureate[major_title]["requirements"] = requirement_tr     
+print(time.time() - start_time)

From c0a890476ec5cfa3aa236b56c44278caa0a186ed Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Sun, 20 Feb 2022 09:32:55 -0500
Subject: [PATCH 02/17] fixed issue and lowered runtime by 6s

---
 src/baccalaureate_scraper.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index 54ef20d..296cbba 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -1,10 +1,10 @@
+from tracemalloc import start
 from bs4 import BeautifulSoup
 import requests
 import time
-start_time = time.time()
 r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
 #print(r.content)
-
+start_time = time.time()
 baccalaureate = {}
 soup = BeautifulSoup(r.content, 'html5lib')
 #print(soup)
@@ -12,15 +12,16 @@
 major_elements = table.find_all("ul", class_="program-list")
 ba_elements = major_elements[0].find_all("a")
 for z in ba_elements:
-    #print(z.text.strip())
     major_title = z.text.strip()
     major_link = z.get('href')
     baccalaureate[major_title] = {"description": " ", "requirements": " "}
     major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
     major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
-    major_table = major_soup.find("table", class_="table_default")
-    description_tr = major_table.find("tr")
-    requirement_tr = description_tr.find_next("tr")
+    major_table = major_soup.find("td", class_="block_content")
+    major_table_def = major_table.find("table", class_="table_default")
+    trs = major_table_def.find_all("tr")
+    description_tr = trs[0]
+    requirement_tr = trs[3]
     baccalaureate[major_title]["description"] = description_tr
-    baccalaureate[major_title]["requirements"] = requirement_tr     
-print(time.time() - start_time)
+    baccalaureate[major_title]["requirements"] = requirement_tr    
+print(baccalaureate)

From 4ec7b11d43c3885f6b9906f09aa2b0ab62404d6d Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Sun, 20 Feb 2022 09:33:45 -0500
Subject: [PATCH 03/17] got rid of accidental test print

---
 src/baccalaureate_scraper.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index 296cbba..07f49c9 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -1,13 +1,9 @@
 from tracemalloc import start
 from bs4 import BeautifulSoup
 import requests
-import time
 r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
-#print(r.content)
-start_time = time.time()
 baccalaureate = {}
 soup = BeautifulSoup(r.content, 'html5lib')
-#print(soup)
 table = soup.find("td", class_="block_content")
 major_elements = table.find_all("ul", class_="program-list")
 ba_elements = major_elements[0].find_all("a")
@@ -24,4 +20,3 @@
     requirement_tr = trs[3]
     baccalaureate[major_title]["description"] = description_tr
     baccalaureate[major_title]["requirements"] = requirement_tr    
-print(baccalaureate)

From c55d2a9c666227f0ee3030d72e89e708fa622f10 Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Fri, 4 Mar 2022 17:02:24 -0500
Subject: [PATCH 04/17] started to implement further scraper

---
 src/baccalaureate_scraper.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index 07f49c9..9ad1a78 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -10,7 +10,10 @@
 for z in ba_elements:
     major_title = z.text.strip()
     major_link = z.get('href')
-    baccalaureate[major_title] = {"description": " ", "requirements": " "}
+    baccalaureate[major_title] = {"description": " ", "requirements": " ", 
+    "years": [[], [], [], []], 
+    "other-content": {}}
+    baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " "}
     major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
     major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
     major_table = major_soup.find("td", class_="block_content")
@@ -19,4 +22,13 @@
     description_tr = trs[0]
     requirement_tr = trs[3]
     baccalaureate[major_title]["description"] = description_tr
-    baccalaureate[major_title]["requirements"] = requirement_tr    
+    baccalaureate[major_title]["requirements"] = requirement_tr   
+    yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
+    yeartitlelist = yearsHTML.find_all("div", class_ = "acalog-core")
+    yeardescriptionlist = yearsHTML.find_all("div", class_="custom_leftpad_20")
+    for x in range(0, len(yeartitlelist)):
+        baccalaureate[major_title]["years"][x].add(yeartitlelist[x])
+        baccalaureate[major_title]["years"][x].add(yeardescriptionlist[x])
+    
+    print("----------------------")
+    break

From b863e476098eafe2f847e4570bb3f893614a9f9a Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Tue, 15 Mar 2022 16:15:21 -0400
Subject: [PATCH 05/17] progress

---
 src/baccalaureate_scraper.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index 9ad1a78..8881750 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -26,9 +26,11 @@
     yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
     yeartitlelist = yearsHTML.find_all("div", class_ = "acalog-core")
     yeardescriptionlist = yearsHTML.find_all("div", class_="custom_leftpad_20")
+    #print(yeartitlelist)
+    print(yeardescriptionlist)
     for x in range(0, len(yeartitlelist)):
-        baccalaureate[major_title]["years"][x].add(yeartitlelist[x])
-        baccalaureate[major_title]["years"][x].add(yeardescriptionlist[x])
+        baccalaureate[major_title]["years"][x].append(yeartitlelist[x])
+        baccalaureate[major_title]["years"][x].append(yeardescriptionlist[x])
     
     print("----------------------")
     break

From 123cc8f189a7cb161b10ec5e7ad2f95a24490c6a Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Tue, 15 Mar 2022 16:32:28 -0400
Subject: [PATCH 06/17] year raw HTML now works

---
 src/baccalaureate_scraper.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index 8881750..b17b508 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -24,13 +24,22 @@
     baccalaureate[major_title]["description"] = description_tr
     baccalaureate[major_title]["requirements"] = requirement_tr   
     yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
-    yeartitlelist = yearsHTML.find_all("div", class_ = "acalog-core")
-    yeardescriptionlist = yearsHTML.find_all("div", class_="custom_leftpad_20")
-    #print(yeartitlelist)
-    print(yeardescriptionlist)
-    for x in range(0, len(yeartitlelist)):
-        baccalaureate[major_title]["years"][x].append(yeartitlelist[x])
-        baccalaureate[major_title]["years"][x].append(yeardescriptionlist[x])
+    yeartitlelist = yearsHTML.find("div", class_ = "acalog-core")
+    yeardescriptionlist = yearsHTML.find("div", class_="custom_leftpad_20")
+    count = 0
+    while(yeartitlelist != None):
+        if count > 3:
+            break
+        baccalaureate[major_title]["years"][count].append(yeartitlelist)
+        baccalaureate[major_title]["years"][count].append(yeartitlelist.next_sibling)
+        yeartitlelist = yeartitlelist.next_sibling.next_sibling
+        count += 1
+    for x in baccalaureate[major_title]["years"]:
+        print(x)
+        print("--------------------------------------")
+    #for x in range(0, len(yeartitlelist)):
+        #baccalaureate[major_title]["years"][x].append(yeartitlelist[x])
+        #baccalaureate[major_title]["years"][x].append(yeardescriptionlist[x])
     
     print("----------------------")
     break

From e382e9c54c9704b67c01d2b4537f5e17118a87c7 Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Tue, 15 Mar 2022 16:44:02 -0400
Subject: [PATCH 07/17] proto year description working, still needs work

---
 src/baccalaureate_scraper.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index b17b508..b1a6d8d 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -11,7 +11,7 @@
     major_title = z.text.strip()
     major_link = z.get('href')
     baccalaureate[major_title] = {"description": " ", "requirements": " ", 
-    "years": [[], [], [], []], 
+    "years": [[], [], [], [], []], 
     "other-content": {}}
     baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " "}
     major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
@@ -24,22 +24,20 @@
     baccalaureate[major_title]["description"] = description_tr
     baccalaureate[major_title]["requirements"] = requirement_tr   
     yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
-    yeartitlelist = yearsHTML.find("div", class_ = "acalog-core")
-    yeardescriptionlist = yearsHTML.find("div", class_="custom_leftpad_20")
+    if yearsHTML == None:
+        continue 
+    yearlist = yearsHTML.find("div", class_ = "acalog-core")
     count = 0
-    while(yeartitlelist != None):
+    while(yearlist != None):
         if count > 3:
             break
-        baccalaureate[major_title]["years"][count].append(yeartitlelist)
-        baccalaureate[major_title]["years"][count].append(yeartitlelist.next_sibling)
-        yeartitlelist = yeartitlelist.next_sibling.next_sibling
+        baccalaureate[major_title]["years"][count].append(yearlist)
+        baccalaureate[major_title]["years"][count].append(yearlist.next_sibling)
+        if yearlist.next_sibling == None or yearlist.next_sibling.next_sibing == None:
+            break
+        yearlist = yearlist.next_sibling.next_sibling
         count += 1
-    for x in baccalaureate[major_title]["years"]:
-        print(x)
-        print("--------------------------------------")
-    #for x in range(0, len(yeartitlelist)):
-        #baccalaureate[major_title]["years"][x].append(yeartitlelist[x])
-        #baccalaureate[major_title]["years"][x].append(yeardescriptionlist[x])
+    
     
     print("----------------------")
     break

From 3eef023c0aa4f3dbb3f66737a9922e9b670dc714 Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Tue, 22 Mar 2022 16:57:55 -0400
Subject: [PATCH 08/17] fixed some stuff and implemented footnotes

---
 src/baccalaureate_scraper.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index b1a6d8d..e096da8 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -1,5 +1,6 @@
-from tracemalloc import start
+from cgi import test
 from bs4 import BeautifulSoup
+from numpy import equal
 import requests
 r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
 baccalaureate = {}
@@ -7,13 +8,16 @@
 table = soup.find("td", class_="block_content")
 major_elements = table.find_all("ul", class_="program-list")
 ba_elements = major_elements[0].find_all("a")
+
 for z in ba_elements:
     major_title = z.text.strip()
     major_link = z.get('href')
+    
     baccalaureate[major_title] = {"description": " ", "requirements": " ", 
     "years": [[], [], [], [], []], 
     "other-content": {}}
     baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " "}
+    
     major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
     major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
     major_table = major_soup.find("td", class_="block_content")
@@ -24,20 +28,30 @@
     baccalaureate[major_title]["description"] = description_tr
     baccalaureate[major_title]["requirements"] = requirement_tr   
     yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
+    
     if yearsHTML == None:
         continue 
     yearlist = yearsHTML.find("div", class_ = "acalog-core")
     count = 0
+    yearcount = 0
+    print(yearsHTML)
+    print("---------------------------")
     while(yearlist != None):
-        if count > 3:
-            break
+        print(yearlist)
+        try:
+            testString = yearlist.h2.a
+            if "Footnotes" in testString:
+                baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist)
+        except:
+            print("no h2")
         baccalaureate[major_title]["years"][count].append(yearlist)
-        baccalaureate[major_title]["years"][count].append(yearlist.next_sibling)
-        if yearlist.next_sibling == None or yearlist.next_sibling.next_sibing == None:
-            break
-        yearlist = yearlist.next_sibling.next_sibling
-        count += 1
-    
+        yearlist = yearlist.next_sibling
+        yearcount += 1
+        if yearcount > 1:
+            yearcount = 0
+            count += 1
+        print("-------")
     
+    #print(baccalaureate[major_title]["years"])
     print("----------------------")
-    break
+    break
\ No newline at end of file

From b9470c302af7c3fb09a3c6291fb451f94771cb91 Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Tue, 22 Mar 2022 17:06:24 -0400
Subject: [PATCH 09/17] added options and transfer credit policy

---
 src/baccalaureate_scraper.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index e096da8..7d7493c 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -34,17 +34,26 @@
     yearlist = yearsHTML.find("div", class_ = "acalog-core")
     count = 0
     yearcount = 0
-    print(yearsHTML)
-    print("---------------------------")
+    #print(yearsHTML)
+    #print("---------------------------")
     while(yearlist != None):
         print(yearlist)
-        try:
-            testString = yearlist.h2.a
-            if "Footnotes" in testString:
-                baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist)
-        except:
-            print("no h2")
-        baccalaureate[major_title]["years"][count].append(yearlist)
+        if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"):
+            try:
+                testString = yearlist.h2.a
+                print(testString)
+                if "Footnotes" in testString:
+                    baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist)
+                if "Capstone" in testString:
+                    baccalaureate[major_title]["other_content"]["capstone"].append(yearlist)
+                if "Transfer Credit Policy" in testString:
+                    baccalaureate[major_title]["other_content"]["transfer_policy"].append(yearlist)
+                if "Options" in testString:
+                    baccalaureate[major_title]["other_content"]["options"].append(yearlist)
+            except:
+                print("no h2")
+        else:
+            baccalaureate[major_title]["years"][count].append(yearlist)
         yearlist = yearlist.next_sibling
         yearcount += 1
         if yearcount > 1:
@@ -53,5 +62,4 @@
         print("-------")
     
     #print(baccalaureate[major_title]["years"])
-    print("----------------------")
-    break
\ No newline at end of file
+    print("----------------------")
\ No newline at end of file

From 9f5e1689e51b0d9365caf83f587dfc7ae70adbf9 Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Tue, 22 Mar 2022 17:10:46 -0400
Subject: [PATCH 10/17] added list for misc information incase a major has one

---
 src/baccalaureate_scraper.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index 7d7493c..a1839d5 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -16,7 +16,7 @@
     baccalaureate[major_title] = {"description": " ", "requirements": " ", 
     "years": [[], [], [], [], []], 
     "other-content": {}}
-    baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " "}
+    baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": " "}
     
     major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
     major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
@@ -37,21 +37,23 @@
     #print(yearsHTML)
     #print("---------------------------")
     while(yearlist != None):
-        print(yearlist)
+        #print(yearlist)
         if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"):
             try:
                 testString = yearlist.h2.a
-                print(testString)
+                #print(testString)
                 if "Footnotes" in testString:
                     baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist)
-                if "Capstone" in testString:
+                elif "Capstone" in testString:
                     baccalaureate[major_title]["other_content"]["capstone"].append(yearlist)
-                if "Transfer Credit Policy" in testString:
+                elif "Transfer Credit Policy" in testString:
                     baccalaureate[major_title]["other_content"]["transfer_policy"].append(yearlist)
-                if "Options" in testString:
+                elif "Options" in testString:
                     baccalaureate[major_title]["other_content"]["options"].append(yearlist)
+                else:
+                    baccalaureate[major_title]["other_content"]["misc"].append(yearlist)
             except:
-                print("no h2")
+                continue
         else:
             baccalaureate[major_title]["years"][count].append(yearlist)
         yearlist = yearlist.next_sibling
@@ -59,7 +61,7 @@
         if yearcount > 1:
             yearcount = 0
             count += 1
-        print("-------")
+        #print("-------")
     
     #print(baccalaureate[major_title]["years"])
-    print("----------------------")
\ No newline at end of file
+    #print("----------------------")
\ No newline at end of file

From fe911a0d6895eae3b3e9c9d3f6cbd962e9c21d5c Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Thu, 24 Mar 2022 01:04:45 -0400
Subject: [PATCH 11/17] fixed some other things

---
 src/baccalaureate_scraper.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index a1839d5..c98f374 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -16,7 +16,7 @@
     baccalaureate[major_title] = {"description": " ", "requirements": " ", 
     "years": [[], [], [], [], []], 
     "other-content": {}}
-    baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": " "}
+    baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []}
     
     major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
     major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
@@ -38,22 +38,23 @@
     #print("---------------------------")
     while(yearlist != None):
         #print(yearlist)
+        #print(major_title)
         if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"):
             try:
-                testString = yearlist.h2.a
-                #print(testString)
+                #print(yearlist)
+                testString = yearlist.h2.a['name']
                 if "Footnotes" in testString:
-                    baccalaureate[major_title]["other_content"]["footnotes"].append(yearlist)
+                    baccalaureate[major_title]["other-content"]["footnotes"] = yearlist
                 elif "Capstone" in testString:
-                    baccalaureate[major_title]["other_content"]["capstone"].append(yearlist)
+                    baccalaureate[major_title]["other-content"]["capstone"] = yearlist
                 elif "Transfer Credit Policy" in testString:
-                    baccalaureate[major_title]["other_content"]["transfer_policy"].append(yearlist)
+                    baccalaureate[major_title]["other-content"]["transfer_policy"] = yearlist
                 elif "Options" in testString:
-                    baccalaureate[major_title]["other_content"]["options"].append(yearlist)
+                    baccalaureate[major_title]["other-content"]["options"] = yearlist
                 else:
-                    baccalaureate[major_title]["other_content"]["misc"].append(yearlist)
-            except:
-                continue
+                    baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
+            except Exception as e:
+                baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
         else:
             baccalaureate[major_title]["years"][count].append(yearlist)
         yearlist = yearlist.next_sibling
@@ -61,7 +62,9 @@
         if yearcount > 1:
             yearcount = 0
             count += 1
-        #print("-------")
+        #print("-------------------------------------------")
     
-    #print(baccalaureate[major_title]["years"])
-    #print("----------------------")
\ No newline at end of file
+for x in baccalaureate:
+    print(baccalaureate[x])
+    print("-------------------------------------------------------")
+    break

From b5c3b228b6f36ed287ba60c5a6c31ac26cff730c Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Thu, 24 Mar 2022 01:39:06 -0400
Subject: [PATCH 12/17] added notes for each part to make things clear

---
 src/baccalaureate_scraper.py | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index c98f374..e75d0ef 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -1,7 +1,7 @@
-from cgi import test
 from bs4 import BeautifulSoup
-from numpy import equal
 import requests
+import time
+time_start = time.time()
 r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
 baccalaureate = {}
 soup = BeautifulSoup(r.content, 'html5lib')
@@ -10,39 +10,55 @@
 ba_elements = major_elements[0].find_all("a")
 
 for z in ba_elements:
+    #grabbing major title as well as the link to the major page
     major_title = z.text.strip()
     major_link = z.get('href')
     
+    #defining the inner values of the mega dictionary
     baccalaureate[major_title] = {"description": " ", "requirements": " ", 
     "years": [[], [], [], [], []], 
     "other-content": {}}
     baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []}
     
+    #getting the HTML from the webpage for the individual major
     major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
     major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
+
+    #parsing through the large HTML from the webpage in order to  get the description of the major as well as the massive list of requirements
     major_table = major_soup.find("td", class_="block_content")
     major_table_def = major_table.find("table", class_="table_default")
     trs = major_table_def.find_all("tr")
     description_tr = trs[0]
     requirement_tr = trs[3]
     baccalaureate[major_title]["description"] = description_tr
-    baccalaureate[major_title]["requirements"] = requirement_tr   
-    yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
+    baccalaureate[major_title]["requirements"] = requirement_tr  
     
+    #yearHTML is all of the raw description HTML for the entire major.
+    yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
     if yearsHTML == None:
         continue 
+
+    #yearHTML will be used to get the raw HTML for each year.
     yearlist = yearsHTML.find("div", class_ = "acalog-core")
+    
+    #count and yearcount will be used to  put the raw HTML in each value of the years array.
     count = 0
     yearcount = 0
     #print(yearsHTML)
     #print("---------------------------")
+
+    #if there are no more HTML chunks, then the loop ends
     while(yearlist != None):
         #print(yearlist)
         #print(major_title)
+        
+        #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years.
         if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"):
             try:
                 #print(yearlist)
                 testString = yearlist.h2.a['name']
+
+                #checks to see if they fall into a specific category that we defined in the mega dictionary
                 if "Footnotes" in testString:
                     baccalaureate[major_title]["other-content"]["footnotes"] = yearlist
                 elif "Capstone" in testString:
@@ -57,14 +73,22 @@
                 baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
         else:
             baccalaureate[major_title]["years"][count].append(yearlist)
+        
+        #switches to the next HTML chunk
         yearlist = yearlist.next_sibling
+        
+        #yearcount increases by 1 until it is 2. If it's greater than 2, yearcount goes to 0 and count increases by 1.
+        #this is to ensure that each year gets not only the title HTML, but the information for that year as well.
         yearcount += 1
         if yearcount > 1:
             yearcount = 0
             count += 1
         #print("-------------------------------------------")
     
+#printing all the information in the mass dictionary
 for x in baccalaureate:
     print(baccalaureate[x])
     print("-------------------------------------------------------")
-    break
+
+#runtime
+print(time.time() - time_start)
\ No newline at end of file

From e214e471b670e623e22b9a8b46ddb260a2d63ebd Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Fri, 25 Mar 2022 16:08:36 -0400
Subject: [PATCH 13/17] commented out some debug outputs and further commented

---
 src/baccalaureate_scraper.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index e75d0ef..a1a5456 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -1,6 +1,7 @@
 from bs4 import BeautifulSoup
 import requests
 import time
+
 time_start = time.time()
 r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
 baccalaureate = {}
@@ -44,17 +45,17 @@
     #count and yearcount will be used to  put the raw HTML in each value of the years array.
     count = 0
     yearcount = 0
-    #print(yearsHTML)
-    #print("---------------------------")
 
     #if there are no more HTML chunks, then the loop ends
     while(yearlist != None):
+        #debug print
         #print(yearlist)
         #print(major_title)
         
         #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years.
         if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"):
             try:
+                #debug print
                 #print(yearlist)
                 testString = yearlist.h2.a['name']
 
@@ -83,12 +84,11 @@
         if yearcount > 1:
             yearcount = 0
             count += 1
-        #print("-------------------------------------------")
-    
-#printing all the information in the mass dictionary
-for x in baccalaureate:
-    print(baccalaureate[x])
-    print("-------------------------------------------------------")
 
-#runtime
+#printing all the information in the mass dictionary (DEBUG)
+#for x in baccalaureate:
+    #print(baccalaureate[x])
+    #print("-------------------------------------------------------")
+
+#runtime (DEBUG)
 print(time.time() - time_start)
\ No newline at end of file

From 0952edba25906a78855b42cb1e8c65c385d81b07 Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Sat, 2 Apr 2022 10:51:23 -0400
Subject: [PATCH 14/17] prototype parsed baccalaurete scraper

---
 file.txt                     | 192 +++++++++++++++++++++++++++++
 src/baccalaureate_scraper.py | 232 +++++++++++++++++++++++------------
 2 files changed, 347 insertions(+), 77 deletions(-)
 create mode 100644 file.txt

diff --git a/file.txt b/file.txt
new file mode 100644
index 0000000..fc0b523
--- /dev/null
+++ b/file.txt
@@ -0,0 +1,192 @@
+Accelerated Science, Technology, and Society (Law) B.S./J.D.
+<tr>
+		<td colspan="4">
+			<table class="table_default">
+				<tbody><tr>
+					<td>
+						<div class="help_block"><a class="help acalog-highlight-ignore" href="help.php" onclick="acalogPopup('help.php', 'help', 770, 530, 'yes');return false;" target="_blank"><strong>HELP</strong></a></div>
+	<span class="acalog_catalog_name">Rensselaer Catalog 2021-2022</span>																	<br/>
+	    <h1 id="acalog-content">Accelerated Science, Technology, and Society (Law) B.S./J.D.</h1>
+				<div style="float: right"><a alt="Text Version" class="print_link acalog-highlight-ignore" href="/preview_program.php?catoid=22&amp;poid=5522&amp;returnto=542&amp;print" onclick="acalogPopup('/preview_program.php?catoid=22&amp;poid=5522&amp;returnto=542&amp;print', 'print_preview', 770, 530, 'yes');return false;" rel="nofollow" target="_blank" title="Print-Friendly Page (opens a new window)"><span class="sr-only">Print-Friendly Page (opens a new window)</span></a></div><div class="acalog-social-media-links float_right"> </div>
+			<div align="left">
+				<a class="portfolio_link acalog-highlight-ignore acalog-icon" href="javascript:void(0)" onclick="acalogPopup('portfolio.php?catoid=22&amp;add=1&amp;poid=5522#programs', 'portfolio', 770, 530, 'yes');return false;" target="_blank" title="Add to Portfolio (opens a new window)">
+				</a>
+			</div>																</td>
+															</tr>
+															<tr>
+																<td>
+																	<hr/>
+																</td>
+															</tr>
+														</tbody></table>
+		<p class="acalog-breadcrumb acalog-highlight-ignore"><img alt="Return to {$returnto_text}" class="return-to" src="/return.gif"/> Return to: <a href="content.php?catoid=22&amp;navoid=542">Programs</a></p><p>This template is designed to advise students participating in the accelerated STSO/Law (B.S./J.D.) program (SSLW). The degree requirements are the same as the B.S. in STSO, however, this template provides for accelerated completion of STSO major requirements in the program�s first three years (at Rensselaer) as well as the coursework of any first-year law program that allows for remote-completion of the requirements of the B.S. in STSO.</p>
+													</td>
+												</tr>
+<tr>
+													<td class="width" colspan="4">
+		<div class="custom_leftpad_20"><div class="acalog-core"><h2><a name="FirstYear"></a><a id="core_34896" name="firstyear"></a>First Year</h2><hr/></div><div class="custom_leftpad_20"><div class="acalog-core"><h3><a name="Fall"></a><a id="core_34897" name="fall"></a>Fall</h3><hr/><ul>
+	<li>Science Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 6 below)</li>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '44313',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34897~;}'); return false;">STSO 1110 - Science, Technology, and Society</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>or</p></li><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '43961',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34897~;}'); return false;">MATH 1500 - Calculus for Architecture, Management, and HASS</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>or</p></li><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '43959',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34897~;}'); return false;">MATH 1010 - Calculus I</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>(See footnote 6 below)</p></li></ul></div><div class="acalog-core"><h3><a name="Spring"></a><a id="core_34898" name="spring"></a>Spring</h3><hr/><ul>
+	<li>Science Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 6 below)</li>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+	<li>Intermediate STS course�<em>Credit Hours: 4</em><br/>
+	(See footnote 2 below)</li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '43962',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34898~;}'); return false;">MATH 1520 - Mathematical Methods in Management and Economics</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>or</p></li><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '43960',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34898~;}'); return false;">MATH 1020 - Calculus II</a> <em>Credit Hours:</em> <em>4</em></span></li></ul></div></div><div class="acalog-core"><h2><a name="SecondYear"></a><a id="core_34899" name="secondyear"></a>Second Year</h2><hr/></div><div class="custom_leftpad_20"><div class="acalog-core"><h3><a name="Fall"></a><a id="core_34900" name="fall"></a>Fall</h3><hr/><ul>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+	<li>Science Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 6 below)</li>
+	<li>CAS Course�<em>Credit Hours: 4</em><br/>
+	(See footnote 7 below)</li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '44872',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34900~;}'); return false;">STSO 2100 - Investigating Society</a> <em>Credit Hours:</em> <em>4</em></span></li></ul></div><div class="acalog-core"><h3><a name="Spring"></a><a id="core_34901" name="spring"></a>Spring</h3><hr/><ul>
+	<li>CAS Course�<em>Credit Hours: 4</em></li>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+	<li>Science Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 6 below)</li>
+	<li>Advanced STS Course�<em>Credit Hours: 4</em><br/>
+	(See footnote 3 below)</li>
+</ul>
+</div></div><div class="acalog-core"><h2><a name="ThirdYear"></a><a id="core_34902" name="thirdyear"></a>Third Year</h2><hr/><p><em>Note: The third year of this accelerated degree program is the last year in which courses are taken at Rensselaer. As such, students in the accelerated degree program are exempt from the Arch summer semester and away semester experience</em>.�</p>
+</div><div class="custom_leftpad_20"><div class="acalog-core"><h3><a name="Fall"></a><a id="core_34903" name="fall"></a>Fall</h3><hr/><ul>
+	<li>CAS Course�<em>Credit Hours: 4</em></li>
+	<li>Advanced STS Course�<em>Credit Hours: 4</em></li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '44870',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34903~;}'); return false;">STSO 4980 - Research Design</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>(See footnote 4 below)</p></li></ul></div><div class="acalog-core"><h3><a name="Spring"></a><a id="core_34904" name="spring"></a>Spring</h3><hr/><ul>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+	<li>CAS Course�<em>Credit Hours: 4</em></li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '44334',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34904~;}'); return false;">STSO 4990 - STS and Sustainability Senior Project</a> <em>Credit Hours:</em> <em>4</em></span></li></ul></div></div><div class="acalog-core"><h2><a name="FourthYear"></a><a id="core_34905" name="fourthyear"></a>Fourth Year</h2><hr/><p><em>Courses this year will be taken at law school�as part of the first-year curriculum of a J.D. and will complete STSO�s B.S. requirements.</em></p>
+</div><div class="custom_leftpad_20"><div class="acalog-core"><h3><a name="Fall"></a><a id="core_34906" name="fall"></a>Fall</h3><hr/><ul>
+	<li>Federal Civil Procedure�<em>Credit Hours: 4</em></li>
+	<li>Contracts�C<em>redit Hours: 3</em></li>
+	<li>Property I�<em>Credit Hours: 2</em></li>
+	<li>Torts�<em>Credit Hours: 4</em></li>
+	<li>Introduction to Lawyering�<em>Credit Hours: 3</em></li>
+</ul>
+</div><div class="acalog-core"><h3><a name="Spring"></a><a id="core_34907" name="spring"></a>Spring</h3><hr/><ul>
+	<li>Constitutional Law�<em>Credit Hours: 4</em><br/>
+	(Course applied to HASS Core Elective)</li>
+	<li>Criminal Law�<em>Credit Hours: 3</em></li>
+	<li>Property II�<em>Credit Hours: 4</em></li>
+	<li>Introduction to Lawyering�<em>Credit Hours: 3</em></li>
+	<li>Contracts�<em>Credit Hours: 2</em></li>
+</ul>
+</div></div><div class="acalog-core"><h2><a name="Footnotes"></a><a id="core_34908" name="footnotes"></a>Footnotes</h2><hr/><ol start="1">
+	<li><em>STSS/STSH 1110 Science, Technology, and Society may be substituted with ITWS 1220 IT and Society.</em></li>
+	<li><em>Intermediate STS course options include STSS 2300 Environment and Society�and other STSS or STSH courses at the 2000-level.</em></li>
+	<li><em>Advanced STS Courses include STSH 4510 History of American Technology and other STSS or STSH courses at the 4000-level.</em></li>
+	<li><em>STSS 4980 Research Design is a prerequisite for STSS/STSH 4990 STS and Sustainability Senior Project.</em></li>
+	<li><em>All Rensselaer students must complete a HASS Core. This includes a communication intensive course, a HASS Inquiry course, and a 12-credit integrative pathway. In the case of the Accelerated STSO�program (SSLW), students complete 20 credits of their HASS Core requirements at Rensselaer and the final 4 credits at law school through their Constitutional Law course.�</em></li>
+	<li><em>All Rensselaer students must complete a 24-credit Science Core.�This includes at least 2 mathematics courses (8 credits), such as MATH 1010, MATH 1020, MATH 1500, MATH 1520, or others.</em></li>
+	<li><em>The 16-credit Complementary Area of Study (CAS) provides STSO majors with a depth of understanding from an external field that will inform their STS research and coursework. The CAS courses should come from a single department (not STS). Two courses should be taken at the 4000-level.</em></li>
+</ol>
+
+<p>�</p>
+</div></div>													</td>
+												</tr>
+[[<div class="acalog-core"><h2><a name="FirstYear"></a><a id="core_34896" name="firstyear"></a>First Year</h2><hr/></div>, <div class="custom_leftpad_20"><div class="acalog-core"><h3><a name="Fall"></a><a id="core_34897" name="fall"></a>Fall</h3><hr/><ul>
+	<li>Science Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 6 below)</li>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '44313',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34897~;}'); return false;">STSO 1110 - Science, Technology, and Society</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>or</p></li><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '43961',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34897~;}'); return false;">MATH 1500 - Calculus for Architecture, Management, and HASS</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>or</p></li><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '43959',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34897~;}'); return false;">MATH 1010 - Calculus I</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>(See footnote 6 below)</p></li></ul></div><div class="acalog-core"><h3><a name="Spring"></a><a id="core_34898" name="spring"></a>Spring</h3><hr/><ul>
+	<li>Science Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 6 below)</li>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+	<li>Intermediate STS course�<em>Credit Hours: 4</em><br/>
+	(See footnote 2 below)</li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '43962',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34898~;}'); return false;">MATH 1520 - Mathematical Methods in Management and Economics</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>or</p></li><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '43960',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34898~;}'); return false;">MATH 1020 - Calculus II</a> <em>Credit Hours:</em> <em>4</em></span></li></ul></div></div>], [<div class="acalog-core"><h2><a name="SecondYear"></a><a id="core_34899" name="secondyear"></a>Second Year</h2><hr/></div>, <div class="custom_leftpad_20"><div class="acalog-core"><h3><a name="Fall"></a><a id="core_34900" name="fall"></a>Fall</h3><hr/><ul>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+	<li>Science Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 6 below)</li>
+	<li>CAS Course�<em>Credit Hours: 4</em><br/>
+	(See footnote 7 below)</li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '44872',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34900~;}'); return false;">STSO 2100 - Investigating Society</a> <em>Credit Hours:</em> <em>4</em></span></li></ul></div><div class="acalog-core"><h3><a name="Spring"></a><a id="core_34901" name="spring"></a>Spring</h3><hr/><ul>
+	<li>CAS Course�<em>Credit Hours: 4</em></li>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+	<li>Science Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 6 below)</li>
+	<li>Advanced STS Course�<em>Credit Hours: 4</em><br/>
+	(See footnote 3 below)</li>
+</ul>
+</div></div>], [<div class="acalog-core"><h2><a name="ThirdYear"></a><a id="core_34902" name="thirdyear"></a>Third Year</h2><hr/><p><em>Note: The third year of this accelerated degree program is the last year in which courses are taken at Rensselaer. As such, students in the accelerated degree program are exempt from the Arch summer semester and away semester experience</em>.�</p>
+</div>, <div class="custom_leftpad_20"><div class="acalog-core"><h3><a name="Fall"></a><a id="core_34903" name="fall"></a>Fall</h3><hr/><ul>
+	<li>CAS Course�<em>Credit Hours: 4</em></li>
+	<li>Advanced STS Course�<em>Credit Hours: 4</em></li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '44870',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34903~;}'); return false;">STSO 4980 - Research Design</a> <em>Credit Hours:</em> <em>4</em></span></li><li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;"><p>(See footnote 4 below)</p></li></ul></div><div class="acalog-core"><h3><a name="Spring"></a><a id="core_34904" name="spring"></a>Spring</h3><hr/><ul>
+	<li>HASS Core Elective�<em>Credit Hours: 4</em><br/>
+	(See footnote 5 below)</li>
+	<li>CAS Course�<em>Credit Hours: 4</em></li>
+</ul>
+<ul><li class="acalog-course"><span><a aria-expanded="false" href="#" onclick="showCourse('22', '44334',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~34904~;}'); return false;">STSO 4990 - STS and Sustainability Senior Project</a> <em>Credit Hours:</em> <em>4</em></span></li></ul></div></div>], [<div class="acalog-core"><h2><a name="FourthYear"></a><a id="core_34905" name="fourthyear"></a>Fourth Year</h2><hr/><p><em>Courses this year will be taken at law school�as part of the first-year curriculum of a J.D. and will complete STSO�s B.S. requirements.</em></p>
+</div>, <div class="custom_leftpad_20"><div class="acalog-core"><h3><a name="Fall"></a><a id="core_34906" name="fall"></a>Fall</h3><hr/><ul>
+	<li>Federal Civil Procedure�<em>Credit Hours: 4</em></li>
+	<li>Contracts�C<em>redit Hours: 3</em></li>
+	<li>Property I�<em>Credit Hours: 2</em></li>
+	<li>Torts�<em>Credit Hours: 4</em></li>
+	<li>Introduction to Lawyering�<em>Credit Hours: 3</em></li>
+</ul>
+</div><div class="acalog-core"><h3><a name="Spring"></a><a id="core_34907" name="spring"></a>Spring</h3><hr/><ul>
+	<li>Constitutional Law�<em>Credit Hours: 4</em><br/>
+	(Course applied to HASS Core Elective)</li>
+	<li>Criminal Law�<em>Credit Hours: 3</em></li>
+	<li>Property II�<em>Credit Hours: 4</em></li>
+	<li>Introduction to Lawyering�<em>Credit Hours: 3</em></li>
+	<li>Contracts�<em>Credit Hours: 2</em></li>
+</ul>
+</div></div>], []]
+{'options': ' ', 'capstone': ' ', 'transfer_policy': ' ', 'footnotes': <div class="acalog-core"><h2><a name="Footnotes"></a><a id="core_34908" name="footnotes"></a>Footnotes</h2><hr/><ol start="1">
+	<li><em>STSS/STSH 1110 Science, Technology, and Society may be substituted with ITWS 1220 IT and Society.</em></li>
+	<li><em>Intermediate STS course options include STSS 2300 Environment and Society�and other STSS or STSH courses at the 2000-level.</em></li>
+	<li><em>Advanced STS Courses include STSH 4510 History of American Technology and other STSS or STSH courses at the 4000-level.</em></li>
+	<li><em>STSS 4980 Research Design is a prerequisite for STSS/STSH 4990 STS and Sustainability Senior Project.</em></li>
+	<li><em>All Rensselaer students must complete a HASS Core. This includes a communication intensive course, a HASS Inquiry course, and a 12-credit integrative pathway. In the case of the Accelerated STSO�program (SSLW), students complete 20 credits of their HASS Core requirements at Rensselaer and the final 4 credits at law school through their Constitutional Law course.�</em></li>
+	<li><em>All Rensselaer students must complete a 24-credit Science Core.�This includes at least 2 mathematics courses (8 credits), such as MATH 1010, MATH 1020, MATH 1500, MATH 1520, or others.</em></li>
+	<li><em>The 16-credit Complementary Area of Study (CAS) provides STSO majors with a depth of understanding from an external field that will inform their STS research and coursework. The CAS courses should come from a single department (not STS). Two courses should be taken at the 4000-level.</em></li>
+</ol>
+
+<p>�</p>
+</div>, 'misc': []}
+Aeronautical Engineering Curriculum
+<tr>
+		<td colspan="4">
+			<table class="table_default">
+				<tbody><tr>
+					<td>
+						<div class="help_block"><a class="help acalog-highlight-ignore" href="help.php" onclick="acalogPopup('help.php', 'help', 770, 530, 'yes');return false;" target="_blank"><strong>HELP</strong></a></div>
+	<span class="acalog_catalog_name">Rensselaer Catalog 2021-2022</span>																	<br/>
+	    <h1 id="acalog-content">Aeronautical Engineering Curriculum</h1>
+				<div style="float: right"><a alt="Text Version" class="print_link acalog-highlight-ignore" href="/preview_program.php?catoid=22&amp;poid=5307&amp;hl=%22Aeronautical%22&amp;returnto=search&amp;print" onclick="acalogPopup('/preview_program.php?catoid=22&amp;poid=5307&amp;hl=%22Aeronautical%22&amp;returnto=search&amp;print', 'print_preview', 770, 530, 'yes');return false;" rel="nofollow" target="_blank" title="Print-Friendly Page (opens a new window)"><span class="sr-only">Print-Friendly Page (opens a new window)</span></a></div><div class="acalog-social-media-links float_right"> </div>
+			<div align="left">
+				<a class="portfolio_link acalog-highlight-ignore acalog-icon" href="javascript:void(0)" onclick="acalogPopup('portfolio.php?catoid=22&amp;add=1&amp;poid=5307#programs', 'portfolio', 770, 530, 'yes');return false;" target="_blank" title="Add to Portfolio (opens a new window)">
+				</a>
+			</div>																</td>
+															</tr>
+															<tr>
+																<td>
+																	<hr/>
+																</td>
+															</tr>
+														</tbody></table>
+		<p class="acalog-breadcrumb acalog-highlight-ignore"><img alt="Return to {$returnto_text}" class="return-to" src="/return.gif"/> Return to: <a href="content.php?catoid=22&amp;navoid=542">Programs</a></p><h3>Baccalaureate Programs</h3>
+
+<p>Freshmen or sophomores who have identified aeronautical engineering as their major may follow the baccalaureate program below in lieu of the general core engineering program.� Dual major programs which lead to a single baccalaureate degree embracing two fields are also available and are described in more detail in the MANE Handbook (emailed to you when you declare a major in MANE, and available from the MANE Student Services Office, JEC 2012).</p>
+													</td>
+												</tr>
diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index a1a5456..311845a 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -1,94 +1,172 @@
+from itertools import count
+from readline import insert_text
 from bs4 import BeautifulSoup
+from numpy import empty
 import requests
 import time
 
-time_start = time.time()
-r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
-baccalaureate = {}
-soup = BeautifulSoup(r.content, 'html5lib')
-table = soup.find("td", class_="block_content")
-major_elements = table.find_all("ul", class_="program-list")
-ba_elements = major_elements[0].find_all("a")
+def baccalurate_grab_html():
+    r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
+    soup = BeautifulSoup(r.content, 'html5lib')
+    table = soup.find("td", class_="block_content")
+    major_elements = table.find_all("ul", class_="program-list")
+    ba_elements = major_elements[0].find_all("a")
 
-for z in ba_elements:
-    #grabbing major title as well as the link to the major page
-    major_title = z.text.strip()
-    major_link = z.get('href')
-    
-    #defining the inner values of the mega dictionary
-    baccalaureate[major_title] = {"description": " ", "requirements": " ", 
-    "years": [[], [], [], [], []], 
-    "other-content": {}}
-    baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []}
-    
-    #getting the HTML from the webpage for the individual major
-    major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
-    major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
+    for z in ba_elements:
+        #grabbing major title as well as the link to the major page
+        major_title = z.text.strip()
+        major_link = z.get('href')
+        
+        #defining the inner values of the mega dictionary
+        baccalaureate_parsed[major_title] = {"description": " ", "requirements": " ", 
+        "years": [[], [], [], [], []], 
+        "other-content": {}}
+        baccalaureate_parsed[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []}
 
-    #parsing through the large HTML from the webpage in order to  get the description of the major as well as the massive list of requirements
-    major_table = major_soup.find("td", class_="block_content")
-    major_table_def = major_table.find("table", class_="table_default")
-    trs = major_table_def.find_all("tr")
-    description_tr = trs[0]
-    requirement_tr = trs[3]
-    baccalaureate[major_title]["description"] = description_tr
-    baccalaureate[major_title]["requirements"] = requirement_tr  
-    
-    #yearHTML is all of the raw description HTML for the entire major.
-    yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
-    if yearsHTML == None:
-        continue 
+        baccalaureate[major_title] = {"description": " ", "requirements": " ", 
+        "years": [[], [], [], [], []], 
+        "other-content": {}}
+        baccalaureate[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []}
+        
+        #getting the HTML from the webpage for the individual major
+        major_webpage = requests.get("http://catalog.rpi.edu/" + major_link)
+        major_soup = BeautifulSoup(major_webpage.content, 'html5lib')
 
-    #yearHTML will be used to get the raw HTML for each year.
-    yearlist = yearsHTML.find("div", class_ = "acalog-core")
-    
-    #count and yearcount will be used to  put the raw HTML in each value of the years array.
-    count = 0
-    yearcount = 0
+        #parsing through the large HTML from the webpage in order to  get the description of the major as well as the massive list of requirements
+        major_table = major_soup.find("td", class_="block_content")
+        major_table_def = major_table.find("table", class_="table_default")
+        trs = major_table_def.find_all("tr")
+        description_tr = trs[0]
+        requirement_tr = trs[3]
+        baccalaureate[major_title]["description"] = description_tr
+        baccalaureate[major_title]["requirements"] = requirement_tr  
+        
+        #yearHTML is all of the raw description HTML for the entire major.
+        yearsHTML = requirement_tr.find("div", class_="custom_leftpad_20")
+        if yearsHTML == None:
+            continue 
 
-    #if there are no more HTML chunks, then the loop ends
-    while(yearlist != None):
-        #debug print
-        #print(yearlist)
-        #print(major_title)
+        #yearHTML will be used to get the raw HTML for each year.
+        yearlist = yearsHTML.find("div", class_ = "acalog-core")
         
-        #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years.
-        if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"):
-            try:
-                #debug print
-                #print(yearlist)
-                testString = yearlist.h2.a['name']
+        #count and yearcount will be used to  put the raw HTML in each value of the years array.
+        count = 0
+        yearcount = 0
+
+        #if there are no more HTML chunks, then the loop ends
+        while(yearlist != None):
+            #debug print
+            #print(yearlist)
+            #print(major_title)
+            
+            #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years.
+            if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"):
+                try:
+                    #debug print
+                    #print(yearlist)
+                    testString = yearlist.h2.a['name']
 
-                #checks to see if they fall into a specific category that we defined in the mega dictionary
-                if "Footnotes" in testString:
-                    baccalaureate[major_title]["other-content"]["footnotes"] = yearlist
-                elif "Capstone" in testString:
-                    baccalaureate[major_title]["other-content"]["capstone"] = yearlist
-                elif "Transfer Credit Policy" in testString:
-                    baccalaureate[major_title]["other-content"]["transfer_policy"] = yearlist
-                elif "Options" in testString:
-                    baccalaureate[major_title]["other-content"]["options"] = yearlist
-                else:
+                    #checks to see if they fall into a specific category that we defined in the mega dictionary
+                    if "Footnotes" in testString:
+                        baccalaureate[major_title]["other-content"]["footnotes"] = yearlist
+                    elif "Capstone" in testString:
+                        baccalaureate[major_title]["other-content"]["capstone"] = yearlist
+                    elif "Transfer Credit Policy" in testString:
+                        baccalaureate[major_title]["other-content"]["transfer_policy"] = yearlist
+                    elif "Options" in testString:
+                        baccalaureate[major_title]["other-content"]["options"] = yearlist
+                    else:
+                        baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
+                except Exception as e:
                     baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
+            else:
+                baccalaureate[major_title]["years"][count].append(yearlist)
+            
+            #switches to the next HTML chunk
+            yearlist = yearlist.next_sibling
+            
+            #yearcount increases by 1 until it is 2. If it's greater than 2, yearcount goes to 0 and count increases by 1.
+            #this is to ensure that each year gets not only the title HTML, but the information for that year as well.
+            yearcount += 1
+            if yearcount > 1:
+                yearcount = 0
+                count += 1
+
+def baccalurate_parse_html():
+    for x in baccalaureate:
+        baccalaureate_parsed[x]["description"] = baccalaureate[x]["description"].find_all("p")[1].string
+        yearcounter = 0
+        for y in baccalaureate[x]["years"]:
+            try:
+                #print(y)
+                #print(y[0].h2.a["name"])
+                baccalaureate_parsed[x]["years"][yearcounter] = []
+                baccalaureate_parsed[x]["years"][yearcounter].append(y[0].h2.a["name"])
+                baccalaureate_parsed[x]["years"][yearcounter].append([])
+                baccalaureate_parsed[x]["years"][yearcounter].append([])
+                #print(baccalaureate_parsed[x]["years"])
+                #print("------------------------------------------------")
+                #baccalaureate_parsed[x]["years"][1] = []
+                yearlist = y[1].find_all("div")
+                for z in yearlist:
+                    #print(z.find_all("li"))
+                    for i in z.find_all("li"):
+                        #print(i.get_text())
+                        if "or" == i.get_text():
+                            continue
+                        else:
+                            inserted_string = i.get_text().replace("\xa0", " ")
+                            inserted_string = inserted_string.replace("\n", " ")
+                            inserted_string = inserted_string.replace("\t", " ")
+                            footnote_value = " "
+                            footnote_found = False
+                            credit_hour_found = False
+                            credit_hour_value = ""
+                            if "(See footnote" in inserted_string:
+                                #print(inserted_string.index("(See footnote "))
+                                footnote_value = inserted_string[inserted_string.index("(See footnote ") + 14]
+                                inserted_string = inserted_string.replace("(See footnote " + footnote_value + " below)", "")
+                                footnote_found = True
+                            
+                            if "Credit Hours" in inserted_string:
+                                #print(inserted_string.index("Credit Hours: "))
+                                credit_hour_value = inserted_string[inserted_string.index("Credit Hours: ") + 14]
+                                #print(credit_hour_value)
+                                inserted_string = inserted_string.replace("Credit Hours: " + credit_hour_value, "")
+                                credit_hour_found = True  
+                            
+                            if footnote_found == True:
+                                inserted_string = inserted_string + "[FOOTNOTE: " + footnote_value + "]"
+                            if credit_hour_found == True:
+                                inserted_string = inserted_string + " [CREDIT HOURS: " + credit_hour_value + "]"
+                            if "Fall" in z.h3.a["name"]:
+                                baccalaureate_parsed[x]["years"][yearcounter][1].append(inserted_string)
+                            if "Spring" in z.h3.a["name"]:
+                                baccalaureate_parsed[x]["years"][yearcounter][2].append(inserted_string)
+                        #print("-----------------------")
+                
+                #print("------------------------------------------------")
+                yearcounter += 1
             except Exception as e:
-                baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
-        else:
-            baccalaureate[major_title]["years"][count].append(yearlist)
-        
-        #switches to the next HTML chunk
-        yearlist = yearlist.next_sibling
-        
-        #yearcount increases by 1 until it is 2. If it's greater than 2, yearcount goes to 0 and count increases by 1.
-        #this is to ensure that each year gets not only the title HTML, but the information for that year as well.
-        yearcount += 1
-        if yearcount > 1:
-            yearcount = 0
-            count += 1
+                #print(e)
+                #print("------------------------------------------------")
+                continue
+        #print("------------------")
+        for y in baccalaureate[x]["other-content"]:
+            if len(baccalaureate[x]["other-content"][y]) == 0 or baccalaureate[x]["other-content"][y] == " ":
+                print("empty")
+            else:
+                print(baccalaureate[x]["other-content"][y])
+            print("--------------------")
+        break
 
 #printing all the information in the mass dictionary (DEBUG)
-#for x in baccalaureate:
-    #print(baccalaureate[x])
-    #print("-------------------------------------------------------")
+time_start = time.time()
+baccalaureate = {}
+baccalaureate_parsed = {}
+baccalurate_grab_html()
+baccalurate_parse_html()
+        
 
 #runtime (DEBUG)
 print(time.time() - time_start)
\ No newline at end of file

From 2da081dbd6f601dde690f4aa939bdadd7e2929a2 Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Sun, 10 Apr 2022 20:31:56 -0400
Subject: [PATCH 15/17] parser done minus some issues

---
 src/baccalaureate_scraper.py | 56 +++++++++++++++++++++++++++++-------
 src/course.py                |  7 +++--
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index 311845a..52c070d 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -1,4 +1,4 @@
-from itertools import count
+
 from readline import insert_text
 from bs4 import BeautifulSoup
 from numpy import empty
@@ -17,11 +17,11 @@ def baccalurate_grab_html():
         major_title = z.text.strip()
         major_link = z.get('href')
         
-        #defining the inner values of the mega dictionary
-        baccalaureate_parsed[major_title] = {"description": " ", "requirements": " ", 
+        #defining the inner values of the mega dictionaries
+        baccalaureate_parsed[major_title] = {"description": [], 
         "years": [[], [], [], [], []], 
         "other-content": {}}
-        baccalaureate_parsed[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": " ", "misc": []}
+        baccalaureate_parsed[major_title]["other-content"] = {"options": " ", "capstone": " ", "transfer_policy": " ", "footnotes": [], "misc": []}
 
         baccalaureate[major_title] = {"description": " ", "requirements": " ", 
         "years": [[], [], [], [], []], 
@@ -37,7 +37,19 @@ def baccalurate_grab_html():
         major_table_def = major_table.find("table", class_="table_default")
         trs = major_table_def.find_all("tr")
         description_tr = trs[0]
-        requirement_tr = trs[3]
+        #print(description_tr.get_text())
+        #print(description_tr.get_text().find("Return to: Programs"))
+        #print(description_tr.get_text()[description_tr.get_text().find("Return to: Programs") + 19])
+        if description_tr.get_text()[description_tr.get_text().find("Return to: Programs") + 19].isalpha():
+            description_tr = description_tr
+        else:
+            description_tr = "NO DESCRIPTION"
+        #print(description_tr)
+        #print("-------------------------------")
+        if major_title == "Architecture":
+            requirement_tr = trs[4]
+        else:
+            requirement_tr = trs[3]
         baccalaureate[major_title]["description"] = description_tr
         baccalaureate[major_title]["requirements"] = requirement_tr  
         
@@ -94,7 +106,16 @@ def baccalurate_grab_html():
 
 def baccalurate_parse_html():
     for x in baccalaureate:
-        baccalaureate_parsed[x]["description"] = baccalaureate[x]["description"].find_all("p")[1].string
+        print(x)
+        print("------------------------")
+        print(baccalaureate[x]["description"])
+        if baccalaureate[x]["description"] != "NO DESCRIPTION":
+            for y in baccalaureate[x]["description"].find_all("p"):
+                if y.get_text() != " Return to: Programs":
+                    inserted_string = y.get_text().replace("\xa0", " ")
+                    inserted_string = inserted_string.replace("\n", " ")
+                    inserted_string = inserted_string.replace("\t", " ")
+                    baccalaureate_parsed[x]["description"].append(inserted_string)
         yearcounter = 0
         for y in baccalaureate[x]["years"]:
             try:
@@ -154,11 +175,21 @@ def baccalurate_parse_html():
         #print("------------------")
         for y in baccalaureate[x]["other-content"]:
             if len(baccalaureate[x]["other-content"][y]) == 0 or baccalaureate[x]["other-content"][y] == " ":
-                print("empty")
+                print(" ")
             else:
+                #print(y)
+                print("---------------------------------")
                 print(baccalaureate[x]["other-content"][y])
-            print("--------------------")
-        break
+                try:
+                    li_list = baccalaureate[x]["other-content"][y].find_all("li");
+                    for z in li_list:
+                        #print(z.get_text())
+                        baccalaureate_parsed[x]["other-content"][y].append(z.get_text())
+                except:
+                    print(baccalaureate[x]["other-content"][y])
+                #print(li_list)
+            #print("--------------------")
+        
 
 #printing all the information in the mass dictionary (DEBUG)
 time_start = time.time()
@@ -166,7 +197,12 @@ def baccalurate_parse_html():
 baccalaureate_parsed = {}
 baccalurate_grab_html()
 baccalurate_parse_html()
-        
+
+for x in baccalaureate_parsed:
+    print(x)
+    print("----------")
+    print(baccalaureate_parsed[x])
+    print("-------------------------------------------------")
 
 #runtime (DEBUG)
 print(time.time() - time_start)
\ No newline at end of file
diff --git a/src/course.py b/src/course.py
index 3fb077f..b070959 100644
--- a/src/course.py
+++ b/src/course.py
@@ -10,6 +10,7 @@
 import json
 import re
 import time
+import os
 
 # Do you want to output the time it took for the operations to complete
 timeit = True
@@ -21,7 +22,9 @@
 # output will be redirected to 'sis_courses_TEST.json' if True
 small_search = False
 
+os.makedirs('data/', exist_ok=True)
 output_file = 'sis_courses_TEST.json' if small_search else 'sis_courses_data.json'
+output_file = 'data/' + output_file
 
 host = "https://sis.rpi.edu"
 url_pre = '/rss/bwckctlg.p_display_courses?term_in='
@@ -83,7 +86,7 @@ def fetch_course_links(year, term):
             # if the course has a link, store the link and description in a dictionary
             if class_info.a:
                 classes_links_dict[class_title.a.text] = [host+class_info.a['href'], desc]
-        
+
         return classes_links_dict
 
 
@@ -204,4 +207,4 @@ def fetch_course_info(link, desc):
     if timeit:
         print(f'\nTook {time.time() - before} to go to all sublinks')
     print(f'outputting into {output_file}')
-    json.dump(store, open(output_file, 'w'))
+    json.dump(store, open(output_file, 'w'))
\ No newline at end of file

From 0ea3747da0e93a2f3e56bc06fef65063dce6c67f Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Mon, 11 Apr 2022 11:06:33 -0400
Subject: [PATCH 16/17] worked on ITWS fixes

---
 src/baccalaureate_scraper.py | 66 ++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index 52c070d..ab8bc78 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -1,9 +1,7 @@
-
-from readline import insert_text
 from bs4 import BeautifulSoup
-from numpy import empty
 import requests
 import time
+import re
 
 def baccalurate_grab_html():
     r = requests.get("http://catalog.rpi.edu/content.php?catoid=22&navoid=542")
@@ -68,9 +66,24 @@ def baccalurate_grab_html():
         #if there are no more HTML chunks, then the loop ends
         while(yearlist != None):
             #debug print
-            #print(yearlist)
+            #print(yearlist.h2.a['name'])
             #print(major_title)
+
+            #for SOME reason, ITWS has an extensive amount of major information that needs to be parsed before getting to
+            #each major year, I have to add extra lists to add the HTML information to the mega dictionary.
+            if major_title == "Information Technology and Web Science" and len(baccalaureate[major_title]["years"]) == 5:
+                print("CHECKED!!!")
+                for g in range(6):
+                    baccalaureate[major_title]["years"].append([])
+                for g in range(4, 11):
+                    baccalaureate[major_title]["years"][g].append(yearlist)
+                    yearlist = yearlist.next_sibling
+                    baccalaureate[major_title]["years"][g].append(yearlist)
+                yearlist = yearlist.next_sibling
+                
             
+
+
             #checks to see if there are any extra bits of HTML after getting through the inital 4/5 years.
             if (count > 3 and major_title != "Architecture") or (count > 4 and major_title == "Architecture"):
                 try:
@@ -90,25 +103,27 @@ def baccalurate_grab_html():
                     else:
                         baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
                 except Exception as e:
+                    #print(e)
                     baccalaureate[major_title]["other-content"]["misc"].append(yearlist)
             else:
                 baccalaureate[major_title]["years"][count].append(yearlist)
+                yearcount += 1
+                if yearcount > 1:
+                    yearcount = 0
+                    count += 1
             
             #switches to the next HTML chunk
             yearlist = yearlist.next_sibling
             
             #yearcount increases by 1 until it is 2. If it's greater than 2, yearcount goes to 0 and count increases by 1.
             #this is to ensure that each year gets not only the title HTML, but the information for that year as well.
-            yearcount += 1
-            if yearcount > 1:
-                yearcount = 0
-                count += 1
+            
 
 def baccalurate_parse_html():
     for x in baccalaureate:
-        print(x)
-        print("------------------------")
-        print(baccalaureate[x]["description"])
+        #print(x)
+        #print("------------------------")
+        #print(baccalaureate[x]["description"])
         if baccalaureate[x]["description"] != "NO DESCRIPTION":
             for y in baccalaureate[x]["description"].find_all("p"):
                 if y.get_text() != " Return to: Programs":
@@ -175,18 +190,22 @@ def baccalurate_parse_html():
         #print("------------------")
         for y in baccalaureate[x]["other-content"]:
             if len(baccalaureate[x]["other-content"][y]) == 0 or baccalaureate[x]["other-content"][y] == " ":
-                print(" ")
+                print("NO CONTENT HERE")
             else:
                 #print(y)
-                print("---------------------------------")
-                print(baccalaureate[x]["other-content"][y])
+                #print("---------------------------------")
+                #print(baccalaureate[x]["other-content"][y])
                 try:
                     li_list = baccalaureate[x]["other-content"][y].find_all("li");
                     for z in li_list:
                         #print(z.get_text())
-                        baccalaureate_parsed[x]["other-content"][y].append(z.get_text())
-                except:
-                    print(baccalaureate[x]["other-content"][y])
+                        inserted_string = z.get_text().replace("\xa0", " ")
+                        inserted_string = inserted_string.replace("\n", " ")
+                        inserted_string = inserted_string.replace("\t", " ")
+                        baccalaureate_parsed[x]["other-content"][y].append(inserted_string)
+                except Exception as e:
+                    #print(baccalaureate[x]["other-content"][y])
+                    print(e)
                 #print(li_list)
             #print("--------------------")
         
@@ -198,11 +217,14 @@ def baccalurate_parse_html():
 baccalurate_grab_html()
 baccalurate_parse_html()
 
-for x in baccalaureate_parsed:
-    print(x)
-    print("----------")
-    print(baccalaureate_parsed[x])
-    print("-------------------------------------------------")
+print(baccalaureate["Information Technology and Web Science"]["years"])
+print("-------------------------------------")
+print(baccalaureate_parsed["Information Technology and Web Science"])
+#for x in baccalaureate_parsed:
+#    print(x)
+#    print("----------")
+#    print(baccalaureate_parsed[x])
+#    print("-------------------------------------------------")
 
 #runtime (DEBUG)
 print(time.time() - time_start)
\ No newline at end of file

From b99abb5acc792f57ecc4eca54efb6cbfac2de307 Mon Sep 17 00:00:00 2001
From: akeylg <graya4@rpi.edu>
Date: Mon, 11 Apr 2022 11:09:12 -0400
Subject: [PATCH 17/17] fixed footnote/credit hours bug

---
 src/baccalaureate_scraper.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/baccalaureate_scraper.py b/src/baccalaureate_scraper.py
index ab8bc78..67e5da0 100644
--- a/src/baccalaureate_scraper.py
+++ b/src/baccalaureate_scraper.py
@@ -127,9 +127,9 @@ def baccalurate_parse_html():
         if baccalaureate[x]["description"] != "NO DESCRIPTION":
             for y in baccalaureate[x]["description"].find_all("p"):
                 if y.get_text() != " Return to: Programs":
-                    inserted_string = y.get_text().replace("\xa0", " ")
-                    inserted_string = inserted_string.replace("\n", " ")
-                    inserted_string = inserted_string.replace("\t", " ")
+                    inserted_string = y.get_text().replace("\xa0", "")
+                    inserted_string = inserted_string.replace("\n", "")
+                    inserted_string = inserted_string.replace("\t", "")
                     baccalaureate_parsed[x]["description"].append(inserted_string)
         yearcounter = 0
         for y in baccalaureate[x]["years"]:
@@ -151,9 +151,9 @@ def baccalurate_parse_html():
                         if "or" == i.get_text():
                             continue
                         else:
-                            inserted_string = i.get_text().replace("\xa0", " ")
-                            inserted_string = inserted_string.replace("\n", " ")
-                            inserted_string = inserted_string.replace("\t", " ")
+                            inserted_string = i.get_text().replace("\xa0", "")
+                            inserted_string = inserted_string.replace("\n", "")
+                            inserted_string = inserted_string.replace("\t", "")
                             footnote_value = " "
                             footnote_found = False
                             credit_hour_found = False
@@ -199,9 +199,9 @@ def baccalurate_parse_html():
                     li_list = baccalaureate[x]["other-content"][y].find_all("li");
                     for z in li_list:
                         #print(z.get_text())
-                        inserted_string = z.get_text().replace("\xa0", " ")
-                        inserted_string = inserted_string.replace("\n", " ")
-                        inserted_string = inserted_string.replace("\t", " ")
+                        inserted_string = z.get_text().replace("\xa0", "")
+                        inserted_string = inserted_string.replace("\n", "")
+                        inserted_string = inserted_string.replace("\t", "")
                         baccalaureate_parsed[x]["other-content"][y].append(inserted_string)
                 except Exception as e:
                     #print(baccalaureate[x]["other-content"][y])