This repository was archived by the owner on Feb 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtmlparser.py
More file actions
78 lines (71 loc) · 2.34 KB
/
htmlparser.py
File metadata and controls
78 lines (71 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from html.parser import HTMLParser
import json
class Parser(HTMLParser):
tableDict = {}
inTable = False
enteringHeaderData = False
enteringTableRow = False
headerList = []
dataList = []
enteringTableData = False
currentDataHeader = 0
def end_table(self):
tableStr = json.dumps(self.tableDict)
try:
with open('tables.json', 'a') as outputFile:
outputFile.write(tableStr + ',\n')
except:
with open('tables.json', 'w') as outputFile:
newStr = f'{{\n{tableStr},\n'
outputFile.write(newStr)
self.tableDict = {}
self.headerList = []
def handle_starttag(self, tag, attrs):
# Enable variables to tell the data reader where in the dict to
# enter the data
if (tag == 'table'):
self.inTable = True
elif (tag == 'th'):
self.enteringHeaderData = True
elif (tag == 'td'):
self.enteringTableData = True
elif (tag == 'tr'):
self.enteringTableRow = True
def handle_endtag(self, tag):
# Disable variables to tell the data reader that the tag is done
if (tag == 'table'):
self.inTable = False
self.end_table()
elif (tag == 'th'):
self.enteringHeaderData = False
elif (tag == 'td'):
self.enteringTableData = False
elif (tag == 'tr'):
self.enteringTableRow = False
self.currentDataHeader = 0
def handle_data(self, data):
# Check to see if header or table data is being read
if (self.enteringHeaderData == True):
# Run this so long as headerList is not empty
if (self.headerList != []):
# Adds new header to the dict named header(next avaliable number)
# then set the first list item to the name of the header
self.tableDict['header' + str(self.headerList[-1] + 1)] = [data]
# Add this header count to the list
self.headerList.append(self.headerList[-1] + 1)
else:
# Does the same as above code but instead initializes the dict and list
self.tableDict['header' + str(0)] = [data]
self.headerList.append(0)
self.enteringHeaderData = False
elif (self.enteringTableData == True):
currentHeader = 'header' + str(self.currentDataHeader)
dataList = self.tableDict[currentHeader]
dataList.append(data)
# Append the table data to the appropriate header
self.tableDict[currentHeader] = dataList
self.enteringTableData = False
if (self.currentDataHeader >= self.headerList[-1]):
self.currentDataHeader = 0
else:
self.currentDataHeader += 1