-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathauto_summary_parser.py
More file actions
67 lines (56 loc) · 1.94 KB
/
auto_summary_parser.py
File metadata and controls
67 lines (56 loc) · 1.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# pip install requests BeautifulSoup4 pdfplumber pandas pypdf2 fpdf2
from datetime import datetime,timezone,timedelta
import re
import sys
import pdfplumber
import pandas as pd
# Start to parse the PDF
filename = 'processed_latest.pdf'
output_summary = 'data/auto_summary.csv'
pdf = pdfplumber.open('./pdf/' + filename)
page = pdf.pages[0]
# Start convert Summary from page 1
# bounding_box = (50, 405, 360, 710)
bounding_box = (50, 280, 320, 500)
page_crop = page.within_bbox(bounding_box)
# page_crop.to_image(resolution=200).save("./snapshot/summary_crop.png", format="PNG")
table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"snap_tolerance": 3,
}
# im = page_crop.to_image(resolution=200)
# im.reset().draw_hline(780, stroke='black', stroke_width=3)
# im.debug_tablefinder(table_settings)
# im.save("./snapshot/summary.png", format="PNG")
summaryTable = page_crop.extract_table(table_settings)
print(summaryTable)
# Save the summary CSV
utcNow = datetime.utcnow().replace(tzinfo=timezone.utc)
jstNow = utcNow.astimezone(timezone(timedelta(hours=9))) # Change Timezone to JST
current_time = jstNow.strftime("%Y/%m/%d %H:%M")
today = jstNow.strftime("%Y/%m/%d")
data = [
current_time,
summaryTable[15][1],
summaryTable[1][1],
summaryTable[3][3],
summaryTable[4][3],
summaryTable[5][1],
summaryTable[6][1],
summaryTable[7][1],
summaryTable[10][1],
summaryTable[11][2],
summaryTable[12][2],
summaryTable[13][1]
]
data = [item.replace('※', '') for item in data]
csvDf = pd.read_csv(output_summary, sep=',', encoding="utf-8")
indexNames = csvDf[ csvDf['更新時間'].str.contains(today) ].index
if int(summaryTable[14][1]) != int(csvDf.loc[len(csvDf)-1].iat[1]):
csvDf.drop(indexNames , inplace=True)
csvDf.loc[len(csvDf)] = data
csvDf.to_csv(output_summary, index=False)
print("Summary CSV updated at: data/auto_summary.csv")
else:
print("No Summary update")