-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathauto_dl_process.py
More file actions
95 lines (77 loc) · 2.78 KB
/
auto_dl_process.py
File metadata and controls
95 lines (77 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# pip install requests BeautifulSoup4 pdfplumber pandas pypdf2 fpdf2
import requests
import urllib.request
from bs4 import BeautifulSoup
from fpdf import FPDF
from PyPDF2 import PdfFileWriter, PdfFileReader
import re
import sys
import pdfplumber
import pandas as pd
import time
# DOWNLOAD FILE
domain = 'https://www.pref.okinawa.lg.jp'
url = domain + '/site/hoken/kansen/soumu/press/20200214_covid19_pr1.html'
response = requests.get(url)
def remove_invisible_chars(chars):
for char in chars:
if char['non_stroking_color'] == (1,1,1):
print(char)
## Get file link and change file name
soup = BeautifulSoup(response.text, "html.parser")
link = soup.find(id="tmp_contents").find_all('a')[2]['href']
# find_pattern = r"\/documents\/(?P<report>\d*)\D*(?P<cases>\d*)\D*.pdf"
# replace_pattern = lambda number: number.group('report') + '_' + number.group('cases') + '.pdf'
filename = link[link.find('documents/')+10:].replace('hou', '_').replace('rei', '').replace('me', '')
## Download the file
download_url = domain + link
urllib.request.urlretrieve(download_url, './pdf/' + filename)
print("PDF downloaded at: pdf/" + filename)
# Preprocess PDF
## Create lines
linePDF = FPDF()
### Page with lines for summary table
linePDF.add_page(orientation='P', format='A4')
linePDF.set_fill_color(0,0,0)
linePDF.rect(38, 101.2, 0.25, 72,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(66, 101.2, 0.25, 72,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(77, 101.2, 0.25, 72,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(97, 101.2, 0.25, 63,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(106.5, 101.2, 0.25, 63,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(38, 168, 39, 0.25,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(38, 173, 39, 0.25,'F')
### Page with lines for cases table
linePDF.add_page(orientation='P', format='A4')
linePDF.set_fill_color(0,0,0)
linePDF.rect(32,282,160,0.5,'F')
linePDF.add_page()
linePDF.set_fill_color(0,0,0)
linePDF.rect(32,288.5,160,0.5,'F')
linePDF.output('component/line.pdf', 'F')
## Add lines to every page
outputPDF = PdfFileWriter()
sourcePDF = PdfFileReader(open('./pdf/' + filename, "rb"))
# print how many pages sourcePDF has:
# print (sourcePDF.numPages)
linePDF = PdfFileReader(open("component/line.pdf", "rb"))
for pageNum in range(sourcePDF.numPages):
if pageNum == 0:
linePlace = linePDF.getPage(0)
elif pageNum == 2:
linePlace = linePDF.getPage(1)
else:
linePlace = linePDF.getPage(2)
if pageNum == 0: #or pageNum >= 2:
currentPage = sourcePDF.getPage(pageNum)
currentPage.mergePage(linePlace)
outputPDF.addPage(currentPage)
## finally, write "outputPDF" to document-outputPDF.pdf
outputPDFStream = open('./pdf/processed_latest.pdf', "wb")
outputPDF.write(outputPDFStream)
print ('PDF preprocess finished')