covid19-pdf-parser/auto_dl_process.py at master · Code-for-OKINAWA/covid19-pdf-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# pip install requests BeautifulSoup4 pdfplumber pandas pypdf2 fpdf2
import requests
import urllib.request
from bs4 import BeautifulSoup

from fpdf import FPDF
from PyPDF2 import PdfFileWriter, PdfFileReader

import re
import sys
import pdfplumber
import pandas as pd
import time

# DOWNLOAD FILE
domain = 'https://www.pref.okinawa.lg.jp'
url = domain + '/site/hoken/kansen/soumu/press/20200214_covid19_pr1.html'
response = requests.get(url)

def remove_invisible_chars(chars):
    for char in chars:
        if char['non_stroking_color'] == (1,1,1):
            print(char)

## Get file link and change file name
soup = BeautifulSoup(response.text, "html.parser")
link = soup.find(id="tmp_contents").find_all('a')[2]['href']

# find_pattern = r"\/documents\/(?P<report>\d*)\D*(?P<cases>\d*)\D*.pdf"
# replace_pattern = lambda number: number.group('report') + '_' + number.group('cases') + '.pdf'

filename = link[link.find('documents/')+10:].replace('hou', '_').replace('rei', '').replace('me', '')

## Download the file
download_url = domain + link
urllib.request.urlretrieve(download_url, './pdf/' + filename)
print("PDF downloaded at: pdf/" + filename)

# Preprocess PDF

## Create lines
linePDF = FPDF()

### Page with lines for summary table
linePDF.add_page(orientation='P', format='A4')
linePDF.set_fill_color(0,0,0)
linePDF.rect(38, 101.2, 0.25, 72,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(66, 101.2, 0.25, 72,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(77, 101.2, 0.25, 72,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(97, 101.2, 0.25, 63,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(106.5, 101.2, 0.25, 63,'F')

linePDF.set_fill_color(0,0,0)
linePDF.rect(38, 168, 39, 0.25,'F')
linePDF.set_fill_color(0,0,0)
linePDF.rect(38, 173, 39, 0.25,'F')

### Page with lines for cases table
linePDF.add_page(orientation='P', format='A4')
linePDF.set_fill_color(0,0,0)
linePDF.rect(32,282,160,0.5,'F')
linePDF.add_page()
linePDF.set_fill_color(0,0,0)
linePDF.rect(32,288.5,160,0.5,'F')
linePDF.output('component/line.pdf', 'F')

## Add lines to every page
outputPDF = PdfFileWriter()
sourcePDF = PdfFileReader(open('./pdf/' + filename, "rb"))

# print how many pages sourcePDF has:
# print (sourcePDF.numPages)
linePDF = PdfFileReader(open("component/line.pdf", "rb"))

for pageNum in range(sourcePDF.numPages):
    if pageNum == 0:
        linePlace = linePDF.getPage(0)
    elif pageNum == 2:
        linePlace = linePDF.getPage(1)
    else:
        linePlace = linePDF.getPage(2)

    if pageNum == 0: #or pageNum >= 2:
        currentPage = sourcePDF.getPage(pageNum)
        currentPage.mergePage(linePlace)
        outputPDF.addPage(currentPage)
## finally, write "outputPDF" to document-outputPDF.pdf
outputPDFStream = open('./pdf/processed_latest.pdf', "wb")
outputPDF.write(outputPDFStream)

print ('PDF preprocess finished')