-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdfTools.py
More file actions
54 lines (48 loc) · 2.87 KB
/
pdfTools.py
File metadata and controls
54 lines (48 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re
def displayHelp():
print("usage: py pdfSplit.py /?")
print(" To display cli instructions.")
print("usage: py pdfSplit.py <path to folder> <path to archive folder> <path to output> <string to be present on first page of bill> <ideal chunk size>")
print(" To split a big PDF fill file as close to half based off of a string that should be unique to the first page of each bill.")
print(" <path to folder> is a path to a folder that contains the PDF files to split.")
print(" <path to archive folder> is the folder where PDF files that are split will be moved to.")
print(" <path to output> is a path to a folder where split PDF files will be created.")
print(" <string to be present on first page of bill> is a string value that will be used to identify the first page of any bill. This text must exist on all bills and only on the first page of each.")
print(" <ideal chunk size> is a string value that indicates the ideal number of pages per output file.")
print("usage: py pdfSplit.py /d <path to pdf> <page number>")
print(" To display the contents of a specific page number of each bill in the folder. This can be useful to figure out the <string to be present on first page of bill>")
print(" <path to pdf> is a path to the PDF to display")
print(" <page number> is the page number of each PDF to display")
def findBillBreaks(pdfReader, sizePerChunk, searchString):
""" Returns an array of all pages upon which you should break the PDF so that it doesn't split any bills. """
if (sizePerChunk <= 0):
raise Exception("You must specify a chunk size of 1 page or more.")
chunkList = []
numPages = pdfReader.getNumPages()
nextPageToChunkOn = sizePerChunk
while True:
if nextPageToChunkOn >= numPages:
break
else:
nextPageToChunkOn = findNextPageWithSearchString(pdfReader, searchString, nextPageToChunkOn, numPages + 1)
chunkList.append(nextPageToChunkOn)
nextPageToChunkOn += sizePerChunk
return chunkList
def findNextPageWithSearchString(pdfReader, searchString, pageNum, numPages):
""" Find the next page after the specified page that contains the searchString. """
while True:
if doesPageContainSearchString(pdfReader, searchString, pageNum):
break
else:
pageNum += 1
if pageNum >= numPages:
break
return pageNum
def doesPageContainSearchString(pdfReader, searchString, pageNum):
""" Returns true if the specified page contains the searchString. """
page_text = getContentsOfPageFromPDF(pdfReader, pageNum)
return re.match(r".*" + searchString + ".*", page_text)
def getContentsOfPageFromPDF(pdfReader, pageNum):
""" Returns the content of a specific page in a PDF. """
pageObj = pdfReader.getPage(pageNum)
return re.sub("\n", "", pageObj.extractText())