Skip to content

Commit 5efee0d

Browse files
authored
Merge pull request #3 from OpenBookPublishers/develop
Develop
2 parents 3b1e285 + b4b36a7 commit 5efee0d

7 files changed

Lines changed: 46 additions & 8 deletions

File tree

.travis.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
language: python
2+
python:
3+
- "3.8"
4+
install:
5+
- pip install flake8==3.7.7
6+
script:
7+
- flake8 ./src/*

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ RUN rm requirements.txt
1616

1717
COPY ./src/ ./
1818

19-
CMD python main.py ./pdf_file.pdf ./output --doi $DOI
19+
CMD python main.py ./pdf_file.pdf ./output --doi $DOI --compress-output

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ Example:
3333

3434
$ `python3 main.py Hobbs-Provincial-Press.pdf /dev/shm --doi 10.11647/OBP.0152`
3535

36+
You may specify `--compress-output` to output a zip file containing all the curated (without the 'original', metadata less, files) chapter PDFs.
37+
3638
## Development
3739
### What works
3840
* Chapter-level DOI discovery

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
requests==2.22.0
22
pagelabels==1.1.1
3+
roman==3.2

src/main.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#!/usr/bin/env python3
22

33
import argparse
4-
from os import path
4+
from os import path, listdir
5+
import tempfile
6+
from zipfile import ZipFile
57
from modules.doi import Doi
68
from modules.pdf import Pdf
79
from modules.metadata import Metadata
@@ -23,6 +25,10 @@ def do_split(m, p, output_dir, doi):
2325
Metadata.write_metadata(doi_metadata, output_file_path)
2426

2527

28+
def get_tmp_dir():
29+
return tempfile.mkdtemp()
30+
31+
2632
def run():
2733
parser = argparse.ArgumentParser(description='chapter-splitter')
2834

@@ -33,25 +39,39 @@ def run():
3339
parser.add_argument('-d', '--doi',
3440
help='The DOI (at book-level) you wish to parse',
3541
required=True)
42+
parser.add_argument('-c', '--compress-output', dest='compress',
43+
action='store_true',
44+
help='If set it will output a single zip file')
3645

3746
args = parser.parse_args()
3847

48+
out_dir = args.output_folder
49+
tmp_dir = out_dir if not args.compress else get_tmp_dir()
50+
3951
# Check parsed arguments
4052
file_checks(args.input_file)
41-
path_checks(args.output_folder)
53+
path_checks(out_dir)
4254

4355
# Check dependencies
4456
dependencies_checks()
4557

4658
# Discover chapter-level DOIs of the supplied --doi value
47-
d = Doi(args.doi)
59+
d = Doi(args.doi.lower())
4860
ch_dois = d.discover_ch_dois()
4961

5062
m = Metadata()
51-
p = Pdf(args.input_file, args.output_folder)
63+
p = Pdf(args.input_file, tmp_dir)
5264

5365
for doi in ch_dois:
54-
do_split(m, p, args.output_folder, doi)
66+
do_split(m, p, tmp_dir, doi)
67+
68+
if args.compress:
69+
out_file = '{}/{}.zip'.format(out_dir, d.book_level_doi_suffix)
70+
suffix = '_original'
71+
files = filter(lambda w: not w.endswith(suffix), listdir(tmp_dir))
72+
with ZipFile(out_file, 'w') as zipfile:
73+
for file in files:
74+
zipfile.write('{}/{}'.format(tmp_dir, file), file)
5575

5676

5777
if __name__ == '__main__':

src/modules/doi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
class Doi:
88
def __init__(self, book_level_doi):
99
self.book_level_doi = book_level_doi
10+
self.book_level_doi_suffix = book_level_doi.split('/')[1]
1011

1112
config = Config()
1213
self.api_url = config.get_config('metadata', 'api_url')

src/modules/pdf.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pagelabels import PageLabels
66
from os import path
77
from .config import Config
8+
import roman
89

910

1011
class Pdf:
@@ -40,8 +41,14 @@ def get_page_range(self, page_range):
4041
Returns a list of the effective chapter page range
4142
"""
4243

43-
# Convert the page numbers to int object type
44-
page_range = [int(page) + self.page_one for page in page_range]
44+
# Check if the page range is numeric or roman numeral
45+
if page_range[0].isnumeric() and page_range[1].isnumeric():
46+
# Convert the page numbers to int object type
47+
page_range = [int(page) + self.page_one for page in page_range]
48+
else:
49+
# Convert pages to arabic numeral and add 1 (cover page)
50+
page_range = [roman.fromRoman(page.upper()) + 1
51+
for page in page_range]
4552
return page_range
4653

4754
def merge_pdfs(self, page_range, output_file_name):

0 commit comments

Comments
 (0)