Merge pull request #3 from OpenBookPublishers/develop

ja573 · web-flow · commit 5efee0dd0b75 · 2019-12-12T12:11:57.000Z
Develop
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,7 @@
+language: python
+python:
+  - "3.8"
+install:
+  - pip install flake8==3.7.7
+script:
+  - flake8 ./src/*
diff --git a/Dockerfile b/Dockerfile
@@ -16,4 +16,4 @@ RUN rm requirements.txt
 
 COPY ./src/ ./
 
-CMD python main.py ./pdf_file.pdf ./output --doi $DOI
+CMD python main.py ./pdf_file.pdf ./output --doi $DOI --compress-output
diff --git a/README.md b/README.md
@@ -33,6 +33,8 @@ Example:
 
 $ `python3 main.py Hobbs-Provincial-Press.pdf /dev/shm --doi 10.11647/OBP.0152`
 
+You may specify `--compress-output` to output a zip file containing all the curated (without the 'original', metadata less, files) chapter PDFs.
+
 ## Development
 ### What works
 * Chapter-level DOI discovery
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 requests==2.22.0
 pagelabels==1.1.1
+roman==3.2
diff --git a/src/main.py b/src/main.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
 
 import argparse
-from os import path
+from os import path, listdir
+import tempfile
+from zipfile import ZipFile
 from modules.doi import Doi
 from modules.pdf import Pdf
 from modules.metadata import Metadata
@@ -23,6 +25,10 @@ def do_split(m, p, output_dir, doi):
     Metadata.write_metadata(doi_metadata, output_file_path)
 
 
+def get_tmp_dir():
+    return tempfile.mkdtemp()
+
+
 def run():
     parser = argparse.ArgumentParser(description='chapter-splitter')
 
@@ -33,25 +39,39 @@ def run():
     parser.add_argument('-d', '--doi',
                         help='The DOI (at book-level) you wish to parse',
                         required=True)
+    parser.add_argument('-c', '--compress-output', dest='compress',
+                        action='store_true',
+                        help='If set it will output a single zip file')
 
     args = parser.parse_args()
 
+    out_dir = args.output_folder
+    tmp_dir = out_dir if not args.compress else get_tmp_dir()
+
     # Check parsed arguments
     file_checks(args.input_file)
-    path_checks(args.output_folder)
+    path_checks(out_dir)
 
     # Check dependencies
     dependencies_checks()
 
     # Discover chapter-level DOIs of the supplied --doi value
-    d = Doi(args.doi)
+    d = Doi(args.doi.lower())
     ch_dois = d.discover_ch_dois()
 
     m = Metadata()
-    p = Pdf(args.input_file, args.output_folder)
+    p = Pdf(args.input_file, tmp_dir)
 
     for doi in ch_dois:
-        do_split(m, p, args.output_folder, doi)
+        do_split(m, p, tmp_dir, doi)
+
+    if args.compress:
+        out_file = '{}/{}.zip'.format(out_dir, d.book_level_doi_suffix)
+        suffix = '_original'
+        files = filter(lambda w: not w.endswith(suffix), listdir(tmp_dir))
+        with ZipFile(out_file, 'w') as zipfile:
+            for file in files:
+                zipfile.write('{}/{}'.format(tmp_dir, file), file)
 
 
 if __name__ == '__main__':
diff --git a/src/modules/doi.py b/src/modules/doi.py
@@ -7,6 +7,7 @@
 class Doi:
     def __init__(self, book_level_doi):
         self.book_level_doi = book_level_doi
+        self.book_level_doi_suffix = book_level_doi.split('/')[1]
 
         config = Config()
         self.api_url = config.get_config('metadata', 'api_url')
diff --git a/src/modules/pdf.py b/src/modules/pdf.py
@@ -5,6 +5,7 @@
 from pagelabels import PageLabels
 from os import path
 from .config import Config
+import roman
 
 
 class Pdf:
@@ -40,8 +41,14 @@ def get_page_range(self, page_range):
         Returns a list of the effective chapter page range
         """
 
-        # Convert the page numbers to int object type
-        page_range = [int(page) + self.page_one for page in page_range]
+        # Check if the page range is numeric or roman numeral
+        if page_range[0].isnumeric() and page_range[1].isnumeric():
+            # Convert the page numbers to int object type
+            page_range = [int(page) + self.page_one for page in page_range]
+        else:
+            # Convert pages to arabic numeral and add 1 (cover page)
+            page_range = [roman.fromRoman(page.upper()) + 1
+                          for page in page_range]
         return page_range
 
     def merge_pdfs(self, page_range, output_file_name):

Original file line number	Diff line number	Diff line change
`@@ -16,4 +16,4 @@ RUN rm requirements.txt`
`16`	`16`
`17`	`17`	`COPY ./src/ ./`
`18`	`18`
`19`		`-CMD python main.py ./pdf_file.pdf ./output --doi $DOI`
	`19`	`+CMD python main.py ./pdf_file.pdf ./output --doi $DOI --compress-output`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`requests==2.22.0`
`2`	`2`	`pagelabels==1.1.1`
	`3`	`+roman==3.2`