euske · ghmo2789 · Aug 30, 2022 · Aug 30, 2022 · Aug 30, 2022 · Aug 30, 2022
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -0,0 +1,39 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Python application
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.9"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 unittest2
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . #--count --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with unittest2
+      run: |
+        unittest2
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,166 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+.vscode/
+
+# Files by Github in MacOS
+.DS_Store
+.idea/
diff --git a/README.md b/README.md
@@ -12,32 +12,30 @@ For the active project, check out its fork
 
 ## Features:
 
-  * Pure Python (3.6 or above).
-  * Supports PDF-1.7. (well, almost)
-  * Obtains the exact location of text as well as other layout information (fonts, etc.).
-  * Performs automatic layout analysis.
-  * Can convert PDF into other formats (HTML/XML).
-  * Can extract an outline (TOC).
-  * Can extract tagged contents.
-  * Supports basic encryption (RC4 and AES).
-  * Supports various font types (Type1, TrueType, Type3, and CID).
-  * Supports CJK languages and vertical writing scripts.
-  * Has an extensible PDF parser that can be used for other purposes.
-
+- Pure Python (3.6 or above).
+- Supports PDF-1.7. (well, almost)
+- Obtains the exact location of text as well as other layout information (fonts, etc.).
+- Performs automatic layout analysis.
+- Can convert PDF into other formats (HTML/XML).
+- Can extract an outline (TOC).
+- Can extract tagged contents.
+- Supports basic encryption (RC4 and AES).
+- Supports various font types (Type1, TrueType, Type3, and CID).
+- Supports CJK languages and vertical writing scripts.
+- Has an extensible PDF parser that can be used for other purposes.
 
 ## How to Use:
 
-  1. `> pip install pdfminer`
-  1. `> pdf2txt.py samples/simple1.pdf`
-
+1. `pip install -r requirements.txt`
+1. `python3 -m tools.pdf2txt samples/simple1.pdf`
 
 ## Command Line Syntax:
 
 ### pdf2txt.py
 
 pdf2txt.py extracts all the texts that are rendered programmatically.
 It also extracts the corresponding locations, font names, font sizes,
-writing direction (horizontal or vertical) for each text segment.  It
+writing direction (horizontal or vertical) for each text segment. It
 does not recognize text in images. A password needs to be provided for
 restricted PDF documents.
 
@@ -49,26 +47,26 @@ restricted PDF documents.
                  [-F boxes_flow] [-d]
                  input.pdf ...
 
-  * `-P password` : PDF password.
-  * `-o output` : Output file name.
-  * `-t text|html|xml|tag` : Output type. (default: automatically inferred from the output file name.)
-  * `-O output_dir` : Output directory for extracted images.
-  * `-c encoding` : Output encoding. (default: utf-8)
-  * `-s scale` : Output scale.
-  * `-R rotation` : Rotates the page in degree.
-  * `-Y normal|loose|exact` : Specifies the layout mode. (only for HTML output.)
-  * `-p pagenos` : Processes certain pages only.
-  * `-m maxpages` : Limits the number of maximum pages to process.
-  * `-S` : Strips control characters.
-  * `-C` : Disables resource caching.
-  * `-n` : Disables layout analysis.
-  * `-A` : Applies layout analysis for all texts including figures.
-  * `-V` : Automatically detects vertical writing.
-  * `-M char_margin` : Speficies the char margin.
-  * `-W word_margin` : Speficies the word margin.
-  * `-L line_margin` : Speficies the line margin.
-  * `-F boxes_flow` : Speficies the box flow ratio.
-  * `-d` : Turns on Debug output.
+- `-P password` : PDF password.
+- `-o output` : Output file name.
+- `-t text|html|xml|tag` : Output type. (default: automatically inferred from the output file name.)
+- `-O output_dir` : Output directory for extracted images.
+- `-c encoding` : Output encoding. (default: utf-8)
+- `-s scale` : Output scale.
+- `-R rotation` : Rotates the page in degree.
+- `-Y normal|loose|exact` : Specifies the layout mode. (only for HTML output.)
+- `-p pagenos` : Processes certain pages only.
+- `-m maxpages` : Limits the number of maximum pages to process.
+- `-S` : Strips control characters.
+- `-C` : Disables resource caching.
+- `-n` : Disables layout analysis.
+- `-A` : Applies layout analysis for all texts including figures.
+- `-V` : Automatically detects vertical writing.
+- `-M char_margin` : Speficies the char margin.
+- `-W word_margin` : Speficies the word margin.
+- `-L line_margin` : Speficies the line margin.
+- `-F boxes_flow` : Speficies the box flow ratio.
+- `-d` : Turns on Debug output.
 
 ### dumppdf.py
 
@@ -79,31 +77,30 @@ It dumps all the internal contents in pseudo-XML format.
                  [-o output] [-r|-b|-t] [-T] [-O directory] [-d]
                  input.pdf ...
 
-  * `-P password` : PDF password.
-  * `-a` : Extracts all objects.
-  * `-p pageid` : Extracts a Page object.
-  * `-i objid` : Extracts a certain object.
-  * `-o output` : Output file name.
-  * `-r` : Raw mode. Dumps the raw compressed/encoded streams.
-  * `-b` : Binary mode. Dumps the uncompressed/decoded streams.
-  * `-t` : Text mode. Dumps the streams in text format.
-  * `-T` : Tagged mode. Dumps the tagged contents.
-  * `-O output_dir` : Output directory for extracted streams.
+- `-P password` : PDF password.
+- `-a` : Extracts all objects.
+- `-p pageid` : Extracts a Page object.
+- `-i objid` : Extracts a certain object.
+- `-o output` : Output file name.
+- `-r` : Raw mode. Dumps the raw compressed/encoded streams.
+- `-b` : Binary mode. Dumps the uncompressed/decoded streams.
+- `-t` : Text mode. Dumps the streams in text format.
+- `-T` : Tagged mode. Dumps the tagged contents.
+- `-O output_dir` : Output directory for extracted streams.
 
 ## TODO
 
-  * Replace STRICT variable with something better.
-  * Improve the debugging functions.
-  * Use logging module instead of sys.stderr.
-  * Proper test cases.
-  * PEP-8 and PEP-257 conformance.
-  * Better documentation.
-  * Crypto stream filter support.
-
+- Replace STRICT variable with something better.
+- Improve the debugging functions.
+- Use logging module instead of sys.stderr.
+- Proper test cases.
+- PEP-8 and PEP-257 conformance.
+- Better documentation.
+- Crypto stream filter support.
 
 ## Related Projects
 
-  * <a href="http://pybrary.net/pyPdf/">pyPdf</a>
-  * <a href="http://www.foolabs.com/xpdf/">xpdf</a>
-  * <a href="http://pdfbox.apache.org/">pdfbox</a>
-  * <a href="http://mupdf.com/">mupdf</a>
+- <a href="http://pybrary.net/pyPdf/">pyPdf</a>
+- <a href="http://www.foolabs.com/xpdf/">xpdf</a>
+- <a href="http://pdfbox.apache.org/">pdfbox</a>
+- <a href="http://mupdf.com/">mupdf</a>
diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py
@@ -1,5 +0,0 @@
-#!/usr/bin/env python
-__version__ = '20191125'
-
-if __name__ == '__main__':
-    print(__version__)

diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py
@@ -7,10 +7,8 @@
 """
 
 
-##  Arcfour
-##
+#  Arcfour
 class Arcfour:
-
     """
     >>> Arcfour(b'Key').process(b'Plaintext').hex()
     'bbf316e8d940af0ad3'
@@ -36,19 +34,21 @@ def process(self, data):
         s = self.s
         r = []
         for c in data:
-            i = (i+1) % 256
-            j = (j+s[i]) % 256
+            i = (i + 1) % 256
+            j = (j + s[i]) % 256
             (s[i], s[j]) = (s[j], s[i])
-            k = s[(s[i]+s[j]) % 256]
+            k = s[(s[i] + s[j]) % 256]
             r.append(c ^ k)
         (self.i, self.j) = (i, j)
         return bytes(r)
 
     encrypt = decrypt = process
 
+
 new = Arcfour
 
 # test
 if __name__ == '__main__':
     import doctest
+
     print('pdfminer.arcfour:', doctest.testmod())