Updates and fixes #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

Yugabdh wants to merge 2 commits into ghostlulzhacks:master from Yugabdh:update

.gitignore

-Original file line number
+Diff line change
@@ -0,0 +1,129 @@
+    # Byte-compiled / optimized / DLL files
+    __pycache__/
+    *.py[cod]
+    *$py.class
+    # C extensions
+    *.so
+    # Distribution / packaging
+    .Python
+    build/
+    develop-eggs/
+    dist/
+    downloads/
+    eggs/
+    .eggs/
+    lib/
+    lib64/
+    parts/
+    sdist/
+    var/
+    wheels/
+    pip-wheel-metadata/
+    share/python-wheels/
+    *.egg-info/
+    .installed.cfg
+    *.egg
+    MANIFEST
+    # PyInstaller
+    #  Usually these files are written by a python script from a template
+    #  before PyInstaller builds the exe, so as to inject date/other infos into it.
+    *.manifest
+    *.spec
+    # Installer logs
+    pip-log.txt
+    pip-delete-this-directory.txt
+    # Unit test / coverage reports
+    htmlcov/
+    .tox/
+    .nox/
+    .coverage
+    .coverage.*
+    .cache
+    nosetests.xml
+    coverage.xml
+    *.cover
+    *.py,cover
+    .hypothesis/
+    .pytest_cache/
+    # Translations
+    *.mo
+    *.pot
+    # Django stuff:
+    *.log
+    local_settings.py
+    db.sqlite3
+    db.sqlite3-journal
+    # Flask stuff:
+    instance/
+    .webassets-cache
+    # Scrapy stuff:
+    .scrapy
+    # Sphinx documentation
+    docs/_build/
+    # PyBuilder
+    target/
+    # Jupyter Notebook
+    .ipynb_checkpoints
+    # IPython
+    profile_default/
+    ipython_config.py
+    # pyenv
+    .python-version
+    # pipenv
+    #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+    #   However, in case of collaboration, if having platform-specific dependencies or dependencies
+    #   having no cross-platform support, pipenv may install dependencies that don't work, or not
+    #   install all needed dependencies.
+    #Pipfile.lock
+    # PEP 582; used by e.g. github.com/David-OConnor/pyflow
+    __pypackages__/
+    # Celery stuff
+    celerybeat-schedule
+    celerybeat.pid
+    # SageMath parsed files
+    *.sage.py
+    # Environments
+    .env
+    .venv
+    env/
+    venv/
+    ENV/
+    env.bak/
+    venv.bak/
+    # Spyder project settings
+    .spyderproject
+    .spyproject
+    # Rope project settings
+    .ropeproject
+    # mkdocs documentation
+    /site
+    # mypy
+    .mypy_cache/
+    .dmypy.json
+    dmypy.json
+    # Pyre type checker
+    .pyre/

README.md

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -1,7 +1,22 @@
  
    # waybackMachine

    Use wayback Machine data to pull a list of paths.

    [![Python Version](https://img.shields.io/badge/python-3.6+-green)](https://www.python.org)

    # Usage

    Lists all URLs available on [Wayback Machine](https://archive.org/web/). Can be used for listing all the endpoint available.

    python waybackMachine.py facebook.com

    ### Usage

    Listing URLs available for example.com on archive.org.

    ```bash

    python3 waybackMachine.py example.com

    ```

    Querying for URLs between certain years.

    ```bash

    python3 waybackMachine.py example.com -f 2020 -t 2021

    ```

    Using grep command to find certain keywords in result

    ```bash

    python3 waybackMachine.py example.com -f 2020 -t 2021 | grep redirect=

    ```

scripts/__init__.py

Empty file.

scripts/main.py

-Original file line number
+Diff line change
@@ -0,0 +1,49 @@
+    #!/usr/bin/env python3
+    import requests
+    import json
+    import urllib3
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+    class WayBackMachine(object):
+        def __init__(self,
+                     domain: str,
+                     start_year: int=None,
+                     stop_year: int=None) -> None:
+            self.domain = domain
+            self.start_year = start_year
+            self.stop_year = stop_year
+        def get_urls(self) -> list:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0)'
+                              ' Gecko/20100101 Firefox/60.0',
+            }
+            payload = {
+                'url': self.domain,
+                'from': self.start_year,
+                'to': self.stop_year,
+                'output': 'json',
+                'matchType': 'prefix',
+                'collapse': 'urlkey',
+                'fl': 'original,mimetype,timestamp,'
+                      'endtimestamp,groupcount,uniqcount',
+                'ilter': '!statuscode:[45]..',
+                'limit': 100000,
+                '_': 1547318148315,
+            }
+            webarchive_url = "https://web.archive.org/cdx/search/cdx"
+            res = requests.get(
+                url=webarchive_url,
+                headers=headers,
+                params=payload,
+                verify=False
+            )
+            html = res.text
+            json_obj = json.loads(html)
+            return json_obj

waybackMachine.py

-Original file line number
+Diff line change
@@ -1,19 +1,43 @@
-    import requests
-    import sys
-    import json
+    #!/usr/bin/env python3
+    import argparse
-    class waybackMachineClass():
+    from scripts.main import WayBackMachine
-            def __init__(self,domain):
-                    self.waybackURL = "https://web.archive.org/cdx/search?url="+domain+"%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Cmimetype%2Ctimestamp%2Cendtimestamp%2Cgroupcount%2Cuniqcount&filter=!statuscode%3A%5B45%5D..&limit=100000&_=1547318148315"
-                    self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}
-            def getUrls(self):
-                    r = requests.get(self.waybackURL,headers=self.headers)
-                    html = r.text
-                    jsonObj = json.loads(html)
-                    return jsonObj
+    if __name__ == '__main__':
+        wayback_parser = argparse.ArgumentParser(
+            description='List all the endpoints for given domain',
+            usage='python3 waybackMachine.py example.com'
+        )
-    wbm = waybackMachineClass(sys.argv[1])
-    for row in wbm.getUrls():
-        print(row[0])
+        wayback_parser.add_argument(
+            'domain_name',
+            type=str,
+            help='Domain name'
+        )
+        wayback_parser.add_argument(
+            '-f', '--fyear',
+            type=int,
+            metavar='',
+            help='Results from year'
+        )
+        wayback_parser.add_argument(
+            '-t', '--tyear',
+            type=int,
+            metavar='',
+            help='Results to year'
+        )
+        args = wayback_parser.parse_args()
+        wbm = WayBackMachine(
+            domain=args.domain_name,
+            start_year=args.fyear,
+            stop_year=args.tyear
+        )
+        url_list = wbm.get_urls()
+        for row in url_list[1:]:
+            print(row[0])

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Updates and fixes #2

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Updates and fixes #2

Are you sure you want to change the base?

Uh oh!

Updates and fixes #2

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing