diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6e4761 --- /dev/null +++ b/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/README.md b/README.md index 2f68bb1..9a4ebc2 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,22 @@ # waybackMachine -Use wayback Machine data to pull a list of paths. +[![Python Version](https://img.shields.io/badge/python-3.6+-green)](https://www.python.org) -# Usage +Lists all URLs available on [Wayback Machine](https://archive.org/web/). Can be used for listing all the endpoint available. -python waybackMachine.py facebook.com +### Usage + +Listing URLs available for example.com on archive.org. +```bash +python3 waybackMachine.py example.com +``` + +Querying for URLs between certain years. +```bash +python3 waybackMachine.py example.com -f 2020 -t 2021 +``` + +Using grep command to find certain keywords in result +```bash +python3 waybackMachine.py example.com -f 2020 -t 2021 | grep redirect= +``` diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/main.py b/scripts/main.py new file mode 100644 index 0000000..276b132 --- /dev/null +++ b/scripts/main.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import requests +import json +import urllib3 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +class WayBackMachine(object): + def __init__(self, + domain: str, + start_year: int=None, + stop_year: int=None) -> None: + + self.domain = domain + self.start_year = start_year + self.stop_year = stop_year + + def get_urls(self) -> list: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0)' + ' Gecko/20100101 Firefox/60.0', + } + payload = { + 'url': self.domain, + 'from': self.start_year, + 'to': self.stop_year, + 'output': 'json', + 'matchType': 'prefix', + 'collapse': 'urlkey', + 'fl': 'original,mimetype,timestamp,' + 'endtimestamp,groupcount,uniqcount', + 'ilter': '!statuscode:[45]..', + 'limit': 100000, + '_': 1547318148315, + } + + webarchive_url = "https://web.archive.org/cdx/search/cdx" + + res = requests.get( + url=webarchive_url, + headers=headers, + params=payload, + verify=False + ) + html = res.text + json_obj = json.loads(html) + return json_obj diff --git a/waybackMachine.py b/waybackMachine.py index 4c9926e..87adcf6 100644 --- a/waybackMachine.py +++ b/waybackMachine.py @@ -1,19 +1,43 @@ -import requests -import sys -import json +#!/usr/bin/env python3 +import argparse -class waybackMachineClass(): +from scripts.main import WayBackMachine - def __init__(self,domain): - self.waybackURL = "https://web.archive.org/cdx/search?url="+domain+"%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Cmimetype%2Ctimestamp%2Cendtimestamp%2Cgroupcount%2Cuniqcount&filter=!statuscode%3A%5B45%5D..&limit=100000&_=1547318148315" - self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'} - def getUrls(self): - r = requests.get(self.waybackURL,headers=self.headers) - html = r.text - jsonObj = json.loads(html) - return jsonObj +if __name__ == '__main__': + wayback_parser = argparse.ArgumentParser( + description='List all the endpoints for given domain', + usage='python3 waybackMachine.py example.com' + ) -wbm = waybackMachineClass(sys.argv[1]) -for row in wbm.getUrls(): - print(row[0]) + wayback_parser.add_argument( + 'domain_name', + type=str, + help='Domain name' + ) + + wayback_parser.add_argument( + '-f', '--fyear', + type=int, + metavar='', + help='Results from year' + ) + + wayback_parser.add_argument( + '-t', '--tyear', + type=int, + metavar='', + help='Results to year' + ) + + args = wayback_parser.parse_args() + wbm = WayBackMachine( + domain=args.domain_name, + start_year=args.fyear, + stop_year=args.tyear + ) + + url_list = wbm.get_urls() + + for row in url_list[1:]: + print(row[0])