Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
# waybackMachine

Use wayback Machine data to pull a list of paths.
[![Python Version](https://img.shields.io/badge/python-3.6+-green)](https://www.python.org)

# Usage
Lists all URLs available on [Wayback Machine](https://archive.org/web/). Can be used for listing all the endpoint available.

python waybackMachine.py facebook.com
### Usage

Listing URLs available for example.com on archive.org.
```bash
python3 waybackMachine.py example.com
```

Querying for URLs between certain years.
```bash
python3 waybackMachine.py example.com -f 2020 -t 2021
```

Using grep command to find certain keywords in result
```bash
python3 waybackMachine.py example.com -f 2020 -t 2021 | grep redirect=
```
Empty file added scripts/__init__.py
Empty file.
49 changes: 49 additions & 0 deletions scripts/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python3

import requests
import json
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class WayBackMachine(object):
def __init__(self,
domain: str,
start_year: int=None,
stop_year: int=None) -> None:

self.domain = domain
self.start_year = start_year
self.stop_year = stop_year

def get_urls(self) -> list:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0)'
' Gecko/20100101 Firefox/60.0',
}
payload = {
'url': self.domain,
'from': self.start_year,
'to': self.stop_year,
'output': 'json',
'matchType': 'prefix',
'collapse': 'urlkey',
'fl': 'original,mimetype,timestamp,'
'endtimestamp,groupcount,uniqcount',
'ilter': '!statuscode:[45]..',
'limit': 100000,
'_': 1547318148315,
}

webarchive_url = "https://web.archive.org/cdx/search/cdx"

res = requests.get(
url=webarchive_url,
headers=headers,
params=payload,
verify=False
)
html = res.text
json_obj = json.loads(html)
return json_obj
54 changes: 39 additions & 15 deletions waybackMachine.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,43 @@
import requests
import sys
import json
#!/usr/bin/env python3
import argparse

class waybackMachineClass():
from scripts.main import WayBackMachine

def __init__(self,domain):
self.waybackURL = "https://web.archive.org/cdx/search?url="+domain+"%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Cmimetype%2Ctimestamp%2Cendtimestamp%2Cgroupcount%2Cuniqcount&filter=!statuscode%3A%5B45%5D..&limit=100000&_=1547318148315"
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}
def getUrls(self):
r = requests.get(self.waybackURL,headers=self.headers)
html = r.text
jsonObj = json.loads(html)
return jsonObj

if __name__ == '__main__':
wayback_parser = argparse.ArgumentParser(
description='List all the endpoints for given domain',
usage='python3 waybackMachine.py example.com'
)

wbm = waybackMachineClass(sys.argv[1])
for row in wbm.getUrls():
print(row[0])
wayback_parser.add_argument(
'domain_name',
type=str,
help='Domain name'
)

wayback_parser.add_argument(
'-f', '--fyear',
type=int,
metavar='',
help='Results from year'
)

wayback_parser.add_argument(
'-t', '--tyear',
type=int,
metavar='',
help='Results to year'
)

args = wayback_parser.parse_args()
wbm = WayBackMachine(
domain=args.domain_name,
start_year=args.fyear,
stop_year=args.tyear
)

url_list = wbm.get_urls()

for row in url_list[1:]:
print(row[0])