Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore

This file was deleted.

67 changes: 47 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,47 @@
# NamUs Scraper
Python scraper for collecting metadata and files for Missing-, Unidentified-, and Unclaimed-person cases from the [National Missing and Unidentified Persons System (NamUs)](https://www.namus.gov) organization. The scraper uses APIs used for internal purposes at NamUs and may therefore change at any point.

To work around the 10.000 case search limit, cases are found by searching on a per-state basis. This may miss some cases if they are entered incorrectly! Compare result counts with the ones available on the site.

⚠️ This requests a large amount of data. Please run it responsibly!

## Installation
```
sudo pip3 install requests
sudo pip3 install grequests
sudo pip3 install face_recognition
```

## Scraping
```
python3 scrape-data.py # Downloads all metadata related to the cases.
python3 scrape-files.py # Downloads all files related to the scraped cases.
python3 process-faces.py # Extracts faces from the downloaded images.
```
# NamUs Scraper Enhancements

I've updated the original NamUs Scraper repository by changing the method, and adding some arguments so that the user can have more freedom on how they scrape. This is mainly because of the amount of data used in this project.

## Changes Made:

### `process-faces.py` Changes:
* Complete rewrite with different face extraction method that processes images individually instead of batch pickle storage
* Argparse implemented (--limit, --input, --model)
* Added 20% padding around the detected faces
* Consistent naming format for the output files using the NamUS ID
* This is done because in our pipeline, we use this ID to match the faces to the correct person
* Implemented automatic NamUs ID extraction from filepath / filename
* Implemented skipping for duplicates

### `scrape-data.py` Changes:
* Argparse implemented (--limit)

### `scrape-files.py` Changes:
* Argparse implemented (--limit)

<br>
<hr>
<br>

The following is the original `README.md` from the NamUs Scraper repository, where the code was forked from.

> # NamUs Scraper
> Python scraper for collecting metadata and files for Missing-, Unidentified-, and Unclaimed-person cases from the [National Missing and Unidentified Persons System (NamUs)](https://www.namus.gov) organization. The scraper uses APIs used for internal purposes at NamUs and may therefore change at any point.

> To work around the 10.000 case search limit, cases are found by searching on a per-state basis. This may miss some cases if they are entered incorrectly! Compare result counts with the ones available on the site.

> ⚠️ This requests a large amount of data. Please run it responsibly!

> ## Installation
> ```
> sudo pip3 install requests
> sudo pip3 install grequests
> sudo pip3 install face_recognition
> ```

> ## Scraping
> ```
> python3 scrape-data.py # Downloads all metadata related to the cases.
> python3 scrape-files.py # Downloads all files related to the scraped cases.
> python3 process-faces.py # Extracts faces from the downloaded images.
> ```
224 changes: 137 additions & 87 deletions process-faces.py
Original file line number Diff line number Diff line change
@@ -1,90 +1,140 @@
import os, pickle, face_recognition
from glob import glob
from functools import reduce
import os, re, glob, face_recognition
from PIL import Image

PROCESS_FEEDBACK_INTERVAL = 50

FILE_LOCATIONS = "./output/{type}/files/*/"
IMAGE_EXTENSIONS = ["jpg", "jpeg", "png"]

FACES_DATA = "./output/{type}/FaceEncodings.dat"
FACES_OUTPUT = "./output/{type}/faces/{file}-{index}.{extension}"
CASE_TYPES = {
"MissingPersons": {
"excluded": []
},
"UnidentifiedPersons": {
"excluded": ["/Clothing/", "/Footwear/", "/OfficeLogo/"]
}
}

import argparse
import requests
import time

OUTPUT_BASE = "./output"
FACES_OUTPUT = "./output/faces/namus{namus_id}-face{index}.{extension}"

def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--limit', type=int)
parser.add_argument('--input', default=None)
parser.add_argument('--model', default='hog') # hog is faster than CNN.. would use CNN if more time
return parser.parse_args()

def extract_namus_id(filepath):
# looking for number in path that looks like a NamUs ID
match = re.search(r'/(\d+)[/-]', filepath)
if match:
return match.group(1)

filename = os.path.basename(filepath)
match = re.search(r'(\d+)', filename)
if match:
return match.group(1)

return "unknown"

def make_request_with_retry(url, headers, max_retries=3):
for attempt in range(max_retries):
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
return response
except Exception as e:
if attempt == max_retries - 1:
return None
time.sleep(2)

def main():
for caseType in CASE_TYPES:
print("Processing: {type}".format(type=caseType))

print(" > Fetching image file paths")
paths = getImageFilesForType(caseType)
print(" > Found %d files" % len(paths))

os.makedirs(
os.path.dirname(
FACES_OUTPUT.format(type=caseType, file="*", index="*", extension="*")
),
exist_ok=True,
)
dataFile = open(FACES_DATA.format(type=caseType), 'wb')

print(" > Starting face extraction")
processedFiles, facesFound = 0, 0
for path in paths:
try:
image = face_recognition.load_image_file(path)
locations = face_recognition.face_locations(image)
encodings = face_recognition.face_encodings(image, locations)

if len(encodings):
pickle.dump({path: encodings}, dataFile)

pathParts = path.split("/")[-1].split(".")
fileName, extension = pathParts[0], pathParts[1]

for index, location in enumerate(locations):
outputPath = FACES_OUTPUT.format(
type=caseType, file=fileName, index=index, extension=extension
)

top, right, bottom, left = location
face = Image.fromarray(image[top:bottom, left:right])
face.save(outputPath)
facesFound += 1

processedFiles += 1
if processedFiles % PROCESS_FEEDBACK_INTERVAL == 0:
print(
" > Processed {count} files with {faces} faces".format(
count=processedFiles, faces=facesFound
)
)
except:
processedFiles += 1
print(" > Failed parsing path: {path}".format(path=path))

dataFile.close()


def getImageFilesForType(caseType):
imageExtensionPaths = [
FILE_LOCATIONS.format(type=caseType) + "*." + extension
for extension in IMAGE_EXTENSIONS
]

filePaths = reduce(lambda output, path: output + glob(path), imageExtensionPaths, [])
for excluded in CASE_TYPES[caseType]["excluded"]:
filePaths = list(filter(lambda path: excluded not in path, filePaths))

return list(filePaths)


main()
args = parse_args()

# output directory
faces_dir = os.path.join(OUTPUT_BASE, "faces")
os.makedirs(faces_dir, exist_ok=True)

if args.input:
search_path = args.input
else:
search_path = OUTPUT_BASE

image_paths = []

# IMPORTANT !!! ensuring we EXCLUDE faces output directory to avoid duplicates
for ext in ['jpg', 'jpeg', 'png']:
found_paths = glob.glob(f"{search_path}/**/*.{ext}", recursive=True)
filtered_paths = [p for p in found_paths if '/faces/' not in p]
image_paths.extend(filtered_paths)

print(f"Found {len(image_paths)} images")

if args.limit:
image_paths = image_paths[:args.limit]

processed = 0
faces_found = 0
skipped = 0
errors = 0

for path in image_paths:
try:
namus_id = extract_namus_id(path)
if namus_id == "unknown":
print(f"Couldn't extract NamUs ID from {path}")

image = face_recognition.load_image_file(path) # extraction
face_locations = face_recognition.face_locations(
image,
model="hog"
)

if len(face_locations) > 0:
print(f"Found {len(face_locations)} faces")

for i, face_location in enumerate(face_locations):
top, right, bottom, left = face_location

height = bottom - top
width = right - left

padding_v = int(height * 0.2)
padding_h = int(width * 0.2)

new_top = max(0, top - padding_v) # boundaries
new_bottom = min(image.shape[0], bottom + padding_v)
new_left = max(0, left - padding_h)
new_right = min(image.shape[1], right + padding_h)

face_image = image[new_top:new_bottom, new_left:new_right]
pil_image = Image.fromarray(face_image)

extension = os.path.splitext(path)[1][1:].lower()
if extension not in ['jpg', 'jpeg', 'png']:
extension = 'jpg'

output_path = FACES_OUTPUT.format(
namus_id=namus_id,
index=i,
extension=extension
)

if os.path.exists(output_path) and not args.force:
skipped += 1
continue

pil_image.save(output_path)
faces_found += 1

processed += 1
if processed % 100 == 0:
print(f"Progress: {processed} images processed; {faces_found} faces extracted")

except Exception as e:
print(f"Couldn't process {path}: {str(e)}")
errors += 1

print("\nSummary:")
print(f"Images processed: {processed}")
print(f"Faces extracted: {faces_found}")
print(f"Duplicates: {skipped}")
print(f"Errors: {errors}")

if __name__ == "__main__":
main()


# python3 scrape-data.py
# python3 scrape-files.py
# python3 process-faces.py
10 changes: 10 additions & 0 deletions scrape-data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os, json, grequests, requests, functools
import argparse

SEARCH_LIMIT = 10000
REQUEST_BATCH_SIZE = 50
Expand All @@ -19,8 +20,13 @@

completedCases = 0

def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--limit', type=int) # so we can limit # cases processed
return parser.parse_args()

def main():
args = parse_args()
print("Fetching states\n")
states = requests.get(STATE_ENDPOINT, headers={"User-Agent": USER_AGENT}).json()

Expand Down Expand Up @@ -59,6 +65,10 @@ def main():
[],
)

if args.limit:
cases = cases[:args.limit]
print(f" > Limited to {args.limit} cases")

print(" > Found %d cases" % len(cases))

print(" > Creating output file")
Expand Down
13 changes: 12 additions & 1 deletion scrape-files.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os, re, json, grequests, functools, mimetypes
import argparse

REQUEST_BATCH_SIZE = 10
REQUEST_FEEDBACK_INTERVAL = 50
Expand All @@ -12,12 +13,23 @@

KNOWN_EXTENSIONS = {"image/jpg": ".jpg", "image/pjpeg": ".jpg", "image/x-png": ".png"}

def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--limit', type=int)
return parser.parse_args()

def main():
args = parse_args()

for caseType in CASE_TYPES:
print("Collecting: {type}".format(type=caseType))

cases = json.loads(open(DATA_INPUT.format(type=caseType), "r").read())

if args.limit:
cases = cases[:args.limit]
print(f" > Limited to {args.limit} cases")

print(" > Found %d cases" % len(cases))

files = functools.reduce(
Expand Down Expand Up @@ -119,5 +131,4 @@ def buildFilePath(caseType, case, fileData):
extension=extension or ".unknown",
)


main()