Prepager · ymorsi7 · Mar 14, 2025 · Mar 14, 2025 · Mar 14, 2025 · Mar 14, 2025
diff --git a/.gitignore b/.gitignore
diff --git a/README.md b/README.md
@@ -1,20 +1,47 @@
-# NamUs Scraper
-Python scraper for collecting metadata and files for Missing-, Unidentified-, and Unclaimed-person cases from the [National Missing and Unidentified Persons System (NamUs)](https://www.namus.gov) organization. The scraper uses APIs used for internal purposes at NamUs and may therefore change at any point.
-
-To work around the 10.000 case search limit, cases are found by searching on a per-state basis. This may miss some cases if they are entered incorrectly! Compare result counts with the ones available on the site. 
-
-⚠️ This requests a large amount of data. Please run it responsibly!
-
-## Installation
-```
-sudo pip3 install requests
-sudo pip3 install grequests
-sudo pip3 install face_recognition
-```
-
-## Scraping
-```
-python3 scrape-data.py   # Downloads all metadata related to the cases.
-python3 scrape-files.py  # Downloads all files related to the scraped cases.
-python3 process-faces.py # Extracts faces from the downloaded images.
-```
+# NamUs Scraper Enhancements
+
+I've updated the original NamUs Scraper repository by changing the method, and adding some arguments so that the user can have more freedom on how they scrape. This is mainly because of the amount of data used in this project.
+
+## Changes Made:
+
+### `process-faces.py` Changes:
+* Complete rewrite with different face extraction method that processes images individually instead of batch pickle storage
+* Argparse implemented (--limit, --input, --model)
+* Added 20% padding around the detected faces
+* Consistent naming format for the output files using the NamUS ID
+  * This is done because in our pipeline, we use this ID to match the faces to the correct person
+* Implemented automatic NamUs ID extraction from filepath / filename
+* Implemented skipping for duplicates
+
+### `scrape-data.py` Changes:
+* Argparse implemented (--limit)
+
+### `scrape-files.py` Changes:
+* Argparse implemented (--limit)
+
+<br>
+<hr>
+<br>
+
+The following is the original `README.md` from the NamUs Scraper repository, where the code was forked from.
+
+> # NamUs Scraper
+> Python scraper for collecting metadata and files for Missing-, Unidentified-, and Unclaimed-person cases from the [National Missing and Unidentified Persons System (NamUs)](https://www.namus.gov) organization. The scraper uses APIs used for internal purposes at NamUs and may therefore change at any point.
+
+> To work around the 10.000 case search limit, cases are found by searching on a per-state basis. This may miss some cases if they are entered incorrectly! Compare result counts with the ones available on the site. 
+
+> ⚠️ This requests a large amount of data. Please run it responsibly!
+
+> ## Installation
+> ```
+> sudo pip3 install requests
+> sudo pip3 install grequests
+> sudo pip3 install face_recognition
+> ```
+
+> ## Scraping
+> ```
+> python3 scrape-data.py   # Downloads all metadata related to the cases.
+> python3 scrape-files.py  # Downloads all files related to the scraped cases.
+> python3 process-faces.py # Extracts faces from the downloaded images.
+> ```
diff --git a/process-faces.py b/process-faces.py
@@ -1,90 +1,140 @@
-import os, pickle, face_recognition
-from glob import glob
-from functools import reduce
+import os, re, glob, face_recognition
 from PIL import Image
-
-PROCESS_FEEDBACK_INTERVAL = 50
-
-FILE_LOCATIONS = "./output/{type}/files/*/"
-IMAGE_EXTENSIONS = ["jpg", "jpeg", "png"]
-
-FACES_DATA = "./output/{type}/FaceEncodings.dat"
-FACES_OUTPUT = "./output/{type}/faces/{file}-{index}.{extension}"
-CASE_TYPES = {
-    "MissingPersons": {
-        "excluded": []
-    },
-    "UnidentifiedPersons": {
-        "excluded": ["/Clothing/", "/Footwear/", "/OfficeLogo/"]
-    }
-}
-
+import argparse
+import requests
+import time
+
+OUTPUT_BASE = "./output"
+FACES_OUTPUT = "./output/faces/namus{namus_id}-face{index}.{extension}"
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--limit', type=int)
+    parser.add_argument('--input', default=None)
+    parser.add_argument('--model', default='hog') # hog is faster than CNN.. would use CNN if more time
+    return parser.parse_args()
+
+def extract_namus_id(filepath):
+    # looking for number in path that looks like a NamUs ID
+    match = re.search(r'/(\d+)[/-]', filepath)
+    if match:
+        return match.group(1)
+
+    filename = os.path.basename(filepath)
+    match = re.search(r'(\d+)', filename)
+    if match:
+        return match.group(1)
+
+    return "unknown"
+
+def make_request_with_retry(url, headers, max_retries=3):
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(url, headers=headers, timeout=30)
+            response.raise_for_status()
+            return response
+        except Exception as e:
+            if attempt == max_retries - 1:
+                return None
+            time.sleep(2) 
 
 def main():
-    for caseType in CASE_TYPES:
-        print("Processing: {type}".format(type=caseType))
-
-        print(" > Fetching image file paths")
-        paths = getImageFilesForType(caseType)
-        print(" > Found %d files" % len(paths))
-
-        os.makedirs(
-            os.path.dirname(
-                FACES_OUTPUT.format(type=caseType, file="*", index="*", extension="*")
-            ),
-            exist_ok=True,
-        )
-        dataFile = open(FACES_DATA.format(type=caseType), 'wb')
-
-        print(" > Starting face extraction")
-        processedFiles, facesFound = 0, 0
-        for path in paths:
-            try:
-                image = face_recognition.load_image_file(path)
-                locations = face_recognition.face_locations(image)
-                encodings = face_recognition.face_encodings(image, locations)
-
-                if len(encodings):
-                    pickle.dump({path: encodings}, dataFile)
-
-                pathParts = path.split("/")[-1].split(".")
-                fileName, extension = pathParts[0], pathParts[1]
-
-                for index, location in enumerate(locations):
-                    outputPath = FACES_OUTPUT.format(
-                        type=caseType, file=fileName, index=index, extension=extension
-                    )
-
-                    top, right, bottom, left = location
-                    face = Image.fromarray(image[top:bottom, left:right])
-                    face.save(outputPath)
-                    facesFound += 1
-
-                processedFiles += 1
-                if processedFiles % PROCESS_FEEDBACK_INTERVAL == 0:
-                    print(
-                        " > Processed {count} files with {faces} faces".format(
-                            count=processedFiles, faces=facesFound
-                        )
-                    )
-            except:
-                processedFiles += 1
-                print(" > Failed parsing path: {path}".format(path=path))
-
-        dataFile.close()
-
-
-def getImageFilesForType(caseType):
-    imageExtensionPaths = [
-        FILE_LOCATIONS.format(type=caseType) + "*." + extension
-        for extension in IMAGE_EXTENSIONS
-    ]
-
-    filePaths = reduce(lambda output, path: output + glob(path), imageExtensionPaths, [])
-    for excluded in CASE_TYPES[caseType]["excluded"]:
-        filePaths = list(filter(lambda path: excluded not in path, filePaths))
-
-    return list(filePaths)
-
-
-main()
+    args = parse_args()
+
+    # output directory
+    faces_dir = os.path.join(OUTPUT_BASE, "faces")
+    os.makedirs(faces_dir, exist_ok=True)
+
+    if args.input:
+        search_path = args.input
+    else:
+        search_path = OUTPUT_BASE
+
+    image_paths = []
+
+    # IMPORTANT !!! ensuring we EXCLUDE faces output directory to avoid duplicates
+    for ext in ['jpg', 'jpeg', 'png']:
+        found_paths = glob.glob(f"{search_path}/**/*.{ext}", recursive=True)
+        filtered_paths = [p for p in found_paths if '/faces/' not in p]
+        image_paths.extend(filtered_paths)
+
+    print(f"Found {len(image_paths)} images")
+
+    if args.limit:
+        image_paths = image_paths[:args.limit]
+
+    processed = 0
+    faces_found = 0
+    skipped = 0
+    errors = 0
+
+    for path in image_paths:
+        try:
+            namus_id = extract_namus_id(path)
+            if namus_id == "unknown":
+                print(f"Couldn't extract NamUs ID from {path}")
+
+            image = face_recognition.load_image_file(path) # extraction
+            face_locations = face_recognition.face_locations(
+                image, 
+                model="hog"
+            )
+
+            if len(face_locations) > 0:
+                print(f"Found {len(face_locations)} faces")
+
+            for i, face_location in enumerate(face_locations):
+                top, right, bottom, left = face_location
+
+                height = bottom - top
+                width = right - left
+
+                padding_v = int(height * 0.2)
+                padding_h = int(width * 0.2)
+
+                new_top = max(0, top - padding_v) # boundaries
+                new_bottom = min(image.shape[0], bottom + padding_v)
+                new_left = max(0, left - padding_h)
+                new_right = min(image.shape[1], right + padding_h)
+
+                face_image = image[new_top:new_bottom, new_left:new_right]
+                pil_image = Image.fromarray(face_image)
+
+                extension = os.path.splitext(path)[1][1:].lower()
+                if extension not in ['jpg', 'jpeg', 'png']:
+                    extension = 'jpg'
+
+                output_path = FACES_OUTPUT.format(
+                    namus_id=namus_id,
+                    index=i,
+                    extension=extension
+                )
+
+                if os.path.exists(output_path) and not args.force:
+                    skipped += 1
+                    continue
+
+                pil_image.save(output_path)
+                faces_found += 1
+
+            processed += 1
+            if processed % 100 == 0:
+                print(f"Progress: {processed} images processed; {faces_found} faces extracted")
+
+        except Exception as e:
+            print(f"Couldn't process {path}: {str(e)}")
+            errors += 1
+
+    print("\nSummary:")
+    print(f"Images processed: {processed}")
+    print(f"Faces extracted: {faces_found}")
+    print(f"Duplicates: {skipped}")
+    print(f"Errors: {errors}")
+
+if __name__ == "__main__":
+    main()
+
+
+# python3 scrape-data.py
+# python3 scrape-files.py
+# python3 process-faces.py
diff --git a/scrape-data.py b/scrape-data.py
@@ -1,4 +1,5 @@
 import os, json, grequests, requests, functools
+import argparse
 
 SEARCH_LIMIT = 10000
 REQUEST_BATCH_SIZE = 50
@@ -19,8 +20,13 @@
 
 completedCases = 0
 
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--limit', type=int) # so we can limit # cases processed
+    return parser.parse_args()
 
 def main():
+    args = parse_args()
     print("Fetching states\n")
     states = requests.get(STATE_ENDPOINT, headers={"User-Agent": USER_AGENT}).json()
 
@@ -59,6 +65,10 @@ def main():
             [],
         )
 
+        if args.limit:
+            cases = cases[:args.limit]
+            print(f" > Limited to {args.limit} cases")
+
         print(" > Found %d cases" % len(cases))
 
         print(" > Creating output file")

diff --git a/scrape-files.py b/scrape-files.py
@@ -1,4 +1,5 @@
 import os, re, json, grequests, functools, mimetypes
+import argparse
 
 REQUEST_BATCH_SIZE = 10
 REQUEST_FEEDBACK_INTERVAL = 50
@@ -12,12 +13,23 @@
 
 KNOWN_EXTENSIONS = {"image/jpg": ".jpg", "image/pjpeg": ".jpg", "image/x-png": ".png"}
 
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--limit', type=int)
+    return parser.parse_args()
 
 def main():
+    args = parse_args()
+
     for caseType in CASE_TYPES:
         print("Collecting: {type}".format(type=caseType))
 
         cases = json.loads(open(DATA_INPUT.format(type=caseType), "r").read())
+
+        if args.limit:
+            cases = cases[:args.limit]
+            print(f" > Limited to {args.limit} cases")
+
         print(" > Found %d cases" % len(cases))
 
         files = functools.reduce(
@@ -119,5 +131,4 @@ def buildFilePath(caseType, case, fileData):
         extension=extension or ".unknown",
     )
 
-
 main()