From bfea94e867834625dc1d6fcdbe74e793276d0a21 Mon Sep 17 00:00:00 2001
From: minhsueh <mchiu@D22ML-MCHIU.local>
Date: Mon, 14 Aug 2023 13:00:19 -0700
Subject: [PATCH] adding the file for outputing pure json format

---
 scrape-data_JSON.py | 116 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 scrape-data_JSON.py

diff --git a/scrape-data_JSON.py b/scrape-data_JSON.py
new file mode 100644
index 000000000..3bb0f7360
--- /dev/null
+++ b/scrape-data_JSON.py
@@ -0,0 +1,116 @@
+import os, json, grequests, requests, functools
+
+SEARCH_LIMIT = 10000
+REQUEST_BATCH_SIZE = 50
+REQUEST_FEEDBACK_INTERVAL = 50
+
+USER_AGENT = "NamUs Scraper / github.com/prepager/namus-scraper"
+API_ENDPOINT = "https://www.namus.gov/api"
+STATE_ENDPOINT = API_ENDPOINT + "/CaseSets/NamUs/States"
+CASE_ENDPOINT = API_ENDPOINT + "/CaseSets/NamUs/{type}/Cases/{case}"
+SEARCH_ENDPOINT = API_ENDPOINT + "/CaseSets/NamUs/{type}/Search"
+
+DATA_OUTPUT = "./output/{type}/{type}.json"
+CASE_TYPES = {
+    "MissingPersons": {"stateField": "stateOfLastContact"},
+    "UnidentifiedPersons": {"stateField": "stateOfRecovery"},
+    "UnclaimedPersons": {"stateField": "stateFound"},
+}
+
+completedCases = 0
+
+
+def main():
+    print("Fetching states\n")
+    states = requests.get(STATE_ENDPOINT, headers={"User-Agent": USER_AGENT}).json()
+
+    for caseType in CASE_TYPES:
+        output_dict = dict()
+
+        print("Collecting: {type}".format(type=caseType))
+
+        global completedCases
+        completedCases = 0
+
+        print(" > Fetching case identifiers")
+        searchRequests = (
+            grequests.post(
+                SEARCH_ENDPOINT.format(type=caseType),
+                headers={"User-Agent": USER_AGENT, "Content-Type": "application/json"},
+                data=json.dumps(
+                    {
+                        "take": SEARCH_LIMIT,
+                        "projections": ["namus2Number"],
+                        "predicates": [
+                            {
+                                "field": CASE_TYPES[caseType]["stateField"],
+                                "operator": "IsIn",
+                                "values": [state["name"]],
+                            }
+                        ],
+                    }
+                ),
+            )
+            for state in states
+        )
+
+        searchRequests = grequests.map(searchRequests, size=REQUEST_BATCH_SIZE)
+        cases = functools.reduce(
+            lambda output, element: output + element.json()["results"],
+            searchRequests,
+            [],
+        )
+
+        print(" > Found %d cases" % len(cases))
+
+        print(" > Creating output file")
+        filePath = DATA_OUTPUT.format(type=caseType)
+        os.makedirs(os.path.dirname(filePath), exist_ok=True)
+        #outputFile = open(filePath, "w")
+        #outputFile.write("[")
+
+        print(" > Starting case processing")
+        caseRequests = (
+            grequests.get(
+                CASE_ENDPOINT.format(type=caseType, case=case["namus2Number"]),
+                hooks={"response": requestFeedback},
+                headers={"User-Agent": USER_AGENT},
+            )
+            for case in cases
+        )
+
+        caseRequests = grequests.map(caseRequests, size=REQUEST_BATCH_SIZE)
+        for index, case in enumerate(caseRequests):
+            if not case:
+                print(
+                    " > Failed parsing case: {case} index {index}".format(
+                        case=cases[index], index=index
+                    )
+                )
+                continue
+
+            #outputFile.write(
+            #    case.text + ("," if ((index + 1) != len(caseRequests)) else "")
+            #)
+            tem_output = json.loads(case.text)
+            output_dict[tem_output["id"]] = tem_output
+
+        print(" > Closing output file")
+        #outputFile.write("]")
+        #outputFile.close()
+        with open(filePath, "w") as outfile:
+            json.dump(output_dict, outfile)
+        
+
+    print("Scraping completed")
+
+
+def requestFeedback(response, **kwargs):
+    global completedCases
+    completedCases = completedCases + 1
+
+    if completedCases % REQUEST_FEEDBACK_INTERVAL == 0:
+        print(" > Completed {count} cases".format(count=completedCases))
+
+
+main()