ABKK-CodePath · Aramist · Nov 29, 2021 · Dec 2, 2021 · Dec 5, 2021 · Dec 11, 2021
diff --git a/.gitignore b/.gitignore
@@ -89,3 +89,12 @@ fastlane/test_output
 
 iOSInjectionProject/
 *.playground
+
+GoogleService-Info.plist
+
+backend-scripts/json/*
+backend-scripts/minijson/*
+
+.DS_STORE
+*.pyc
+__pycache__/
diff --git a/backend-scripts/README.md b/backend-scripts/README.md
@@ -0,0 +1,18 @@
+# A collection of scripts for working with the Firebase backend
+
+## Scripts
+* firebase\_import.py:
+	* Loads all data from the local JSON files, formats it, and uploads it to a Cloud Firestore database.
+* image\_merger.py
+	* Functions related to merging groups of images located close to each other. Serves the purpose of preventing overlapping annotations
+* json\_loader.py
+	* Functions related to loading and formatting data from the raw local dataset.
+
+## Subdirectories
+* json
+	* The complete OldNYC dataset. Each file corresponds to a single annotation (prior to merging via image\_merger.py).
+* minijson
+	* A small subset of the complete dataset. For testing
+* unmaintained
+	* Older, undocumented versions of the above scripts.
+
diff --git a/backend-scripts/__init__.py b/backend-scripts/__init__.py
diff --git a/backend-scripts/borough_boundaries.geojson b/backend-scripts/borough_boundaries.geojson
diff --git a/backend-scripts/firebase_import.py b/backend-scripts/firebase_import.py
@@ -0,0 +1,78 @@
+import datetime
+import os
+
+import firebase_admin
+from firebase_admin import credentials
+from firebase_admin import firestore
+
+import image_merger
+import json_loader
+
+
+def load_image_groups(data_directory, borough_poly_path):
+    """ Loads image groups from the dataset.
+    data_directory refers to the directory containing OldNYC json data
+    borough_poly_path is a path to the geojson file defining the NYC borough boundaries
+    """
+    boroughs = json_loader.load_borough_polygons(borough_poly_path)
+    image_groups = list(json_loader.image_group_loader(data_directory))
+
+    for image_group in image_groups:
+        image_group.json['borough_code'] = image_group.test_borough(boroughs)
+
+    image_groups = image_merger.merge_nearby_points(image_groups)
+    return image_groups
+
+
+def load_firestore_db():
+    """ To use this script, generate and download a private certificate in the
+    firestore control panel, store it locally (outside the repo), and create
+    an environment variable pointing to it.
+    """
+    cert_path = os.getenv('FIREBASE_CERTIFICATE_PATH')
+    if cert_path is None:
+        raise Error('Failed to find certificate to connect to Firebase as admin. Double-check the FIREBASE_CERTIFICATE_PATH environment variable')
+    cred = credentials.Certificate(cert_path)
+    firebase_admin.initialize_app(cred)
+    db = firestore.client()
+    return db
+
+
+def batch_upload_image_groups(image_groups, database, batch_size=500):
+    """ Attempts to upload all data to firestore in batches of limited size
+    """
+    batch_counter = 1
+    doc_counter = 0
+    print(f'{datetime.datetime.now()}: Beginning upload of batch {batch_counter}')
+    batch = database.batch()
+    for image_group in image_groups:
+        ig_ref = database.collection(u'nyc-image-groups').document()
+        batch.set(ig_ref, image_group.parent_json())
+        doc_counter += 1
+        if doc_counter >= batch_size:
+            batch.commit()
+            batch = database.batch()
+            doc_counter = 0
+            batch_counter += 1
+            print(f'{datetime.datetime.now()}: Beginning upload of batch {batch_counter}')
+        photo_coll = ig_ref.collection(u'photos')
+        for photo in image_group.full_json()['photos']:
+            photo_id = photo['id']
+            photo_ref = photo_coll.document(photo_id)
+            batch.set(photo_ref, photo)
+            doc_counter += 1
+            if doc_counter >= batch_size:
+                batch.commit()
+                batch = database.batch()
+                doc_counter = 0
+                batch_counter += 1
+                print(f'{datetime.datetime.now()}: Beginning upload of batch {batch_counter}')
+
+    print('Batched upload complete')
+
+
+if __name__ == '__main__':
+    db = load_firestore_db()
+    image_groups = load_image_groups('json', 'borough_boundaries.geojson')
+    batch_upload_image_groups(image_groups, db, 400)
+
diff --git a/backend-scripts/image_merger.py b/backend-scripts/image_merger.py
@@ -0,0 +1,59 @@
+import numpy as np
+import pyproj
+from scipy.spatial import cKDTree as Tree
+
+
+def make_groups(paired_points):
+    """ Treats the set of pairs generated by SciPy's kdTree query as an equivalence
+    relation on the underlying indices and generates a list of equivalence classes
+    for the relation.
+    """
+    eq_classes = list()
+
+    def helper_get_index(tup, group_list):
+        """ Finds the list in group_list containing at least one of the elements
+        of tup. This corresponds to the equivalence class that the elements of tup
+        belong to.
+        """
+        for n, g in enumerate(group_list):
+            if tup[0] in g or tup[1] in g:
+                return n
+        return None
+
+    for tup in paired_points:
+        idx = helper_get_index(tup, eq_classes)
+        if idx is None:
+            eq_classes.append({tup[0], tup[1]})
+        else:
+            eq_classes[idx].add(tup[0])
+            eq_classes[idx].add(tup[1])
+    return eq_classes
+
+
+def merge_nearby_points(image_groups, thresh_distance=20):
+    """ Merges points within a given distance of each other
+    """
+
+    # This line was taken from a stackoverflow post
+    # I can't find the post I referenced, so no link
+    meter_projection = pyproj.Transformer.from_crs('epsg:4326', 'epsg:3857')
+
+    numpy_data = np.array([
+        meter_projection.transform(group.json['latitude'], group.json['longitude'])
+        for group in image_groups])
+    tree = Tree(numpy_data)
+
+    pairs = tree.query_pairs(thresh_distance)
+    groups = make_groups(pairs)
+    # The set of all (indices to) ImageGroups that won't be merged into any others
+    leftovers = set(range(len(image_groups))).difference(*groups)
+    new_data = list()
+    for idx in leftovers:
+        new_data.append(image_groups[idx])
+    for group in groups:
+        idx_list = list(group)
+        merged_group = image_groups[idx_list[0]].merge_many([image_groups[i] for i in idx_list[1:]])
+        new_data.append(merged_group)
+
+    return new_data
+
diff --git a/backend-scripts/json_loader.py b/backend-scripts/json_loader.py
@@ -0,0 +1,152 @@
+import json
+import os
+from os import path
+
+import geopy.distance
+import numpy as np
+import pyproj
+from scipy.spatial import cKDTree as Tree
+from shapely.geometry import Point, shape
+
+
+""" Loads data from json and does some minor preprocessing to merge groups of
+images that are located at the same point or very close to each other.
+This results in fewer overlapping image annotations within the app and allows
+us to avoid the use of MKClusterAnnotation, which is bugged.
+"""
+
+
+class ImageGroup:
+    """ A thin wrapper around the ImageGroup JSON objects that provides convenience
+    methods for working with them
+    """
+    def __init__(self, json_object):
+        self.json = json_object
+
+    def distance_to(self, other):
+        """ Distance between two ImageGroups in meters
+        """
+        p1 = self.json['latitude'], self.json['longitude']
+        p2 = other.json['latitude'], other.json['longitude']
+        return geopy.distance.distance(p1, p2).m
+
+    def merge(self, other):
+        """ Returns a new ImageGroup produced by merging self with another image group
+        """
+        merged_photos = self.json['photos'] + other.json['photos']
+        avg_lat = (self.json['latitude'] + other.json['latitude']) / 2
+        avg_lon = (self.json['longitude'] + other.json['longitude']) / 2
+        return ImageGroup({'latitude': avg_lat, 'longitude': avg_lon, 'photos': merged_photos})
+
+    def merge_many(self, others):
+        """ Returns a new ImageGroup produced by merging self with many other image groups
+        """
+        lats = [self.json['latitude']] + [other.json['latitude'] for other in others]
+        lons = [self.json['longitude']] + [other.json['longitude'] for other in others]
+        avg_lat = sum(lats) / len(lats)
+        avg_lon = sum(lons) / len(lons)
+        images = list()
+        for image_group in [self, *others]:
+            images.extend(image_group.json['photos'])
+        return ImageGroup({'latitude': avg_lat, 'longitude': avg_lon, 'photos': images})
+
+    def nearest_group(self, image_groups):
+        """ Finds the nearest ImageGroup to self in image_groups
+        """
+        nearest = None
+        nearest_idx = 0
+        nearest_dist = 1e10
+        for n, group in enumerate(image_groups):
+            dist = self.distance_to(group)
+            if dist < nearest_dist:
+                nearest = group
+                nearest_idx = n
+                nearest_dist = dist
+        if nearest is None:
+            return None
+        return nearest, nearest_idx, nearest_dist
+
+    def test_borough(self, boroughs):
+        """ Hit test on all the boroughs to determine which one this
+        ImageGroup belongs to
+        """
+        # Shapely uses a (longitude, latitude convention)
+        coord = Point(self.json['longitude'], self.json['latitude'])
+        for n, poly in enumerate(boroughs):
+            if poly.contains(coord):
+                return n
+        # If it's not in any of the boroughs, return a dummy index corresponding
+        # to this result. This could occur for images taken on bridges
+        return len(boroughs)
+
+    def full_json(self):
+        return self.json
+
+    def parent_json(self):
+        result = {'latitude': self.json['latitude'], 'longitude': self.json['longitude']}
+        if 'borough_code' in self.json:
+            result['borough_code'] = self.json['borough_code']
+        return result
+
+
+def coords_from_fname(fname):
+    """ As the data files contain the image's coordinates within the filename,
+    the name must be parsed to extract this information.
+    The files currently have the format <lat><long>.json
+    Since all points in NYC have negative longitude, the negative sign is used
+    here to determine where the longitude number starts.
+    """
+    no_extension = fname[:-5]  # remove the .json from the end
+    lat,lon = no_extension.split('-')
+    lat,lon = float(lat), float(lon)
+    lon = -lon  # Account for the - we removed when splitting the string
+    return lat, lon
+
+
+def get_entries_from_file(fdir, fname):
+    """ Converts a data file into an ImageGroup object.
+    Also strips away several fields from the original data
+    """
+    with open(path.join(fdir, fname), 'r') as json_file:
+        json_objs = json.load(json_file)
+    coords = coords_from_fname(fname)
+    for k, v in json_objs.items():
+        # Originally, the image id is given as the name of the dictionary (within the larger json object)
+        v['id'] = k  # Turn it into a property of the dictionary
+        if 'years' in v:  # Usually doesn't contain data. Sometimes contains multiple years. Difficult to process
+            del v['years']
+        if 'original_title' in v:  # Also sparse
+            del v['original_title']
+        if 'title' in v:  # Alse sparse. `folder` provides a much more useful label to the ImageGroup
+            del v['title']
+    new_obj = dict()  # Create an object representing a single image group, contains coordinates and a list of images
+    new_obj['latitude'] = coords[0]
+    new_obj['longitude'] = coords[1]
+    new_obj['photos'] = list(json_objs.values())
+    return ImageGroup(new_obj)
+
+
+def image_group_loader(json_directory):
+    """ A generator that yields all ImageGroups within a directory (as JSON objects)
+    """
+    all_files = os.listdir(json_directory)
+    all_files = [fname for fname in all_files if fname.endswith('.json')]
+    for fname in all_files:
+        yield get_entries_from_file(json_directory, fname)
+
+
+def load_borough_polygons(geo_filename):
+    """ Loads polygons from NYC borough geojson data for hit-testing
+    """
+    with open(geo_filename, 'r') as ctx:
+        json_data = json.load(ctx)
+    boroughs = list()
+    for feature in json_data['features']:
+        boro_name = feature['properties']['boro_name']
+        boroughs.append(shape(feature['geometry']))
+        # Print the index assigned to each borough
+        # This determines the values given to the Borough enum in Swift
+        print(f'Feature {len(boroughs) - 1}: {boro_name}')
+    return boroughs
+
+