Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,12 @@ fastlane/test_output

iOSInjectionProject/
*.playground

GoogleService-Info.plist

backend-scripts/json/*
backend-scripts/minijson/*

.DS_STORE
*.pyc
__pycache__/
18 changes: 18 additions & 0 deletions backend-scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# A collection of scripts for working with the Firebase backend

## Scripts
* firebase\_import.py:
* Loads all data from the local JSON files, formats it, and uploads it to a Cloud Firestore database.
* image\_merger.py
* Functions related to merging groups of images located close to each other. Serves the purpose of preventing overlapping annotations
* json\_loader.py
* Functions related to loading and formatting data from the raw local dataset.

## Subdirectories
* json
* The complete OldNYC dataset. Each file corresponds to a single annotation (prior to merging via image\_merger.py).
* minijson
* A small subset of the complete dataset. For testing
* unmaintained
* Older, undocumented versions of the above scripts.

Empty file added backend-scripts/__init__.py
Empty file.
5 changes: 5 additions & 0 deletions backend-scripts/borough_boundaries.geojson

Large diffs are not rendered by default.

78 changes: 78 additions & 0 deletions backend-scripts/firebase_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import datetime
import os

import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

import image_merger
import json_loader


def load_image_groups(data_directory, borough_poly_path):
""" Loads image groups from the dataset.
data_directory refers to the directory containing OldNYC json data
borough_poly_path is a path to the geojson file defining the NYC borough boundaries
"""
boroughs = json_loader.load_borough_polygons(borough_poly_path)
image_groups = list(json_loader.image_group_loader(data_directory))

for image_group in image_groups:
image_group.json['borough_code'] = image_group.test_borough(boroughs)

image_groups = image_merger.merge_nearby_points(image_groups)
return image_groups


def load_firestore_db():
""" To use this script, generate and download a private certificate in the
firestore control panel, store it locally (outside the repo), and create
an environment variable pointing to it.
"""
cert_path = os.getenv('FIREBASE_CERTIFICATE_PATH')
if cert_path is None:
raise Error('Failed to find certificate to connect to Firebase as admin. Double-check the FIREBASE_CERTIFICATE_PATH environment variable')
cred = credentials.Certificate(cert_path)
firebase_admin.initialize_app(cred)
db = firestore.client()
return db


def batch_upload_image_groups(image_groups, database, batch_size=500):
""" Attempts to upload all data to firestore in batches of limited size
"""
batch_counter = 1
doc_counter = 0
print(f'{datetime.datetime.now()}: Beginning upload of batch {batch_counter}')
batch = database.batch()
for image_group in image_groups:
ig_ref = database.collection(u'nyc-image-groups').document()
batch.set(ig_ref, image_group.parent_json())
doc_counter += 1
if doc_counter >= batch_size:
batch.commit()
batch = database.batch()
doc_counter = 0
batch_counter += 1
print(f'{datetime.datetime.now()}: Beginning upload of batch {batch_counter}')
photo_coll = ig_ref.collection(u'photos')
for photo in image_group.full_json()['photos']:
photo_id = photo['id']
photo_ref = photo_coll.document(photo_id)
batch.set(photo_ref, photo)
doc_counter += 1
if doc_counter >= batch_size:
batch.commit()
batch = database.batch()
doc_counter = 0
batch_counter += 1
print(f'{datetime.datetime.now()}: Beginning upload of batch {batch_counter}')

print('Batched upload complete')


if __name__ == '__main__':
db = load_firestore_db()
image_groups = load_image_groups('json', 'borough_boundaries.geojson')
batch_upload_image_groups(image_groups, db, 400)

59 changes: 59 additions & 0 deletions backend-scripts/image_merger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import numpy as np
import pyproj
from scipy.spatial import cKDTree as Tree


def make_groups(paired_points):
""" Treats the set of pairs generated by SciPy's kdTree query as an equivalence
relation on the underlying indices and generates a list of equivalence classes
for the relation.
"""
eq_classes = list()

def helper_get_index(tup, group_list):
""" Finds the list in group_list containing at least one of the elements
of tup. This corresponds to the equivalence class that the elements of tup
belong to.
"""
for n, g in enumerate(group_list):
if tup[0] in g or tup[1] in g:
return n
return None

for tup in paired_points:
idx = helper_get_index(tup, eq_classes)
if idx is None:
eq_classes.append({tup[0], tup[1]})
else:
eq_classes[idx].add(tup[0])
eq_classes[idx].add(tup[1])
return eq_classes


def merge_nearby_points(image_groups, thresh_distance=20):
""" Merges points within a given distance of each other
"""

# This line was taken from a stackoverflow post
# I can't find the post I referenced, so no link
meter_projection = pyproj.Transformer.from_crs('epsg:4326', 'epsg:3857')

numpy_data = np.array([
meter_projection.transform(group.json['latitude'], group.json['longitude'])
for group in image_groups])
tree = Tree(numpy_data)

pairs = tree.query_pairs(thresh_distance)
groups = make_groups(pairs)
# The set of all (indices to) ImageGroups that won't be merged into any others
leftovers = set(range(len(image_groups))).difference(*groups)
new_data = list()
for idx in leftovers:
new_data.append(image_groups[idx])
for group in groups:
idx_list = list(group)
merged_group = image_groups[idx_list[0]].merge_many([image_groups[i] for i in idx_list[1:]])
new_data.append(merged_group)

return new_data

152 changes: 152 additions & 0 deletions backend-scripts/json_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import json
import os
from os import path

import geopy.distance
import numpy as np
import pyproj
from scipy.spatial import cKDTree as Tree
from shapely.geometry import Point, shape


""" Loads data from json and does some minor preprocessing to merge groups of
images that are located at the same point or very close to each other.
This results in fewer overlapping image annotations within the app and allows
us to avoid the use of MKClusterAnnotation, which is bugged.
"""


class ImageGroup:
""" A thin wrapper around the ImageGroup JSON objects that provides convenience
methods for working with them
"""
def __init__(self, json_object):
self.json = json_object

def distance_to(self, other):
""" Distance between two ImageGroups in meters
"""
p1 = self.json['latitude'], self.json['longitude']
p2 = other.json['latitude'], other.json['longitude']
return geopy.distance.distance(p1, p2).m

def merge(self, other):
""" Returns a new ImageGroup produced by merging self with another image group
"""
merged_photos = self.json['photos'] + other.json['photos']
avg_lat = (self.json['latitude'] + other.json['latitude']) / 2
avg_lon = (self.json['longitude'] + other.json['longitude']) / 2
return ImageGroup({'latitude': avg_lat, 'longitude': avg_lon, 'photos': merged_photos})

def merge_many(self, others):
""" Returns a new ImageGroup produced by merging self with many other image groups
"""
lats = [self.json['latitude']] + [other.json['latitude'] for other in others]
lons = [self.json['longitude']] + [other.json['longitude'] for other in others]
avg_lat = sum(lats) / len(lats)
avg_lon = sum(lons) / len(lons)
images = list()
for image_group in [self, *others]:
images.extend(image_group.json['photos'])
return ImageGroup({'latitude': avg_lat, 'longitude': avg_lon, 'photos': images})

def nearest_group(self, image_groups):
""" Finds the nearest ImageGroup to self in image_groups
"""
nearest = None
nearest_idx = 0
nearest_dist = 1e10
for n, group in enumerate(image_groups):
dist = self.distance_to(group)
if dist < nearest_dist:
nearest = group
nearest_idx = n
nearest_dist = dist
if nearest is None:
return None
return nearest, nearest_idx, nearest_dist

def test_borough(self, boroughs):
""" Hit test on all the boroughs to determine which one this
ImageGroup belongs to
"""
# Shapely uses a (longitude, latitude convention)
coord = Point(self.json['longitude'], self.json['latitude'])
for n, poly in enumerate(boroughs):
if poly.contains(coord):
return n
# If it's not in any of the boroughs, return a dummy index corresponding
# to this result. This could occur for images taken on bridges
return len(boroughs)

def full_json(self):
return self.json

def parent_json(self):
result = {'latitude': self.json['latitude'], 'longitude': self.json['longitude']}
if 'borough_code' in self.json:
result['borough_code'] = self.json['borough_code']
return result


def coords_from_fname(fname):
""" As the data files contain the image's coordinates within the filename,
the name must be parsed to extract this information.
The files currently have the format <lat><long>.json
Since all points in NYC have negative longitude, the negative sign is used
here to determine where the longitude number starts.
"""
no_extension = fname[:-5] # remove the .json from the end
lat,lon = no_extension.split('-')
lat,lon = float(lat), float(lon)
lon = -lon # Account for the - we removed when splitting the string
return lat, lon


def get_entries_from_file(fdir, fname):
""" Converts a data file into an ImageGroup object.
Also strips away several fields from the original data
"""
with open(path.join(fdir, fname), 'r') as json_file:
json_objs = json.load(json_file)
coords = coords_from_fname(fname)
for k, v in json_objs.items():
# Originally, the image id is given as the name of the dictionary (within the larger json object)
v['id'] = k # Turn it into a property of the dictionary
if 'years' in v: # Usually doesn't contain data. Sometimes contains multiple years. Difficult to process
del v['years']
if 'original_title' in v: # Also sparse
del v['original_title']
if 'title' in v: # Alse sparse. `folder` provides a much more useful label to the ImageGroup
del v['title']
new_obj = dict() # Create an object representing a single image group, contains coordinates and a list of images
new_obj['latitude'] = coords[0]
new_obj['longitude'] = coords[1]
new_obj['photos'] = list(json_objs.values())
return ImageGroup(new_obj)


def image_group_loader(json_directory):
""" A generator that yields all ImageGroups within a directory (as JSON objects)
"""
all_files = os.listdir(json_directory)
all_files = [fname for fname in all_files if fname.endswith('.json')]
for fname in all_files:
yield get_entries_from_file(json_directory, fname)


def load_borough_polygons(geo_filename):
""" Loads polygons from NYC borough geojson data for hit-testing
"""
with open(geo_filename, 'r') as ctx:
json_data = json.load(ctx)
boroughs = list()
for feature in json_data['features']:
boro_name = feature['properties']['boro_name']
boroughs.append(shape(feature['geometry']))
# Print the index assigned to each borough
# This determines the values given to the Borough enum in Swift
print(f'Feature {len(boroughs) - 1}: {boro_name}')
return boroughs


Loading