Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Project-specific
*code-workspace
model_dir/
*.egg-info

# Compiled source #
###################
*.com
Expand Down
21 changes: 20 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ You can find the POI type classification dataset in `spacegraph/data_collection/


### Code Usage
This code is implemented in Python 2.7
All codes about the POI type classification task are in `spacegraph/spacegraph_codebase/`.

#### Location Modeling (See Section 5.1.1 and 5.1.2 in [our ICLR 2020 paper](https://openreview.net/forum?id=rJljdh4KDH))
Expand Down Expand Up @@ -78,6 +77,26 @@ Results:
<img src="res_fig/context_modeling.png" alt="context_modeling" width="1000" />
</p>

### Training and testing on own data

To simplify the usage on your own point data, we provide further example data in `geojson` format in the [data_collection](spacegraph/data_collection) folder. To train a model on your own data, it first needs to be converted to the same format as our [example data](spacegraph/data_collection/example_pois.geojson), specifically, you need a GeoDataFrame with a *projected* geometry, and with a column named `id`, and a variable number of columns named `poi_type_1`, `poi_type_2`, etc, containing the categories that each POI belongs to. Then, run the following steps:

```
cd spacegraph
python data_collection/prepare_own_data.py -d data_collection/example_pois.geojson
```
Here, you can replace the -d argument with the path to your own data. The script preprocesses the data and dumps the preprocessed files in a new folder `data_collection/example_poi_data` (or specify the output directory with the -o flag)

Then, use one of the bash scripts for training (global / relative / join), e.g.
```
sh global_train_example_pois.sh
```
Finally, we provide a [script](spacegraph/spacegraph_codebase/test.py) to use the trained model for generating embeddings of new points.
Run the following line with the path to your test data and the path to the directory with your trained model (per default `spacegraph/model_dir/global_example_data`)
```
python spacegraph_codebase/test.py [-h] [-d DATA_PATH] [-m MODEL_PATH]
```

## Geo-Aware Fine-Grained Image Classification Task

`geo_prior/` folder contains codes for recreating the evaluation results of the geo-aware fine-grained image classification task in [our ICLR 2020 paper](https://openreview.net/forum?id=rJljdh4KDH).
Expand Down
10,006 changes: 10,006 additions & 0 deletions spacegraph/data_collection/example_pois.geojson

Large diffs are not rendered by default.

154 changes: 154 additions & 0 deletions spacegraph/data_collection/prepare_own_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import argparse
import os
import json
import pickle
import numpy as np
import geopandas as gpd
from sklearn.metrics import pairwise_distances
from collections import defaultdict

from sklearn.neighbors import BallTree


def get_nearest(src_points, candidates, k_neighbors=10, remove_first=True):
"""Find nearest neighbors for all source points from a set of candidate points"""
# Create tree from the candidate points
tree = BallTree(candidates, leaf_size=15, metric="euclidean")
# Find closest points and distances
distances, indices = tree.query(
src_points, k=k_neighbors + int(remove_first)
)
# Return indices and distances
return (indices[:, remove_first:], distances[:, remove_first:])


def get_ordered_unique(arr):
set_new, elems_new = set(), []
for elem in arr:
if elem not in set_new:
set_new.add(elem)
elems_new.append(elem)
return elems_new


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-d",
"--data_path",
default="data_collection/example_pois.geojson",
type=str
)
parser.add_argument("-p", "--positive_samples", default=10, type=int)
parser.add_argument(
"-o",
"--out_path",
default="data_collection/example_poi_data",
type=str
)
args = parser.parse_args()

out_path = args.out_path
os.makedirs(out_path, exist_ok=True)
nr_neighbors = args.positive_samples

# LOAD data
poi = gpd.read_file(args.data_path)
mapping_prev_ids = {
i: int(old_id)
for i, old_id in enumerate(poi["id"].values)
}
with open(os.path.join(out_path, "poi_id_mapping.json"), "w") as outfile:
json.dump(mapping_prev_ids, outfile)
print("Saved mapping from old IDs to new IDs")
poi["id"] = np.arange(len(poi))
poi.set_index("id", inplace=True)

# PART 1: POI types
# add the main categories:
poi_type_cols = [col for col in poi if col.startswith("poi_type_")]
all_types = set()
for poi_col in poi_type_cols:
for elem in poi[poi_col].unique():
all_types.add(elem)
poi_id_mapping = {elem: i for i, elem in enumerate(list(all_types))}
# reversed
id_poi_mapping = {str(i): elem for elem, i in poi_id_mapping.items()}

# SAVE the poi types
with open(os.path.join(out_path, "poi_type.json"), "w") as outfile:
json.dump(id_poi_mapping, outfile)
print("Saved POI types")

# PART 2: POI list with categories
# update table
for col in poi_type_cols:
# transfer into numerical category IDs
poi[col] = poi[col].map(poi_id_mapping)
# train test splot
rand_perm = np.random.permutation(len(poi))
train_cutoff = int(len(poi) * 0.8)
val_cutoff = int(len(poi) * 0.9)
split_label_arr = np.array(["training"
for _ in range(len(poi))]).astype(str)
split_label_arr[rand_perm[train_cutoff:val_cutoff]] = "validation"
split_label_arr[rand_perm[val_cutoff:]] = "test"
poi["split"] = split_label_arr
poi.loc[poi["split"] == "validati", "split"] = "validation"
# convert table into tuple
my_poi_data = []
for elem_id, row in poi.iterrows():
this_tuple = (
elem_id,
(row["geometry"].x, row["geometry"].y),
tuple([row[poi_type] for poi_type in poi_type_cols]),
row["split"],
)
my_poi_data.append(this_tuple)
number_of_pois = len(id_poi_mapping)

# Save the poi data with the categories
with open(os.path.join(out_path, "pointset.pkl"), "wb") as outfile:
pickle.dump((number_of_pois, my_poi_data), outfile)
print("Saved POI-label data")

# PART 3: sample the spatially closest
coord_arr = np.swapaxes(
np.vstack([poi["geometry"].x.values, poi["geometry"].y.values]), 1, 0
)
closest, distance_of_closest = get_nearest(
coord_arr, coord_arr, k_neighbors=nr_neighbors
)
print("Finished positive sampling")

# convert index
poi_id_list = list(poi.index)
poi_id_array = np.array(poi_id_list)
poi_id_set = set(poi_id_list)

# Negative sampling:
all_tuples = []
for counter, positive_sampled_index in enumerate(closest):
elem_id = poi_id_list[counter]
positive_sampled = poi_id_array[positive_sampled_index]
leftover = list(poi_id_set - set([elem_id] + list(positive_sampled)))
negative_sampled = list(np.random.choice(leftover, nr_neighbors))

mode = poi.loc[elem_id, "split"]
all_tuples.append(
(
elem_id, tuple(positive_sampled), mode, negative_sampled,
distance_of_closest[counter]
)
)
print("Finisher negative sampling")

for mode in ["training", "validation", "test"]:
out_tuple = [
the_tuple for the_tuple in all_tuples if the_tuple[2] == mode
]
with open(
os.path.join(out_path, f"neighborgraphs_{mode}.pkl"), "wb"
) as outfile:
pickle.dump(out_tuple, outfile)
print("Saved graph data", mode)
54 changes: 54 additions & 0 deletions spacegraph/global_train_example_pois.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
python -m spacegraph_codebase.Place2Vec.train \
--data_dir data_collection/example_poi_data/\
--model_dir ./model_dir/global_example_data/ \
--log_dir ./model_dir/global_example_data/ \
--num_context_sample 10 \
--embed_dim 32 \
--dropout 0.5 \
--enc_agg mean \
--model_type global \
--num_rbf_anchor_pts 0 \
--spa_enc theory \
--spa_embed_dim 64 \
--freq 16 \
--max_radius 10000 \
--min_radius 100 \
--spa_f_act sigmoid \
--freq_init geometric \
--spa_enc_use_layn F \
--spa_enc_use_postmat T \
--g_spa_enc theory \
--g_spa_embed_dim 64 \
--g_freq 32 \
--g_max_radius 40000 \
--g_min_radius 50 \
--g_spa_f_act relu \
--g_freq_init geometric \
--g_spa_enc_use_layn T \
--g_spa_enc_use_postmat T \
--num_hidden_layer 1 \
--hidden_dim 512 \
--use_layn T \
--skip_connection T \
--use_dec T \
--init_decoder_atten_type concat \
--init_decoder_atten_act leakyrelu \
--init_decoder_atten_f_act sigmoid \
--init_decoder_atten_num 1 \
--init_decoder_use_layn T \
--init_decoder_use_postmat T \
--decoder_atten_type concat \
--decoder_atten_act leakyrelu \
--decoder_atten_f_act sigmoid \
--decoder_atten_num 1 \
--decoder_use_layn T \
--decoder_use_postmat T \
--join_dec_type max \
--act sigmoid \
--opt adam \
--lr 0.001 \
--max_iter 5000 \
--batch_size 512 \
--log_every 50 \
--val_every 50 \
# --load_model
13 changes: 6 additions & 7 deletions spacegraph/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
numpy==1.16.4
matplotlib==2.2.4
torch==1.0.1
sklearn==0.20.3
pyproj==2.2.2
pandas==0.24.2
scipy==1.2.1
numpy>=1.16.4
matplotlib>=2.2.4
torch>=1.0.1
pyproj>=2.2.2
pandas>=0.24.2
scipy>=1.2.1
4 changes: 2 additions & 2 deletions spacegraph/spacegraph_codebase/Place2Vec/cur_data_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
import cPickle as pickle
import pickle
import torch
from collections import OrderedDict, defaultdict
from multiprocessing import Process
Expand All @@ -18,7 +18,7 @@ def load_pointset(data_dir, point_data_path = "/pointset.pkl", num_feature_sampl
num_feature_sample: each POI have different num of POI Type, we resample a fix number of POI Types for each POI
embed_dim: embedding dimention
'''
num_poi_type, point_list = pickle.load(open(data_dir+point_data_path, "rb"))
num_poi_type, point_list = pickle.load(open(data_dir+point_data_path, "rb"), encoding='latin1')

feature_dim = embed_dim
feature_embedding = torch.nn.Embedding(num_poi_type, embed_dim)
Expand Down
4 changes: 2 additions & 2 deletions spacegraph/spacegraph_codebase/Place2Vec/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
import cPickle as pickle
import pickle
import torch
from collections import OrderedDict, defaultdict
from multiprocessing import Process
Expand All @@ -18,7 +18,7 @@ def load_pointset(data_dir, point_data_path = "/pointset.pkl", num_feature_sampl
num_feature_sample: each POI have different num of POI Type, we resample a fix number of POI Types for each POI
embed_dim: embedding dimention
'''
num_poi_type, point_list = pickle.load(open(data_dir+point_data_path, "rb"))
num_poi_type, point_list = pickle.load(open(data_dir+point_data_path, "rb"), encoding="latin-1")

feature_dim = embed_dim
feature_embedding = torch.nn.Embedding(num_poi_type, embed_dim)
Expand Down
15 changes: 12 additions & 3 deletions spacegraph/spacegraph_codebase/Place2Vec/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,26 @@
if args.cuda:
pointset.feature_embed_lookup = cudify(feature_embedding)

# make model directory
os.makedirs(args.model_dir, exist_ok=True)

# build NN model
trainer = Trainer(args, pointset, train_ng_list, val_ng_list, test_ng_list, feature_embedding, console = True)

trainer.logger.info("All argusment:")
trainer.logger.info("All arguments:")
for arg in vars(args):
trainer.logger.info("{}: {}".format(arg, getattr(args, arg)))

# load model
if args.load_model:
trainer.load_model()
trainer.logger.info("LOADING MODEL")
trainer.load_model()

# Save parameters
config = vars(args)
with open(os.path.join(args.model_dir, "config.json"), "w") as outfile:
json.dump(config, outfile)
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this in order to improve model loading, compared to the long file name with all parameters in the file name.


# train NN model
trainer.train()
trainer.train()
# trainer.eval_model()
9 changes: 4 additions & 5 deletions spacegraph/spacegraph_codebase/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import random
import numpy as np
import re
from sets import Set

def _random_sampling(item_tuple, num_sample):
'''
Expand Down Expand Up @@ -129,9 +128,9 @@ def __init__(self, point_list, num_feature_type, feature_embed_lookup,

self.pt_dict = defaultdict()
self.pt_mode = defaultdict()
self.pt_mode["training"] = Set()
self.pt_mode["validation"] = Set()
self.pt_mode["test"] = Set()
self.pt_mode["training"] = set()
self.pt_mode["validation"] = set()
self.pt_mode["test"] = set()

_, _, features, _ = point_list[0]
init_num_feature = len(features)
Expand Down Expand Up @@ -204,7 +203,7 @@ def get_negative_point_sample(self, neighbor_tuple, neg_sample_num):
a list of negative samples id
'''
data_mode = neighbor_tuple[2]
pt_list = list(self.pt_mode[data_mode]-Set([neighbor_tuple[0]]+list(neighbor_tuple[1])))
pt_list = list(self.pt_mode[data_mode]-set([neighbor_tuple[0]]+list(neighbor_tuple[1])))
if len(pt_list) >= neg_sample_num:
return list(np.random.choice(pt_list, neg_sample_num, replace=False))
else:
Expand Down
5 changes: 2 additions & 3 deletions spacegraph/spacegraph_codebase/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import cPickle as pickle
import pickle
import torch
from collections import OrderedDict, defaultdict
import random
Expand All @@ -8,9 +8,8 @@

from spacegraph_codebase.data import PointSet, NeighborGraph, Point


def load_ng(data_file):
raw_info = pickle.load(open(data_file, "rb"))
raw_info = pickle.load(open(data_file, "rb"), encoding='latin1')
return [NeighborGraph.deserialize(info) for info in raw_info]


Expand Down
2 changes: 1 addition & 1 deletion spacegraph/spacegraph_codebase/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,6 @@ def forward(self, context_feature_embeds, spa_feature_embeds):
# so we need to check the result type
aggs = aggs[0]

embeds = self.f_act(self.dropout(self.post_linear(aggs)))
embeds = self.f_act(self.dropout(self.post_linear(aggs.values)))
return embeds

2 changes: 0 additions & 2 deletions spacegraph/spacegraph_codebase/encoder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from sets import Set

import torch
import torch.nn as nn
from torch.nn import init
Expand Down
Loading