gengchenmai · NinaWie · Nov 14, 2022 · Nov 14, 2022 · Nov 14, 2022 · Nov 15, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+# Project-specific
+*code-workspace
+model_dir/
+*.egg-info
+
 # Compiled source #
 ###################
 *.com

diff --git a/README.md b/README.md
@@ -35,7 +35,6 @@ You can find the POI type classification dataset in `spacegraph/data_collection/
 
 
 ### Code Usage
-This code is implemented in Python 2.7
 All codes about the POI type classification task are in `spacegraph/spacegraph_codebase/`.
 
 #### Location Modeling (See Section 5.1.1 and 5.1.2 in [our ICLR 2020 paper](https://openreview.net/forum?id=rJljdh4KDH))
@@ -78,6 +77,26 @@ Results:
   <img src="res_fig/context_modeling.png" alt="context_modeling" width="1000" />
 </p>
 
+### Training and testing on own data
+
+To simplify the usage on your own point data, we provide further example data in `geojson` format in the [data_collection](spacegraph/data_collection) folder. To train a model on your own data, it first needs to be converted to the same format as our [example data](spacegraph/data_collection/example_pois.geojson), specifically, you need a GeoDataFrame with a *projected* geometry, and with a column named `id`, and a variable number of columns named `poi_type_1`, `poi_type_2`, etc, containing the categories that each POI belongs to. Then, run the following steps:
+
+```
+cd spacegraph
+python data_collection/prepare_own_data.py  -d data_collection/example_pois.geojson 
+```
+Here, you can replace the -d argument with the path to your own data. The script preprocesses the data and dumps the preprocessed files in a new folder `data_collection/example_poi_data` (or specify the output directory with the -o flag)
+
+Then, use one of the bash scripts for training (global / relative / join), e.g.
+```
+sh global_train_example_pois.sh
+```
+Finally, we provide a [script](spacegraph/spacegraph_codebase/test.py) to use the trained model for generating embeddings of new points.
+Run the following line with the path to your test data and the path to the directory with your trained model (per default `spacegraph/model_dir/global_example_data`) 
+```
+python spacegraph_codebase/test.py [-h] [-d DATA_PATH] [-m MODEL_PATH]
+```
+
 ## Geo-Aware Fine-Grained Image Classification Task
 
 `geo_prior/` folder contains codes for recreating the evaluation results of the geo-aware fine-grained image classification task in [our ICLR 2020 paper](https://openreview.net/forum?id=rJljdh4KDH).

diff --git a/spacegraph/data_collection/example_pois.geojson b/spacegraph/data_collection/example_pois.geojson
diff --git a/spacegraph/data_collection/prepare_own_data.py b/spacegraph/data_collection/prepare_own_data.py
@@ -0,0 +1,154 @@
+import argparse
+import os
+import json
+import pickle
+import numpy as np
+import geopandas as gpd
+from sklearn.metrics import pairwise_distances
+from collections import defaultdict
+
+from sklearn.neighbors import BallTree
+
+
+def get_nearest(src_points, candidates, k_neighbors=10, remove_first=True):
+    """Find nearest neighbors for all source points from a set of candidate points"""
+    # Create tree from the candidate points
+    tree = BallTree(candidates, leaf_size=15, metric="euclidean")
+    # Find closest points and distances
+    distances, indices = tree.query(
+        src_points, k=k_neighbors + int(remove_first)
+    )
+    # Return indices and distances
+    return (indices[:, remove_first:], distances[:, remove_first:])
+
+
+def get_ordered_unique(arr):
+    set_new, elems_new = set(), []
+    for elem in arr:
+        if elem not in set_new:
+            set_new.add(elem)
+            elems_new.append(elem)
+    return elems_new
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-d",
+        "--data_path",
+        default="data_collection/example_pois.geojson",
+        type=str
+    )
+    parser.add_argument("-p", "--positive_samples", default=10, type=int)
+    parser.add_argument(
+        "-o",
+        "--out_path",
+        default="data_collection/example_poi_data",
+        type=str
+    )
+    args = parser.parse_args()
+
+    out_path = args.out_path
+    os.makedirs(out_path, exist_ok=True)
+    nr_neighbors = args.positive_samples
+
+    # LOAD data
+    poi = gpd.read_file(args.data_path)
+    mapping_prev_ids = {
+        i: int(old_id)
+        for i, old_id in enumerate(poi["id"].values)
+    }
+    with open(os.path.join(out_path, "poi_id_mapping.json"), "w") as outfile:
+        json.dump(mapping_prev_ids, outfile)
+    print("Saved mapping from old IDs to new IDs")
+    poi["id"] = np.arange(len(poi))
+    poi.set_index("id", inplace=True)
+
+    # PART 1: POI types
+    # add the main categories:
+    poi_type_cols = [col for col in poi if col.startswith("poi_type_")]
+    all_types = set()
+    for poi_col in poi_type_cols:
+        for elem in poi[poi_col].unique():
+            all_types.add(elem)
+    poi_id_mapping = {elem: i for i, elem in enumerate(list(all_types))}
+    # reversed
+    id_poi_mapping = {str(i): elem for elem, i in poi_id_mapping.items()}
+
+    # SAVE the poi types
+    with open(os.path.join(out_path, "poi_type.json"), "w") as outfile:
+        json.dump(id_poi_mapping, outfile)
+    print("Saved POI types")
+
+    # PART 2: POI list with categories
+    # update table
+    for col in poi_type_cols:
+        # transfer into numerical category IDs
+        poi[col] = poi[col].map(poi_id_mapping)
+    # train test splot
+    rand_perm = np.random.permutation(len(poi))
+    train_cutoff = int(len(poi) * 0.8)
+    val_cutoff = int(len(poi) * 0.9)
+    split_label_arr = np.array(["training"
+                                for _ in range(len(poi))]).astype(str)
+    split_label_arr[rand_perm[train_cutoff:val_cutoff]] = "validation"
+    split_label_arr[rand_perm[val_cutoff:]] = "test"
+    poi["split"] = split_label_arr
+    poi.loc[poi["split"] == "validati", "split"] = "validation"
+    # convert table into tuple
+    my_poi_data = []
+    for elem_id, row in poi.iterrows():
+        this_tuple = (
+            elem_id,
+            (row["geometry"].x, row["geometry"].y),
+            tuple([row[poi_type] for poi_type in poi_type_cols]),
+            row["split"],
+        )
+        my_poi_data.append(this_tuple)
+    number_of_pois = len(id_poi_mapping)
+
+    # Save the poi data with the categories
+    with open(os.path.join(out_path, "pointset.pkl"), "wb") as outfile:
+        pickle.dump((number_of_pois, my_poi_data), outfile)
+    print("Saved POI-label data")
+
+    # PART 3: sample the spatially closest
+    coord_arr = np.swapaxes(
+        np.vstack([poi["geometry"].x.values, poi["geometry"].y.values]), 1, 0
+    )
+    closest, distance_of_closest = get_nearest(
+        coord_arr, coord_arr, k_neighbors=nr_neighbors
+    )
+    print("Finished positive sampling")
+
+    # convert index
+    poi_id_list = list(poi.index)
+    poi_id_array = np.array(poi_id_list)
+    poi_id_set = set(poi_id_list)
+
+    # Negative sampling:
+    all_tuples = []
+    for counter, positive_sampled_index in enumerate(closest):
+        elem_id = poi_id_list[counter]
+        positive_sampled = poi_id_array[positive_sampled_index]
+        leftover = list(poi_id_set - set([elem_id] + list(positive_sampled)))
+        negative_sampled = list(np.random.choice(leftover, nr_neighbors))
+
+        mode = poi.loc[elem_id, "split"]
+        all_tuples.append(
+            (
+                elem_id, tuple(positive_sampled), mode, negative_sampled,
+                distance_of_closest[counter]
+            )
+        )
+    print("Finisher negative sampling")
+
+    for mode in ["training", "validation", "test"]:
+        out_tuple = [
+            the_tuple for the_tuple in all_tuples if the_tuple[2] == mode
+        ]
+        with open(
+            os.path.join(out_path, f"neighborgraphs_{mode}.pkl"), "wb"
+        ) as outfile:
+            pickle.dump(out_tuple, outfile)
+        print("Saved graph data", mode)
diff --git a/spacegraph/global_train_example_pois.sh b/spacegraph/global_train_example_pois.sh
@@ -0,0 +1,54 @@
+python -m spacegraph_codebase.Place2Vec.train \
+  --data_dir  data_collection/example_poi_data/\
+  --model_dir ./model_dir/global_example_data/ \
+  --log_dir ./model_dir/global_example_data/ \
+  --num_context_sample 10 \
+  --embed_dim 32 \
+  --dropout 0.5 \
+  --enc_agg mean \
+  --model_type global \
+  --num_rbf_anchor_pts 0 \
+  --spa_enc theory \
+  --spa_embed_dim 64 \
+  --freq 16 \
+  --max_radius 10000 \
+  --min_radius 100 \
+  --spa_f_act sigmoid \
+  --freq_init geometric \
+  --spa_enc_use_layn F \
+  --spa_enc_use_postmat T \
+  --g_spa_enc theory \
+  --g_spa_embed_dim 64 \
+  --g_freq 32 \
+  --g_max_radius 40000 \
+  --g_min_radius 50 \
+  --g_spa_f_act relu \
+  --g_freq_init geometric \
+  --g_spa_enc_use_layn T \
+  --g_spa_enc_use_postmat T \
+  --num_hidden_layer 1 \
+  --hidden_dim 512 \
+  --use_layn T \
+  --skip_connection T \
+  --use_dec T \
+  --init_decoder_atten_type concat \
+  --init_decoder_atten_act leakyrelu \
+  --init_decoder_atten_f_act sigmoid \
+  --init_decoder_atten_num 1 \
+  --init_decoder_use_layn T \
+  --init_decoder_use_postmat T \
+  --decoder_atten_type concat \
+  --decoder_atten_act leakyrelu \
+  --decoder_atten_f_act sigmoid \
+  --decoder_atten_num 1 \
+  --decoder_use_layn T \
+  --decoder_use_postmat T \
+  --join_dec_type max \
+  --act sigmoid \
+  --opt adam \
+  --lr 0.001 \
+  --max_iter 5000 \
+  --batch_size 512 \
+  --log_every 50 \
+  --val_every 50 \
+  # --load_model
diff --git a/spacegraph/requirements.txt b/spacegraph/requirements.txt
@@ -1,7 +1,6 @@
-numpy==1.16.4
-matplotlib==2.2.4
-torch==1.0.1
-sklearn==0.20.3
-pyproj==2.2.2
-pandas==0.24.2
-scipy==1.2.1
+numpy>=1.16.4
+matplotlib>=2.2.4
+torch>=1.0.1
+pyproj>=2.2.2
+pandas>=0.24.2
+scipy>=1.2.1
diff --git a/spacegraph/spacegraph_codebase/Place2Vec/cur_data_utils.py b/spacegraph/spacegraph_codebase/Place2Vec/cur_data_utils.py
@@ -1,5 +1,5 @@
 import os
-import cPickle as pickle
+import pickle
 import torch
 from collections import OrderedDict, defaultdict
 from multiprocessing import Process
@@ -18,7 +18,7 @@ def load_pointset(data_dir, point_data_path = "/pointset.pkl", num_feature_sampl
         num_feature_sample: each POI have different num of POI Type, we resample a fix number of POI Types for each POI
         embed_dim: embedding dimention
     '''
-    num_poi_type, point_list = pickle.load(open(data_dir+point_data_path, "rb"))
+    num_poi_type, point_list = pickle.load(open(data_dir+point_data_path, "rb"), encoding='latin1')
 
     feature_dim = embed_dim
     feature_embedding = torch.nn.Embedding(num_poi_type, embed_dim)

diff --git a/spacegraph/spacegraph_codebase/Place2Vec/data_utils.py b/spacegraph/spacegraph_codebase/Place2Vec/data_utils.py
@@ -1,5 +1,5 @@
 import os
-import cPickle as pickle
+import pickle
 import torch
 from collections import OrderedDict, defaultdict
 from multiprocessing import Process
@@ -18,7 +18,7 @@ def load_pointset(data_dir, point_data_path = "/pointset.pkl", num_feature_sampl
         num_feature_sample: each POI have different num of POI Type, we resample a fix number of POI Types for each POI
         embed_dim: embedding dimention
     '''
-    num_poi_type, point_list = pickle.load(open(data_dir+point_data_path, "rb"))
+    num_poi_type, point_list = pickle.load(open(data_dir+point_data_path, "rb"), encoding="latin-1")
 
     feature_dim = embed_dim
     feature_embedding = torch.nn.Embedding(num_poi_type, embed_dim)

diff --git a/spacegraph/spacegraph_codebase/Place2Vec/train.py b/spacegraph/spacegraph_codebase/Place2Vec/train.py
@@ -38,17 +38,26 @@
 if args.cuda:
     pointset.feature_embed_lookup = cudify(feature_embedding)
 
+# make model directory
+os.makedirs(args.model_dir, exist_ok=True)
 
 # build NN model
 trainer = Trainer(args, pointset, train_ng_list, val_ng_list, test_ng_list, feature_embedding, console = True)
 
-trainer.logger.info("All argusment:")
+trainer.logger.info("All arguments:")
 for arg in vars(args):
     trainer.logger.info("{}: {}".format(arg, getattr(args, arg)))
 
 # load model
 if args.load_model:
-	trainer.load_model()
+    trainer.logger.info("LOADING MODEL")
+    trainer.load_model()
+
+# Save parameters
+config = vars(args)
+with open(os.path.join(args.model_dir, "config.json"), "w") as outfile:
+    json.dump(config, outfile)
 
 # train NN model
-trainer.train()
+trainer.train()
+# trainer.eval_model()
diff --git a/spacegraph/spacegraph_codebase/data.py b/spacegraph/spacegraph_codebase/data.py
@@ -2,7 +2,6 @@
 import random
 import numpy as np
 import re
-from sets import Set
 
 def _random_sampling(item_tuple, num_sample):
     '''
@@ -129,9 +128,9 @@ def __init__(self, point_list, num_feature_type, feature_embed_lookup,
 
         self.pt_dict = defaultdict()
         self.pt_mode = defaultdict()
-        self.pt_mode["training"] = Set()
-        self.pt_mode["validation"] = Set()
-        self.pt_mode["test"] = Set()
+        self.pt_mode["training"] = set()
+        self.pt_mode["validation"] = set()
+        self.pt_mode["test"] = set()
 
         _, _, features, _ = point_list[0]
         init_num_feature = len(features)
@@ -204,7 +203,7 @@ def get_negative_point_sample(self, neighbor_tuple, neg_sample_num):
             a list of negative samples id
         '''
         data_mode = neighbor_tuple[2]
-        pt_list = list(self.pt_mode[data_mode]-Set([neighbor_tuple[0]]+list(neighbor_tuple[1])))
+        pt_list = list(self.pt_mode[data_mode]-set([neighbor_tuple[0]]+list(neighbor_tuple[1])))
         if len(pt_list) >= neg_sample_num:
             return list(np.random.choice(pt_list, neg_sample_num, replace=False))
         else:

diff --git a/spacegraph/spacegraph_codebase/data_utils.py b/spacegraph/spacegraph_codebase/data_utils.py
@@ -1,4 +1,4 @@
-import cPickle as pickle
+import pickle
 import torch
 from collections import OrderedDict, defaultdict
 import random
@@ -8,9 +8,8 @@
 
 from spacegraph_codebase.data import PointSet, NeighborGraph, Point 
 
-
 def load_ng(data_file):
-    raw_info = pickle.load(open(data_file, "rb"))
+    raw_info = pickle.load(open(data_file, "rb"), encoding='latin1')
     return [NeighborGraph.deserialize(info) for info in raw_info]
 
 

diff --git a/spacegraph/spacegraph_codebase/decoder.py b/spacegraph/spacegraph_codebase/decoder.py
@@ -486,6 +486,6 @@ def forward(self, context_feature_embeds, spa_feature_embeds):
                 # so we need to check the result type
                 aggs = aggs[0]
 
-        embeds = self.f_act(self.dropout(self.post_linear(aggs)))
+        embeds = self.f_act(self.dropout(self.post_linear(aggs.values)))
         return embeds
 
diff --git a/spacegraph/spacegraph_codebase/encoder.py b/spacegraph/spacegraph_codebase/encoder.py
@@ -1,5 +1,3 @@
-from sets import Set
-
 import torch
 import torch.nn as nn
 from torch.nn import init