From 4861210e6a5d08f3b04c12fe879e80a149dae437 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Tue, 8 Dec 2020 18:13:53 +0100
Subject: [PATCH 01/21] minimal environment with up2date packages

---
 environment_min.yml | 98 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 environment_min.yml

diff --git a/environment_min.yml b/environment_min.yml
new file mode 100644
index 0000000..504ce86
--- /dev/null
+++ b/environment_min.yml
@@ -0,0 +1,98 @@
+name: teran
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _pytorch_select=0.1=cpu_0
+  - arrow=0.17.0=py36h9f0ad1d_1
+  - binaryornot=0.4.4=py_1
+  - blas=1.0=mkl
+  - brotlipy=0.7.0=py36he6145b8_1001
+  - ca-certificates=2020.10.14=0
+  - certifi=2020.12.5=py36h06a4308_0
+  - cffi=1.14.0=py36h2e261b9_0
+  - chardet=3.0.4=py36h9880bd3_1008
+  - click=7.1.2=py_0
+  - cookiecutter=1.7.2=pyh9f0ad1d_0
+  - cryptography=3.2.1=py36h6ec43e4_0
+  - cudatoolkit=10.1.243=h6bb024c_0
+  - cycler=0.10.0=py_2
+  - cython=0.29.21=py36ha357f81_1
+  - dataclasses=0.7=pyhe4b4509_6
+  - filelock=3.0.12=pyh9f0ad1d_0
+  - freetype=2.10.4=h5ab3b9f_0
+  - gperftools=2.7=h767d802_2
+  - idna=2.10=pyh9f0ad1d_0
+  - intel-openmp=2020.2=254
+  - jinja2=2.11.2=pyh9f0ad1d_0
+  - jinja2-time=0.2.0=py_2
+  - joblib=0.17.0=py_0
+  - jpeg=9b=h024ee3a_2
+  - kiwisolver=1.3.1=py36h51d7077_0
+  - lcms2=2.11=h396b838_0
+  - libedit=3.1.20191231=h14c3975_1
+  - libffi=3.2.1=hf484d3e_1007
+  - libgcc-ng=9.1.0=hdf63c60_0
+  - libpng=1.6.37=hbc83047_0
+  - libstdcxx-ng=9.1.0=hdf63c60_0
+  - libtiff=4.1.0=h2733197_1
+  - libuv=1.40.0=h7b6447c_0
+  - lz4-c=1.9.2=heb0550a_3
+  - markupsafe=1.1.1=py36he6145b8_2
+  - matplotlib-base=3.3.3=py36he12231b_0
+  - mkl=2020.2=256
+  - mkl-service=2.3.0=py36he8ac12f_0
+  - mkl_fft=1.2.0=py36h23d657b_0
+  - mkl_random=1.1.1=py36h0573a6f_0
+  - ncurses=6.2=he6710b0_1
+  - ninja=1.10.2=py36hff7bd54_0
+  - nltk=3.5=py_0
+  - numpy=1.19.2=py36h54aff64_0
+  - numpy-base=1.19.2=py36hfa32c7d_0
+  - olefile=0.46=py36_0
+  - openssl=1.1.1h=h7b6447c_0
+  - packaging=20.7=pyhd3deb0d_0
+  - perl=5.32.0=h36c2ea0_0
+  - pillow=8.0.1=py36he98fc37_0
+  - pip=20.3.1=py36h06a4308_0
+  - poyo=0.5.0=py_0
+  - protobuf=3.4.1=py36_0
+  - pycocotools=2.0.2=py36h8c4c3a4_1
+  - pycparser=2.20=py_2
+  - pyopenssl=20.0.0=pyhd8ed1ab_0
+  - pyparsing=2.4.7=pyh9f0ad1d_0
+  - pysocks=1.7.1=py36h9880bd3_2
+  - python=3.6.9=h265db76_0
+  - python-dateutil=2.8.1=py_0
+  - python-slugify=4.0.1=pyh9f0ad1d_0
+  - python_abi=3.6=1_cp36m
+  - pytorch=1.7.0=py3.6_cuda10.1.243_cudnn7.6.3_0
+  - readline=7.0=h7b6447c_5
+  - regex=2020.11.13=py36h27cfd23_0
+  - requests=2.25.0=pyhd3deb0d_0
+  - sacremoses=0.0.43=pyh9f0ad1d_0
+  - sentencepiece=0.1.92=py36hdb11119_0
+  - setuptools=51.0.0=py36h06a4308_2
+  - six=1.15.0=py36h06a4308_0
+  - sqlite=3.33.0=h62c20be_0
+  - text-unidecode=1.3=py_0
+  - tk=8.6.10=hbc83047_0
+  - tokenizers=0.9.4=py36h2bc52f9_1
+  - torchvision=0.8.1=py36_cu101
+  - tornado=6.1=py36h1d69622_0
+  - tqdm=4.54.1=pyhd3eb1b0_0
+  - transformers=4.0.0=pyhd8ed1ab_0
+  - typing_extensions=3.7.4.3=py_0
+  - unidecode=1.1.1=py_0
+  - urllib3=1.25.11=py_0
+  - wheel=0.36.1=pyhd3eb1b0_0
+  - whichcraft=0.6.1=py_0
+  - xz=5.2.5=h7b6447c_0
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.11=h7b6447c_3
+  - zstd=1.4.5=h9ceee32_0
+  - pip:
+    - pyyaml==5.3.1
+prefix: /home/p0w3r/bin/miniconda3/envs/teran

From 5c6eeb79997e64c30bfa269e5ae52d43ac821589 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Tue, 8 Dec 2020 18:50:15 +0100
Subject: [PATCH 02/21] updated readme

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index cb3837f..cedb9d8 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,12 @@ conda activate teran
 export PYTHONPATH=.
 ```
 
+2.1 Setup minimal python environment for CUDA 10.1 using conda:
+```
+conda env create --file environment_min.yml
+conda activate teran
+export PYTHONPATH=.
+```
 ## Get the data
 1. Download and extract the data folder, containing annotations, the splits by Karpathy et al. and ROUGEL - SPICE precomputed relevances for both COCO and Flickr30K datasets:
 

From fbbdb815e8b91723f8bda9eb3e469bf6c408f014 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Wed, 9 Dec 2020 11:29:33 +0100
Subject: [PATCH 03/21] ignored data and models

---
 .gitignore | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.gitignore b/.gitignore
index a9a721b..8843461 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,10 @@
 *.ipynb_checkpoints
 *.json
 *.pth.tar
+
+
+.idea
+data
+pretrained_models
+*.tar
+

From 646f2b3e7e66af5bfc10c8dd43d0e96ab6aff766 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Wed, 9 Dec 2020 11:48:59 +0100
Subject: [PATCH 04/21] ignored jupyer stuff

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 8843461..4bc9eeb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,4 @@
 data
 pretrained_models
 *.tar
-
+*.ipynb
\ No newline at end of file

From 3a0fd5f66f37c228d222a9d51f0a0ba9451f8997 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Thu, 10 Dec 2020 15:48:08 +0100
Subject: [PATCH 05/21] - commented out ndcg and i2t stuff - started impl of
 inference.py

---
 configs/teran_coco_MrSw.yaml     |  2 +-
 configs/teran_inf_coco_MrSw.yaml | 59 +++++++++++++++++++++++++++++
 evaluation.py                    | 23 +++++------
 inference.py                     | 65 ++++++++++++++++++++++++++++++++
 models/teran.py                  |  4 +-
 test.py                          | 11 ++++--
 6 files changed, 147 insertions(+), 17 deletions(-)
 create mode 100644 configs/teran_inf_coco_MrSw.yaml
 create mode 100644 inference.py

diff --git a/configs/teran_coco_MrSw.yaml b/configs/teran_coco_MrSw.yaml
index d3458aa..5e8a440 100644
--- a/configs/teran_coco_MrSw.yaml
+++ b/configs/teran_coco_MrSw.yaml
@@ -51,7 +51,7 @@ training:
   alignment-mode: 'MrSw'
   measure: 'dot'
   margin: 0.2
-  bs: 40
+  bs: 100
   scheduler: 'steplr'
   gamma: 0.1
   step-size: 20
diff --git a/configs/teran_inf_coco_MrSw.yaml b/configs/teran_inf_coco_MrSw.yaml
new file mode 100644
index 0000000..d3458aa
--- /dev/null
+++ b/configs/teran_inf_coco_MrSw.yaml
@@ -0,0 +1,59 @@
+dataset:
+  name: 'coco'
+  images-path: 'data/coco/images'  # not needed if using pre-extracted bottom-up features
+  data: 'data'
+  restval: True
+  pre-extracted-features: False
+
+text-model:
+  name: 'bert'
+  pretrain: 'bert-base-uncased'
+  word-dim: 768
+  extraction-hidden-layer: 6
+  fine-tune: True
+  pre-extracted: False
+  layers: 0
+  dropout: 0.1
+
+#text-model:
+#  name: 'gru'
+#  word-dim: 300
+#  fine-tune: True
+#  pre-extracted: False
+#  layers: 1
+
+image-model:
+  name: 'bottomup'
+  pre-extracted-features-root: 'data/coco/features_36'
+  transformer-layers: 4
+  dropout: 0.1
+  pos-encoding: 'concat-and-process'
+  crop-size: 224  # not used
+  fine-tune: False
+  feat-dim: 2048
+  norm: True
+
+model:
+  name: 'teran'
+  embed-size: 1024
+  text-aggregation: 'first'
+  image-aggregation: 'first'
+  layers: 2
+  exclude-stopwords: False
+  shared-transformer: False
+  dropout: 0.1
+
+training:
+  lr: 0.00001  # 0.000006
+  grad-clip: 2.0
+  max-violation: True
+  loss-type: 'alignment'
+  alignment-mode: 'MrSw'
+  measure: 'dot'
+  margin: 0.2
+  bs: 40
+  scheduler: 'steplr'
+  gamma: 0.1
+  step-size: 20
+  warmup: null
+  warmup-period: 1000
diff --git a/evaluation.py b/evaluation.py
index 5cf4569..e4a5d68 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -72,7 +72,8 @@ def tb_log(self, tb_logger, prefix='', step=None):
 
 
 def encode_data(model, data_loader, log_step=10, logging=print):
-    """Encode all images and captions loadable by `data_loader`
+    """
+    Encode all images and captions loadable by `data_loader`
     """
     batch_time = AverageMeter()
     val_logger = LogCollector()
@@ -195,14 +196,14 @@ def evalrank(config, checkpoint, split='dev', fold5=False):
 
     if not fold5:
         # no cross-validation, full evaluation
-        r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5)
+        # r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5)
         ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5)
-        ar = (r[0] + r[1] + r[2]) / 3
+        # ar = (r[0] + r[1] + r[2]) / 3
         ari = (ri[0] + ri[1] + ri[2]) / 3
-        rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
-        print("rsum: %.1f" % rsum)
-        print("Average i2t Recall: %.1f" % ar)
-        print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r)
+        #rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
+        #print("rsum: %.1f" % rsum)
+        # print("Average i2t Recall: %.1f" % ar)
+        # print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r)
         print("Average t2i Recall: %.1f" % ari)
         print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri)
     else:
@@ -391,10 +392,10 @@ def t2i(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals
                 0]  # in che posizione e' l'immagine (index) che ha questa caption (5*index + i)
             top50[5 * index + i] = inds[i][0:50]
             # calculate ndcg
-            if ndcg_scorer is not None:
-                rougel_ndcgs[5 * index + i], spice_ndcgs[5 * index + i] = \
-                    ndcg_scorer.compute_ndcg(npts, 5 * index + i, inds[i].astype(int),
-                                             fold_index=fold_index, retrieval='image').values()
+            # if ndcg_scorer is not None:
+            #     rougel_ndcgs[5 * index + i], spice_ndcgs[5 * index + i] = \
+            #         ndcg_scorer.compute_ndcg(npts, 5 * index + i, inds[i].astype(int),
+            #                                  fold_index=fold_index, retrieval='image').values()
 
     # Compute metrics
     r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
diff --git a/inference.py b/inference.py
new file mode 100644
index 0000000..6bb8dd5
--- /dev/null
+++ b/inference.py
@@ -0,0 +1,65 @@
+import argparse
+from typing import List
+from data import get_inference_loader
+import torch
+import yaml
+
+from models.teran import TERAN
+
+
+def image_retrieval(checkpoint, opts, config) -> List[str]:
+    # load model and options
+    # checkpoint = torch.load(model_path)
+    data_path = config['dataset']['data']
+    measure = config['training']['measure']
+
+    # construct model
+    model = TERAN(config)
+
+    # load model state
+    model.load_state_dict(checkpoint['model'], strict=False)
+
+    print('Loading dataset')
+    dataloader = get_inference_loader(config, opts, workers=4)
+
+    return ["1", "2"]
+
+
+def main(opts, current_config) -> List[str]:
+    checkpoint = torch.load(opts.checkpoint, map_location=torch.device(opts.device))
+
+    print('Checkpoint loaded from {}'.format(opts.checkpoint))
+    loaded_config = checkpoint['config']
+
+    # Override some mandatory things in the configuration (paths)
+    if current_config is not None:
+        loaded_config['dataset']['images-path'] = current_config['dataset']['images-path']
+        loaded_config['dataset']['data'] = current_config['dataset']['data']
+        loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model'][
+            'pre-extracted-features-root']
+
+    top_k_results = image_retrieval(checkpoint, opts, loaded_config)
+    return top_k_results
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar"
+                        , required=True)
+    parser.add_argument('--query', type=str, required=True)
+    parser.add_argument('--device', type=str, choices=['cpu', 'gpu'], default='cpu')
+    parser.add_argument('--num_images', type=int, default=1000)
+    parser.add_argument('--top_k', type=int, default=10)
+    parser.add_argument('--dataset', type=str, choices=['coco', 'flickr30k'], default='coco')
+    parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the "
+                                                                 "checkpoint configuration. See into 'config' folder")
+
+    opts = parser.parse_args()
+    if opts.config is not None:
+        with open(opts.config, 'r') as yml_file:
+            config = yaml.load(yml_file)
+    else:
+        config = None
+    top_k_results = main(opts, config)
+    print(f"######## TOP {opts.tok_k} RESULTS ########")
+    print(top_k_results)
diff --git a/models/teran.py b/models/teran.py
index 1eeb524..f74e45a 100644
--- a/models/teran.py
+++ b/models/teran.py
@@ -16,7 +16,7 @@
 class JointTextImageTransformerEncoder(nn.Module):
     """
     This is a bert caption encoder - transformer image encoder (using bottomup features).
-    If process the encoder outputs through a transformer, like VilBERT and outputs two different graph embeddings
+    It process the encoder outputs through a transformer, like VilBERT and outputs two different graph embeddings
     """
     def __init__(self, config):
         super().__init__()
@@ -233,7 +233,7 @@ def forward_loss(self, img_emb, cap_emb, img_emb_set, cap_emb_seq, img_lengths,
         # bs = img_emb.shape[0]
         losses = {}
 
-        if  'matching' in self.config['training']['loss-type']:
+        if 'matching' in self.config['training']['loss-type']:
             matching_loss = self.matching_criterion(img_emb, cap_emb)
             losses.update({'matching-loss': matching_loss})
             self.logger.update('matching_loss', matching_loss.item(), img_emb.size(0))
diff --git a/test.py b/test.py
index 9c38df3..e3fa3f0 100644
--- a/test.py
+++ b/test.py
@@ -1,3 +1,4 @@
+import os
 import argparse
 
 import evaluation
@@ -7,7 +8,7 @@
 def main(opt, current_config):
     model_checkpoint = opt.checkpoint
 
-    checkpoint = torch.load(model_checkpoint)
+    checkpoint = torch.load(model_checkpoint)#, map_location=torch.device("cpu"))
     print('Checkpoint loaded from {}'.format(model_checkpoint))
     loaded_config = checkpoint['config']
 
@@ -23,14 +24,18 @@ def main(opt, current_config):
         loaded_config['dataset']['images-path'] = current_config['dataset']['images-path']
         loaded_config['dataset']['data'] = current_config['dataset']['data']
         loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model']['pre-extracted-features-root']
+        loaded_config['training']['bs'] = current_config['training']['bs']
 
-    evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=fold5)
+    evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=False)
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('checkpoint', type=str, help="Checkpoint to load")
     parser.add_argument('--size', type=str, choices=['1k', '5k'], default='1k')
-    parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the checkpoint configuration. See into 'config' folder")
+    parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the "
+                                                                 "checkpoint configuration. See into 'config' folder")
+
+    print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", ""))
 
     opt = parser.parse_args()
     if opt.config is not None:

From 121bf5497407f31058de8d9fbd85d73d61d35b00 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Mon, 14 Dec 2020 11:53:10 +0100
Subject: [PATCH 06/21] added flags for i2t t2i and gpu

---
 evaluation.py | 116 +++++++++++++++++++++++++++++++++++---------------
 test.py       |  22 +++++++---
 2 files changed, 98 insertions(+), 40 deletions(-)

diff --git a/evaluation.py b/evaluation.py
index e4a5d68..3c0854f 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -150,7 +150,7 @@ def encode_data(model, data_loader, log_step=10, logging=print):
     return img_embs, cap_embs, img_lengths, cap_lengths
 
 
-def evalrank(config, checkpoint, split='dev', fold5=False):
+def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i2t=False):
     """
     Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold
     cross-validation is done (only for MSCOCO). Otherwise, the full data is
@@ -196,48 +196,94 @@ def evalrank(config, checkpoint, split='dev', fold5=False):
 
     if not fold5:
         # no cross-validation, full evaluation
-        # r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5)
-        ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5)
-        # ar = (r[0] + r[1] + r[2]) / 3
-        ari = (ri[0] + ri[1] + ri[2]) / 3
-        #rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
-        #print("rsum: %.1f" % rsum)
-        # print("Average i2t Recall: %.1f" % ar)
-        # print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r)
-        print("Average t2i Recall: %.1f" % ari)
-        print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri)
+        if eval_i2t:
+            r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5)
+            ar = (r[0] + r[1] + r[2]) / 3
+            print("Average i2t Recall: %.1f" % ar)
+            print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r)
+
+        if eval_t2i:
+            ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5)
+            ari = (ri[0] + ri[1] + ri[2]) / 3
+            print("Average t2i Recall: %.1f" % ari)
+            print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri)
+
+        if eval_i2t and eval_t2i:
+            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
+            print("rsum: %.1f" % rsum)
+
+
+
     else:
         # 5fold cross-validation, only for MSCOCO
         results = []
         for i in range(5):
-            r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000],
-                         img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000],
-                         return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, cap_batches=1)
-            print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f" % r)
-            ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000],
-                           img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000],
-                           return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, im_batches=1)
-            if i == 0:
-                rt, rti = rt0, rti0
-            print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri)
-            ar = (r[0] + r[1] + r[2]) / 3
-            ari = (ri[0] + ri[1] + ri[2]) / 3
-            rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
-            print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
-            results += [list(r) + list(ri) + [ar, ari, rsum]]
+            if eval_i2t:
+                r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000],
+                             img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000],
+                                  return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, cap_batches=1)
+                print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f" % r)
+                if i == 0:
+                    rt = rt0
+                ar = (r[0] + r[1] + r[2]) / 3
+            if eval_t2i:
+                ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000],
+                               img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000],
+                                    return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, im_batches=1)
+                print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri)
+                if i == 0:
+                    rti = rti0
+                ari = (ri[0] + ri[1] + ri[2]) / 3
+
+
+            if eval_t2i and eval_i2t:
+                rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
+                print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari))
+            elif eval_t2i:
+                print("ari: %.1f" % (ari,))
+            elif eval_i2t:
+                print("ar: %.1f" % (ar,))
+
+
+            if eval_t2i and eval_i2t:
+                results += [list(r) + list(ri) + [ar, ari, rsum]] # 7 + 7 + 3 = 17 elements
+            elif eval_t2i:
+                results += [list(ri) + [ari]] # 7 + 1 = 8 elements
+            elif eval_i2t:
+                results += [list(r) + [ar]] # 7 + 1 = 8 elements
+
+
 
         print("-----------------------------------")
         print("Mean metrics: ")
         mean_metrics = tuple(np.array(results).mean(axis=0).flatten())
-        print("rsum: %.1f" % (mean_metrics[16] * 6))
-        print("Average i2t Recall: %.1f" % mean_metrics[14])
-        print("Image to text: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" %
-              mean_metrics[:7])
-        print("Average t2i Recall: %.1f" % mean_metrics[15])
-        print("Text to image: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" %
-              mean_metrics[7:14])
-
-    torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
+        if eval_t2i and eval_i2t:
+            print("rsum: %.1f" % (mean_metrics[16] * 6))
+            print("Average i2t Recall: %.1f" % mean_metrics[14])
+            print("Image to text: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" %
+                  mean_metrics[:7])
+            print("Average t2i Recall: %.1f" % mean_metrics[15])
+            print("Text to image: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" %
+                  mean_metrics[7:14])
+        elif eval_t2i:
+            print("Average t2i Recall: %.1f" % mean_metrics[7])
+            print("Text to image: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" %
+                  mean_metrics[:7])
+        elif eval_i2t:
+            print("Average i2t Recall: %.1f" % mean_metrics[7])
+            print("Image to text: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" %
+                  mean_metrics[:7])
+
+
+
+
+    if eval_t2i and eval_i2t:
+        torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
+    elif eval_t2i:
+        torch.save({'rti': rti}, 'ranks.pth.tar')
+    elif eval_i2t:
+        torch.save({'rt': rt}, 'ranks.pth.tar')
+
 
 
 def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, measure='dot', sim_function=None, cap_batches=1):
diff --git a/test.py b/test.py
index e3fa3f0..9586784 100644
--- a/test.py
+++ b/test.py
@@ -1,14 +1,20 @@
-import os
 import argparse
+import os
 
-import evaluation
-import yaml
 import torch
+import yaml
+
+import evaluation
+
 
 def main(opt, current_config):
     model_checkpoint = opt.checkpoint
 
-    checkpoint = torch.load(model_checkpoint)#, map_location=torch.device("cpu"))
+    if opt.gpu:
+        checkpoint = torch.load(model_checkpoint)  # , map_location=torch.device("cpu"))
+    else:
+        checkpoint = torch.load(model_checkpoint, map_location=torch.device("cpu"))
+
     print('Checkpoint loaded from {}'.format(model_checkpoint))
     loaded_config = checkpoint['config']
 
@@ -26,12 +32,18 @@ def main(opt, current_config):
         loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model']['pre-extracted-features-root']
         loaded_config['training']['bs'] = current_config['training']['bs']
 
-    evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=False)
+    evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=False, eval_t2i=opt.t2i, eval_i2t=opt.i2t)
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('checkpoint', type=str, help="Checkpoint to load")
     parser.add_argument('--size', type=str, choices=['1k', '5k'], default='1k')
+    parser.add_argument('--gpu', type=bool, default=True, help="If false, CPU is used for computations; GPU otherwise.")
+    parser.add_argument('--t2i', type=bool, default=True, help="If true text-to-image (image retrieval) evaluation "
+                                                               "will be executed.")
+    parser.add_argument('--i2t', type=bool, default=False, help="If true image-to-text (image captioning) evaluation "
+                                                                "will be executed.")
     parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the "
                                                                  "checkpoint configuration. See into 'config' folder")
 

From 0ed07303e89e95565427560c3c5665fa82da4a67 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Mon, 14 Dec 2020 11:54:30 +0100
Subject: [PATCH 07/21] added timing outputs for evaluation

---
 configs/teran_coco_MrSw.yaml |  2 +-
 evaluation.py                | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/configs/teran_coco_MrSw.yaml b/configs/teran_coco_MrSw.yaml
index 5e8a440..d3458aa 100644
--- a/configs/teran_coco_MrSw.yaml
+++ b/configs/teran_coco_MrSw.yaml
@@ -51,7 +51,7 @@ training:
   alignment-mode: 'MrSw'
   measure: 'dot'
   margin: 0.2
-  bs: 100
+  bs: 40
   scheduler: 'steplr'
   gamma: 0.1
   step-size: 20
diff --git a/evaluation.py b/evaluation.py
index 3c0854f..f66c47b 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -156,6 +156,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
     cross-validation is done (only for MSCOCO). Otherwise, the full data is
     used for evaluation.
     """
+    evalrank_start_time = time.time();
+
     # load model and options
     # checkpoint = torch.load(model_path)
     data_path = config['dataset']['data']
@@ -177,7 +179,10 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
     sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['training']['alignment-mode'], return_similarity_mat=True) if config['training']['loss-type'] == 'alignment' else None
 
     print('Computing results...')
+    encode_data_start_time = time.time()
     img_embs, cap_embs, img_lenghts, cap_lenghts = encode_data(model, data_loader)
+    print(f"Time elapsed for encode_data: {time.time() - encode_data_start_time} seconds." )
+
     torch.cuda.empty_cache()
 
     # if checkpoint2 is not None:
@@ -197,17 +202,25 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
     if not fold5:
         # no cross-validation, full evaluation
         if eval_i2t:
+            eval_i2t_start_time = time.time()
+
             r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5)
             ar = (r[0] + r[1] + r[2]) / 3
             print("Average i2t Recall: %.1f" % ar)
             print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r)
 
+            print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_i2t_start_time} seconds." )
+
         if eval_t2i:
+            eval_t2i_start_time = time.time()
+
             ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5)
             ari = (ri[0] + ri[1] + ri[2]) / 3
             print("Average t2i Recall: %.1f" % ari)
             print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri)
 
+            print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_t2i_start_time} seconds.")
+
         if eval_i2t and eval_t2i:
             rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
             print("rsum: %.1f" % rsum)
@@ -284,6 +297,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
     elif eval_i2t:
         torch.save({'rt': rt}, 'ranks.pth.tar')
 
+    print(f"Time elapsed for evalrank(): {time.time() - evalrank_start_time} seconds.")
+
 
 
 def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, measure='dot', sim_function=None, cap_batches=1):

From cc9f94df55f7f76dd4ae11e29108fa6965be31e8 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Mon, 14 Dec 2020 16:01:27 +0100
Subject: [PATCH 08/21] fixed flags for i2t and t2i

---
 evaluation.py |  2 +-
 test.py       | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/evaluation.py b/evaluation.py
index f66c47b..e2b89e6 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -156,7 +156,7 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
     cross-validation is done (only for MSCOCO). Otherwise, the full data is
     used for evaluation.
     """
-    evalrank_start_time = time.time();
+    evalrank_start_time = time.time()
 
     # load model and options
     # checkpoint = torch.load(model_path)
diff --git a/test.py b/test.py
index 9586784..123ef5c 100644
--- a/test.py
+++ b/test.py
@@ -29,7 +29,8 @@ def main(opt, current_config):
     if current_config is not None:
         loaded_config['dataset']['images-path'] = current_config['dataset']['images-path']
         loaded_config['dataset']['data'] = current_config['dataset']['data']
-        loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model']['pre-extracted-features-root']
+        loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model'][
+            'pre-extracted-features-root']
         loaded_config['training']['bs'] = current_config['training']['bs']
 
     evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=False, eval_t2i=opt.t2i, eval_i2t=opt.i2t)
@@ -40,10 +41,10 @@ def main(opt, current_config):
     parser.add_argument('checkpoint', type=str, help="Checkpoint to load")
     parser.add_argument('--size', type=str, choices=['1k', '5k'], default='1k')
     parser.add_argument('--gpu', type=bool, default=True, help="If false, CPU is used for computations; GPU otherwise.")
-    parser.add_argument('--t2i', type=bool, default=True, help="If true text-to-image (image retrieval) evaluation "
-                                                               "will be executed.")
-    parser.add_argument('--i2t', type=bool, default=False, help="If true image-to-text (image captioning) evaluation "
-                                                                "will be executed.")
+    parser.add_argument('--t2i', action='store_true', default=True,
+                        help="If set text-to-image (image retrieval) evaluation will be executed.")
+    parser.add_argument('--i2t', action='store_true', default=False,
+                        help="If set image-to-text (image captioning) evaluation will be executed.")
     parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the "
                                                                  "checkpoint configuration. See into 'config' folder")
 
@@ -55,4 +56,4 @@ def main(opt, current_config):
             config = yaml.load(ymlfile)
     else:
         config = None
-    main(opt, config)
\ No newline at end of file
+    main(opt, config)

From 6f562b1e56997d53917ee42f3985e08a15842c38 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Mon, 28 Dec 2020 12:21:40 +0100
Subject: [PATCH 09/21] improved code readability by renaming some variable
 names and adding some comments

---
 data.py                             | 88 ++++++++++++++---------------
 evaluate_utils/compute_relevance.py |  2 +-
 evaluation.py                       | 79 ++++++++++++++++----------
 3 files changed, 93 insertions(+), 76 deletions(-)

diff --git a/data.py b/data.py
index 9b988fd..2fbecbf 100644
--- a/data.py
+++ b/data.py
@@ -1,17 +1,15 @@
+import json as jsonmod
+import os
+import pickle
+
+import numpy as np
 import torch
 import torch.utils.data as data
 import torchvision.transforms as transforms
-import os
-import nltk
+import tqdm
 from PIL import Image
 from pycocotools.coco import COCO
-import numpy as np
-import json as jsonmod
-from collections.abc import Sequence
-import shelve
 from transformers import BertTokenizer
-import pickle
-import tqdm
 
 from features import HuggingFaceTransformerExtractor
 
@@ -82,21 +80,21 @@ def get_paths(config):
 class CocoDataset(data.Dataset):
     """COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
 
-    def __init__(self, root, json, transform=None, ids=None, get_images=True):
+    def __init__(self, imgs_root, captions_json, transform=None, ids=None, get_images=True):
         """
         Args:
-            root: image directory.
-            json: coco annotation file path.
+            imgs_root: image directory.
+            captions_json: coco annotation file path.
             transform: transformer for image.
         """
-        self.root = root
+        self.root = imgs_root
         self.get_images = get_images
         # when using `restval`, two json files are needed
-        if isinstance(json, tuple):
-            self.coco = (COCO(json[0]), COCO(json[1]))
+        if isinstance(captions_json, tuple):
+            self.coco = (COCO(captions_json[0]), COCO(captions_json[1]))
         else:
-            self.coco = (COCO(json),)
-            self.root = (root,)
+            self.coco = (COCO(captions_json),)
+            self.root = (imgs_root,)
         # if ids provided by get_paths, use split-specific ids
         if ids is None:
             self.ids = list(self.coco.anns.keys())
@@ -123,7 +121,7 @@ def __getitem__(self, index):
         return image, target, index, img_id
 
     def get_raw_item(self, index, load_image=True):
-        if index < self.bp:
+        if index < self.bp:  # bp -> breakpoint to stop after N samples
             coco = self.coco[0]
             root = self.root[0]
         else:
@@ -132,8 +130,8 @@ def get_raw_item(self, index, load_image=True):
         ann_id = self.ids[index]
         caption = coco.anns[ann_id]['caption']
         img_id = coco.anns[ann_id]['image_id']
-        img = coco.imgs[img_id]
-        img_size = np.array([img['width'], img['height']])
+        img_metadata = coco.imgs[img_id]
+        img_size = np.array([img_metadata['width'], img_metadata['height']])
         if load_image:
             path = coco.loadImgs(img_id)[0]['file_name']
             image = Image.open(os.path.join(root, path)).convert('RGB')
@@ -147,14 +145,14 @@ def __len__(self):
 
 
 class BottomUpFeaturesDataset:
-    def __init__(self, root, json, features_path, split, ids=None, **kwargs):
+    def __init__(self, imgs_root, captions_json, features_path, split, ids=None, **kwargs):
         # which dataset?
-        r = root[0] if type(root) == tuple else root
+        r = imgs_root[0] if type(imgs_root) == tuple else imgs_root
         r = r.lower()
         if 'coco' in r:
-            self.underlying_dataset = CocoDataset(root, json, ids=ids)
+            self.underlying_dataset = CocoDataset(imgs_root, captions_json, ids=ids)
         elif 'f30k' in r or 'flickr30k' in r:
-            self.underlying_dataset = FlickrDataset(root, json, split)
+            self.underlying_dataset = FlickrDataset(imgs_root, captions_json, split)
 
         # data_path = config['image-model']['data-path']
         self.feats_data_path = os.path.join(features_path, 'bu_att')
@@ -191,7 +189,7 @@ def __getitem__(self, index):
         else:
             target = caption
         # image = (img_feat, img_boxes)
-        return img_feat, img_boxes, target, index, img_id
+        return img_feat, img_boxes, target, index, img_id  # target is the actual caption sentence
 
     def __len__(self):
         return len(self.underlying_dataset)
@@ -348,31 +346,31 @@ def __call__(self, data):
             return img_features, targets, feat_lengths, cap_lengths, out_boxes, ids
 
 
-def get_loader_single(data_name, split, root, json, transform, preextracted_root=None,
+def get_loader_single(data_name, split, imgs_root, captions_json, transform, pre_extracted_root=None,
                       batch_size=100, shuffle=True,
                       num_workers=2, ids=None, collate_fn=None, **kwargs):
     """Returns torch.utils.data.DataLoader for custom coco dataset."""
     if 'coco' in data_name:
-        if preextracted_root is not None:
-            dataset = BottomUpFeaturesDataset(root=root,
-                                              json=json,
-                                              features_path=preextracted_root, split=split,
+        if pre_extracted_root is not None:
+            dataset = BottomUpFeaturesDataset(imgs_root=imgs_root,
+                                              captions_json=captions_json,
+                                              features_path=pre_extracted_root, split=split,
                                               ids=ids, **kwargs)
         else:
             # COCO custom dataset
-            dataset = CocoDataset(root=root,
-                                  json=json,
+            dataset = CocoDataset(imgs_root=imgs_root,
+                                  captions_json=captions_json,
                                   transform=transform, ids=ids)
     elif 'f8k' in data_name or 'f30k' in data_name:
-        if preextracted_root is not None:
-            dataset = BottomUpFeaturesDataset(root=root,
-                                              json=json,
-                                              features_path=preextracted_root, split=split,
+        if pre_extracted_root is not None:
+            dataset = BottomUpFeaturesDataset(imgs_root=imgs_root,
+                                              captions_json=captions_json,
+                                              features_path=pre_extracted_root, split=split,
                                               ids=ids, **kwargs)
         else:
-            dataset = FlickrDataset(root=root,
+            dataset = FlickrDataset(root=imgs_root,
                                     split=split,
-                                    json=json,
+                                    json=captions_json,
                                     transform=transform)
 
     # Data loader
@@ -385,7 +383,7 @@ def get_loader_single(data_name, split, root, json, transform, preextracted_root
     return data_loader
 
 
-def get_transform(data_name, split_name, config):
+def get_transform(data_name=None, split_name=None, config=None):
     normalizer = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                       std=[0.229, 0.224, 0.225])
     t_list = []
@@ -417,7 +415,7 @@ def get_loaders(config, workers, batch_size=None):
                                      roots['train']['img'],
                                      roots['train']['cap'],
                                      transform, ids=ids['train'],
-                                     preextracted_root=preextracted_root,
+                                     pre_extracted_root=preextracted_root,
                                      batch_size=batch_size, shuffle=True,
                                      num_workers=workers,
                                      collate_fn=collate_fn, config=config)
@@ -427,7 +425,7 @@ def get_loaders(config, workers, batch_size=None):
                                    roots['val']['img'],
                                    roots['val']['cap'],
                                    transform, ids=ids['val'],
-                                   preextracted_root=preextracted_root,
+                                   pre_extracted_root=preextracted_root,
                                    batch_size=batch_size, shuffle=False,
                                    num_workers=workers,
                                    collate_fn=collate_fn, config=config)
@@ -443,15 +441,15 @@ def get_test_loader(config, workers, split_name='test', batch_size=None):
     # Build Dataset Loader
     roots, ids = get_paths(config)
 
-    preextracted_root = config['image-model']['pre-extracted-features-root'] \
+    pre_extracted_root = config['image-model']['pre-extracted-features-root'] \
         if 'pre-extracted-features-root' in config['image-model'] else None
 
     transform = get_transform(data_name, split_name, config)
     test_loader = get_loader_single(data_name, split_name,
-                                    roots[split_name]['img'],
-                                    roots[split_name]['cap'],
-                                    transform, ids=ids[split_name],
-                                    preextracted_root=preextracted_root,
+                                    imgs_root=roots[split_name]['img'],
+                                    captions_json=roots[split_name]['cap'],
+                                    transform=transform, ids=ids[split_name],
+                                    pre_extracted_root=pre_extracted_root,
                                     batch_size=batch_size, shuffle=False,
                                     num_workers=workers,
                                     collate_fn=collate_fn, config=config)
diff --git a/evaluate_utils/compute_relevance.py b/evaluate_utils/compute_relevance.py
index aa67bf8..07c34cd 100644
--- a/evaluate_utils/compute_relevance.py
+++ b/evaluate_utils/compute_relevance.py
@@ -58,7 +58,7 @@ def get_dataset(config, split):
     data_name = config['dataset']['name']
     if 'coco' in data_name:
         # COCO custom dataset
-        dataset = data.CocoDataset(root=roots[split]['img'], json=roots[split]['cap'], ids=ids[split], get_images=False)
+        dataset = data.CocoDataset(imgs_root=roots[split]['img'], captions_json=roots[split]['cap'], ids=ids[split], get_images=False)
     elif 'f8k' in data_name or 'f30k' in data_name:
         dataset = data.FlickrDataset(root=roots[split]['img'], split=split, json=roots[split]['cap'], get_images=False)
     return dataset
diff --git a/evaluation.py b/evaluation.py
index e2b89e6..8232174 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -1,16 +1,17 @@
 from __future__ import print_function
 
-import numpy
-
-from data import get_test_loader
 import time
+from collections import OrderedDict
+
+import numpy
 import numpy as np
 import torch
 import tqdm
-from collections import OrderedDict
-from utils import dot_sim, get_model
+
 from evaluate_utils.dcg import DCG
 from models.loss import order_sim, AlignmentContrastiveLoss
+from utils import get_model
+from data import get_test_loader
 
 
 class AverageMeter(object):
@@ -176,12 +177,14 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
     ndcg_val_scorer = DCG(config, len(data_loader.dataset), split, rank=25, relevance_methods=['rougeL', 'spice'])
 
     # initialize similarity matrix evaluator
-    sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['training']['alignment-mode'], return_similarity_mat=True) if config['training']['loss-type'] == 'alignment' else None
+    sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['training']['alignment-mode'],
+                                             return_similarity_mat=True) if config['training'][
+                                                                                'loss-type'] == 'alignment' else None
 
     print('Computing results...')
     encode_data_start_time = time.time()
     img_embs, cap_embs, img_lenghts, cap_lenghts = encode_data(model, data_loader)
-    print(f"Time elapsed for encode_data: {time.time() - encode_data_start_time} seconds." )
+    print(f"Time elapsed for encode_data: {time.time() - encode_data_start_time} seconds.")
 
     torch.cuda.empty_cache()
 
@@ -204,17 +207,32 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
         if eval_i2t:
             eval_i2t_start_time = time.time()
 
-            r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5)
+            r, rt = i2t(img_embs,
+                        cap_embs,
+                        img_lenghts,
+                        cap_lenghts,
+                        return_ranks=True,
+                        ndcg_scorer=ndcg_val_scorer,
+                        sim_function=sim_matrix_fn,
+                        cap_batches=5)
             ar = (r[0] + r[1] + r[2]) / 3
             print("Average i2t Recall: %.1f" % ar)
             print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r)
 
-            print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_i2t_start_time} seconds." )
+            print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_i2t_start_time} seconds.")
 
         if eval_t2i:
             eval_t2i_start_time = time.time()
 
-            ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5)
+            ri, rti = t2i(img_embs,
+                          cap_embs,
+                          img_lenghts,
+                          cap_lenghts,
+                          return_ranks=True,
+                          ndcg_scorer=ndcg_val_scorer,
+                          sim_function=sim_matrix_fn,
+                          im_batches=5)
+
             ari = (ri[0] + ri[1] + ri[2]) / 3
             print("Average t2i Recall: %.1f" % ari)
             print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri)
@@ -234,7 +252,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
             if eval_i2t:
                 r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000],
                              img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000],
-                                  return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, cap_batches=1)
+                             return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn,
+                             cap_batches=1)
                 print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f" % r)
                 if i == 0:
                     rt = rt0
@@ -242,7 +261,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
             if eval_t2i:
                 ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000],
                                img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000],
-                                    return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, im_batches=1)
+                               return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn,
+                               im_batches=1)
                 print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri)
                 if i == 0:
                     rti = rti0
@@ -257,15 +277,12 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
             elif eval_i2t:
                 print("ar: %.1f" % (ar,))
 
-
             if eval_t2i and eval_i2t:
-                results += [list(r) + list(ri) + [ar, ari, rsum]] # 7 + 7 + 3 = 17 elements
+                results += [list(r) + list(ri) + [ar, ari, rsum]]  # 7 + 7 + 3 = 17 elements
             elif eval_t2i:
-                results += [list(ri) + [ari]] # 7 + 1 = 8 elements
+                results += [list(ri) + [ari]]  # 7 + 1 = 8 elements
             elif eval_i2t:
-                results += [list(r) + [ar]] # 7 + 1 = 8 elements
-
-
+                results += [list(r) + [ar]]  # 7 + 1 = 8 elements
 
         print("-----------------------------------")
         print("Mean metrics: ")
@@ -343,8 +360,8 @@ def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals
                 d = d.cpu().numpy().flatten()
             else:
                 for i in range(cap_batches):
-                    captions_now = captions[i*captions_per_batch:(i+1)*captions_per_batch]
-                    cap_lenghts_now = cap_lenghts[i*captions_per_batch:(i+1)*captions_per_batch]
+                    captions_now = captions[i * captions_per_batch:(i + 1) * captions_per_batch]
+                    cap_lenghts_now = cap_lenghts[i * captions_per_batch:(i + 1) * captions_per_batch]
                     captions_now = captions_now.cuda()
 
                     d_align = sim_function(im, captions_now, im_len, cap_lenghts_now)
@@ -352,7 +369,7 @@ def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals
                     # d_matching = torch.mm(im[:, 0, :], captions[:, 0, :].t())
                     # d_matching = d_matching.cpu().numpy().flatten()
                     if d is None:
-                        d = d_align # + d_matching
+                        d = d_align  # + d_matching
                     else:
                         d = numpy.concatenate([d, d_align], axis=0)
 
@@ -432,31 +449,33 @@ def t2i(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals
                 d = d.cpu().numpy()
             else:
                 for i in range(im_batches):
-                    ims_now = ims[i * images_per_batch:(i+1) * images_per_batch]
-                    ims_len_now = ims_len[i * images_per_batch:(i+1) * images_per_batch]
+                    ims_now = ims[i * images_per_batch:(i + 1) * images_per_batch]
+                    ims_len_now = ims_len[i * images_per_batch:(i + 1) * images_per_batch]
                     ims_now = ims_now.cuda()
 
                     # d = numpy.dot(queries, ims.T)
+                    # d_align is the (MrSw) aggregated/pooled similarity matrix A in the paper
                     d_align = sim_function(ims_now, queries, ims_len_now, queries_len).t()
                     d_align = d_align.cpu().numpy()
                     # d_matching = torch.mm(queries[:, 0, :], ims[:, 0, :].t())
                     # d_matching = d_matching.cpu().numpy()
                     if d is None:
-                        d = d_align # + d_matching
+                        d = d_align  # + d_matching
                     else:
                         d = numpy.concatenate([d, d_align], axis=1)
 
+        # d contains all aggregated/pooled similarity matrices for all query-image pairs in the test set
         inds = numpy.zeros(d.shape)
         for i in range(len(inds)):
             inds[i] = numpy.argsort(d[i])[::-1]
-            ranks[5 * index + i] = numpy.where(inds[i] == index)[0][
-                0]  # in che posizione e' l'immagine (index) che ha questa caption (5*index + i)
+            # in che posizione e' l'immagine (index) che ha questa caption (5*index + i)
+            ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0]
             top50[5 * index + i] = inds[i][0:50]
             # calculate ndcg
-            # if ndcg_scorer is not None:
-            #     rougel_ndcgs[5 * index + i], spice_ndcgs[5 * index + i] = \
-            #         ndcg_scorer.compute_ndcg(npts, 5 * index + i, inds[i].astype(int),
-            #                                  fold_index=fold_index, retrieval='image').values()
+            if ndcg_scorer is not None:
+                rougel_ndcgs[5 * index + i], spice_ndcgs[5 * index + i] = \
+                    ndcg_scorer.compute_ndcg(npts, 5 * index + i, inds[i].astype(int),
+                                             fold_index=fold_index, retrieval='image').values()
 
     # Compute metrics
     r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)

From d0a175d713f6d899e746f090419188c7b42abf00 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Mon, 28 Dec 2020 14:17:12 +0100
Subject: [PATCH 10/21] exiting program if CUDA_VISIBLE_DEVICES is not set

---
 test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test.py b/test.py
index 123ef5c..3effe87 100644
--- a/test.py
+++ b/test.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+import sys
 
 import torch
 import yaml
@@ -48,7 +49,9 @@ def main(opt, current_config):
     parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the "
                                                                  "checkpoint configuration. See into 'config' folder")
 
-    print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", ""))
+    print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", "NOT SET - ABORTING"))
+    if os.getenv("CUDA_VISIBLE_DEVICES", None) is None:
+        sys.exit(1)
 
     opt = parser.parse_args()
     if opt.config is not None:

From fb63f919403f6a11f790d8a61623f95e6552cabd Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Wed, 30 Dec 2020 14:05:32 +0100
Subject: [PATCH 11/21] improved code readability by renaming some variable
 names and adding some comments

---
 data.py                             | 86 ++++++++++++++++-------------
 evaluate_utils/compute_relevance.py |  2 +-
 evaluation.py                       | 74 +++----------------------
 models/text.py                      |  9 ++-
 utils.py                            | 58 +++++++++++++++++++
 5 files changed, 122 insertions(+), 107 deletions(-)

diff --git a/data.py b/data.py
index 2fbecbf..9aa4fde 100644
--- a/data.py
+++ b/data.py
@@ -15,23 +15,25 @@
 
 
 def get_paths(config):
+    # noinspection PyIncorrectDocstring
+    # noinspection PyUnresolvedReferences
     """
-    Returns paths to images and annotations for the given datasets. For MSCOCO
-    indices are also returned to control the data split being used.
-    The indices are extracted from the Karpathy et al. splits using this
-    snippet:
-
-    >>> import json
-    >>> dataset=json.load(open('dataset_coco.json','r'))
-    >>> A=[]
-    >>> for i in range(len(D['images'])):
-    ...   if D['images'][i]['split'] == 'val':
-    ...     A+=D['images'][i]['sentids'][:5]
-    ...
-
-    :param name: Dataset names
-    :param use_restval: If True, the the `restval` data is included in train.
-    """
+        Returns paths to images and annotations for the given datasets. For MSCOCO
+        indices are also returned to control the data split being used.
+        The indices are extracted from the Karpathy et al. splits using this
+        snippet:
+
+        >>> import json
+        >>> dataset=json.load(open('dataset_coco.json','r'))
+        >>> A=[]
+        >>> for i in range(len(D['images'])):
+        ...   if D['images'][i]['split'] == 'val':
+        ...     A+=D['images'][i]['sentids'][:5]
+        ...
+
+        :param name: Dataset names
+        :param use_restval: If True, the the `restval` data is included in train.
+        """
     name = config['dataset']['name']
     annotations_path = os.path.join(config['dataset']['data'], name, 'annotations')
     use_restval = config['dataset']['restval']
@@ -62,7 +64,8 @@ def get_paths(config):
         ids['test'] = np.load(os.path.join(annotations_path, 'coco_test_ids.npy'))
         ids['trainrestval'] = (
             ids['train'],
-            np.load(os.path.join(annotations_path, 'coco_restval_ids.npy')))
+            np.load(os.path.join(annotations_path, 'coco_restval_ids.npy'))
+        )
         if use_restval:
             roots['train'] = roots['trainrestval']
             ids['train'] = ids['trainrestval']
@@ -80,7 +83,7 @@ def get_paths(config):
 class CocoDataset(data.Dataset):
     """COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
 
-    def __init__(self, imgs_root, captions_json, transform=None, ids=None, get_images=True):
+    def __init__(self, imgs_root, captions_json, transform=None, coco_annotation_ids=None, get_images=True):
         """
         Args:
             imgs_root: image directory.
@@ -96,17 +99,17 @@ def __init__(self, imgs_root, captions_json, transform=None, ids=None, get_image
             self.coco = (COCO(captions_json),)
             self.root = (imgs_root,)
         # if ids provided by get_paths, use split-specific ids
-        if ids is None:
-            self.ids = list(self.coco.anns.keys())
+        if coco_annotation_ids is None:
+            self.annotation_ids = list(self.coco[0].anns.keys())
         else:
-            self.ids = ids
+            self.annotation_ids = coco_annotation_ids
 
         # if `restval` data is to be used, record the break point for ids
-        if isinstance(self.ids, tuple):
-            self.bp = len(self.ids[0])
-            self.ids = list(self.ids[0]) + list(self.ids[1])
+        if isinstance(self.annotation_ids, tuple):
+            self.bp = len(self.annotation_ids[0])
+            self.annotation_ids = list(self.annotation_ids[0]) + list(self.annotation_ids[1])
         else:
-            self.bp = len(self.ids)
+            self.bp = len(self.annotation_ids)
         self.transform = transform
 
     def __getitem__(self, index):
@@ -127,7 +130,7 @@ def get_raw_item(self, index, load_image=True):
         else:
             coco = self.coco[1]
             root = self.root[1]
-        ann_id = self.ids[index]
+        ann_id = self.annotation_ids[index]
         caption = coco.anns[ann_id]['caption']
         img_id = coco.anns[ann_id]['image_id']
         img_metadata = coco.imgs[img_id]
@@ -141,7 +144,7 @@ def get_raw_item(self, index, load_image=True):
             return root, caption, img_id, None, None, img_size
 
     def __len__(self):
-        return len(self.ids)
+        return len(self.annotation_ids)
 
 
 class BottomUpFeaturesDataset:
@@ -150,7 +153,7 @@ def __init__(self, imgs_root, captions_json, features_path, split, ids=None, **k
         r = imgs_root[0] if type(imgs_root) == tuple else imgs_root
         r = r.lower()
         if 'coco' in r:
-            self.underlying_dataset = CocoDataset(imgs_root, captions_json, ids=ids)
+            self.underlying_dataset = CocoDataset(imgs_root, captions_json, coco_annotation_ids=ids)
         elif 'f30k' in r or 'flickr30k' in r:
             self.underlying_dataset = FlickrDataset(imgs_root, captions_json, split)
 
@@ -275,12 +278,12 @@ def __call__(self, data):
 
             Returns:
                 images: torch tensor of shape (batch_size, 3, 256, 256).
-                targets: torch tensor of shape (batch_size, padded_length).
+                targets: torch tensor of shape (batch_size, padded_length). -> the textual tokens
                 lengths: list; valid length for each padded caption.
             """
         # Sort a data list by caption length
         # data.sort(key=lambda x: len(x[1]), reverse=True)
-        if len(data[0]) == 5:      # TODO: find a better way to distinguish the two
+        if len(data[0]) == 5:  # TODO: find a better way to distinguish the two
             images, boxes, captions, ids, img_ids = zip(*data)
         elif len(data[0]) == 4:
             images, captions, ids, img_ids = zip(*data)
@@ -294,14 +297,17 @@ def __call__(self, data):
             cap_features = [torch.FloatTensor(f) for f in cap_features]
             wembeddings = [torch.FloatTensor(w) for w in wembeddings]
         else:
-            if self.vocab_type == 'bert':
+            if self.vocab_type == 'bert': 
                 cap_lengths = [len(self.tokenizer.tokenize(c)) + 2 for c in
-                           captions]  # + 2 in order to account for begin and end tokens
+                               captions]  # + 2 in order to account for begin and end tokens
                 max_len = max(cap_lengths)
-                captions_ids = [torch.LongTensor(self.tokenizer.encode(c, max_length=max_len, pad_to_max_length=True))
-                                for c in captions]
+                captions_token_ids = [torch.LongTensor(self.tokenizer.encode(c,
+                                                                             max_length=max_len,
+                                                                             padding='max_length',
+                                                                             truncation=True))
+                                      for c in captions]
 
-            captions = captions_ids
+            captions = captions_token_ids  # caption_ids are the token ids from bert tokenizer
         # Merge images (convert tuple of 3D tensor to 4D tensor)
         preextracted_images = not (images[0].shape[0] == 3)
         if not preextracted_images:
@@ -337,12 +343,18 @@ def __call__(self, data):
             targets = torch.zeros(len(captions), max(cap_lengths)).long()
             for i, cap in enumerate(captions):
                 end = cap_lengths[i]
-                targets[i, :end] = cap[:end]
+                targets[i, :end] = cap[:end]  #caption token ids
 
         if not preextracted_images:
             return images, targets, None, cap_lengths, None, ids
         else:
             # features = features.permute(0, 2, 1)
+            # img_features -> from FRCNN >> B x 2048
+            # targets -> padded caption token ids from BERT >> B x max_len(cap_lengths) or(queries)
+            # feat_lengths -> num of regions in the image (fixed to 36 + 1) >> B x 37
+            # cap_lengths -> true length of the non-padded captions or queries >> B x 1 (list of len B)
+            # out_boxes -> spatial information of the region boxes >> B x 37 x 4
+            # ids -> dataset indices wich are in this batch >> 1 x B (tuple of len B)
             return img_features, targets, feat_lengths, cap_lengths, out_boxes, ids
 
 
@@ -360,7 +372,7 @@ def get_loader_single(data_name, split, imgs_root, captions_json, transform, pre
             # COCO custom dataset
             dataset = CocoDataset(imgs_root=imgs_root,
                                   captions_json=captions_json,
-                                  transform=transform, ids=ids)
+                                  transform=transform, coco_annotation_ids=ids)
     elif 'f8k' in data_name or 'f30k' in data_name:
         if pre_extracted_root is not None:
             dataset = BottomUpFeaturesDataset(imgs_root=imgs_root,
diff --git a/evaluate_utils/compute_relevance.py b/evaluate_utils/compute_relevance.py
index 07c34cd..ff2de4b 100644
--- a/evaluate_utils/compute_relevance.py
+++ b/evaluate_utils/compute_relevance.py
@@ -58,7 +58,7 @@ def get_dataset(config, split):
     data_name = config['dataset']['name']
     if 'coco' in data_name:
         # COCO custom dataset
-        dataset = data.CocoDataset(imgs_root=roots[split]['img'], captions_json=roots[split]['cap'], ids=ids[split], get_images=False)
+        dataset = data.CocoDataset(imgs_root=roots[split]['img'], captions_json=roots[split]['cap'], coco_annotation_ids=ids[split], get_images=False)
     elif 'f8k' in data_name or 'f30k' in data_name:
         dataset = data.FlickrDataset(root=roots[split]['img'], split=split, json=roots[split]['cap'], get_images=False)
     return dataset
diff --git a/evaluation.py b/evaluation.py
index 8232174..63b5313 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -10,66 +10,8 @@
 
 from evaluate_utils.dcg import DCG
 from models.loss import order_sim, AlignmentContrastiveLoss
-from utils import get_model
-from data import get_test_loader
-
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=0):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / (.0001 + self.count)
-
-    def __str__(self):
-        """String representation for logging
-        """
-        # for values that should be recorded exactly e.g. iteration number
-        if self.count == 0:
-            return str(self.val)
-        # for stats
-        return '%.4f (%.4f)' % (self.val, self.avg)
-
-
-class LogCollector(object):
-    """A collection of logging objects that can change from train to val"""
-
-    def __init__(self):
-        # to keep the order of logged variables deterministic
-        self.meters = OrderedDict()
-
-    def update(self, k, v, n=0):
-        # create a new meter if previously not recorded
-        if k not in self.meters:
-            self.meters[k] = AverageMeter()
-        self.meters[k].update(v, n)
-
-    def __str__(self):
-        """Concatenate the meters in one log line
-        """
-        s = ''
-        for i, (k, v) in enumerate(self.meters.items()):
-            if i > 0:
-                s += '  '
-            s += k + ' ' + str(v)
-        return s
-
-    def tb_log(self, tb_logger, prefix='', step=None):
-        """Log using tensorboard
-        """
-        for k, v in self.meters.items():
-            tb_logger.add_scalar(prefix + k, v.val, global_step=step)
+from utils import get_model, AverageMeter, LogCollector
+from data import get_coco_image_retrieval_data_loader, get_test_loader
 
 
 def encode_data(model, data_loader, log_step=10, logging=print):
@@ -108,14 +50,13 @@ def encode_data(model, data_loader, log_step=10, logging=print):
         else:
             text = targets
             captions = targets
-            wembeddings = model.img_txt_enc.txt_enc.word_embeddings(captions.cuda() if torch.cuda.is_available() else captions)
 
         # compute the embeddings
         with torch.no_grad():
             _, _, img_emb, cap_emb, cap_length = model.forward_emb(images, text, img_length, cap_length, boxes)
 
             # initialize the numpy arrays given the size of the embeddings
-            if img_embs is None:
+            if img_embs is None: # N x max_len x 1024
                 img_embs = torch.zeros((len(data_loader.dataset), max_img_len, img_emb.size(2)))
                 cap_embs = torch.zeros((len(data_loader.dataset), max_cap_len, cap_emb.size(2)))
 
@@ -237,7 +178,7 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
             print("Average t2i Recall: %.1f" % ari)
             print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri)
 
-            print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_t2i_start_time} seconds.")
+            print(f"Time elapsed for t2i evaluation without 5-fold CV: {time.time() - eval_t2i_start_time} seconds.")
 
         if eval_i2t and eval_t2i:
             rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2]
@@ -317,8 +258,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i
     print(f"Time elapsed for evalrank(): {time.time() - evalrank_start_time} seconds.")
 
 
-
-def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, measure='dot', sim_function=None, cap_batches=1):
+def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0,
+        measure='dot', sim_function=None, cap_batches=1):
     """
     Images->Text (Image Annotation)
     Images: (5N, K) matrix of images
@@ -404,7 +345,8 @@ def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals
         return (r1, r5, r10, medr, meanr, mean_rougel_ndcg, mean_spice_ndcg)
 
 
-def t2i(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, measure='dot', sim_function=None, im_batches=1):
+def t2i(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0,
+        measure='dot', sim_function=None, im_batches=1):
     """
     Text->Images (Image Search)
     Images: (5N, K) matrix of images
diff --git a/models/text.py b/models/text.py
index 0dac895..10d23b0 100644
--- a/models/text.py
+++ b/models/text.py
@@ -58,7 +58,7 @@ def forward(self, x, lengths):
         # Reshape *final* output to (batch_size, hidden_size)
         padded = pad_packed_sequence(out, batch_first=True)
         I = torch.LongTensor(lengths).view(-1, 1, 1)
-        I = (I.expand(x.size(0), 1, self.embed_size)-1).to(x.device)
+        I = (I.expand(x.size(0), 1, self.embed_size) - 1).to(x.device)
         out = torch.gather(padded[0], 1, I).squeeze(1)
 
         # normalization in the joint embedding space
@@ -105,6 +105,8 @@ def forward(self, x, lengths):
         lengths: tensor of lengths (LongTensor) of size B
         '''
         if not self.preextracted or self.post_transformer_layers > 0:
+            # this code builds the attention_mask so that its 1 for every valid token and pads 0 for the max len
+            # attention_mask is a kinda padding
             max_len = max(lengths)
             attention_mask = torch.ones(x.shape[0], max_len)
             for e, l in zip(attention_mask, lengths):
@@ -115,7 +117,8 @@ def forward(self, x, lengths):
             outputs = x
         else:
             outputs = self.bert_model(x, attention_mask=attention_mask)
-            outputs = outputs[2][-1]
+            #  https://huggingface.co/transformers/model_doc/bert.html#bertmodel
+            outputs = outputs[2][-1]  # -> hidden_states[-1]
 
         if self.post_transformer_layers > 0:
             outputs = outputs.permute(1, 0, 2)
@@ -124,7 +127,7 @@ def forward(self, x, lengths):
         if self.mean:
             x = outputs.mean(dim=1)
         else:
-            x = outputs[:, 0, :]     # from the last layer take only the first word
+            x = outputs[:, 0, :]  # from the last layer take only the first word
 
         out = self.map(x)
 
diff --git a/utils.py b/utils.py
index 1f2cb6a..6b46f17 100644
--- a/utils.py
+++ b/utils.py
@@ -16,3 +16,61 @@ def cosine_sim(x, y):
     x = x / numpy.expand_dims(numpy.linalg.norm(x, axis=1), 1)
     y = y / numpy.expand_dims(numpy.linalg.norm(y, axis=1), 1)
     return numpy.dot(x, y.T)
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=0):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / (.0001 + self.count)
+
+    def __str__(self):
+        """String representation for logging
+        """
+        # for values that should be recorded exactly e.g. iteration number
+        if self.count == 0:
+            return str(self.val)
+        # for stats
+        return '%.4f (%.4f)' % (self.val, self.avg)
+
+
+class LogCollector(object):
+    """A collection of logging objects that can change from train to val"""
+
+    def __init__(self):
+        # to keep the order of logged variables deterministic
+        self.meters = OrderedDict()
+
+    def update(self, k, v, n=0):
+        # create a new meter if previously not recorded
+        if k not in self.meters:
+            self.meters[k] = AverageMeter()
+        self.meters[k].update(v, n)
+
+    def __str__(self):
+        """Concatenate the meters in one log line
+        """
+        s = ''
+        for i, (k, v) in enumerate(self.meters.items()):
+            if i > 0:
+                s += '  '
+            s += k + ' ' + str(v)
+        return s
+
+    def tb_log(self, tb_logger, prefix='', step=None):
+        """Log using tensorboard
+        """
+        for k, v in self.meters.items():
+            tb_logger.add_scalar(prefix + k, v.val, global_step=step)

From 42378158f329d7959507f06a7a0191484ab86849 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Wed, 30 Dec 2020 14:09:25 +0100
Subject: [PATCH 12/21] first working (but not fully optimized) IR Inference

---
 __init__.py                     |   1 +
 configs/teran_coco_MrSw_IR.yaml |  63 ++++++++++
 data.py                         | 204 ++++++++++++++++++++++++++++++-
 evaluation.py                   |   1 -
 inference.py                    | 210 ++++++++++++++++++++++++++------
 utils.py                        |   2 +
 6 files changed, 444 insertions(+), 37 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 configs/teran_coco_MrSw_IR.yaml

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..705e854
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1 @@
+from .data import d
\ No newline at end of file
diff --git a/configs/teran_coco_MrSw_IR.yaml b/configs/teran_coco_MrSw_IR.yaml
new file mode 100644
index 0000000..6d650c7
--- /dev/null
+++ b/configs/teran_coco_MrSw_IR.yaml
@@ -0,0 +1,63 @@
+dataset:
+  name: 'coco'
+  images-path: 'data/coco/images'  # not needed if using pre-extracted bottom-up features
+  data: 'data'
+  restval: True
+  pre-extracted-features: False
+
+image-retrieval:
+  dataset: 'coco' # for now only coco support
+  split: 'test' # we can remove this in later versions
+  num_imgs: 5000
+  batch_size: 100
+  pre-extracted-img-features-root: 'data/coco/features_36'
+  create_query_batch: True
+  alignment_mode: 'MrSw'
+
+
+
+text-model:
+  name: 'bert'
+  pretrain: 'bert-base-uncased'
+  word-dim: 768
+  extraction-hidden-layer: 6
+  fine-tune: True
+  pre-extracted: False
+  layers: 0
+  dropout: 0.1
+
+image-model:
+  name: 'bottomup'
+  pre-extracted-features-root: 'data/coco/features_36'
+  transformer-layers: 4
+  dropout: 0.1
+  pos-encoding: 'concat-and-process'
+  crop-size: 224  # not used
+  fine-tune: False
+  feat-dim: 2048
+  norm: True
+
+model:
+  name: 'teran'
+  embed-size: 1024
+  text-aggregation: 'first'
+  image-aggregation: 'first'
+  layers: 2
+  exclude-stopwords: False
+  shared-transformer: False
+  dropout: 0.1
+
+training:
+  lr: 0.00001  # 0.000006
+  grad-clip: 2.0
+  max-violation: True
+  loss-type: 'alignment'
+  alignment-mode: 'MrSw'
+  measure: 'dot'
+  margin: 0.2
+  bs: 40
+  scheduler: 'steplr'
+  gamma: 0.1
+  step-size: 20
+  warmup: null
+  warmup-period: 1000
diff --git a/data.py b/data.py
index 9aa4fde..2a2505c 100644
--- a/data.py
+++ b/data.py
@@ -147,6 +147,65 @@ def __len__(self):
         return len(self.annotation_ids)
 
 
+class CocoImageRetrievalDataset:
+    """
+    Custom COCO Dataset that uses only the images together with a user query.
+    Compatible with torch.utils.data.DataLoader.
+    """
+
+    def __init__(self, imgs_root, img_features_path, captions_json, coco_annotation_ids, query, num_imgs):
+        self.query = query
+        self.num_imgs = num_imgs
+        self.feats_data_path = os.path.join(img_features_path, 'bu_att')
+        self.box_data_path = os.path.join(img_features_path, 'bu_box')
+        self.imgs_root = imgs_root
+
+        self.coco = COCO(captions_json)
+        self.anno_ids = coco_annotation_ids
+
+    def __getitem__(self, idx):
+        """
+        This function returns a tuple that is further passed to collate_fn
+        """
+        img_id, img_size = self.get_raw_item(idx)
+
+        img_feat_path = os.path.join(self.feats_data_path, '{}.npz'.format(img_id))
+        img_box_path = os.path.join(self.box_data_path, '{}.npy'.format(img_id))
+
+        img_feat = np.load(img_feat_path)['feat']
+        img_feat_box = np.load(img_box_path)
+
+        # normalize box
+        img_feat_box = img_feat_box / np.tile(img_size, 2)
+
+        img_feat = torch.Tensor(img_feat)
+        img_feat_box = torch.Tensor(img_feat_box)
+
+        # we always return the query here since we want to compute the similarity of each image with the query
+        # this output is the input of the CollateFn
+        return img_feat, img_feat_box, img_id, self.query, idx
+
+    def get_raw_item(self, idx):
+        next_img_idx = idx * 5  # in the coco dataset there are 5 captions for every image
+        ann_id = self.anno_ids[next_img_idx]
+        img_id = self.coco.anns[ann_id]['image_id']
+        img_metadata = self.coco.imgs[img_id]
+        img_size = np.array([img_metadata['width'], img_metadata['height']])
+
+        return img_id, img_size
+
+    def get_image_metadata(self, idx):
+        # TODO can't we just get coco.imgs[idx'] somehow?
+        next_img_idx = idx * 5  # in the coco dataset there are 5 captions for every image
+        ann_id = self.anno_ids[next_img_idx]
+        img_id = self.coco.anns[ann_id]['image_id']
+        img_metadata = self.coco.imgs[img_id]
+        return img_metadata
+
+    def __len__(self):
+        return self.num_imgs
+
+
 class BottomUpFeaturesDataset:
     def __init__(self, imgs_root, captions_json, features_path, split, ids=None, **kwargs):
         # which dataset?
@@ -257,12 +316,112 @@ def get_raw_item(self, index, load_image=True):
         else:
             return root, caption, img_id, None, None, img_size
 
-
-
     def __len__(self):
         return len(self.ids)
 
 
+class InferenceCollate(object):
+    def __new__(cls, *args, **kwargs):
+        # we only need to compute this once so it gets stored in a static class variable
+        cls.query_token_ids = None
+        cls.query_length = None
+        cls.img_feat_length = None
+        cls.img_feat_dim = None
+        cls.bboxes_length = None
+        cls.bboxes_dim = None
+
+        return super(InferenceCollate, cls).__new__(cls)
+
+    def __init__(self, config):
+        self.vocab_type = str(config['text-model']['name']).lower()
+        self.create_query_batch = bool(config['image-retrieval']['create_query_batch'])
+        if self.vocab_type == 'bert':
+            self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain'])
+        else:
+            raise ValueError("Currently only BERT Tokenizer is supported!")
+
+    @classmethod
+    def set_query_token_ids(cls, query_token_ids):
+        cls.query_token_ids = query_token_ids
+        cls.query_length = len(query_token_ids)
+
+    @classmethod
+    def set_img_feat_length_and_dimension(cls, img_feat):
+        # +1 because the first region feature is reserved as CLS
+        cls.img_feat_length = img_feat.shape[0] + 1
+        cls.img_feat_dim = img_feat.shape[1]
+
+    @classmethod
+    def set_bboxes_length_and_dimension(cls, bbox):
+        # +1 because the first region feature is reserved as CLS
+        cls.bboxes_length = bbox.shape[0] + 1
+        cls.bboxes_dim = bbox.shape[1]
+
+    def __call__(self, data):
+        img_feats, img_feat_bboxes, img_ids, queries, dataset_indices = zip(*data)
+        """
+        Build batch tensors from a list of (img_feats, img_feat_boxes, img_ids, queries, dataset_indices) tuples.
+            Args:
+                - img_feats:
+                - img_feat_bboxes:
+                - img_ids:
+                - queries:
+                - dataset_indices:
+
+            Returns:
+                - img_feature_batch: batch of image features
+                - img_feat_bboxes_batch: batch of bounding boxes of the image features
+                - img_feat_length: length of the image features and bounding boxes (all of same size)
+                - query_token_ids: bert token ids of the tokenized query
+                - query_length: length of the query
+                - dataset_indices: indices of the elements of the datasets inside the batch.
+        """
+
+        # encode (tokenize) the query
+        if self.query_token_ids is None:
+            # we don't need to pad or truncate since we only have a single query
+            # TODO actually we don't even need the tokenizer twice so we could just use a local variable
+            query_token_ids = torch.LongTensor(self.tokenizer.encode(queries[0]))
+            self.set_query_token_ids(query_token_ids)
+
+        # prepare image features
+        if self.img_feat_length is None:
+            self.set_img_feat_length_and_dimension(img_feats[0])
+
+        # prepare bounding boxes
+        if self.bboxes_length is None:
+            self.set_bboxes_length_and_dimension(img_feat_bboxes[0])
+
+        assert self.bboxes_length == self.img_feat_length
+
+        # create the image feature batch
+        batch_size = len(img_feats)
+        img_feature_batch = torch.zeros(batch_size, self.img_feat_length, self.img_feat_dim)
+        for i, f in enumerate(img_feats):
+            # reserve the first token as CLS
+            img_feature_batch[i, 1:] = f
+
+        # create the image features bounding boxes batch
+        img_feat_bboxes_batch = torch.zeros(batch_size, self.bboxes_length, self.bboxes_dim)
+        for i, box in enumerate(img_feat_bboxes):
+            img_feat_bboxes_batch[i, 1:] = box
+
+        if self.create_query_batch:
+            # create the query batch
+            # since the token id is a scalar, the dim is 1 and whe don't need to add it to the batch
+            # for the BERT embeddings the ids have to be Long
+            query_batch = torch.zeros(batch_size, self.query_length).long()
+            for i in range(len(queries)):
+                query_batch[i] = self.query_token_ids
+
+            query_lengths = [self.query_length for _ in range(batch_size)]
+            img_feat_lengths = [self.img_feat_length for _ in range(batch_size)]
+
+            return img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_batch, query_lengths, dataset_indices
+        else:
+            return img_feature_batch, img_feat_bboxes_batch, self.img_feat_length, self.query_token_ids, self.query_length, dataset_indices
+
+
 class Collate:
     def __init__(self, config):
         self.vocab_type = config['text-model']['name']
@@ -445,6 +604,47 @@ def get_loaders(config, workers, batch_size=None):
     return train_loader, val_loader
 
 
+def get_coco_image_retrieval_data_loader(config, workers, query):
+    # create the dataset + loader
+    # 1) load / create a Coco Dataset to get meta info about images (we could also do this by hand)
+    # 2) choose (the first) N images and create a dataset with N samples where each sample consists of the n-th image
+    #    and the query (gets repeated N times) # TODO maybe this is not necessary
+
+    # get the directories that contain the coco json files and coco annotation ids (which we may not need, I think)
+    roots, coco_annotation_ids = get_paths(config)
+
+    dataset_name = config['image-retrieval']['dataset']
+    batch_size = config['image-retrieval']['batch_size']
+    split_name = config['image-retrieval']['split']
+
+    imgs_root = roots[split_name]['img']
+
+    # for images we use pre-extracted features (not for text)
+    pre_extracted_img_features_root = config['image-retrieval']['pre-extracted-img-features-root']
+
+    captions_json = roots[split_name]['cap']
+    coco_annotation_ids = coco_annotation_ids[split_name]
+    num_imgs = config['image-retrieval']['num_imgs']
+
+    dataset = CocoImageRetrievalDataset(imgs_root=imgs_root,
+                                        img_features_path=pre_extracted_img_features_root,
+                                        captions_json=captions_json,
+                                        coco_annotation_ids=coco_annotation_ids,
+                                        query=query,
+                                        num_imgs=num_imgs)
+
+    # basically this creates the mini-batches which get passed to the model
+    collate_fn = InferenceCollate(config)
+    data_loader = torch.utils.data.DataLoader(dataset=dataset,
+                                              batch_size=batch_size,
+                                              shuffle=False,
+                                              pin_memory=True,
+                                              num_workers=workers,
+                                              collate_fn=collate_fn)
+
+    return data_loader
+
+
 def get_test_loader(config, workers, split_name='test', batch_size=None):
     data_name = config['dataset']['name']
     if batch_size is None:
diff --git a/evaluation.py b/evaluation.py
index 63b5313..f164cf0 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -1,7 +1,6 @@
 from __future__ import print_function
 
 import time
-from collections import OrderedDict
 
 import numpy
 import numpy as np
diff --git a/inference.py b/inference.py
index 6bb8dd5..035dc05 100644
--- a/inference.py
+++ b/inference.py
@@ -1,13 +1,139 @@
 import argparse
-from typing import List
-from data import get_inference_loader
+import os
+import sys
+import time
+from typing import List, Any, Dict
+
+import numpy as np
 import torch
+import tqdm
 import yaml
 
+from data import get_coco_image_retrieval_data_loader
+from models.loss import AlignmentContrastiveLoss
 from models.teran import TERAN
+from utils import AverageMeter, LogCollector
+
+
+def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=print):
+    # compute the embedding vectors v_i, s_j (paper) for each image region and word respectively
+    # -> forwarding the data through the respective TE stacks
+    print('Computing image and query embeddings...')
+    encode_data_start_time = time.time()
+
+    batch_time = AverageMeter()
+    val_logger = LogCollector()
+
+    # we don't need autograd for inference
+    model.eval()
+
+    # array to keep all the embeddings
+    # TODO maybe we can store those embeddings in an index and load it instead of computing each time for each query
+    query_embs = None
+    num_query_feats = None
+    num_img_feats = None  # all images have a fixed size of pre-extracted features of 36 + 1 regions
+    img_embs = None
+
+    start_time = time.time()
+    for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_ids, query_lengths,
+            dataset_indices) in enumerate(data_loader):
+
+        # make sure val logger is used
+        model.logger = val_logger
+
+        # TODO
+        # in the first version just stack the query_token_ids, img_feat_length and query_length
+        # so that it has shape B x ? x ?, where B is len(img_feature_batch) (should be equal to bs set in the config)
+        #
+        # in the second version adapt model.forward_emb so that the embeddings get only computed once and then stacked
+        # to the same size as the img_embs
+
+        # make sure val logger is used
+        model.logger = val_logger
+
+        # compute the embeddings
+        with torch.no_grad():
+            # TODO inside model.forward_emb we have to adapt the code for only a single query so that it doesn't get
+            # computed each time
+            _, _, img_emb, query_emb, _ = model.forward_emb(img_feature_batch,
+                                                            query_token_ids,
+                                                            img_feat_lengths,
+                                                            query_lengths,
+                                                            img_feat_bboxes_batch)
+
+            # initialize the arrays given the size of the embeddings
+            if img_embs is None:
+                num_img_feats = img_feat_lengths[0] if isinstance(img_feat_lengths, list) else img_feat_lengths
+                num_query_feats = query_lengths[0] if isinstance(query_lengths, list) else query_lengths
+                img_feat_dim = img_emb.size(2)
+                query_feat_dim = query_emb.size(2)
+                img_embs = torch.zeros((len(data_loader.dataset), num_img_feats, img_feat_dim))
+                query_embs = torch.zeros((len(data_loader.dataset), num_query_feats, query_feat_dim))
+
+            # preserve the embeddings by copying from gpu and converting to numpy
+            img_embs[dataset_indices, :, :] = img_emb.cpu().permute(1, 0, 2)
+            query_embs[dataset_indices, :, :] = query_emb.cpu().permute(1, 0, 2)
+
+        # measure elapsed time per batch
+        batch_time.update(time.time() - start_time)
+        start_time = time.time()
+
+        if i % log_step == 0:
+            logging(
+                f"Batch: [{i}/{len(data_loader)}]\t{str(model.logger)}\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})")
+        del img_feature_batch, query_token_ids
+
+    print(f"Time elapsed to encode data: {time.time() - encode_data_start_time} seconds.")
+    return img_embs, query_embs, num_img_feats, num_query_feats
+
 
+def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_lengths, config):
+    # initialize similarity matrix evaluator
+    sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['image-retrieval']['alignment_mode'],
+                                             return_similarity_mat=True)
+    start_time = time.time()
+    img_embs_per_batch = 1000  # TODO config variable
+    img_emb_batches = 5  # TODO config / calc
 
-def image_retrieval(checkpoint, opts, config) -> List[str]:
+    num_img_embs = img_embs.shape[0]
+
+    # distances storage
+    distances = None
+
+    # since its always the same query we can reuse the batch
+    # (TODO maybe we can even just use a batch of size 1?! -> check the sim_matrix_fn)
+    query_emb_batch = query_embs[:1]
+    query_length_batch = [query_lengths[0] if isinstance(query_lengths, list) else query_lengths for _ in range(1)]
+    query_emb_batch.cuda()
+
+    # batch-wise compute the alignment distance between the images and the query
+    for i in tqdm.trange(img_emb_batches):
+        # create the current batch
+        img_embs_batch = img_embs[i * img_embs_per_batch:(i+1) * img_embs_per_batch]
+        img_embs_length_batch = [img_lengths for _ in range(img_embs_per_batch)]
+        img_embs_batch.cuda()
+
+        # compute and pool the similarity matrices to get the global distance between the image and the query
+        alignment_distance = sim_matrix_fn(img_embs_batch, query_emb_batch, img_embs_length_batch, query_length_batch)
+        alignment_distance = alignment_distance.t().cpu().numpy()
+
+        # store the distances
+        if distances is None:
+            distances = alignment_distance
+        else:
+            distances = np.concatenate([distances, alignment_distance], axis=1)
+
+    # get the img indices descended sorted by the distance matrix
+    sorted_distance_indices = np.argsort(distances.squeeze())[::-1]
+    print(f"Time elapsed to compute and pool the similarity matrices: {time.time() - start_time} seconds.")
+    return sorted_distance_indices
+
+
+def get_image_names(top_k_indices, data_loader) -> List[str]:
+    return [data_loader.dataset.get_image_metadata(idx)['file_name'] for idx in top_k_indices]
+
+
+def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
     # load model and options
     # checkpoint = torch.load(model_path)
     data_path = config['dataset']['data']
@@ -20,46 +146,62 @@ def image_retrieval(checkpoint, opts, config) -> List[str]:
     model.load_state_dict(checkpoint['model'], strict=False)
 
     print('Loading dataset')
-    dataloader = get_inference_loader(config, opts, workers=4)
+    data_loader = get_coco_image_retrieval_data_loader(config,
+                                                       query=opts.query,
+                                                       workers=opts.num_data_workers)
+
+    # encode the data (i.e. compute the embeddings / TE outputs for the images and query)
+    img_embs, cap_embs, img_lengths, cap_lengths = encode_data_for_inference(model, data_loader)
 
-    return ["1", "2"]
+    torch.cuda.empty_cache()
+    print(f"Images: {img_embs.shape[0]}, Captions: {cap_embs.shape[0]}")
 
+    # compute the matching scores
+    distance_sorted_indices = compute_distance_sorted_indices(img_embs, cap_embs, img_lengths, cap_lengths, config)
+    top_k_indices = distance_sorted_indices[:opts.top_k]
 
-def main(opts, current_config) -> List[str]:
-    checkpoint = torch.load(opts.checkpoint, map_location=torch.device(opts.device))
+    # get the image names
+    top_k_images = get_image_names(top_k_indices, data_loader)
+    return top_k_images
 
-    print('Checkpoint loaded from {}'.format(opts.checkpoint))
-    loaded_config = checkpoint['config']
 
-    # Override some mandatory things in the configuration (paths)
-    if current_config is not None:
-        loaded_config['dataset']['images-path'] = current_config['dataset']['images-path']
-        loaded_config['dataset']['data'] = current_config['dataset']['data']
-        loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model'][
-            'pre-extracted-features-root']
+def prepare_model_checkpoint_and_config(opts):
+    checkpoint = torch.load(opts.model, map_location=torch.device(opts.device))
+    print('Checkpoint loaded from {}'.format(opts.model))
+    model_checkpoint_config = checkpoint['config']
 
-    top_k_results = image_retrieval(checkpoint, opts, loaded_config)
-    return top_k_results
+    with open(opts.config, 'r') as yml_file:
+        loaded_config = yaml.load(yml_file)
+        # Override some mandatory things in the configuration
+        model_checkpoint_config['dataset']['images-path'] = loaded_config['dataset']['images-path']
+        model_checkpoint_config['dataset']['data'] = loaded_config['dataset']['data']
+        model_checkpoint_config['image-retrieval'] = loaded_config['image-retrieval']
+
+    return model_checkpoint_config, checkpoint
 
 
 if __name__ == '__main__':
+    print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", "NOT SET - ABORTING"))
+    if os.getenv("CUDA_VISIBLE_DEVICES", None) is None:
+        sys.exit(1)
+
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar"
-                        , required=True)
+    parser.add_argument('--model', type=str,
+                        help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar", required=True)
     parser.add_argument('--query', type=str, required=True)
-    parser.add_argument('--device', type=str, choices=['cpu', 'gpu'], default='cpu')
-    parser.add_argument('--num_images', type=int, default=1000)
-    parser.add_argument('--top_k', type=int, default=10)
-    parser.add_argument('--dataset', type=str, choices=['coco', 'flickr30k'], default='coco')
-    parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the "
-                                                                 "checkpoint configuration. See into 'config' folder")
-
+    parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')  # cpu is only for local test runs
+    parser.add_argument('--num_data_workers', type=int, default=8)
+    parser.add_argument('--num_images', type=int, default=5000)
+    parser.add_argument('--top_k', type=int, default=100)
+    parser.add_argument('--dataset', type=str, choices=['coco'], default='coco')  # TODO support other datasets
+    parser.add_argument('--config', type=str, default='configs/teran_coco_MrSw_IR.yaml',
+                        help="Which configuration to use for overriding the checkpoint configuration. See into "
+                             "'config' folder")
     opts = parser.parse_args()
-    if opts.config is not None:
-        with open(opts.config, 'r') as yml_file:
-            config = yaml.load(yml_file)
-    else:
-        config = None
-    top_k_results = main(opts, config)
-    print(f"######## TOP {opts.tok_k} RESULTS ########")
-    print(top_k_results)
+
+    model_config, model_checkpoint = prepare_model_checkpoint_and_config(opts)
+
+    top_k_matches = top_k_image_retrieval(opts, model_config, model_checkpoint)
+
+    print(f"######## TOP {opts.top_k} RESULTS ########")
+    print(top_k_matches)
diff --git a/utils.py b/utils.py
index 6b46f17..822e782 100644
--- a/utils.py
+++ b/utils.py
@@ -1,3 +1,5 @@
+from collections import OrderedDict
+
 import numpy
 
 from models.teran import TERAN

From cbbc32767b22207ee8cc7714dcc34d4e6bafb01b Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Wed, 30 Dec 2020 17:27:10 +0100
Subject: [PATCH 13/21] further optimized computation time by only computing
 the query embedding once at IR inference

---
 configs/teran_coco_MrSw_IR.yaml |   6 +-
 data.py                         |  19 +++---
 inference.py                    |  48 +++++++--------
 models/teran.py                 | 101 ++++++++++++++++++--------------
 models/utils.py                 |   3 +-
 5 files changed, 95 insertions(+), 82 deletions(-)

diff --git a/configs/teran_coco_MrSw_IR.yaml b/configs/teran_coco_MrSw_IR.yaml
index 6d650c7..3b27218 100644
--- a/configs/teran_coco_MrSw_IR.yaml
+++ b/configs/teran_coco_MrSw_IR.yaml
@@ -9,9 +9,9 @@ image-retrieval:
   dataset: 'coco' # for now only coco support
   split: 'test' # we can remove this in later versions
   num_imgs: 5000
-  batch_size: 100
-  pre-extracted-img-features-root: 'data/coco/features_36'
-  create_query_batch: True
+  batch_size: 100 # 100 takes ~10s; 1000 takes ~14s to encode the data (compute the TE outputs)
+  pre_extracted_img_features_root: 'data/coco/features_36'
+  create_query_batch: False
   alignment_mode: 'MrSw'
 
 
diff --git a/data.py b/data.py
index 2a2505c..971273c 100644
--- a/data.py
+++ b/data.py
@@ -402,24 +402,25 @@ def __call__(self, data):
             img_feature_batch[i, 1:] = f
 
         # create the image features bounding boxes batch
+        img_feat_lengths = [self.img_feat_length for _ in range(batch_size)]
         img_feat_bboxes_batch = torch.zeros(batch_size, self.bboxes_length, self.bboxes_dim)
         for i, box in enumerate(img_feat_bboxes):
             img_feat_bboxes_batch[i, 1:] = box
 
         if self.create_query_batch:
-            # create the query batch
+            # create the full query batch of size B x |Q|
             # since the token id is a scalar, the dim is 1 and whe don't need to add it to the batch
             # for the BERT embeddings the ids have to be Long
-            query_batch = torch.zeros(batch_size, self.query_length).long()
+            query_token_ids_batch = torch.zeros(batch_size, self.query_length).long()
             for i in range(len(queries)):
-                query_batch[i] = self.query_token_ids
-
+                query_token_ids_batch[i] = self.query_token_ids
             query_lengths = [self.query_length for _ in range(batch_size)]
-            img_feat_lengths = [self.img_feat_length for _ in range(batch_size)]
-
-            return img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_batch, query_lengths, dataset_indices
         else:
-            return img_feature_batch, img_feat_bboxes_batch, self.img_feat_length, self.query_token_ids, self.query_length, dataset_indices
+            # create a pseudo query batch with only one element of size 1 x |Q|
+            query_token_ids_batch = self.query_token_ids.unsqueeze(dim=0)
+            query_lengths = [self.query_length]
+
+        return img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_ids_batch, query_lengths, dataset_indices
 
 
 class Collate:
@@ -620,7 +621,7 @@ def get_coco_image_retrieval_data_loader(config, workers, query):
     imgs_root = roots[split_name]['img']
 
     # for images we use pre-extracted features (not for text)
-    pre_extracted_img_features_root = config['image-retrieval']['pre-extracted-img-features-root']
+    pre_extracted_img_features_root = config['image-retrieval']['pre_extracted_img_features_root']
 
     captions_json = roots[split_name]['cap']
     coco_annotation_ids = coco_annotation_ids[split_name]
diff --git a/inference.py b/inference.py
index 035dc05..5442a84 100644
--- a/inference.py
+++ b/inference.py
@@ -2,7 +2,7 @@
 import os
 import sys
 import time
-from typing import List, Any, Dict
+from typing import List
 
 import numpy as np
 import torch
@@ -34,45 +34,42 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr
     num_img_feats = None  # all images have a fixed size of pre-extracted features of 36 + 1 regions
     img_embs = None
 
+    # make sure val logger is used
+    model.logger = val_logger
+
     start_time = time.time()
-    for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_ids, query_lengths,
+    for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_id_batch, query_lengths_batch,
             dataset_indices) in enumerate(data_loader):
 
-        # make sure val logger is used
-        model.logger = val_logger
-
-        # TODO
-        # in the first version just stack the query_token_ids, img_feat_length and query_length
-        # so that it has shape B x ? x ?, where B is len(img_feature_batch) (should be equal to bs set in the config)
-        #
-        # in the second version adapt model.forward_emb so that the embeddings get only computed once and then stacked
-        # to the same size as the img_embs
-
-        # make sure val logger is used
-        model.logger = val_logger
+        if query_embs is not None:
+            # set the query batch to None so it doesn't get forwarded by TERAN again (to safe computation)
+            query_token_id_batch = None
+            query_lengths_batch = None
 
         # compute the embeddings
         with torch.no_grad():
             # TODO inside model.forward_emb we have to adapt the code for only a single query so that it doesn't get
             # computed each time
             _, _, img_emb, query_emb, _ = model.forward_emb(img_feature_batch,
-                                                            query_token_ids,
+                                                            query_token_id_batch,
                                                             img_feat_lengths,
-                                                            query_lengths,
+                                                            query_lengths_batch,
                                                             img_feat_bboxes_batch)
 
             # initialize the arrays given the size of the embeddings
             if img_embs is None:
                 num_img_feats = img_feat_lengths[0] if isinstance(img_feat_lengths, list) else img_feat_lengths
-                num_query_feats = query_lengths[0] if isinstance(query_lengths, list) else query_lengths
+                num_query_feats = query_lengths_batch[0] if isinstance(query_lengths_batch,
+                                                                       list) else query_lengths_batch
                 img_feat_dim = img_emb.size(2)
                 query_feat_dim = query_emb.size(2)
                 img_embs = torch.zeros((len(data_loader.dataset), num_img_feats, img_feat_dim))
-                query_embs = torch.zeros((len(data_loader.dataset), num_query_feats, query_feat_dim))
+                query_embs = torch.zeros((1, num_query_feats, query_feat_dim))
+                query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2)
 
             # preserve the embeddings by copying from gpu and converting to numpy
+            # TODO we could persist them on the disk to further save time
             img_embs[dataset_indices, :, :] = img_emb.cpu().permute(1, 0, 2)
-            query_embs[dataset_indices, :, :] = query_emb.cpu().permute(1, 0, 2)
 
         # measure elapsed time per batch
         batch_time.update(time.time() - start_time)
@@ -81,7 +78,7 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr
         if i % log_step == 0:
             logging(
                 f"Batch: [{i}/{len(data_loader)}]\t{str(model.logger)}\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})")
-        del img_feature_batch, query_token_ids
+        del img_feature_batch, query_token_id_batch
 
     print(f"Time elapsed to encode data: {time.time() - encode_data_start_time} seconds.")
     return img_embs, query_embs, num_img_feats, num_query_feats
@@ -92,10 +89,8 @@ def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_len
     sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['image-retrieval']['alignment_mode'],
                                              return_similarity_mat=True)
     start_time = time.time()
-    img_embs_per_batch = 1000  # TODO config variable
-    img_emb_batches = 5  # TODO config / calc
-
-    num_img_embs = img_embs.shape[0]
+    img_emb_batches = 1  # TODO config / calc
+    img_embs_per_batch = img_embs.size(0) // img_emb_batches  # TODO config variable
 
     # distances storage
     distances = None
@@ -109,7 +104,7 @@ def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_len
     # batch-wise compute the alignment distance between the images and the query
     for i in tqdm.trange(img_emb_batches):
         # create the current batch
-        img_embs_batch = img_embs[i * img_embs_per_batch:(i+1) * img_embs_per_batch]
+        img_embs_batch = img_embs[i * img_embs_per_batch:(i + 1) * img_embs_per_batch]
         img_embs_length_batch = [img_lengths for _ in range(img_embs_per_batch)]
         img_embs_batch.cuda()
 
@@ -189,7 +184,8 @@ def prepare_model_checkpoint_and_config(opts):
     parser.add_argument('--model', type=str,
                         help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar", required=True)
     parser.add_argument('--query', type=str, required=True)
-    parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')  # cpu is only for local test runs
+    parser.add_argument('--device', type=str, choices=['cpu', 'cuda'],
+                        default='cuda')  # cpu is only for local test runs
     parser.add_argument('--num_data_workers', type=int, default=8)
     parser.add_argument('--num_images', type=int, default=5000)
     parser.add_argument('--top_k', type=int, default=100)
diff --git a/models/teran.py b/models/teran.py
index f74e45a..8a638ab 100644
--- a/models/teran.py
+++ b/models/teran.py
@@ -1,16 +1,15 @@
 import torch
-import torch.nn.init
+import torch.backends.cudnn as cudnn
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.backends.cudnn as cudnn
+import torch.nn.init
+from nltk.corpus import stopwords
 from transformers import BertTokenizer
 
-from models.loss import ContrastiveLoss, PermInvMatchingLoss, AlignmentContrastiveLoss
-from models.text import EncoderTextBERT, EncoderText
-from models.visual import TransformerPostProcessing, EncoderImage
-
-from .utils import l2norm, PositionalEncodingImageBoxes, PositionalEncodingText, Aggregator, generate_square_subsequent_mask
-from nltk.corpus import stopwords, words as nltk_words
+from models.loss import ContrastiveLoss, AlignmentContrastiveLoss
+from models.text import EncoderText
+from models.visual import EncoderImage
+from .utils import l2norm, Aggregator
 
 
 class JointTextImageTransformerEncoder(nn.Module):
@@ -18,6 +17,7 @@ class JointTextImageTransformerEncoder(nn.Module):
     This is a bert caption encoder - transformer image encoder (using bottomup features).
     It process the encoder outputs through a transformer, like VilBERT and outputs two different graph embeddings
     """
+
     def __init__(self, config):
         super().__init__()
         self.txt_enc = EncoderText(config)
@@ -36,8 +36,8 @@ def __init__(self, config):
         self.shared_transformer = config['model']['shared-transformer']
 
         transformer_layer_1 = nn.TransformerEncoderLayer(d_model=embed_size, nhead=4,
-                                                       dim_feedforward=2048,
-                                                       dropout=dropout, activation='relu')
+                                                         dim_feedforward=2048,
+                                                         dropout=dropout, activation='relu')
         self.transformer_encoder_1 = nn.TransformerEncoder(transformer_layer_1,
                                                            num_layers=layers)
         if not self.shared_transformer:
@@ -52,15 +52,16 @@ def __init__(self, config):
         self.img_aggregation_type = config['model']['image-aggregation']
 
     def forward(self, features, captions, feat_len, cap_len, boxes):
-        # process captions by using bert
-        full_cap_emb_aggr, c_emb = self.txt_enc(captions, cap_len)     # B x S x cap_dim
+        if captions is not None:
+            # process captions by using bert
+            full_cap_emb_aggr, c_emb = self.txt_enc(captions, cap_len)  # B x S x cap_dim
+        else:
+            full_cap_emb_aggr, full_cap_emb = None, None
 
         # process image regions using a two-layer transformer
-        full_img_emb_aggr, i_emb = self.img_enc(features, feat_len, boxes)     # B x S x vis_dim
+        full_img_emb_aggr, i_emb = self.img_enc(features, feat_len, boxes)  # B x S x vis_dim
         # i_emb = i_emb.permute(1, 0, 2)                             # B x S x vis_dim
 
-        bs = features.shape[0]
-
         # if False:
         #     # concatenate the embeddings together
         #     max_summed_lengths = max([x + y for x, y in zip(feat_len, cap_len)])
@@ -84,44 +85,53 @@ def forward(self, features, captions, feat_len, cap_len, boxes):
 
         # forward the captions
         if self.text_aggregation_type is not None:
-            c_emb = self.cap_proj(c_emb)
-
-            mask = torch.zeros(bs, max(cap_len)).bool()
-            mask = mask.to(features.device)
-            for m, c_len in zip(mask, cap_len):
-                m[c_len:] = True
-            full_cap_emb = self.transformer_encoder_1(c_emb.permute(1, 0, 2), src_key_padding_mask=mask)  # S_txt x B x dim
-            full_cap_emb_aggr = self.text_aggregation(full_cap_emb, cap_len, mask)
+            if captions is not None:
+                c_emb = self.cap_proj(c_emb)
+
+                cap_bs = captions.shape[0]
+                mask = torch.zeros(cap_bs, max(cap_len)).bool()
+                mask = mask.to(features.device)
+                for m, c_len in zip(mask, cap_len):
+                    m[c_len:] = True
+                full_cap_emb = self.transformer_encoder_1(c_emb.permute(1, 0, 2),
+                                                          src_key_padding_mask=mask)  # S_txt x B x dim
+                full_cap_emb_aggr = self.text_aggregation(full_cap_emb, cap_len, mask)
+
+                full_cap_emb_aggr = l2norm(full_cap_emb_aggr)
+
+                # normalize even every vector of the set
+                full_cap_emb = F.normalize(full_cap_emb, p=2, dim=2)
         # else use the embedding output by the txt model
-        else:
+        elif self.text_aggregation_type is None:
             full_cap_emb = None
 
         # forward the regions
         if self.img_aggregation_type is not None:
             i_emb = self.img_proj(i_emb)
 
-            mask = torch.zeros(bs, max(feat_len)).bool()
+            feat_bs = features.shape[0]
+            mask = torch.zeros(feat_bs, max(feat_len)).bool()
             mask = mask.to(features.device)
             for m, v_len in zip(mask, feat_len):
                 m[v_len:] = True
             if self.shared_transformer:
-                full_img_emb = self.transformer_encoder_1(i_emb.permute(1, 0, 2), src_key_padding_mask=mask)  # S_txt x B x dim
+                full_img_emb = self.transformer_encoder_1(i_emb.permute(1, 0, 2),
+                                                          src_key_padding_mask=mask)  # S_txt x B x dim
             else:
-                full_img_emb = self.transformer_encoder_2(i_emb.permute(1, 0, 2), src_key_padding_mask=mask)  # S_txt x B x dim
+                full_img_emb = self.transformer_encoder_2(i_emb.permute(1, 0, 2),
+                                                          src_key_padding_mask=mask)  # S_txt x B x dim
             full_img_emb_aggr = self.image_aggregation(full_img_emb, feat_len, mask)
+            full_img_emb_aggr = l2norm(full_img_emb_aggr)
+            # normalize even every vector of the set
+            full_img_emb = F.normalize(full_img_emb, p=2, dim=2)
         else:
             full_img_emb = None
 
-        full_cap_emb_aggr = l2norm(full_cap_emb_aggr)
-        full_img_emb_aggr = l2norm(full_img_emb_aggr)
-
-        # normalize even every vector of the set
-        full_img_emb = F.normalize(full_img_emb, p=2, dim=2)
-        full_cap_emb = F.normalize(full_cap_emb, p=2, dim=2)
-
         if self.order_embeddings:
-            full_cap_emb_aggr = torch.abs(full_cap_emb_aggr)
+            if captions is not None:
+                full_cap_emb_aggr = torch.abs(full_cap_emb_aggr)
             full_img_emb_aggr = torch.abs(full_img_emb_aggr)
+
         return full_img_emb_aggr, full_cap_emb_aggr, full_img_emb, full_cap_emb
 
 
@@ -145,7 +155,8 @@ def __init__(self, config):
         if 'alignment' in loss_type:
             self.alignment_criterion = AlignmentContrastiveLoss(margin=config['training']['margin'],
                                                                 measure=config['training']['measure'],
-                                                                max_violation=config['training']['max-violation'], aggregation=config['training']['alignment-mode'])
+                                                                max_violation=config['training']['max-violation'],
+                                                                aggregation=config['training']['alignment-mode'])
         if 'matching' in loss_type:
             self.matching_criterion = ContrastiveLoss(margin=config['training']['margin'],
                                                       measure=config['training']['measure'],
@@ -181,18 +192,20 @@ def __init__(self, config):
     #     self.txt_enc.eval()
 
     def forward_emb(self, images, captions, img_len, cap_len, boxes):
-        """Compute the image and caption embeddings
+        """
+        Compute the image and caption embeddings
         """
         # Set mini-batch dataset
         if torch.cuda.is_available():
             images = images.cuda()
-            captions = captions.cuda()
             boxes = boxes.cuda()
+            if captions is not None:
+                captions = captions.cuda()
 
         # Forward
         img_emb_aggr, cap_emb_aggr, img_feats, cap_feats = self.img_txt_enc(images, captions, img_len, cap_len, boxes)
 
-        if self.tokenizer is not None:
+        if self.tokenizer is not None and captions is not None:
             # remove stopwords
             # keep only word indexes that are not stopwords
             good_word_indexes = [[i for i, (tok, w) in enumerate(zip(self.tokenizer.convert_ids_to_tokens(ids), ids)) if
@@ -200,8 +213,8 @@ def forward_emb(self, images, captions, img_len, cap_len, boxes):
             cap_len = [len(w) - (cap_feats.shape[0] - orig_len) for w, orig_len in zip(good_word_indexes, cap_len)]
             min_cut_len = min([len(w) for w in good_word_indexes])
             good_word_indexes = [words[:min_cut_len] for words in good_word_indexes]
-            good_word_indexes = torch.LongTensor(good_word_indexes).to(cap_feats.device) # B x S
-            good_word_indexes = good_word_indexes.t().unsqueeze(2).expand(-1, -1, cap_feats.shape[2]) # S x B x dim
+            good_word_indexes = torch.LongTensor(good_word_indexes).to(cap_feats.device)  # B x S
+            good_word_indexes = good_word_indexes.t().unsqueeze(2).expand(-1, -1, cap_feats.shape[2])  # S x B x dim
             cap_feats = cap_feats.gather(dim=0, index=good_word_indexes)
 
         return img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_len
@@ -262,10 +275,12 @@ def forward(self, images, targets, img_lengths, cap_lengths, boxes=None, ids=Non
         else:
             text = targets
             captions = targets
-            wembeddings = self.img_txt_enc.txt_enc.word_embeddings(captions.cuda() if torch.cuda.is_available() else captions)
+            wembeddings = self.img_txt_enc.txt_enc.word_embeddings(
+                captions.cuda() if torch.cuda.is_available() else captions)
 
         # compute the embeddings
-        img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_lengths = self.forward_emb(images, text, img_lengths, cap_lengths, boxes)
+        img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_lengths = self.forward_emb(images, text, img_lengths,
+                                                                                         cap_lengths, boxes)
         # NOTE: img_feats and cap_feats are S x B x dim
 
         loss_dict = self.forward_loss(img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, img_lengths, cap_lengths)
diff --git a/models/utils.py b/models/utils.py
index 4f32bd4..3f0bac3 100644
--- a/models/utils.py
+++ b/models/utils.py
@@ -87,7 +87,8 @@ def forward(self, x, boxes):  # x is seq_len x B x dim
 
 
 def l2norm(X):
-    """L2-normalize columns of X
+    """
+    L2-normalize columns of X
     """
     norm = torch.pow(X, 2).sum(dim=1, keepdim=True).sqrt()
     X = torch.div(X, norm)

From 5798289cb16f216d93ec356c3b399cc3d2432e80 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Wed, 30 Dec 2020 17:59:12 +0100
Subject: [PATCH 14/21] CocoImageRetrievalDataset is now inheriting from
 torch.data.Dataset

---
 data.py      | 4 ++--
 inference.py | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/data.py b/data.py
index 971273c..5792382 100644
--- a/data.py
+++ b/data.py
@@ -147,7 +147,7 @@ def __len__(self):
         return len(self.annotation_ids)
 
 
-class CocoImageRetrievalDataset:
+class CocoImageRetrievalDataset(data.Dataset):
     """
     Custom COCO Dataset that uses only the images together with a user query.
     Compatible with torch.utils.data.DataLoader.
@@ -634,7 +634,7 @@ def get_coco_image_retrieval_data_loader(config, workers, query):
                                         query=query,
                                         num_imgs=num_imgs)
 
-    # basically this creates the mini-batches which get passed to the model
+    # this creates the batches which get passed to the model (inside the query gets repeated or not based on the config)
     collate_fn = InferenceCollate(config)
     data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                               batch_size=batch_size,
diff --git a/inference.py b/inference.py
index 5442a84..5ebca8e 100644
--- a/inference.py
+++ b/inference.py
@@ -129,11 +129,6 @@ def get_image_names(top_k_indices, data_loader) -> List[str]:
 
 
 def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
-    # load model and options
-    # checkpoint = torch.load(model_path)
-    data_path = config['dataset']['data']
-    measure = config['training']['measure']
-
     # construct model
     model = TERAN(config)
 

From 372f5888e1975d19f1bdf11bf1d7fc9512ac9207 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Thu, 31 Dec 2020 10:36:36 +0100
Subject: [PATCH 15/21] Splitted computation of img and txt embeddings in TERAN

---
 models/teran.py | 150 ++++++++++++++++++++++++++++--------------------
 1 file changed, 87 insertions(+), 63 deletions(-)

diff --git a/models/teran.py b/models/teran.py
index 8a638ab..cf48e23 100644
--- a/models/teran.py
+++ b/models/teran.py
@@ -51,61 +51,41 @@ def __init__(self, config):
         self.text_aggregation_type = config['model']['text-aggregation']
         self.img_aggregation_type = config['model']['image-aggregation']
 
-    def forward(self, features, captions, feat_len, cap_len, boxes):
-        if captions is not None:
-            # process captions by using bert
-            full_cap_emb_aggr, c_emb = self.txt_enc(captions, cap_len)  # B x S x cap_dim
-        else:
-            full_cap_emb_aggr, full_cap_emb = None, None
-
-        # process image regions using a two-layer transformer
-        full_img_emb_aggr, i_emb = self.img_enc(features, feat_len, boxes)  # B x S x vis_dim
-        # i_emb = i_emb.permute(1, 0, 2)                             # B x S x vis_dim
-
-        # if False:
-        #     # concatenate the embeddings together
-        #     max_summed_lengths = max([x + y for x, y in zip(feat_len, cap_len)])
-        #     i_c_emb = torch.zeros(bs, max_summed_lengths, self.embed_size)
-        #     i_c_emb = i_c_emb.to(features.device)
-        #     mask = torch.zeros(bs, max_summed_lengths).bool()
-        #     mask = mask.to(features.device)
-        #     for i_c, m, i, c, i_len, c_len in zip(i_c_emb, mask, i_emb, c_emb, feat_len, cap_len):
-        #         i_c[:c_len] = c[:c_len]
-        #         i_c[c_len:c_len + i_len] = i[:i_len]
-        #         m[c_len + i_len:] = True
-        #
-        #     i_c_emb = i_c_emb.permute(1, 0, 2)      # S_vis + S_txt x B x dim
-        #     out = self.transformer_encoder(i_c_emb, src_key_padding_mask=mask)      # S_vis + S_txt x B x dim
-        #
-        #     full_cap_emb = out[0, :, :]
-        #     I = torch.LongTensor(cap_len).view(1, -1, 1)
-        #     I = I.expand(1, bs, self.embed_size).to(features.device)
-        #     full_img_emb = torch.gather(out, dim=0, index=I).squeeze(0)
-        # else:
+    def forward_txt(self, captions, cap_len):
+        # process captions by using bert
+        full_cap_emb_aggr, c_emb = self.txt_enc(captions, cap_len)  # B x S x cap_dim
 
         # forward the captions
         if self.text_aggregation_type is not None:
-            if captions is not None:
-                c_emb = self.cap_proj(c_emb)
+            c_emb = self.cap_proj(c_emb)
 
-                cap_bs = captions.shape[0]
-                mask = torch.zeros(cap_bs, max(cap_len)).bool()
-                mask = mask.to(features.device)
-                for m, c_len in zip(mask, cap_len):
-                    m[c_len:] = True
-                full_cap_emb = self.transformer_encoder_1(c_emb.permute(1, 0, 2),
-                                                          src_key_padding_mask=mask)  # S_txt x B x dim
-                full_cap_emb_aggr = self.text_aggregation(full_cap_emb, cap_len, mask)
+            cap_bs = captions.shape[0]
+            mask = torch.zeros(cap_bs, max(cap_len)).bool()
+            mask = mask.to(captions.device)
+            for m, c_len in zip(mask, cap_len):
+                m[c_len:] = True
+            full_cap_emb = self.transformer_encoder_1(c_emb.permute(1, 0, 2),
+                                                      src_key_padding_mask=mask)  # S_txt x B x dim
+            full_cap_emb_aggr = self.text_aggregation(full_cap_emb, cap_len, mask)
 
-                full_cap_emb_aggr = l2norm(full_cap_emb_aggr)
+            full_cap_emb_aggr = l2norm(full_cap_emb_aggr)
 
-                # normalize even every vector of the set
-                full_cap_emb = F.normalize(full_cap_emb, p=2, dim=2)
+            # normalize even every vector of the set
+            full_cap_emb = F.normalize(full_cap_emb, p=2, dim=2)
         # else use the embedding output by the txt model
-        elif self.text_aggregation_type is None:
+        else:
             full_cap_emb = None
 
+        if self.order_embeddings:
+            full_cap_emb_aggr = torch.abs(full_cap_emb_aggr)
+
+        return full_cap_emb_aggr, full_cap_emb
+
+    def forward_img(self, features, feat_len, boxes):
+        # process image regions using a two-layer transformer
+        full_img_emb_aggr, i_emb = self.img_enc(features, feat_len, boxes)  # B x S x vis_dim
         # forward the regions
+
         if self.img_aggregation_type is not None:
             i_emb = self.img_proj(i_emb)
 
@@ -116,11 +96,12 @@ def forward(self, features, captions, feat_len, cap_len, boxes):
                 m[v_len:] = True
             if self.shared_transformer:
                 full_img_emb = self.transformer_encoder_1(i_emb.permute(1, 0, 2),
-                                                          src_key_padding_mask=mask)  # S_txt x B x dim
+                                                          src_key_padding_mask=mask)  # S_img x B x dim
             else:
                 full_img_emb = self.transformer_encoder_2(i_emb.permute(1, 0, 2),
-                                                          src_key_padding_mask=mask)  # S_txt x B x dim
+                                                          src_key_padding_mask=mask)  # S_img x B x dim
             full_img_emb_aggr = self.image_aggregation(full_img_emb, feat_len, mask)
+
             full_img_emb_aggr = l2norm(full_img_emb_aggr)
             # normalize even every vector of the set
             full_img_emb = F.normalize(full_img_emb, p=2, dim=2)
@@ -128,10 +109,23 @@ def forward(self, features, captions, feat_len, cap_len, boxes):
             full_img_emb = None
 
         if self.order_embeddings:
-            if captions is not None:
-                full_cap_emb_aggr = torch.abs(full_cap_emb_aggr)
             full_img_emb_aggr = torch.abs(full_img_emb_aggr)
 
+        return full_img_emb_aggr, full_img_emb
+
+    def forward(self, features, captions, feat_len, cap_len, boxes):
+        if captions is not None:
+            # process captions
+            full_cap_emb_aggr, full_cap_emb = self.forward_txt(captions, cap_len)
+        else:
+            full_cap_emb_aggr, full_cap_emb = None, None
+
+        if features is not None:
+            # process image regions
+            full_img_emb_aggr, full_img_emb = self.forward_img(features, feat_len, boxes)
+        else:
+            full_img_emb_aggr, full_img_emb = None, None
+
         return full_img_emb_aggr, full_cap_emb_aggr, full_img_emb, full_cap_emb
 
 
@@ -191,14 +185,29 @@ def __init__(self, config):
     #     self.img_enc.eval()
     #     self.txt_enc.eval()
 
+    def remove_stopwords(self, captions, cap_feats, cap_len):
+        # remove stopwords
+        # keep only word indexes that are not stopwords
+        good_word_indexes = [[i for i, (tok, w) in enumerate(zip(self.tokenizer.convert_ids_to_tokens(ids), ids)) if
+                              tok not in self.en_stops or w == 0] for ids in captions]  # keeps the padding
+        cap_len = [len(w) - (cap_feats.shape[0] - orig_len) for w, orig_len in zip(good_word_indexes, cap_len)]
+        min_cut_len = min([len(w) for w in good_word_indexes])
+        good_word_indexes = [words[:min_cut_len] for words in good_word_indexes]
+        good_word_indexes = torch.LongTensor(good_word_indexes).to(cap_feats.device)  # B x S
+        good_word_indexes = good_word_indexes.t().unsqueeze(2).expand(-1, -1, cap_feats.shape[2])  # S x B x dim
+        cap_feats = cap_feats.gather(dim=0, index=good_word_indexes)
+
+        return cap_feats, cap_len
+
     def forward_emb(self, images, captions, img_len, cap_len, boxes):
         """
         Compute the image and caption embeddings
         """
         # Set mini-batch dataset
         if torch.cuda.is_available():
-            images = images.cuda()
-            boxes = boxes.cuda()
+            if images is not None and boxes is not None:
+                images = images.cuda()
+                boxes = boxes.cuda()
             if captions is not None:
                 captions = captions.cuda()
 
@@ -206,19 +215,31 @@ def forward_emb(self, images, captions, img_len, cap_len, boxes):
         img_emb_aggr, cap_emb_aggr, img_feats, cap_feats = self.img_txt_enc(images, captions, img_len, cap_len, boxes)
 
         if self.tokenizer is not None and captions is not None:
-            # remove stopwords
-            # keep only word indexes that are not stopwords
-            good_word_indexes = [[i for i, (tok, w) in enumerate(zip(self.tokenizer.convert_ids_to_tokens(ids), ids)) if
-                                  tok not in self.en_stops or w == 0] for ids in captions]  # keeps the padding
-            cap_len = [len(w) - (cap_feats.shape[0] - orig_len) for w, orig_len in zip(good_word_indexes, cap_len)]
-            min_cut_len = min([len(w) for w in good_word_indexes])
-            good_word_indexes = [words[:min_cut_len] for words in good_word_indexes]
-            good_word_indexes = torch.LongTensor(good_word_indexes).to(cap_feats.device)  # B x S
-            good_word_indexes = good_word_indexes.t().unsqueeze(2).expand(-1, -1, cap_feats.shape[2])  # S x B x dim
-            cap_feats = cap_feats.gather(dim=0, index=good_word_indexes)
+            cap_feats, cap_len = self.remove_stopwords(captions, cap_feats, cap_len)
 
         return img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_len
 
+    def forward_txt_emb(self, captions, cap_len):
+        """
+        compute txt embeddings only
+        """
+        if torch.cuda.is_available():
+            captions = captions.cuda()
+        cap_emb_aggr, cap_feats = self.img_txt_enc.forward_txt(captions, cap_len)
+        if self.tokenizer is not None and captions is not None:
+            cap_feats, cap_len = self.remove_stopwords(captions, cap_feats, cap_len)
+        return cap_emb_aggr, cap_feats, cap_len
+
+    def forward_img_emb(self, images, img_len, boxes):
+        """
+        compute img embeddings only
+        """
+        if torch.cuda.is_available():
+            images = images.cuda()
+            boxes = boxes.cuda()
+        img_emb_aggr, img_feats = self.img_txt_enc.forward_img(images, img_len, boxes)
+        return img_emb_aggr, img_feats
+
     def get_parameters(self):
         lr_multiplier = 1.0 if self.config['text-model']['fine-tune'] else 0.0
 
@@ -279,8 +300,11 @@ def forward(self, images, targets, img_lengths, cap_lengths, boxes=None, ids=Non
                 captions.cuda() if torch.cuda.is_available() else captions)
 
         # compute the embeddings
-        img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_lengths = self.forward_emb(images, text, img_lengths,
-                                                                                         cap_lengths, boxes)
+        img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_lengths = self.forward_emb(images,
+                                                                                         text,
+                                                                                         img_lengths,
+                                                                                         cap_lengths,
+                                                                                         boxes)
         # NOTE: img_feats and cap_feats are S x B x dim
 
         loss_dict = self.forward_loss(img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, img_lengths, cap_lengths)

From deb7cd0a6810d7800b0ceea42a9dd8b9ce81231e Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Thu, 31 Dec 2020 10:39:34 +0100
Subject: [PATCH 16/21] implemented pre-computation of img embeddings

---
 configs/teran_coco_MrSw_IR.yaml |   4 +-
 data.py                         |  21 +++--
 inference.py                    | 142 +++++++++++++++++++++-----------
 3 files changed, 108 insertions(+), 59 deletions(-)

diff --git a/configs/teran_coco_MrSw_IR.yaml b/configs/teran_coco_MrSw_IR.yaml
index 3b27218..0c79a1f 100644
--- a/configs/teran_coco_MrSw_IR.yaml
+++ b/configs/teran_coco_MrSw_IR.yaml
@@ -13,8 +13,8 @@ image-retrieval:
   pre_extracted_img_features_root: 'data/coco/features_36'
   create_query_batch: False
   alignment_mode: 'MrSw'
-
-
+  use_precomputed_img_embeddings: False
+  pre_computed_img_embeddings_root: 'data/coco/pre_computed_embeddings'
 
 text-model:
   name: 'bert'
diff --git a/data.py b/data.py
index 5792382..27e73b5 100644
--- a/data.py
+++ b/data.py
@@ -332,12 +332,13 @@ def __new__(cls, *args, **kwargs):
 
         return super(InferenceCollate, cls).__new__(cls)
 
-    def __init__(self, config):
+    def __init__(self, config, pre_compute_img_embs):
         self.vocab_type = str(config['text-model']['name']).lower()
         self.create_query_batch = bool(config['image-retrieval']['create_query_batch'])
-        if self.vocab_type == 'bert':
+        self.pre_compute_img_embs = pre_compute_img_embs
+        if self.vocab_type == 'bert' and not pre_compute_img_embs:
             self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain'])
-        else:
+        elif self.vocab_type != 'bert':
             raise ValueError("Currently only BERT Tokenizer is supported!")
 
     @classmethod
@@ -378,7 +379,7 @@ def __call__(self, data):
         """
 
         # encode (tokenize) the query
-        if self.query_token_ids is None:
+        if self.query_token_ids is None and not self.pre_compute_img_embs:
             # we don't need to pad or truncate since we only have a single query
             # TODO actually we don't even need the tokenizer twice so we could just use a local variable
             query_token_ids = torch.LongTensor(self.tokenizer.encode(queries[0]))
@@ -407,7 +408,7 @@ def __call__(self, data):
         for i, box in enumerate(img_feat_bboxes):
             img_feat_bboxes_batch[i, 1:] = box
 
-        if self.create_query_batch:
+        if self.create_query_batch and not self.pre_compute_img_embs:
             # create the full query batch of size B x |Q|
             # since the token id is a scalar, the dim is 1 and whe don't need to add it to the batch
             # for the BERT embeddings the ids have to be Long
@@ -415,10 +416,14 @@ def __call__(self, data):
             for i in range(len(queries)):
                 query_token_ids_batch[i] = self.query_token_ids
             query_lengths = [self.query_length for _ in range(batch_size)]
-        else:
+        elif not self.create_query_batch and not self.pre_compute_img_embs:
             # create a pseudo query batch with only one element of size 1 x |Q|
             query_token_ids_batch = self.query_token_ids.unsqueeze(dim=0)
             query_lengths = [self.query_length]
+        else:  # self.pre_compute_img_embs == True
+            # when pre-computing the image embeddings, we don't need (and have) information about the query
+            query_token_ids_batch = None
+            query_lengths = None
 
         return img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_ids_batch, query_lengths, dataset_indices
 
@@ -605,7 +610,7 @@ def get_loaders(config, workers, batch_size=None):
     return train_loader, val_loader
 
 
-def get_coco_image_retrieval_data_loader(config, workers, query):
+def get_coco_image_retrieval_data_loader(config, workers, query, pre_compute_img_embs=False):
     # create the dataset + loader
     # 1) load / create a Coco Dataset to get meta info about images (we could also do this by hand)
     # 2) choose (the first) N images and create a dataset with N samples where each sample consists of the n-th image
@@ -635,7 +640,7 @@ def get_coco_image_retrieval_data_loader(config, workers, query):
                                         num_imgs=num_imgs)
 
     # this creates the batches which get passed to the model (inside the query gets repeated or not based on the config)
-    collate_fn = InferenceCollate(config)
+    collate_fn = InferenceCollate(config, pre_compute_img_embs)
     data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
diff --git a/inference.py b/inference.py
index 5ebca8e..45f5806 100644
--- a/inference.py
+++ b/inference.py
@@ -2,6 +2,7 @@
 import os
 import sys
 import time
+from pathlib import Path
 from typing import List
 
 import numpy as np
@@ -15,14 +16,26 @@
 from utils import AverageMeter, LogCollector
 
 
-def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=print):
+def persist_img_embs(config, data_loader, dataset_indices, numpy_img_emb):
+    dst_root = Path(os.getcwd()).joinpath(config['image-retrieval']['pre_computed_img_embeddings_root'])
+    if not dst_root.exists():
+        dst_root.mkdir(parents=True, exist_ok=True)
+
+    assert len(dataset_indices) == len(numpy_img_emb)
+    img_names = get_image_names(dataset_indices, data_loader)
+    # TODO do we want to store them in one big npz?
+    for idx in range(len(img_names)):
+        dst = dst_root.joinpath(img_names[idx] + '.npz')
+        if dst.exists():
+            continue
+        np.savez_compressed(str(dst), img_emb=numpy_img_emb[idx])
+
+
+def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=print, pre_compute_img_embs=False):
     # compute the embedding vectors v_i, s_j (paper) for each image region and word respectively
     # -> forwarding the data through the respective TE stacks
-    print('Computing image and query embeddings...')
-    encode_data_start_time = time.time()
-
-    batch_time = AverageMeter()
-    val_logger = LogCollector()
+    print(
+        f'{"Pre-" if pre_compute_img_embs else ""}Computing image {"" if pre_compute_img_embs else "and query "}embeddings...')
 
     # we don't need autograd for inference
     model.eval()
@@ -35,56 +48,61 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr
     img_embs = None
 
     # make sure val logger is used
+    batch_time = AverageMeter()
+    val_logger = LogCollector()
     model.logger = val_logger
 
     start_time = time.time()
-    for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_id_batch, query_lengths_batch,
+    for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_len_batch, query_token_batch, query_len_batch,
             dataset_indices) in enumerate(data_loader):
+        batch_start_time = time.time()
+        """
+        the data loader returns None values for the respective batches if the only query was already loaded 
+        -> query_token_batch, query_len_batch = None, None
+        """
 
-        if query_embs is not None:
-            # set the query batch to None so it doesn't get forwarded by TERAN again (to safe computation)
-            query_token_id_batch = None
-            query_lengths_batch = None
-
-        # compute the embeddings
         with torch.no_grad():
-            # TODO inside model.forward_emb we have to adapt the code for only a single query so that it doesn't get
-            # computed each time
-            _, _, img_emb, query_emb, _ = model.forward_emb(img_feature_batch,
-                                                            query_token_id_batch,
-                                                            img_feat_lengths,
-                                                            query_lengths_batch,
-                                                            img_feat_bboxes_batch)
-
-            # initialize the arrays given the size of the embeddings
-            if img_embs is None:
-                num_img_feats = img_feat_lengths[0] if isinstance(img_feat_lengths, list) else img_feat_lengths
-                num_query_feats = query_lengths_batch[0] if isinstance(query_lengths_batch,
-                                                                       list) else query_lengths_batch
-                img_feat_dim = img_emb.size(2)
+            # compute the query embedding only in the first iteration (also because there is only 1 query in IR)
+            if query_embs is None and not pre_compute_img_embs:
+                # TODO maybe we can get the most matching roi from query_emb_aggr?
+                query_emb_aggr, query_emb, _ = model.forward_txt_emb(query_token_batch, query_len_batch)
+
+                # store results as np arrays for further processing or persisting
+                num_query_feats = query_len_batch[0] if isinstance(query_len_batch, list) else query_len_batch
                 query_feat_dim = query_emb.size(2)
-                img_embs = torch.zeros((len(data_loader.dataset), num_img_feats, img_feat_dim))
                 query_embs = torch.zeros((1, num_query_feats, query_feat_dim))
                 query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2)
 
-            # preserve the embeddings by copying from gpu and converting to numpy
-            # TODO we could persist them on the disk to further save time
-            img_embs[dataset_indices, :, :] = img_emb.cpu().permute(1, 0, 2)
+            # compute every image embedding in the dataset
+            img_emb_aggr, img_emb = model.forward_img_emb(img_feature_batch, img_feat_len_batch, img_feat_bboxes_batch)
+
+            # init array to store results for further processing or persisting
+            if img_embs is None:
+                num_img_feats = img_feat_len_batch[0] if isinstance(img_feat_len_batch,
+                                                                    list) else img_feat_len_batch
+                img_feat_dim = img_emb.size(2)
+                img_embs = torch.zeros((len(data_loader.dataset), num_img_feats, img_feat_dim))
+
+            numpy_img_emb = img_emb.cpu().permute(1, 0, 2)  # why are we permuting here? -> TERAN
+            img_embs[dataset_indices, :, :] = numpy_img_emb
+            if pre_compute_img_embs:
+                # if we are in a pre-compute run, persist the arrays
+                persist_img_embs(model_config, data_loader, dataset_indices, numpy_img_emb)
 
         # measure elapsed time per batch
-        batch_time.update(time.time() - start_time)
-        start_time = time.time()
+        batch_time.update(time.time() - batch_start_time)
 
         if i % log_step == 0:
             logging(
                 f"Batch: [{i}/{len(data_loader)}]\t{str(model.logger)}\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})")
-        del img_feature_batch, query_token_id_batch
+        del img_feature_batch, query_token_batch
 
-    print(f"Time elapsed to encode data: {time.time() - encode_data_start_time} seconds.")
+    print(
+        f"Time elapsed to {'encode' if not pre_compute_img_embs else 'encode and persist'} data: {time.time() - start_time} seconds.")
     return img_embs, query_embs, num_img_feats, num_query_feats
 
 
-def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_lengths, config):
+def compute_distances(img_embs, query_embs, img_lengths, query_lengths, config):
     # initialize similarity matrix evaluator
     sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['image-retrieval']['alignment_mode'],
                                              return_similarity_mat=True)
@@ -124,8 +142,8 @@ def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_len
     return sorted_distance_indices
 
 
-def get_image_names(top_k_indices, data_loader) -> List[str]:
-    return [data_loader.dataset.get_image_metadata(idx)['file_name'] for idx in top_k_indices]
+def get_image_names(dataset_indices, data_loader) -> List[str]:
+    return [data_loader.dataset.get_image_metadata(idx)['file_name'] for idx in dataset_indices]
 
 
 def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
@@ -147,7 +165,7 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
     print(f"Images: {img_embs.shape[0]}, Captions: {cap_embs.shape[0]}")
 
     # compute the matching scores
-    distance_sorted_indices = compute_distance_sorted_indices(img_embs, cap_embs, img_lengths, cap_lengths, config)
+    distance_sorted_indices = compute_distances(img_embs, cap_embs, img_lengths, cap_lengths, config)
     top_k_indices = distance_sorted_indices[:opts.top_k]
 
     # get the image names
@@ -170,6 +188,24 @@ def prepare_model_checkpoint_and_config(opts):
     return model_checkpoint_config, checkpoint
 
 
+def pre_compute_img_embeddings(opts, config, checkpoint):
+    # construct model
+    model = TERAN(config)
+
+    # load model state
+
+    model.load_state_dict(checkpoint['model'], strict=False)
+
+    print('Loading dataset')
+    data_loader = get_coco_image_retrieval_data_loader(config,
+                                                       query=opts.query,
+                                                       workers=opts.num_data_workers,
+                                                       pre_compute_img_embs=True)
+
+    # encode the data (i.e. compute the embeddings / TE outputs for the images and query)
+    encode_data_for_inference(model, data_loader, pre_compute_img_embs=True)
+
+
 if __name__ == '__main__':
     print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", "NOT SET - ABORTING"))
     if os.getenv("CUDA_VISIBLE_DEVICES", None) is None:
@@ -178,21 +214,29 @@ def prepare_model_checkpoint_and_config(opts):
     parser = argparse.ArgumentParser()
     parser.add_argument('--model', type=str,
                         help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar", required=True)
-    parser.add_argument('--query', type=str, required=True)
-    parser.add_argument('--device', type=str, choices=['cpu', 'cuda'],
-                        default='cuda')  # cpu is only for local test runs
+    parser.add_argument('--pre_compute_img_embeddings', action='store_true', help="If set or true, the image "
+                                                                                  "embeddings get precomputed and "
+                                                                                  "persisted at the directory "
+                                                                                  "specified in the config.")
+    parser.add_argument('--query', type=str, required='--pre_compute_img_embeddings' not in sys.argv)
     parser.add_argument('--num_data_workers', type=int, default=8)
     parser.add_argument('--num_images', type=int, default=5000)
     parser.add_argument('--top_k', type=int, default=100)
     parser.add_argument('--dataset', type=str, choices=['coco'], default='coco')  # TODO support other datasets
-    parser.add_argument('--config', type=str, default='configs/teran_coco_MrSw_IR.yaml',
-                        help="Which configuration to use for overriding the checkpoint configuration. See into "
-                             "'config' folder")
+    parser.add_argument('--config', type=str, default='configs/teran_coco_MrSw_IR.yaml', help="Which configuration to "
+                                                                                              "use for overriding the"
+                                                                                              " checkpoint "
+                                                                                              "configuration. See "
+                                                                                              "into 'config' folder")
+    # cpu is only for local test runs
+    parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
     opts = parser.parse_args()
 
     model_config, model_checkpoint = prepare_model_checkpoint_and_config(opts)
 
-    top_k_matches = top_k_image_retrieval(opts, model_config, model_checkpoint)
-
-    print(f"######## TOP {opts.top_k} RESULTS ########")
-    print(top_k_matches)
+    if not opts.pre_compute_img_embeddings:
+        top_k_matches = top_k_image_retrieval(opts, model_config, model_checkpoint)
+        print(f"######## TOP {opts.top_k} RESULTS ########")
+        print(top_k_matches)
+    else:
+        pre_compute_img_embeddings(opts, model_config, model_checkpoint)

From 8b9e3ff34d51545e7137bf1cd1cf9a0e855c4be0 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Thu, 31 Dec 2020 13:19:20 +0100
Subject: [PATCH 17/21] using pre-computed image embeddings

---
 configs/teran_coco_MrSw_IR_PreComp.yaml |  63 ++++++++++
 data.py                                 | 148 ++++++++++++++++--------
 evaluation.py                           |   2 +-
 inference.py                            |  67 +++++++----
 models/teran.py                         |   4 +-
 5 files changed, 214 insertions(+), 70 deletions(-)
 create mode 100644 configs/teran_coco_MrSw_IR_PreComp.yaml

diff --git a/configs/teran_coco_MrSw_IR_PreComp.yaml b/configs/teran_coco_MrSw_IR_PreComp.yaml
new file mode 100644
index 0000000..7be7a6e
--- /dev/null
+++ b/configs/teran_coco_MrSw_IR_PreComp.yaml
@@ -0,0 +1,63 @@
+dataset:
+  name: 'coco'
+  images-path: 'data/coco/images'  # not needed if using pre-extracted bottom-up features
+  data: 'data'
+  restval: True
+  pre-extracted-features: False
+
+image-retrieval:
+  dataset: 'coco' # for now only coco support
+  split: 'test' # we can remove this in later versions
+  num_imgs: 5000
+  batch_size: 100 # 100 takes ~10s; 1000 takes ~14s to encode the data (compute the TE outputs)
+  pre_extracted_img_features_root: 'data/coco/features_36'
+  create_query_batch: False
+  alignment_mode: 'MrSw'
+  use_precomputed_img_embeddings: True
+  pre_computed_img_embeddings_root: 'data/coco/pre_computed_embeddings'
+
+text-model:
+  name: 'bert'
+  pretrain: 'bert-base-uncased'
+  word-dim: 768
+  extraction-hidden-layer: 6
+  fine-tune: True
+  pre-extracted: False
+  layers: 0
+  dropout: 0.1
+
+image-model:
+  name: 'bottomup'
+  pre-extracted-features-root: 'data/coco/features_36'
+  transformer-layers: 4
+  dropout: 0.1
+  pos-encoding: 'concat-and-process'
+  crop-size: 224  # not used
+  fine-tune: False
+  feat-dim: 2048
+  norm: True
+
+model:
+  name: 'teran'
+  embed-size: 1024
+  text-aggregation: 'first'
+  image-aggregation: 'first'
+  layers: 2
+  exclude-stopwords: False
+  shared-transformer: False
+  dropout: 0.1
+
+training:
+  lr: 0.00001  # 0.000006
+  grad-clip: 2.0
+  max-violation: True
+  loss-type: 'alignment'
+  alignment-mode: 'MrSw'
+  measure: 'dot'
+  margin: 0.2
+  bs: 40
+  scheduler: 'steplr'
+  gamma: 0.1
+  step-size: 20
+  warmup: null
+  warmup-period: 1000
diff --git a/data.py b/data.py
index 27e73b5..109e7b8 100644
--- a/data.py
+++ b/data.py
@@ -147,22 +147,88 @@ def __len__(self):
         return len(self.annotation_ids)
 
 
-class CocoImageRetrievalDataset(data.Dataset):
+class CocoImageRetrievalDatasetBase:
+    def __init__(self, captions_json, coco_annotation_ids, query, num_imgs):
+        self.query = query
+        self.num_imgs = num_imgs
+
+        self.coco = COCO(captions_json)
+        self.anno_ids = coco_annotation_ids
+
+    def get_raw_item(self, idx):
+        next_img_idx = idx * 5  # in the coco dataset there are 5 captions for every image
+        ann_id = self.anno_ids[next_img_idx]
+        coco_img_id = self.coco.anns[ann_id]['image_id']
+        img_metadata = self.coco.imgs[coco_img_id]
+        img_size = np.array([img_metadata['width'], img_metadata['height']])
+
+        return coco_img_id, img_size
+
+    def get_image_metadata(self, idx):
+        # TODO can't we just get coco.imgs[idx'] somehow?
+        next_img_idx = idx * 5  # in the coco dataset there are 5 captions for every image
+        ann_id = self.anno_ids[next_img_idx]
+        coco_img_id = self.coco.anns[ann_id]['image_id']
+        img_metadata = self.coco.imgs[coco_img_id]
+        return img_metadata, coco_img_id
+
+
+class PreComputedCocoEmbeddingsDataset(CocoImageRetrievalDatasetBase):
+    """
+    Custom COCO Dataset that uses pre-computed image embedding
+    """
+
+    def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config):
+        CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs)
+
+        pre_computed_img_embeddings_root = config['image-retrieval']['pre_computed_img_embeddings_root']
+        self.pre_computed_img_embeddings_root = pre_computed_img_embeddings_root
+
+        self.img_embs = {idx: self.__load_img_emb(idx) for idx in range(num_imgs)}
+
+        self.vocab_type = str(config['text-model']['name']).lower()
+        if self.vocab_type == 'bert':
+            self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain'])
+        elif self.vocab_type != 'bert':
+            raise ValueError("Currently only BERT Tokenizer is supported!")
+
+    def __load_img_emb(self, idx):
+        # just return the query and the img embedding
+        img_metadata, coco_img_id = self.get_image_metadata(idx)
+        file_name = img_metadata['file_name']
+        npz = np.load(os.path.join(self.pre_computed_img_embeddings_root, file_name + '.npz'))
+        img_emd = npz.get('img_emb')
+
+        return img_emd
+
+    def get_img_embs_and_lens(self):
+        return self.img_embs
+
+    def get_query_pseudo_batch(self):
+        # tokenize and encode the query
+        query_token_ids = torch.LongTensor(self.tokenizer.encode(self.query))
+        # create a pseudo batch suitable for TERAN
+        query_token_pseudo_batch = query_token_ids.unsqueeze(dim=0)
+        query_lengths = [len(query_token_ids)]
+        return query_token_pseudo_batch, query_lengths
+
+    def __len__(self):
+        return self.num_imgs
+
+
+class PreComputedCocoFeaturesDataset(CocoImageRetrievalDatasetBase, data.Dataset):
     """
     Custom COCO Dataset that uses only the images together with a user query.
     Compatible with torch.utils.data.DataLoader.
     """
 
     def __init__(self, imgs_root, img_features_path, captions_json, coco_annotation_ids, query, num_imgs):
-        self.query = query
-        self.num_imgs = num_imgs
+        CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs)
+
         self.feats_data_path = os.path.join(img_features_path, 'bu_att')
         self.box_data_path = os.path.join(img_features_path, 'bu_box')
         self.imgs_root = imgs_root
 
-        self.coco = COCO(captions_json)
-        self.anno_ids = coco_annotation_ids
-
     def __getitem__(self, idx):
         """
         This function returns a tuple that is further passed to collate_fn
@@ -185,23 +251,6 @@ def __getitem__(self, idx):
         # this output is the input of the CollateFn
         return img_feat, img_feat_box, img_id, self.query, idx
 
-    def get_raw_item(self, idx):
-        next_img_idx = idx * 5  # in the coco dataset there are 5 captions for every image
-        ann_id = self.anno_ids[next_img_idx]
-        img_id = self.coco.anns[ann_id]['image_id']
-        img_metadata = self.coco.imgs[img_id]
-        img_size = np.array([img_metadata['width'], img_metadata['height']])
-
-        return img_id, img_size
-
-    def get_image_metadata(self, idx):
-        # TODO can't we just get coco.imgs[idx'] somehow?
-        next_img_idx = idx * 5  # in the coco dataset there are 5 captions for every image
-        ann_id = self.anno_ids[next_img_idx]
-        img_id = self.coco.anns[ann_id]['image_id']
-        img_metadata = self.coco.imgs[img_id]
-        return img_metadata
-
     def __len__(self):
         return self.num_imgs
 
@@ -333,9 +382,9 @@ def __new__(cls, *args, **kwargs):
         return super(InferenceCollate, cls).__new__(cls)
 
     def __init__(self, config, pre_compute_img_embs):
-        self.vocab_type = str(config['text-model']['name']).lower()
         self.create_query_batch = bool(config['image-retrieval']['create_query_batch'])
         self.pre_compute_img_embs = pre_compute_img_embs
+        self.vocab_type = str(config['text-model']['name']).lower()
         if self.vocab_type == 'bert' and not pre_compute_img_embs:
             self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain'])
         elif self.vocab_type != 'bert':
@@ -362,6 +411,7 @@ def __call__(self, data):
         img_feats, img_feat_bboxes, img_ids, queries, dataset_indices = zip(*data)
         """
         Build batch tensors from a list of (img_feats, img_feat_boxes, img_ids, queries, dataset_indices) tuples.
+        This data comes from the dataset
             Args:
                 - img_feats:
                 - img_feat_bboxes:
@@ -462,7 +512,7 @@ def __call__(self, data):
             cap_features = [torch.FloatTensor(f) for f in cap_features]
             wembeddings = [torch.FloatTensor(w) for w in wembeddings]
         else:
-            if self.vocab_type == 'bert': 
+            if self.vocab_type == 'bert':
                 cap_lengths = [len(self.tokenizer.tokenize(c)) + 2 for c in
                                captions]  # + 2 in order to account for begin and end tokens
                 max_len = max(cap_lengths)
@@ -508,7 +558,7 @@ def __call__(self, data):
             targets = torch.zeros(len(captions), max(cap_lengths)).long()
             for i, cap in enumerate(captions):
                 end = cap_lengths[i]
-                targets[i, :end] = cap[:end]  #caption token ids
+                targets[i, :end] = cap[:end]  # caption token ids
 
         if not preextracted_images:
             return images, targets, None, cap_lengths, None, ids
@@ -610,12 +660,7 @@ def get_loaders(config, workers, batch_size=None):
     return train_loader, val_loader
 
 
-def get_coco_image_retrieval_data_loader(config, workers, query, pre_compute_img_embs=False):
-    # create the dataset + loader
-    # 1) load / create a Coco Dataset to get meta info about images (we could also do this by hand)
-    # 2) choose (the first) N images and create a dataset with N samples where each sample consists of the n-th image
-    #    and the query (gets repeated N times) # TODO maybe this is not necessary
-
+def get_coco_image_retrieval_data(config, query, workers=None, pre_compute_img_embs=False):
     # get the directories that contain the coco json files and coco annotation ids (which we may not need, I think)
     roots, coco_annotation_ids = get_paths(config)
 
@@ -625,28 +670,37 @@ def get_coco_image_retrieval_data_loader(config, workers, query, pre_compute_img
 
     imgs_root = roots[split_name]['img']
 
-    # for images we use pre-extracted features (not for text)
-    pre_extracted_img_features_root = config['image-retrieval']['pre_extracted_img_features_root']
-
     captions_json = roots[split_name]['cap']
     coco_annotation_ids = coco_annotation_ids[split_name]
     num_imgs = config['image-retrieval']['num_imgs']
+    pre_extracted_img_features_root = config['image-retrieval']['pre_extracted_img_features_root']
+
+    use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings']
+    if use_precomputed_img_embeddings:
+        dataset = PreComputedCocoEmbeddingsDataset(captions_json=captions_json,
+                                                   coco_annotation_ids=coco_annotation_ids,
+                                                   query=query,
+                                                   num_imgs=num_imgs,
+                                                   config=config)
 
-    dataset = CocoImageRetrievalDataset(imgs_root=imgs_root,
-                                        img_features_path=pre_extracted_img_features_root,
-                                        captions_json=captions_json,
-                                        coco_annotation_ids=coco_annotation_ids,
-                                        query=query,
-                                        num_imgs=num_imgs)
+        return dataset
+
+    dataset = PreComputedCocoFeaturesDataset(imgs_root=imgs_root,
+                                             img_features_path=pre_extracted_img_features_root,
+                                             captions_json=captions_json,
+                                             coco_annotation_ids=coco_annotation_ids,
+                                             query=query,
+                                             num_imgs=num_imgs)
 
     # this creates the batches which get passed to the model (inside the query gets repeated or not based on the config)
     collate_fn = InferenceCollate(config, pre_compute_img_embs)
-    data_loader = torch.utils.data.DataLoader(dataset=dataset,
-                                              batch_size=batch_size,
-                                              shuffle=False,
-                                              pin_memory=True,
-                                              num_workers=workers,
-                                              collate_fn=collate_fn)
+
+    data_loader = data.DataLoader(dataset=dataset,
+                                  batch_size=batch_size,
+                                  shuffle=False,
+                                  pin_memory=True,
+                                  num_workers=workers,
+                                  collate_fn=collate_fn)
 
     return data_loader
 
diff --git a/evaluation.py b/evaluation.py
index f164cf0..c815606 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -10,7 +10,7 @@
 from evaluate_utils.dcg import DCG
 from models.loss import order_sim, AlignmentContrastiveLoss
 from utils import get_model, AverageMeter, LogCollector
-from data import get_coco_image_retrieval_data_loader, get_test_loader
+from data import get_coco_image_retrieval_data, get_test_loader
 
 
 def encode_data(model, data_loader, log_step=10, logging=print):
diff --git a/inference.py b/inference.py
index 45f5806..acb4b26 100644
--- a/inference.py
+++ b/inference.py
@@ -10,7 +10,7 @@
 import tqdm
 import yaml
 
-from data import get_coco_image_retrieval_data_loader
+from data import get_coco_image_retrieval_data
 from models.loss import AlignmentContrastiveLoss
 from models.teran import TERAN
 from utils import AverageMeter, LogCollector
@@ -65,7 +65,7 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr
             # compute the query embedding only in the first iteration (also because there is only 1 query in IR)
             if query_embs is None and not pre_compute_img_embs:
                 # TODO maybe we can get the most matching roi from query_emb_aggr?
-                query_emb_aggr, query_emb, _ = model.forward_txt_emb(query_token_batch, query_len_batch)
+                query_emb_aggr, query_emb, _ = model.forward_txt(query_token_batch, query_len_batch)
 
                 # store results as np arrays for further processing or persisting
                 num_query_feats = query_len_batch[0] if isinstance(query_len_batch, list) else query_len_batch
@@ -74,7 +74,7 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr
                 query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2)
 
             # compute every image embedding in the dataset
-            img_emb_aggr, img_emb = model.forward_img_emb(img_feature_batch, img_feat_len_batch, img_feat_bboxes_batch)
+            img_emb_aggr, img_emb = model.forward_img(img_feature_batch, img_feat_len_batch, img_feat_bboxes_batch)
 
             # init array to store results for further processing or persisting
             if img_embs is None:
@@ -142,8 +142,32 @@ def compute_distances(img_embs, query_embs, img_lengths, query_lengths, config):
     return sorted_distance_indices
 
 
-def get_image_names(dataset_indices, data_loader) -> List[str]:
-    return [data_loader.dataset.get_image_metadata(idx)['file_name'] for idx in dataset_indices]
+def get_image_names(dataset_indices, dataset) -> List[str]:
+    return [dataset.get_image_metadata(idx)[0]['file_name'] for idx in dataset_indices]
+
+
+def get_precomputed_embeddings(config, opts, model):
+    print("Loading pre-computed image embeddings...")
+    start = time.time()
+    # returns a PreComputedCocoEmbeddingsDataset
+    dataset = get_coco_image_retrieval_data(config, query=opts.query)
+
+    # compute the query embedding
+    with torch.no_grad():
+        query_token_pseudo_batch, query_lengths = dataset.get_query_pseudo_batch()
+        query_emb_aggr, query_emb, _ = model.forward_txt(query_token_pseudo_batch, query_lengths)
+
+        # store results as np arrays for further processing or persisting
+        query_feat_dim = query_emb.size(2)
+        query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False)
+        query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2)
+
+    # get the img embeddings and convert them to Tensors
+    np_img_embs = list(dataset.img_embs.values())
+    img_embs = torch.Tensor(np_img_embs)
+    img_length = len(np_img_embs[0])
+    print(f"Time elapsed to load pre-computed embeddings and compute query embedding: {time.time() - start} seconds!")
+    return img_embs, query_embs, img_length, query_lengths, dataset
 
 
 def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
@@ -153,23 +177,27 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
     # load model state
     model.load_state_dict(checkpoint['model'], strict=False)
 
-    print('Loading dataset')
-    data_loader = get_coco_image_retrieval_data_loader(config,
-                                                       query=opts.query,
-                                                       workers=opts.num_data_workers)
-
-    # encode the data (i.e. compute the embeddings / TE outputs for the images and query)
-    img_embs, cap_embs, img_lengths, cap_lengths = encode_data_for_inference(model, data_loader)
+    use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings']
+    if use_precomputed_img_embeddings:
+        img_embs, query_embs, img_lengths, query_lengths, dataset = get_precomputed_embeddings(config, opts, model)
+    else:
+        # returns a Dataloader of a PreComputedCocoFeaturesDataset
+        data_loader = get_coco_image_retrieval_data(config,
+                                                    query=opts.query,
+                                                    workers=opts.num_data_workers)
+        dataset = data_loader.dataset
+        # encode the data (i.e. compute the embeddings / TE outputs for the images and query)
+        img_embs, query_embs, img_lengths, query_lengths = encode_data_for_inference(model, data_loader)
 
     torch.cuda.empty_cache()
-    print(f"Images: {img_embs.shape[0]}, Captions: {cap_embs.shape[0]}")
+    print(f"Images Embeddings: {img_embs.shape[0]}, Query Embeddings: {query_embs.shape[0]}")
 
     # compute the matching scores
-    distance_sorted_indices = compute_distances(img_embs, cap_embs, img_lengths, cap_lengths, config)
+    distance_sorted_indices = compute_distances(img_embs, query_embs, img_lengths, query_lengths, config)
     top_k_indices = distance_sorted_indices[:opts.top_k]
 
     # get the image names
-    top_k_images = get_image_names(top_k_indices, data_loader)
+    top_k_images = get_image_names(top_k_indices, dataset)
     return top_k_images
 
 
@@ -193,14 +221,13 @@ def pre_compute_img_embeddings(opts, config, checkpoint):
     model = TERAN(config)
 
     # load model state
-
     model.load_state_dict(checkpoint['model'], strict=False)
 
     print('Loading dataset')
-    data_loader = get_coco_image_retrieval_data_loader(config,
-                                                       query=opts.query,
-                                                       workers=opts.num_data_workers,
-                                                       pre_compute_img_embs=True)
+    data_loader = get_coco_image_retrieval_data(config,
+                                                query=opts.query,
+                                                workers=opts.num_data_workers,
+                                                pre_compute_img_embs=True)
 
     # encode the data (i.e. compute the embeddings / TE outputs for the images and query)
     encode_data_for_inference(model, data_loader, pre_compute_img_embs=True)
diff --git a/models/teran.py b/models/teran.py
index cf48e23..b57be52 100644
--- a/models/teran.py
+++ b/models/teran.py
@@ -219,7 +219,7 @@ def forward_emb(self, images, captions, img_len, cap_len, boxes):
 
         return img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_len
 
-    def forward_txt_emb(self, captions, cap_len):
+    def forward_txt(self, captions, cap_len):
         """
         compute txt embeddings only
         """
@@ -230,7 +230,7 @@ def forward_txt_emb(self, captions, cap_len):
             cap_feats, cap_len = self.remove_stopwords(captions, cap_feats, cap_len)
         return cap_emb_aggr, cap_feats, cap_len
 
-    def forward_img_emb(self, images, img_len, boxes):
+    def forward_img(self, images, img_len, boxes):
         """
         compute img embeddings only
         """

From 6c697dca385f671fc78de49c6fab4c47c8ffc0ae Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Thu, 31 Dec 2020 15:06:52 +0100
Subject: [PATCH 18/21] optimized loading of per-computed embeddings

---
 data.py      | 49 ++++++++++++++++++++++++++++++++-----------------
 inference.py | 11 +++++++++--
 2 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/data.py b/data.py
index 109e7b8..5758aad 100644
--- a/data.py
+++ b/data.py
@@ -1,6 +1,9 @@
 import json as jsonmod
 import os
 import pickle
+import time
+from collections import OrderedDict
+from multiprocessing import Pool
 
 import numpy as np
 import torch
@@ -173,18 +176,26 @@ def get_image_metadata(self, idx):
         return img_metadata, coco_img_id
 
 
+# This has to be outside any class so that it can be pickled for multiproc
+def load_img_emb(args):
+    # just return the query and the img embedding
+    idx, file_name = args
+    npz = np.load(file_name)
+    img_emd = npz.get('img_emb')
+    return idx, img_emd
+
+
 class PreComputedCocoEmbeddingsDataset(CocoImageRetrievalDatasetBase):
     """
     Custom COCO Dataset that uses pre-computed image embedding
     """
 
-    def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config):
+    def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config, num_workers=32):
         CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs)
 
         pre_computed_img_embeddings_root = config['image-retrieval']['pre_computed_img_embeddings_root']
         self.pre_computed_img_embeddings_root = pre_computed_img_embeddings_root
-
-        self.img_embs = {idx: self.__load_img_emb(idx) for idx in range(num_imgs)}
+        self.num_workers = num_workers
 
         self.vocab_type = str(config['text-model']['name']).lower()
         if self.vocab_type == 'bert':
@@ -192,17 +203,20 @@ def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config):
         elif self.vocab_type != 'bert':
             raise ValueError("Currently only BERT Tokenizer is supported!")
 
-    def __load_img_emb(self, idx):
-        # just return the query and the img embedding
-        img_metadata, coco_img_id = self.get_image_metadata(idx)
-        file_name = img_metadata['file_name']
-        npz = np.load(os.path.join(self.pre_computed_img_embeddings_root, file_name + '.npz'))
-        img_emd = npz.get('img_emb')
-
-        return img_emd
-
-    def get_img_embs_and_lens(self):
-        return self.img_embs
+        self.img_embs = self.__load_img_embs()
+
+    def __load_img_embs(self):
+        start = time.time()
+        print('Parellel loading of pre-computed image embeddings started...')
+        file_names = list(map(lambda m: os.path.join(self.pre_computed_img_embeddings_root, m[0]['file_name'] + '.npz'),
+                              [self.get_image_metadata(i) for i in range(self.num_imgs)]))
+        # parallel loading of all image embeddings
+        with Pool(self.num_workers) as pool:
+            res = pool.map(load_img_emb, enumerate(file_names))
+        pool.join()
+        res = OrderedDict(res)
+        print(f'Time elapsed to load pre-computed image embeddings: {time.time() - start} seconds')
+        return res
 
     def get_query_pseudo_batch(self):
         # tokenize and encode the query
@@ -660,7 +674,7 @@ def get_loaders(config, workers, batch_size=None):
     return train_loader, val_loader
 
 
-def get_coco_image_retrieval_data(config, query, workers=None, pre_compute_img_embs=False):
+def get_coco_image_retrieval_data(config, query, num_workers=32, pre_compute_img_embs=False):
     # get the directories that contain the coco json files and coco annotation ids (which we may not need, I think)
     roots, coco_annotation_ids = get_paths(config)
 
@@ -681,7 +695,8 @@ def get_coco_image_retrieval_data(config, query, workers=None, pre_compute_img_e
                                                    coco_annotation_ids=coco_annotation_ids,
                                                    query=query,
                                                    num_imgs=num_imgs,
-                                                   config=config)
+                                                   config=config,
+                                                   num_workers=num_workers)
 
         return dataset
 
@@ -699,7 +714,7 @@ def get_coco_image_retrieval_data(config, query, workers=None, pre_compute_img_e
                                   batch_size=batch_size,
                                   shuffle=False,
                                   pin_memory=True,
-                                  num_workers=workers,
+                                  num_workers=num_workers,
                                   collate_fn=collate_fn)
 
     return data_loader
diff --git a/inference.py b/inference.py
index acb4b26..f246a1e 100644
--- a/inference.py
+++ b/inference.py
@@ -154,17 +154,24 @@ def get_precomputed_embeddings(config, opts, model):
 
     # compute the query embedding
     with torch.no_grad():
+        start_query_batch = time.time()
         query_token_pseudo_batch, query_lengths = dataset.get_query_pseudo_batch()
+        print(f'Time to get query pseudo batch: {time.time() - start_query_batch}')
+
+        start_query_enc = time.time()
         query_emb_aggr, query_emb, _ = model.forward_txt(query_token_pseudo_batch, query_lengths)
+        print(f'Time to compute query embedding: {time.time() - start_query_enc}')
+
 
         # store results as np arrays for further processing or persisting
         query_feat_dim = query_emb.size(2)
         query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False)
         query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2)
 
+
     # get the img embeddings and convert them to Tensors
-    np_img_embs = list(dataset.img_embs.values())
-    img_embs = torch.Tensor(np_img_embs)
+    np_img_embs = np.array(list(dataset.img_embs.values()))
+    img_embs = torch.Tensor(np_img_embs)  # here is the bottleneck
     img_length = len(np_img_embs[0])
     print(f"Time elapsed to load pre-computed embeddings and compute query embedding: {time.time() - start} seconds!")
     return img_embs, query_embs, img_length, query_lengths, dataset

From 3bf88b5dd16796082ed979c0a7c5751115ef52b6 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Thu, 31 Dec 2020 15:11:49 +0100
Subject: [PATCH 19/21] minor bugfix

---
 inference.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/inference.py b/inference.py
index f246a1e..f835947 100644
--- a/inference.py
+++ b/inference.py
@@ -162,13 +162,11 @@ def get_precomputed_embeddings(config, opts, model):
         query_emb_aggr, query_emb, _ = model.forward_txt(query_token_pseudo_batch, query_lengths)
         print(f'Time to compute query embedding: {time.time() - start_query_enc}')
 
-
         # store results as np arrays for further processing or persisting
         query_feat_dim = query_emb.size(2)
         query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False)
         query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2)
 
-
     # get the img embeddings and convert them to Tensors
     np_img_embs = np.array(list(dataset.img_embs.values()))
     img_embs = torch.Tensor(np_img_embs)  # here is the bottleneck
@@ -191,7 +189,7 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
         # returns a Dataloader of a PreComputedCocoFeaturesDataset
         data_loader = get_coco_image_retrieval_data(config,
                                                     query=opts.query,
-                                                    workers=opts.num_data_workers)
+                                                    num_workers=opts.num_data_workers)
         dataset = data_loader.dataset
         # encode the data (i.e. compute the embeddings / TE outputs for the images and query)
         img_embs, query_embs, img_lengths, query_lengths = encode_data_for_inference(model, data_loader)
@@ -233,7 +231,7 @@ def pre_compute_img_embeddings(opts, config, checkpoint):
     print('Loading dataset')
     data_loader = get_coco_image_retrieval_data(config,
                                                 query=opts.query,
-                                                workers=opts.num_data_workers,
+                                                num_workers=opts.num_data_workers,
                                                 pre_compute_img_embs=True)
 
     # encode the data (i.e. compute the embeddings / TE outputs for the images and query)
@@ -270,6 +268,8 @@ def pre_compute_img_embeddings(opts, config, checkpoint):
 
     if not opts.pre_compute_img_embeddings:
         top_k_matches = top_k_image_retrieval(opts, model_config, model_checkpoint)
+        print(f"##########################################")
+        print(f"QUERY: {opts.query}")
         print(f"######## TOP {opts.top_k} RESULTS ########")
         print(top_k_matches)
     else:

From 9f17dfe30d906442cb6bd580fbf8f279a7bbfaf1 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Sun, 3 Jan 2021 15:25:07 +0100
Subject: [PATCH 20/21] modularized code a bit

---
 data.py      | 88 +++++++++++++++++++++++++++++-----------------------
 inference.py | 40 ++++++++++--------------
 2 files changed, 66 insertions(+), 62 deletions(-)

diff --git a/data.py b/data.py
index 5758aad..15f83b0 100644
--- a/data.py
+++ b/data.py
@@ -151,29 +151,18 @@ def __len__(self):
 
 
 class CocoImageRetrievalDatasetBase:
-    def __init__(self, captions_json, coco_annotation_ids, query, num_imgs):
-        self.query = query
+    def __init__(self, captions_json, coco_annotation_ids, num_imgs):
         self.num_imgs = num_imgs
 
         self.coco = COCO(captions_json)
         self.anno_ids = coco_annotation_ids
 
-    def get_raw_item(self, idx):
-        next_img_idx = idx * 5  # in the coco dataset there are 5 captions for every image
-        ann_id = self.anno_ids[next_img_idx]
-        coco_img_id = self.coco.anns[ann_id]['image_id']
-        img_metadata = self.coco.imgs[coco_img_id]
-        img_size = np.array([img_metadata['width'], img_metadata['height']])
-
-        return coco_img_id, img_size
-
     def get_image_metadata(self, idx):
-        # TODO can't we just get coco.imgs[idx'] somehow?
         next_img_idx = idx * 5  # in the coco dataset there are 5 captions for every image
         ann_id = self.anno_ids[next_img_idx]
         coco_img_id = self.coco.anns[ann_id]['image_id']
         img_metadata = self.coco.imgs[coco_img_id]
-        return img_metadata, coco_img_id
+        return coco_img_id, img_metadata
 
 
 # This has to be outside any class so that it can be pickled for multiproc
@@ -185,30 +174,24 @@ def load_img_emb(args):
     return idx, img_emd
 
 
-class PreComputedCocoEmbeddingsDataset(CocoImageRetrievalDatasetBase):
+class PreComputedCocoImageEmbeddingsDataset(CocoImageRetrievalDatasetBase):
     """
     Custom COCO Dataset that uses pre-computed image embedding
     """
 
-    def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config, num_workers=32):
-        CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs)
+    def __init__(self, captions_json, coco_annotation_ids, num_imgs, config, num_workers=32):
+        CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, num_imgs)
 
         pre_computed_img_embeddings_root = config['image-retrieval']['pre_computed_img_embeddings_root']
         self.pre_computed_img_embeddings_root = pre_computed_img_embeddings_root
         self.num_workers = num_workers
 
-        self.vocab_type = str(config['text-model']['name']).lower()
-        if self.vocab_type == 'bert':
-            self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain'])
-        elif self.vocab_type != 'bert':
-            raise ValueError("Currently only BERT Tokenizer is supported!")
-
         self.img_embs = self.__load_img_embs()
 
     def __load_img_embs(self):
         start = time.time()
-        print('Parellel loading of pre-computed image embeddings started...')
-        file_names = list(map(lambda m: os.path.join(self.pre_computed_img_embeddings_root, m[0]['file_name'] + '.npz'),
+        print('Parallel loading of pre-computed image embeddings started...')
+        file_names = list(map(lambda m: os.path.join(self.pre_computed_img_embeddings_root, m[1]['file_name'] + '.npz'),
                               [self.get_image_metadata(i) for i in range(self.num_imgs)]))
         # parallel loading of all image embeddings
         with Pool(self.num_workers) as pool:
@@ -218,16 +201,45 @@ def __load_img_embs(self):
         print(f'Time elapsed to load pre-computed image embeddings: {time.time() - start} seconds')
         return res
 
-    def get_query_pseudo_batch(self):
+    def __len__(self):
+        return self.num_imgs
+
+
+class QueryEncoder:
+    def __init__(self, config, model):
+        self.vocab_type = str(config['text-model']['name']).lower()
+        if self.vocab_type == 'bert':
+            self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain'])
+        elif self.vocab_type != 'bert':
+            raise ValueError("Currently only BERT Tokenizer is supported!")
+
+        self.model = model
+
+    def _get_query_pseudo_batch(self, query: str):
         # tokenize and encode the query
-        query_token_ids = torch.LongTensor(self.tokenizer.encode(self.query))
+        query_token_ids = torch.LongTensor(self.tokenizer.encode(query))
         # create a pseudo batch suitable for TERAN
         query_token_pseudo_batch = query_token_ids.unsqueeze(dim=0)
         query_lengths = [len(query_token_ids)]
         return query_token_pseudo_batch, query_lengths
 
-    def __len__(self):
-        return self.num_imgs
+    def compute_query_embedding(self, query):
+        # compute the query embedding
+        with torch.no_grad():
+            start_query_batch = time.time()
+            query_token_pseudo_batch, query_lengths = self._get_query_pseudo_batch(query)
+            print(f'Time to get query pseudo batch: {time.time() - start_query_batch}')
+
+            start_query_enc = time.time()
+            query_emb_aggr, query_emb, _ = self.model.forward_txt(query_token_pseudo_batch, query_lengths)
+            print(f'Time to compute query embedding: {time.time() - start_query_enc}')
+
+            # store results as np arrays for further processing or persisting
+            query_feat_dim = query_emb.size(2)
+            query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False)
+            query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2)
+
+        return query_embs, query_lengths
 
 
 class PreComputedCocoFeaturesDataset(CocoImageRetrievalDatasetBase, data.Dataset):
@@ -237,17 +249,19 @@ class PreComputedCocoFeaturesDataset(CocoImageRetrievalDatasetBase, data.Dataset
     """
 
     def __init__(self, imgs_root, img_features_path, captions_json, coco_annotation_ids, query, num_imgs):
-        CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs)
+        CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, num_imgs)
 
         self.feats_data_path = os.path.join(img_features_path, 'bu_att')
         self.box_data_path = os.path.join(img_features_path, 'bu_box')
         self.imgs_root = imgs_root
+        self.query = query
 
     def __getitem__(self, idx):
         """
         This function returns a tuple that is further passed to collate_fn
         """
-        img_id, img_size = self.get_raw_item(idx)
+        img_id, img_metadata = self.get_image_metadata(idx)
+        img_size = np.array([img_metadata['width'], img_metadata['height']])
 
         img_feat_path = os.path.join(self.feats_data_path, '{}.npz'.format(img_id))
         img_box_path = os.path.join(self.box_data_path, '{}.npy'.format(img_id))
@@ -674,7 +688,7 @@ def get_loaders(config, workers, batch_size=None):
     return train_loader, val_loader
 
 
-def get_coco_image_retrieval_data(config, query, num_workers=32, pre_compute_img_embs=False):
+def get_coco_image_retrieval_data(config, query=None, num_workers=32, pre_compute_img_embs=False):
     # get the directories that contain the coco json files and coco annotation ids (which we may not need, I think)
     roots, coco_annotation_ids = get_paths(config)
 
@@ -691,13 +705,11 @@ def get_coco_image_retrieval_data(config, query, num_workers=32, pre_compute_img
 
     use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings']
     if use_precomputed_img_embeddings:
-        dataset = PreComputedCocoEmbeddingsDataset(captions_json=captions_json,
-                                                   coco_annotation_ids=coco_annotation_ids,
-                                                   query=query,
-                                                   num_imgs=num_imgs,
-                                                   config=config,
-                                                   num_workers=num_workers)
-
+        dataset = PreComputedCocoImageEmbeddingsDataset(captions_json=captions_json,
+                                                        coco_annotation_ids=coco_annotation_ids,
+                                                        num_imgs=num_imgs,
+                                                        config=config,
+                                                        num_workers=num_workers)
         return dataset
 
     dataset = PreComputedCocoFeaturesDataset(imgs_root=imgs_root,
diff --git a/inference.py b/inference.py
index f835947..2949061 100644
--- a/inference.py
+++ b/inference.py
@@ -10,7 +10,7 @@
 import tqdm
 import yaml
 
-from data import get_coco_image_retrieval_data
+from data import get_coco_image_retrieval_data, QueryEncoder
 from models.loss import AlignmentContrastiveLoss
 from models.teran import TERAN
 from utils import AverageMeter, LogCollector
@@ -143,36 +143,21 @@ def compute_distances(img_embs, query_embs, img_lengths, query_lengths, config):
 
 
 def get_image_names(dataset_indices, dataset) -> List[str]:
-    return [dataset.get_image_metadata(idx)[0]['file_name'] for idx in dataset_indices]
+    return [dataset.get_image_metadata(idx)[1]['file_name'] for idx in dataset_indices]
 
 
-def get_precomputed_embeddings(config, opts, model):
+def load_precomputed_image_embeddings(config):
     print("Loading pre-computed image embeddings...")
     start = time.time()
-    # returns a PreComputedCocoEmbeddingsDataset
-    dataset = get_coco_image_retrieval_data(config, query=opts.query)
-
-    # compute the query embedding
-    with torch.no_grad():
-        start_query_batch = time.time()
-        query_token_pseudo_batch, query_lengths = dataset.get_query_pseudo_batch()
-        print(f'Time to get query pseudo batch: {time.time() - start_query_batch}')
-
-        start_query_enc = time.time()
-        query_emb_aggr, query_emb, _ = model.forward_txt(query_token_pseudo_batch, query_lengths)
-        print(f'Time to compute query embedding: {time.time() - start_query_enc}')
-
-        # store results as np arrays for further processing or persisting
-        query_feat_dim = query_emb.size(2)
-        query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False)
-        query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2)
+    # returns a PreComputedCocoImageEmbeddingsDataset
+    dataset = get_coco_image_retrieval_data(config)
 
     # get the img embeddings and convert them to Tensors
     np_img_embs = np.array(list(dataset.img_embs.values()))
     img_embs = torch.Tensor(np_img_embs)  # here is the bottleneck
-    img_length = len(np_img_embs[0])
+    img_lengths = len(np_img_embs[0])
     print(f"Time elapsed to load pre-computed embeddings and compute query embedding: {time.time() - start} seconds!")
-    return img_embs, query_embs, img_length, query_lengths, dataset
+    return img_embs, img_lengths, dataset
 
 
 def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
@@ -184,7 +169,12 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
 
     use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings']
     if use_precomputed_img_embeddings:
-        img_embs, query_embs, img_lengths, query_lengths, dataset = get_precomputed_embeddings(config, opts, model)
+        # load pre computed img embs
+        img_embs, img_lengths, dataset = load_precomputed_image_embeddings(config)
+        # compute query emb
+        query_encoder = QueryEncoder(config, model)
+        query_embs, query_lengths = query_encoder.compute_query_embedding(opts.query)
+
     else:
         # returns a Dataloader of a PreComputedCocoFeaturesDataset
         data_loader = get_coco_image_retrieval_data(config,
@@ -194,7 +184,9 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
         # encode the data (i.e. compute the embeddings / TE outputs for the images and query)
         img_embs, query_embs, img_lengths, query_lengths = encode_data_for_inference(model, data_loader)
 
-    torch.cuda.empty_cache()
+    if opts.device == "cuda":
+        torch.cuda.empty_cache()
+
     print(f"Images Embeddings: {img_embs.shape[0]}, Query Embeddings: {query_embs.shape[0]}")
 
     # compute the matching scores

From 915dcbd0542437adb82bdc6052626deb1a419022 Mon Sep 17 00:00:00 2001
From: floschne <florian.schneider.1992@gmx.de>
Date: Sun, 3 Jan 2021 15:43:39 +0100
Subject: [PATCH 21/21] own fn to load teran

---
 inference.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/inference.py b/inference.py
index 2949061..56e026d 100644
--- a/inference.py
+++ b/inference.py
@@ -146,31 +146,35 @@ def get_image_names(dataset_indices, dataset) -> List[str]:
     return [dataset.get_image_metadata(idx)[1]['file_name'] for idx in dataset_indices]
 
 
-def load_precomputed_image_embeddings(config):
+def load_precomputed_image_embeddings(config, num_workers):
     print("Loading pre-computed image embeddings...")
     start = time.time()
     # returns a PreComputedCocoImageEmbeddingsDataset
-    dataset = get_coco_image_retrieval_data(config)
+    dataset = get_coco_image_retrieval_data(config, num_workers=num_workers)
 
     # get the img embeddings and convert them to Tensors
     np_img_embs = np.array(list(dataset.img_embs.values()))
-    img_embs = torch.Tensor(np_img_embs)  # here is the bottleneck
+    img_embs = torch.Tensor(np_img_embs)
     img_lengths = len(np_img_embs[0])
     print(f"Time elapsed to load pre-computed embeddings and compute query embedding: {time.time() - start} seconds!")
     return img_embs, img_lengths, dataset
 
 
-def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
+def load_teran(config, checkpoint):
     # construct model
     model = TERAN(config)
-
     # load model state
     model.load_state_dict(checkpoint['model'], strict=False)
+    return model
+
+
+def top_k_image_retrieval(opts, config, checkpoint) -> List[str]:
+    model = load_teran(config, checkpoint)
 
     use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings']
     if use_precomputed_img_embeddings:
         # load pre computed img embs
-        img_embs, img_lengths, dataset = load_precomputed_image_embeddings(config)
+        img_embs, img_lengths, dataset = load_precomputed_image_embeddings(config, num_workers=opts.num_data_workers)
         # compute query emb
         query_encoder = QueryEncoder(config, model)
         query_embs, query_lengths = query_encoder.compute_query_embedding(opts.query)