From 4861210e6a5d08f3b04c12fe879e80a149dae437 Mon Sep 17 00:00:00 2001 From: floschne Date: Tue, 8 Dec 2020 18:13:53 +0100 Subject: [PATCH 01/21] minimal environment with up2date packages --- environment_min.yml | 98 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 environment_min.yml diff --git a/environment_min.yml b/environment_min.yml new file mode 100644 index 0000000..504ce86 --- /dev/null +++ b/environment_min.yml @@ -0,0 +1,98 @@ +name: teran +channels: + - pytorch + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _pytorch_select=0.1=cpu_0 + - arrow=0.17.0=py36h9f0ad1d_1 + - binaryornot=0.4.4=py_1 + - blas=1.0=mkl + - brotlipy=0.7.0=py36he6145b8_1001 + - ca-certificates=2020.10.14=0 + - certifi=2020.12.5=py36h06a4308_0 + - cffi=1.14.0=py36h2e261b9_0 + - chardet=3.0.4=py36h9880bd3_1008 + - click=7.1.2=py_0 + - cookiecutter=1.7.2=pyh9f0ad1d_0 + - cryptography=3.2.1=py36h6ec43e4_0 + - cudatoolkit=10.1.243=h6bb024c_0 + - cycler=0.10.0=py_2 + - cython=0.29.21=py36ha357f81_1 + - dataclasses=0.7=pyhe4b4509_6 + - filelock=3.0.12=pyh9f0ad1d_0 + - freetype=2.10.4=h5ab3b9f_0 + - gperftools=2.7=h767d802_2 + - idna=2.10=pyh9f0ad1d_0 + - intel-openmp=2020.2=254 + - jinja2=2.11.2=pyh9f0ad1d_0 + - jinja2-time=0.2.0=py_2 + - joblib=0.17.0=py_0 + - jpeg=9b=h024ee3a_2 + - kiwisolver=1.3.1=py36h51d7077_0 + - lcms2=2.11=h396b838_0 + - libedit=3.1.20191231=h14c3975_1 + - libffi=3.2.1=hf484d3e_1007 + - libgcc-ng=9.1.0=hdf63c60_0 + - libpng=1.6.37=hbc83047_0 + - libstdcxx-ng=9.1.0=hdf63c60_0 + - libtiff=4.1.0=h2733197_1 + - libuv=1.40.0=h7b6447c_0 + - lz4-c=1.9.2=heb0550a_3 + - markupsafe=1.1.1=py36he6145b8_2 + - matplotlib-base=3.3.3=py36he12231b_0 + - mkl=2020.2=256 + - mkl-service=2.3.0=py36he8ac12f_0 + - mkl_fft=1.2.0=py36h23d657b_0 + - mkl_random=1.1.1=py36h0573a6f_0 + - ncurses=6.2=he6710b0_1 + - ninja=1.10.2=py36hff7bd54_0 + - nltk=3.5=py_0 + - numpy=1.19.2=py36h54aff64_0 + - numpy-base=1.19.2=py36hfa32c7d_0 + - olefile=0.46=py36_0 + - openssl=1.1.1h=h7b6447c_0 + - packaging=20.7=pyhd3deb0d_0 + - perl=5.32.0=h36c2ea0_0 + - pillow=8.0.1=py36he98fc37_0 + - pip=20.3.1=py36h06a4308_0 + - poyo=0.5.0=py_0 + - protobuf=3.4.1=py36_0 + - pycocotools=2.0.2=py36h8c4c3a4_1 + - pycparser=2.20=py_2 + - pyopenssl=20.0.0=pyhd8ed1ab_0 + - pyparsing=2.4.7=pyh9f0ad1d_0 + - pysocks=1.7.1=py36h9880bd3_2 + - python=3.6.9=h265db76_0 + - python-dateutil=2.8.1=py_0 + - python-slugify=4.0.1=pyh9f0ad1d_0 + - python_abi=3.6=1_cp36m + - pytorch=1.7.0=py3.6_cuda10.1.243_cudnn7.6.3_0 + - readline=7.0=h7b6447c_5 + - regex=2020.11.13=py36h27cfd23_0 + - requests=2.25.0=pyhd3deb0d_0 + - sacremoses=0.0.43=pyh9f0ad1d_0 + - sentencepiece=0.1.92=py36hdb11119_0 + - setuptools=51.0.0=py36h06a4308_2 + - six=1.15.0=py36h06a4308_0 + - sqlite=3.33.0=h62c20be_0 + - text-unidecode=1.3=py_0 + - tk=8.6.10=hbc83047_0 + - tokenizers=0.9.4=py36h2bc52f9_1 + - torchvision=0.8.1=py36_cu101 + - tornado=6.1=py36h1d69622_0 + - tqdm=4.54.1=pyhd3eb1b0_0 + - transformers=4.0.0=pyhd8ed1ab_0 + - typing_extensions=3.7.4.3=py_0 + - unidecode=1.1.1=py_0 + - urllib3=1.25.11=py_0 + - wheel=0.36.1=pyhd3eb1b0_0 + - whichcraft=0.6.1=py_0 + - xz=5.2.5=h7b6447c_0 + - yaml=0.2.5=h7b6447c_0 + - zlib=1.2.11=h7b6447c_3 + - zstd=1.4.5=h9ceee32_0 + - pip: + - pyyaml==5.3.1 +prefix: /home/p0w3r/bin/miniconda3/envs/teran From 5c6eeb79997e64c30bfa269e5ae52d43ac821589 Mon Sep 17 00:00:00 2001 From: floschne Date: Tue, 8 Dec 2020 18:50:15 +0100 Subject: [PATCH 02/21] updated readme --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index cb3837f..cedb9d8 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,12 @@ conda activate teran export PYTHONPATH=. ``` +2.1 Setup minimal python environment for CUDA 10.1 using conda: +``` +conda env create --file environment_min.yml +conda activate teran +export PYTHONPATH=. +``` ## Get the data 1. Download and extract the data folder, containing annotations, the splits by Karpathy et al. and ROUGEL - SPICE precomputed relevances for both COCO and Flickr30K datasets: From fbbdb815e8b91723f8bda9eb3e469bf6c408f014 Mon Sep 17 00:00:00 2001 From: floschne Date: Wed, 9 Dec 2020 11:29:33 +0100 Subject: [PATCH 03/21] ignored data and models --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index a9a721b..8843461 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,10 @@ *.ipynb_checkpoints *.json *.pth.tar + + +.idea +data +pretrained_models +*.tar + From 646f2b3e7e66af5bfc10c8dd43d0e96ab6aff766 Mon Sep 17 00:00:00 2001 From: floschne Date: Wed, 9 Dec 2020 11:48:59 +0100 Subject: [PATCH 04/21] ignored jupyer stuff --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8843461..4bc9eeb 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ data pretrained_models *.tar - +*.ipynb \ No newline at end of file From 3a0fd5f66f37c228d222a9d51f0a0ba9451f8997 Mon Sep 17 00:00:00 2001 From: floschne Date: Thu, 10 Dec 2020 15:48:08 +0100 Subject: [PATCH 05/21] - commented out ndcg and i2t stuff - started impl of inference.py --- configs/teran_coco_MrSw.yaml | 2 +- configs/teran_inf_coco_MrSw.yaml | 59 +++++++++++++++++++++++++++++ evaluation.py | 23 +++++------ inference.py | 65 ++++++++++++++++++++++++++++++++ models/teran.py | 4 +- test.py | 11 ++++-- 6 files changed, 147 insertions(+), 17 deletions(-) create mode 100644 configs/teran_inf_coco_MrSw.yaml create mode 100644 inference.py diff --git a/configs/teran_coco_MrSw.yaml b/configs/teran_coco_MrSw.yaml index d3458aa..5e8a440 100644 --- a/configs/teran_coco_MrSw.yaml +++ b/configs/teran_coco_MrSw.yaml @@ -51,7 +51,7 @@ training: alignment-mode: 'MrSw' measure: 'dot' margin: 0.2 - bs: 40 + bs: 100 scheduler: 'steplr' gamma: 0.1 step-size: 20 diff --git a/configs/teran_inf_coco_MrSw.yaml b/configs/teran_inf_coco_MrSw.yaml new file mode 100644 index 0000000..d3458aa --- /dev/null +++ b/configs/teran_inf_coco_MrSw.yaml @@ -0,0 +1,59 @@ +dataset: + name: 'coco' + images-path: 'data/coco/images' # not needed if using pre-extracted bottom-up features + data: 'data' + restval: True + pre-extracted-features: False + +text-model: + name: 'bert' + pretrain: 'bert-base-uncased' + word-dim: 768 + extraction-hidden-layer: 6 + fine-tune: True + pre-extracted: False + layers: 0 + dropout: 0.1 + +#text-model: +# name: 'gru' +# word-dim: 300 +# fine-tune: True +# pre-extracted: False +# layers: 1 + +image-model: + name: 'bottomup' + pre-extracted-features-root: 'data/coco/features_36' + transformer-layers: 4 + dropout: 0.1 + pos-encoding: 'concat-and-process' + crop-size: 224 # not used + fine-tune: False + feat-dim: 2048 + norm: True + +model: + name: 'teran' + embed-size: 1024 + text-aggregation: 'first' + image-aggregation: 'first' + layers: 2 + exclude-stopwords: False + shared-transformer: False + dropout: 0.1 + +training: + lr: 0.00001 # 0.000006 + grad-clip: 2.0 + max-violation: True + loss-type: 'alignment' + alignment-mode: 'MrSw' + measure: 'dot' + margin: 0.2 + bs: 40 + scheduler: 'steplr' + gamma: 0.1 + step-size: 20 + warmup: null + warmup-period: 1000 diff --git a/evaluation.py b/evaluation.py index 5cf4569..e4a5d68 100644 --- a/evaluation.py +++ b/evaluation.py @@ -72,7 +72,8 @@ def tb_log(self, tb_logger, prefix='', step=None): def encode_data(model, data_loader, log_step=10, logging=print): - """Encode all images and captions loadable by `data_loader` + """ + Encode all images and captions loadable by `data_loader` """ batch_time = AverageMeter() val_logger = LogCollector() @@ -195,14 +196,14 @@ def evalrank(config, checkpoint, split='dev', fold5=False): if not fold5: # no cross-validation, full evaluation - r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5) + # r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5) ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5) - ar = (r[0] + r[1] + r[2]) / 3 + # ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 - rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] - print("rsum: %.1f" % rsum) - print("Average i2t Recall: %.1f" % ar) - print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r) + #rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] + #print("rsum: %.1f" % rsum) + # print("Average i2t Recall: %.1f" % ar) + # print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) else: @@ -391,10 +392,10 @@ def t2i(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals 0] # in che posizione e' l'immagine (index) che ha questa caption (5*index + i) top50[5 * index + i] = inds[i][0:50] # calculate ndcg - if ndcg_scorer is not None: - rougel_ndcgs[5 * index + i], spice_ndcgs[5 * index + i] = \ - ndcg_scorer.compute_ndcg(npts, 5 * index + i, inds[i].astype(int), - fold_index=fold_index, retrieval='image').values() + # if ndcg_scorer is not None: + # rougel_ndcgs[5 * index + i], spice_ndcgs[5 * index + i] = \ + # ndcg_scorer.compute_ndcg(npts, 5 * index + i, inds[i].astype(int), + # fold_index=fold_index, retrieval='image').values() # Compute metrics r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) diff --git a/inference.py b/inference.py new file mode 100644 index 0000000..6bb8dd5 --- /dev/null +++ b/inference.py @@ -0,0 +1,65 @@ +import argparse +from typing import List +from data import get_inference_loader +import torch +import yaml + +from models.teran import TERAN + + +def image_retrieval(checkpoint, opts, config) -> List[str]: + # load model and options + # checkpoint = torch.load(model_path) + data_path = config['dataset']['data'] + measure = config['training']['measure'] + + # construct model + model = TERAN(config) + + # load model state + model.load_state_dict(checkpoint['model'], strict=False) + + print('Loading dataset') + dataloader = get_inference_loader(config, opts, workers=4) + + return ["1", "2"] + + +def main(opts, current_config) -> List[str]: + checkpoint = torch.load(opts.checkpoint, map_location=torch.device(opts.device)) + + print('Checkpoint loaded from {}'.format(opts.checkpoint)) + loaded_config = checkpoint['config'] + + # Override some mandatory things in the configuration (paths) + if current_config is not None: + loaded_config['dataset']['images-path'] = current_config['dataset']['images-path'] + loaded_config['dataset']['data'] = current_config['dataset']['data'] + loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model'][ + 'pre-extracted-features-root'] + + top_k_results = image_retrieval(checkpoint, opts, loaded_config) + return top_k_results + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar" + , required=True) + parser.add_argument('--query', type=str, required=True) + parser.add_argument('--device', type=str, choices=['cpu', 'gpu'], default='cpu') + parser.add_argument('--num_images', type=int, default=1000) + parser.add_argument('--top_k', type=int, default=10) + parser.add_argument('--dataset', type=str, choices=['coco', 'flickr30k'], default='coco') + parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the " + "checkpoint configuration. See into 'config' folder") + + opts = parser.parse_args() + if opts.config is not None: + with open(opts.config, 'r') as yml_file: + config = yaml.load(yml_file) + else: + config = None + top_k_results = main(opts, config) + print(f"######## TOP {opts.tok_k} RESULTS ########") + print(top_k_results) diff --git a/models/teran.py b/models/teran.py index 1eeb524..f74e45a 100644 --- a/models/teran.py +++ b/models/teran.py @@ -16,7 +16,7 @@ class JointTextImageTransformerEncoder(nn.Module): """ This is a bert caption encoder - transformer image encoder (using bottomup features). - If process the encoder outputs through a transformer, like VilBERT and outputs two different graph embeddings + It process the encoder outputs through a transformer, like VilBERT and outputs two different graph embeddings """ def __init__(self, config): super().__init__() @@ -233,7 +233,7 @@ def forward_loss(self, img_emb, cap_emb, img_emb_set, cap_emb_seq, img_lengths, # bs = img_emb.shape[0] losses = {} - if 'matching' in self.config['training']['loss-type']: + if 'matching' in self.config['training']['loss-type']: matching_loss = self.matching_criterion(img_emb, cap_emb) losses.update({'matching-loss': matching_loss}) self.logger.update('matching_loss', matching_loss.item(), img_emb.size(0)) diff --git a/test.py b/test.py index 9c38df3..e3fa3f0 100644 --- a/test.py +++ b/test.py @@ -1,3 +1,4 @@ +import os import argparse import evaluation @@ -7,7 +8,7 @@ def main(opt, current_config): model_checkpoint = opt.checkpoint - checkpoint = torch.load(model_checkpoint) + checkpoint = torch.load(model_checkpoint)#, map_location=torch.device("cpu")) print('Checkpoint loaded from {}'.format(model_checkpoint)) loaded_config = checkpoint['config'] @@ -23,14 +24,18 @@ def main(opt, current_config): loaded_config['dataset']['images-path'] = current_config['dataset']['images-path'] loaded_config['dataset']['data'] = current_config['dataset']['data'] loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model']['pre-extracted-features-root'] + loaded_config['training']['bs'] = current_config['training']['bs'] - evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=fold5) + evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=False) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('checkpoint', type=str, help="Checkpoint to load") parser.add_argument('--size', type=str, choices=['1k', '5k'], default='1k') - parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the checkpoint configuration. See into 'config' folder") + parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the " + "checkpoint configuration. See into 'config' folder") + + print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", "")) opt = parser.parse_args() if opt.config is not None: From 121bf5497407f31058de8d9fbd85d73d61d35b00 Mon Sep 17 00:00:00 2001 From: floschne Date: Mon, 14 Dec 2020 11:53:10 +0100 Subject: [PATCH 06/21] added flags for i2t t2i and gpu --- evaluation.py | 116 +++++++++++++++++++++++++++++++++++--------------- test.py | 22 +++++++--- 2 files changed, 98 insertions(+), 40 deletions(-) diff --git a/evaluation.py b/evaluation.py index e4a5d68..3c0854f 100644 --- a/evaluation.py +++ b/evaluation.py @@ -150,7 +150,7 @@ def encode_data(model, data_loader, log_step=10, logging=print): return img_embs, cap_embs, img_lengths, cap_lengths -def evalrank(config, checkpoint, split='dev', fold5=False): +def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i2t=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is @@ -196,48 +196,94 @@ def evalrank(config, checkpoint, split='dev', fold5=False): if not fold5: # no cross-validation, full evaluation - # r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5) - ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5) - # ar = (r[0] + r[1] + r[2]) / 3 - ari = (ri[0] + ri[1] + ri[2]) / 3 - #rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] - #print("rsum: %.1f" % rsum) - # print("Average i2t Recall: %.1f" % ar) - # print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r) - print("Average t2i Recall: %.1f" % ari) - print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) + if eval_i2t: + r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5) + ar = (r[0] + r[1] + r[2]) / 3 + print("Average i2t Recall: %.1f" % ar) + print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r) + + if eval_t2i: + ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5) + ari = (ri[0] + ri[1] + ri[2]) / 3 + print("Average t2i Recall: %.1f" % ari) + print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) + + if eval_i2t and eval_t2i: + rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] + print("rsum: %.1f" % rsum) + + + else: # 5fold cross-validation, only for MSCOCO results = [] for i in range(5): - r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], - img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000], - return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, cap_batches=1) - print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f" % r) - ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], - img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000], - return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, im_batches=1) - if i == 0: - rt, rti = rt0, rti0 - print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) - ar = (r[0] + r[1] + r[2]) / 3 - ari = (ri[0] + ri[1] + ri[2]) / 3 - rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] - print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) - results += [list(r) + list(ri) + [ar, ari, rsum]] + if eval_i2t: + r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], + img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000], + return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, cap_batches=1) + print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f" % r) + if i == 0: + rt = rt0 + ar = (r[0] + r[1] + r[2]) / 3 + if eval_t2i: + ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], + img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000], + return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, im_batches=1) + print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) + if i == 0: + rti = rti0 + ari = (ri[0] + ri[1] + ri[2]) / 3 + + + if eval_t2i and eval_i2t: + rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] + print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) + elif eval_t2i: + print("ari: %.1f" % (ari,)) + elif eval_i2t: + print("ar: %.1f" % (ar,)) + + + if eval_t2i and eval_i2t: + results += [list(r) + list(ri) + [ar, ari, rsum]] # 7 + 7 + 3 = 17 elements + elif eval_t2i: + results += [list(ri) + [ari]] # 7 + 1 = 8 elements + elif eval_i2t: + results += [list(r) + [ar]] # 7 + 1 = 8 elements + + print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) - print("rsum: %.1f" % (mean_metrics[16] * 6)) - print("Average i2t Recall: %.1f" % mean_metrics[14]) - print("Image to text: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" % - mean_metrics[:7]) - print("Average t2i Recall: %.1f" % mean_metrics[15]) - print("Text to image: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" % - mean_metrics[7:14]) - - torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar') + if eval_t2i and eval_i2t: + print("rsum: %.1f" % (mean_metrics[16] * 6)) + print("Average i2t Recall: %.1f" % mean_metrics[14]) + print("Image to text: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" % + mean_metrics[:7]) + print("Average t2i Recall: %.1f" % mean_metrics[15]) + print("Text to image: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" % + mean_metrics[7:14]) + elif eval_t2i: + print("Average t2i Recall: %.1f" % mean_metrics[7]) + print("Text to image: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" % + mean_metrics[:7]) + elif eval_i2t: + print("Average i2t Recall: %.1f" % mean_metrics[7]) + print("Image to text: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" % + mean_metrics[:7]) + + + + + if eval_t2i and eval_i2t: + torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar') + elif eval_t2i: + torch.save({'rti': rti}, 'ranks.pth.tar') + elif eval_i2t: + torch.save({'rt': rt}, 'ranks.pth.tar') + def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, measure='dot', sim_function=None, cap_batches=1): diff --git a/test.py b/test.py index e3fa3f0..9586784 100644 --- a/test.py +++ b/test.py @@ -1,14 +1,20 @@ -import os import argparse +import os -import evaluation -import yaml import torch +import yaml + +import evaluation + def main(opt, current_config): model_checkpoint = opt.checkpoint - checkpoint = torch.load(model_checkpoint)#, map_location=torch.device("cpu")) + if opt.gpu: + checkpoint = torch.load(model_checkpoint) # , map_location=torch.device("cpu")) + else: + checkpoint = torch.load(model_checkpoint, map_location=torch.device("cpu")) + print('Checkpoint loaded from {}'.format(model_checkpoint)) loaded_config = checkpoint['config'] @@ -26,12 +32,18 @@ def main(opt, current_config): loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model']['pre-extracted-features-root'] loaded_config['training']['bs'] = current_config['training']['bs'] - evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=False) + evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=False, eval_t2i=opt.t2i, eval_i2t=opt.i2t) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('checkpoint', type=str, help="Checkpoint to load") parser.add_argument('--size', type=str, choices=['1k', '5k'], default='1k') + parser.add_argument('--gpu', type=bool, default=True, help="If false, CPU is used for computations; GPU otherwise.") + parser.add_argument('--t2i', type=bool, default=True, help="If true text-to-image (image retrieval) evaluation " + "will be executed.") + parser.add_argument('--i2t', type=bool, default=False, help="If true image-to-text (image captioning) evaluation " + "will be executed.") parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the " "checkpoint configuration. See into 'config' folder") From 0ed07303e89e95565427560c3c5665fa82da4a67 Mon Sep 17 00:00:00 2001 From: floschne Date: Mon, 14 Dec 2020 11:54:30 +0100 Subject: [PATCH 07/21] added timing outputs for evaluation --- configs/teran_coco_MrSw.yaml | 2 +- evaluation.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/configs/teran_coco_MrSw.yaml b/configs/teran_coco_MrSw.yaml index 5e8a440..d3458aa 100644 --- a/configs/teran_coco_MrSw.yaml +++ b/configs/teran_coco_MrSw.yaml @@ -51,7 +51,7 @@ training: alignment-mode: 'MrSw' measure: 'dot' margin: 0.2 - bs: 100 + bs: 40 scheduler: 'steplr' gamma: 0.1 step-size: 20 diff --git a/evaluation.py b/evaluation.py index 3c0854f..f66c47b 100644 --- a/evaluation.py +++ b/evaluation.py @@ -156,6 +156,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ + evalrank_start_time = time.time(); + # load model and options # checkpoint = torch.load(model_path) data_path = config['dataset']['data'] @@ -177,7 +179,10 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['training']['alignment-mode'], return_similarity_mat=True) if config['training']['loss-type'] == 'alignment' else None print('Computing results...') + encode_data_start_time = time.time() img_embs, cap_embs, img_lenghts, cap_lenghts = encode_data(model, data_loader) + print(f"Time elapsed for encode_data: {time.time() - encode_data_start_time} seconds." ) + torch.cuda.empty_cache() # if checkpoint2 is not None: @@ -197,17 +202,25 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i if not fold5: # no cross-validation, full evaluation if eval_i2t: + eval_i2t_start_time = time.time() + r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5) ar = (r[0] + r[1] + r[2]) / 3 print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r) + print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_i2t_start_time} seconds." ) + if eval_t2i: + eval_t2i_start_time = time.time() + ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5) ari = (ri[0] + ri[1] + ri[2]) / 3 print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) + print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_t2i_start_time} seconds.") + if eval_i2t and eval_t2i: rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) @@ -284,6 +297,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i elif eval_i2t: torch.save({'rt': rt}, 'ranks.pth.tar') + print(f"Time elapsed for evalrank(): {time.time() - evalrank_start_time} seconds.") + def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, measure='dot', sim_function=None, cap_batches=1): From cc9f94df55f7f76dd4ae11e29108fa6965be31e8 Mon Sep 17 00:00:00 2001 From: floschne Date: Mon, 14 Dec 2020 16:01:27 +0100 Subject: [PATCH 08/21] fixed flags for i2t and t2i --- evaluation.py | 2 +- test.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/evaluation.py b/evaluation.py index f66c47b..e2b89e6 100644 --- a/evaluation.py +++ b/evaluation.py @@ -156,7 +156,7 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ - evalrank_start_time = time.time(); + evalrank_start_time = time.time() # load model and options # checkpoint = torch.load(model_path) diff --git a/test.py b/test.py index 9586784..123ef5c 100644 --- a/test.py +++ b/test.py @@ -29,7 +29,8 @@ def main(opt, current_config): if current_config is not None: loaded_config['dataset']['images-path'] = current_config['dataset']['images-path'] loaded_config['dataset']['data'] = current_config['dataset']['data'] - loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model']['pre-extracted-features-root'] + loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model'][ + 'pre-extracted-features-root'] loaded_config['training']['bs'] = current_config['training']['bs'] evaluation.evalrank(loaded_config, checkpoint, split="test", fold5=False, eval_t2i=opt.t2i, eval_i2t=opt.i2t) @@ -40,10 +41,10 @@ def main(opt, current_config): parser.add_argument('checkpoint', type=str, help="Checkpoint to load") parser.add_argument('--size', type=str, choices=['1k', '5k'], default='1k') parser.add_argument('--gpu', type=bool, default=True, help="If false, CPU is used for computations; GPU otherwise.") - parser.add_argument('--t2i', type=bool, default=True, help="If true text-to-image (image retrieval) evaluation " - "will be executed.") - parser.add_argument('--i2t', type=bool, default=False, help="If true image-to-text (image captioning) evaluation " - "will be executed.") + parser.add_argument('--t2i', action='store_true', default=True, + help="If set text-to-image (image retrieval) evaluation will be executed.") + parser.add_argument('--i2t', action='store_true', default=False, + help="If set image-to-text (image captioning) evaluation will be executed.") parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the " "checkpoint configuration. See into 'config' folder") @@ -55,4 +56,4 @@ def main(opt, current_config): config = yaml.load(ymlfile) else: config = None - main(opt, config) \ No newline at end of file + main(opt, config) From 6f562b1e56997d53917ee42f3985e08a15842c38 Mon Sep 17 00:00:00 2001 From: floschne Date: Mon, 28 Dec 2020 12:21:40 +0100 Subject: [PATCH 09/21] improved code readability by renaming some variable names and adding some comments --- data.py | 88 ++++++++++++++--------------- evaluate_utils/compute_relevance.py | 2 +- evaluation.py | 79 ++++++++++++++++---------- 3 files changed, 93 insertions(+), 76 deletions(-) diff --git a/data.py b/data.py index 9b988fd..2fbecbf 100644 --- a/data.py +++ b/data.py @@ -1,17 +1,15 @@ +import json as jsonmod +import os +import pickle + +import numpy as np import torch import torch.utils.data as data import torchvision.transforms as transforms -import os -import nltk +import tqdm from PIL import Image from pycocotools.coco import COCO -import numpy as np -import json as jsonmod -from collections.abc import Sequence -import shelve from transformers import BertTokenizer -import pickle -import tqdm from features import HuggingFaceTransformerExtractor @@ -82,21 +80,21 @@ def get_paths(config): class CocoDataset(data.Dataset): """COCO Custom Dataset compatible with torch.utils.data.DataLoader.""" - def __init__(self, root, json, transform=None, ids=None, get_images=True): + def __init__(self, imgs_root, captions_json, transform=None, ids=None, get_images=True): """ Args: - root: image directory. - json: coco annotation file path. + imgs_root: image directory. + captions_json: coco annotation file path. transform: transformer for image. """ - self.root = root + self.root = imgs_root self.get_images = get_images # when using `restval`, two json files are needed - if isinstance(json, tuple): - self.coco = (COCO(json[0]), COCO(json[1])) + if isinstance(captions_json, tuple): + self.coco = (COCO(captions_json[0]), COCO(captions_json[1])) else: - self.coco = (COCO(json),) - self.root = (root,) + self.coco = (COCO(captions_json),) + self.root = (imgs_root,) # if ids provided by get_paths, use split-specific ids if ids is None: self.ids = list(self.coco.anns.keys()) @@ -123,7 +121,7 @@ def __getitem__(self, index): return image, target, index, img_id def get_raw_item(self, index, load_image=True): - if index < self.bp: + if index < self.bp: # bp -> breakpoint to stop after N samples coco = self.coco[0] root = self.root[0] else: @@ -132,8 +130,8 @@ def get_raw_item(self, index, load_image=True): ann_id = self.ids[index] caption = coco.anns[ann_id]['caption'] img_id = coco.anns[ann_id]['image_id'] - img = coco.imgs[img_id] - img_size = np.array([img['width'], img['height']]) + img_metadata = coco.imgs[img_id] + img_size = np.array([img_metadata['width'], img_metadata['height']]) if load_image: path = coco.loadImgs(img_id)[0]['file_name'] image = Image.open(os.path.join(root, path)).convert('RGB') @@ -147,14 +145,14 @@ def __len__(self): class BottomUpFeaturesDataset: - def __init__(self, root, json, features_path, split, ids=None, **kwargs): + def __init__(self, imgs_root, captions_json, features_path, split, ids=None, **kwargs): # which dataset? - r = root[0] if type(root) == tuple else root + r = imgs_root[0] if type(imgs_root) == tuple else imgs_root r = r.lower() if 'coco' in r: - self.underlying_dataset = CocoDataset(root, json, ids=ids) + self.underlying_dataset = CocoDataset(imgs_root, captions_json, ids=ids) elif 'f30k' in r or 'flickr30k' in r: - self.underlying_dataset = FlickrDataset(root, json, split) + self.underlying_dataset = FlickrDataset(imgs_root, captions_json, split) # data_path = config['image-model']['data-path'] self.feats_data_path = os.path.join(features_path, 'bu_att') @@ -191,7 +189,7 @@ def __getitem__(self, index): else: target = caption # image = (img_feat, img_boxes) - return img_feat, img_boxes, target, index, img_id + return img_feat, img_boxes, target, index, img_id # target is the actual caption sentence def __len__(self): return len(self.underlying_dataset) @@ -348,31 +346,31 @@ def __call__(self, data): return img_features, targets, feat_lengths, cap_lengths, out_boxes, ids -def get_loader_single(data_name, split, root, json, transform, preextracted_root=None, +def get_loader_single(data_name, split, imgs_root, captions_json, transform, pre_extracted_root=None, batch_size=100, shuffle=True, num_workers=2, ids=None, collate_fn=None, **kwargs): """Returns torch.utils.data.DataLoader for custom coco dataset.""" if 'coco' in data_name: - if preextracted_root is not None: - dataset = BottomUpFeaturesDataset(root=root, - json=json, - features_path=preextracted_root, split=split, + if pre_extracted_root is not None: + dataset = BottomUpFeaturesDataset(imgs_root=imgs_root, + captions_json=captions_json, + features_path=pre_extracted_root, split=split, ids=ids, **kwargs) else: # COCO custom dataset - dataset = CocoDataset(root=root, - json=json, + dataset = CocoDataset(imgs_root=imgs_root, + captions_json=captions_json, transform=transform, ids=ids) elif 'f8k' in data_name or 'f30k' in data_name: - if preextracted_root is not None: - dataset = BottomUpFeaturesDataset(root=root, - json=json, - features_path=preextracted_root, split=split, + if pre_extracted_root is not None: + dataset = BottomUpFeaturesDataset(imgs_root=imgs_root, + captions_json=captions_json, + features_path=pre_extracted_root, split=split, ids=ids, **kwargs) else: - dataset = FlickrDataset(root=root, + dataset = FlickrDataset(root=imgs_root, split=split, - json=json, + json=captions_json, transform=transform) # Data loader @@ -385,7 +383,7 @@ def get_loader_single(data_name, split, root, json, transform, preextracted_root return data_loader -def get_transform(data_name, split_name, config): +def get_transform(data_name=None, split_name=None, config=None): normalizer = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) t_list = [] @@ -417,7 +415,7 @@ def get_loaders(config, workers, batch_size=None): roots['train']['img'], roots['train']['cap'], transform, ids=ids['train'], - preextracted_root=preextracted_root, + pre_extracted_root=preextracted_root, batch_size=batch_size, shuffle=True, num_workers=workers, collate_fn=collate_fn, config=config) @@ -427,7 +425,7 @@ def get_loaders(config, workers, batch_size=None): roots['val']['img'], roots['val']['cap'], transform, ids=ids['val'], - preextracted_root=preextracted_root, + pre_extracted_root=preextracted_root, batch_size=batch_size, shuffle=False, num_workers=workers, collate_fn=collate_fn, config=config) @@ -443,15 +441,15 @@ def get_test_loader(config, workers, split_name='test', batch_size=None): # Build Dataset Loader roots, ids = get_paths(config) - preextracted_root = config['image-model']['pre-extracted-features-root'] \ + pre_extracted_root = config['image-model']['pre-extracted-features-root'] \ if 'pre-extracted-features-root' in config['image-model'] else None transform = get_transform(data_name, split_name, config) test_loader = get_loader_single(data_name, split_name, - roots[split_name]['img'], - roots[split_name]['cap'], - transform, ids=ids[split_name], - preextracted_root=preextracted_root, + imgs_root=roots[split_name]['img'], + captions_json=roots[split_name]['cap'], + transform=transform, ids=ids[split_name], + pre_extracted_root=pre_extracted_root, batch_size=batch_size, shuffle=False, num_workers=workers, collate_fn=collate_fn, config=config) diff --git a/evaluate_utils/compute_relevance.py b/evaluate_utils/compute_relevance.py index aa67bf8..07c34cd 100644 --- a/evaluate_utils/compute_relevance.py +++ b/evaluate_utils/compute_relevance.py @@ -58,7 +58,7 @@ def get_dataset(config, split): data_name = config['dataset']['name'] if 'coco' in data_name: # COCO custom dataset - dataset = data.CocoDataset(root=roots[split]['img'], json=roots[split]['cap'], ids=ids[split], get_images=False) + dataset = data.CocoDataset(imgs_root=roots[split]['img'], captions_json=roots[split]['cap'], ids=ids[split], get_images=False) elif 'f8k' in data_name or 'f30k' in data_name: dataset = data.FlickrDataset(root=roots[split]['img'], split=split, json=roots[split]['cap'], get_images=False) return dataset diff --git a/evaluation.py b/evaluation.py index e2b89e6..8232174 100644 --- a/evaluation.py +++ b/evaluation.py @@ -1,16 +1,17 @@ from __future__ import print_function -import numpy - -from data import get_test_loader import time +from collections import OrderedDict + +import numpy import numpy as np import torch import tqdm -from collections import OrderedDict -from utils import dot_sim, get_model + from evaluate_utils.dcg import DCG from models.loss import order_sim, AlignmentContrastiveLoss +from utils import get_model +from data import get_test_loader class AverageMeter(object): @@ -176,12 +177,14 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i ndcg_val_scorer = DCG(config, len(data_loader.dataset), split, rank=25, relevance_methods=['rougeL', 'spice']) # initialize similarity matrix evaluator - sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['training']['alignment-mode'], return_similarity_mat=True) if config['training']['loss-type'] == 'alignment' else None + sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['training']['alignment-mode'], + return_similarity_mat=True) if config['training'][ + 'loss-type'] == 'alignment' else None print('Computing results...') encode_data_start_time = time.time() img_embs, cap_embs, img_lenghts, cap_lenghts = encode_data(model, data_loader) - print(f"Time elapsed for encode_data: {time.time() - encode_data_start_time} seconds." ) + print(f"Time elapsed for encode_data: {time.time() - encode_data_start_time} seconds.") torch.cuda.empty_cache() @@ -204,17 +207,32 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i if eval_i2t: eval_i2t_start_time = time.time() - r, rt = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, cap_batches=5) + r, rt = i2t(img_embs, + cap_embs, + img_lenghts, + cap_lenghts, + return_ranks=True, + ndcg_scorer=ndcg_val_scorer, + sim_function=sim_matrix_fn, + cap_batches=5) ar = (r[0] + r[1] + r[2]) / 3 print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % r) - print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_i2t_start_time} seconds." ) + print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_i2t_start_time} seconds.") if eval_t2i: eval_t2i_start_time = time.time() - ri, rti = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, return_ranks=True, ndcg_scorer=ndcg_val_scorer, sim_function=sim_matrix_fn, im_batches=5) + ri, rti = t2i(img_embs, + cap_embs, + img_lenghts, + cap_lenghts, + return_ranks=True, + ndcg_scorer=ndcg_val_scorer, + sim_function=sim_matrix_fn, + im_batches=5) + ari = (ri[0] + ri[1] + ri[2]) / 3 print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) @@ -234,7 +252,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i if eval_i2t: r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000], - return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, cap_batches=1) + return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, + cap_batches=1) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f" % r) if i == 0: rt = rt0 @@ -242,7 +261,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i if eval_t2i: ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], img_lenghts[i * 5000:(i + 1) * 5000], cap_lenghts[i * 5000:(i + 1) * 5000], - return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, im_batches=1) + return_ranks=True, ndcg_scorer=ndcg_val_scorer, fold_index=i, sim_function=sim_matrix_fn, + im_batches=1) print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) if i == 0: rti = rti0 @@ -257,15 +277,12 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i elif eval_i2t: print("ar: %.1f" % (ar,)) - if eval_t2i and eval_i2t: - results += [list(r) + list(ri) + [ar, ari, rsum]] # 7 + 7 + 3 = 17 elements + results += [list(r) + list(ri) + [ar, ari, rsum]] # 7 + 7 + 3 = 17 elements elif eval_t2i: - results += [list(ri) + [ari]] # 7 + 1 = 8 elements + results += [list(ri) + [ari]] # 7 + 1 = 8 elements elif eval_i2t: - results += [list(r) + [ar]] # 7 + 1 = 8 elements - - + results += [list(r) + [ar]] # 7 + 1 = 8 elements print("-----------------------------------") print("Mean metrics: ") @@ -343,8 +360,8 @@ def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals d = d.cpu().numpy().flatten() else: for i in range(cap_batches): - captions_now = captions[i*captions_per_batch:(i+1)*captions_per_batch] - cap_lenghts_now = cap_lenghts[i*captions_per_batch:(i+1)*captions_per_batch] + captions_now = captions[i * captions_per_batch:(i + 1) * captions_per_batch] + cap_lenghts_now = cap_lenghts[i * captions_per_batch:(i + 1) * captions_per_batch] captions_now = captions_now.cuda() d_align = sim_function(im, captions_now, im_len, cap_lenghts_now) @@ -352,7 +369,7 @@ def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals # d_matching = torch.mm(im[:, 0, :], captions[:, 0, :].t()) # d_matching = d_matching.cpu().numpy().flatten() if d is None: - d = d_align # + d_matching + d = d_align # + d_matching else: d = numpy.concatenate([d, d_align], axis=0) @@ -432,31 +449,33 @@ def t2i(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals d = d.cpu().numpy() else: for i in range(im_batches): - ims_now = ims[i * images_per_batch:(i+1) * images_per_batch] - ims_len_now = ims_len[i * images_per_batch:(i+1) * images_per_batch] + ims_now = ims[i * images_per_batch:(i + 1) * images_per_batch] + ims_len_now = ims_len[i * images_per_batch:(i + 1) * images_per_batch] ims_now = ims_now.cuda() # d = numpy.dot(queries, ims.T) + # d_align is the (MrSw) aggregated/pooled similarity matrix A in the paper d_align = sim_function(ims_now, queries, ims_len_now, queries_len).t() d_align = d_align.cpu().numpy() # d_matching = torch.mm(queries[:, 0, :], ims[:, 0, :].t()) # d_matching = d_matching.cpu().numpy() if d is None: - d = d_align # + d_matching + d = d_align # + d_matching else: d = numpy.concatenate([d, d_align], axis=1) + # d contains all aggregated/pooled similarity matrices for all query-image pairs in the test set inds = numpy.zeros(d.shape) for i in range(len(inds)): inds[i] = numpy.argsort(d[i])[::-1] - ranks[5 * index + i] = numpy.where(inds[i] == index)[0][ - 0] # in che posizione e' l'immagine (index) che ha questa caption (5*index + i) + # in che posizione e' l'immagine (index) che ha questa caption (5*index + i) + ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0] top50[5 * index + i] = inds[i][0:50] # calculate ndcg - # if ndcg_scorer is not None: - # rougel_ndcgs[5 * index + i], spice_ndcgs[5 * index + i] = \ - # ndcg_scorer.compute_ndcg(npts, 5 * index + i, inds[i].astype(int), - # fold_index=fold_index, retrieval='image').values() + if ndcg_scorer is not None: + rougel_ndcgs[5 * index + i], spice_ndcgs[5 * index + i] = \ + ndcg_scorer.compute_ndcg(npts, 5 * index + i, inds[i].astype(int), + fold_index=fold_index, retrieval='image').values() # Compute metrics r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) From d0a175d713f6d899e746f090419188c7b42abf00 Mon Sep 17 00:00:00 2001 From: floschne Date: Mon, 28 Dec 2020 14:17:12 +0100 Subject: [PATCH 10/21] exiting program if CUDA_VISIBLE_DEVICES is not set --- test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test.py b/test.py index 123ef5c..3effe87 100644 --- a/test.py +++ b/test.py @@ -1,5 +1,6 @@ import argparse import os +import sys import torch import yaml @@ -48,7 +49,9 @@ def main(opt, current_config): parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the " "checkpoint configuration. See into 'config' folder") - print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", "")) + print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", "NOT SET - ABORTING")) + if os.getenv("CUDA_VISIBLE_DEVICES", None) is None: + sys.exit(1) opt = parser.parse_args() if opt.config is not None: From fb63f919403f6a11f790d8a61623f95e6552cabd Mon Sep 17 00:00:00 2001 From: floschne Date: Wed, 30 Dec 2020 14:05:32 +0100 Subject: [PATCH 11/21] improved code readability by renaming some variable names and adding some comments --- data.py | 86 ++++++++++++++++------------- evaluate_utils/compute_relevance.py | 2 +- evaluation.py | 74 +++---------------------- models/text.py | 9 ++- utils.py | 58 +++++++++++++++++++ 5 files changed, 122 insertions(+), 107 deletions(-) diff --git a/data.py b/data.py index 2fbecbf..9aa4fde 100644 --- a/data.py +++ b/data.py @@ -15,23 +15,25 @@ def get_paths(config): + # noinspection PyIncorrectDocstring + # noinspection PyUnresolvedReferences """ - Returns paths to images and annotations for the given datasets. For MSCOCO - indices are also returned to control the data split being used. - The indices are extracted from the Karpathy et al. splits using this - snippet: - - >>> import json - >>> dataset=json.load(open('dataset_coco.json','r')) - >>> A=[] - >>> for i in range(len(D['images'])): - ... if D['images'][i]['split'] == 'val': - ... A+=D['images'][i]['sentids'][:5] - ... - - :param name: Dataset names - :param use_restval: If True, the the `restval` data is included in train. - """ + Returns paths to images and annotations for the given datasets. For MSCOCO + indices are also returned to control the data split being used. + The indices are extracted from the Karpathy et al. splits using this + snippet: + + >>> import json + >>> dataset=json.load(open('dataset_coco.json','r')) + >>> A=[] + >>> for i in range(len(D['images'])): + ... if D['images'][i]['split'] == 'val': + ... A+=D['images'][i]['sentids'][:5] + ... + + :param name: Dataset names + :param use_restval: If True, the the `restval` data is included in train. + """ name = config['dataset']['name'] annotations_path = os.path.join(config['dataset']['data'], name, 'annotations') use_restval = config['dataset']['restval'] @@ -62,7 +64,8 @@ def get_paths(config): ids['test'] = np.load(os.path.join(annotations_path, 'coco_test_ids.npy')) ids['trainrestval'] = ( ids['train'], - np.load(os.path.join(annotations_path, 'coco_restval_ids.npy'))) + np.load(os.path.join(annotations_path, 'coco_restval_ids.npy')) + ) if use_restval: roots['train'] = roots['trainrestval'] ids['train'] = ids['trainrestval'] @@ -80,7 +83,7 @@ def get_paths(config): class CocoDataset(data.Dataset): """COCO Custom Dataset compatible with torch.utils.data.DataLoader.""" - def __init__(self, imgs_root, captions_json, transform=None, ids=None, get_images=True): + def __init__(self, imgs_root, captions_json, transform=None, coco_annotation_ids=None, get_images=True): """ Args: imgs_root: image directory. @@ -96,17 +99,17 @@ def __init__(self, imgs_root, captions_json, transform=None, ids=None, get_image self.coco = (COCO(captions_json),) self.root = (imgs_root,) # if ids provided by get_paths, use split-specific ids - if ids is None: - self.ids = list(self.coco.anns.keys()) + if coco_annotation_ids is None: + self.annotation_ids = list(self.coco[0].anns.keys()) else: - self.ids = ids + self.annotation_ids = coco_annotation_ids # if `restval` data is to be used, record the break point for ids - if isinstance(self.ids, tuple): - self.bp = len(self.ids[0]) - self.ids = list(self.ids[0]) + list(self.ids[1]) + if isinstance(self.annotation_ids, tuple): + self.bp = len(self.annotation_ids[0]) + self.annotation_ids = list(self.annotation_ids[0]) + list(self.annotation_ids[1]) else: - self.bp = len(self.ids) + self.bp = len(self.annotation_ids) self.transform = transform def __getitem__(self, index): @@ -127,7 +130,7 @@ def get_raw_item(self, index, load_image=True): else: coco = self.coco[1] root = self.root[1] - ann_id = self.ids[index] + ann_id = self.annotation_ids[index] caption = coco.anns[ann_id]['caption'] img_id = coco.anns[ann_id]['image_id'] img_metadata = coco.imgs[img_id] @@ -141,7 +144,7 @@ def get_raw_item(self, index, load_image=True): return root, caption, img_id, None, None, img_size def __len__(self): - return len(self.ids) + return len(self.annotation_ids) class BottomUpFeaturesDataset: @@ -150,7 +153,7 @@ def __init__(self, imgs_root, captions_json, features_path, split, ids=None, **k r = imgs_root[0] if type(imgs_root) == tuple else imgs_root r = r.lower() if 'coco' in r: - self.underlying_dataset = CocoDataset(imgs_root, captions_json, ids=ids) + self.underlying_dataset = CocoDataset(imgs_root, captions_json, coco_annotation_ids=ids) elif 'f30k' in r or 'flickr30k' in r: self.underlying_dataset = FlickrDataset(imgs_root, captions_json, split) @@ -275,12 +278,12 @@ def __call__(self, data): Returns: images: torch tensor of shape (batch_size, 3, 256, 256). - targets: torch tensor of shape (batch_size, padded_length). + targets: torch tensor of shape (batch_size, padded_length). -> the textual tokens lengths: list; valid length for each padded caption. """ # Sort a data list by caption length # data.sort(key=lambda x: len(x[1]), reverse=True) - if len(data[0]) == 5: # TODO: find a better way to distinguish the two + if len(data[0]) == 5: # TODO: find a better way to distinguish the two images, boxes, captions, ids, img_ids = zip(*data) elif len(data[0]) == 4: images, captions, ids, img_ids = zip(*data) @@ -294,14 +297,17 @@ def __call__(self, data): cap_features = [torch.FloatTensor(f) for f in cap_features] wembeddings = [torch.FloatTensor(w) for w in wembeddings] else: - if self.vocab_type == 'bert': + if self.vocab_type == 'bert': cap_lengths = [len(self.tokenizer.tokenize(c)) + 2 for c in - captions] # + 2 in order to account for begin and end tokens + captions] # + 2 in order to account for begin and end tokens max_len = max(cap_lengths) - captions_ids = [torch.LongTensor(self.tokenizer.encode(c, max_length=max_len, pad_to_max_length=True)) - for c in captions] + captions_token_ids = [torch.LongTensor(self.tokenizer.encode(c, + max_length=max_len, + padding='max_length', + truncation=True)) + for c in captions] - captions = captions_ids + captions = captions_token_ids # caption_ids are the token ids from bert tokenizer # Merge images (convert tuple of 3D tensor to 4D tensor) preextracted_images = not (images[0].shape[0] == 3) if not preextracted_images: @@ -337,12 +343,18 @@ def __call__(self, data): targets = torch.zeros(len(captions), max(cap_lengths)).long() for i, cap in enumerate(captions): end = cap_lengths[i] - targets[i, :end] = cap[:end] + targets[i, :end] = cap[:end] #caption token ids if not preextracted_images: return images, targets, None, cap_lengths, None, ids else: # features = features.permute(0, 2, 1) + # img_features -> from FRCNN >> B x 2048 + # targets -> padded caption token ids from BERT >> B x max_len(cap_lengths) or(queries) + # feat_lengths -> num of regions in the image (fixed to 36 + 1) >> B x 37 + # cap_lengths -> true length of the non-padded captions or queries >> B x 1 (list of len B) + # out_boxes -> spatial information of the region boxes >> B x 37 x 4 + # ids -> dataset indices wich are in this batch >> 1 x B (tuple of len B) return img_features, targets, feat_lengths, cap_lengths, out_boxes, ids @@ -360,7 +372,7 @@ def get_loader_single(data_name, split, imgs_root, captions_json, transform, pre # COCO custom dataset dataset = CocoDataset(imgs_root=imgs_root, captions_json=captions_json, - transform=transform, ids=ids) + transform=transform, coco_annotation_ids=ids) elif 'f8k' in data_name or 'f30k' in data_name: if pre_extracted_root is not None: dataset = BottomUpFeaturesDataset(imgs_root=imgs_root, diff --git a/evaluate_utils/compute_relevance.py b/evaluate_utils/compute_relevance.py index 07c34cd..ff2de4b 100644 --- a/evaluate_utils/compute_relevance.py +++ b/evaluate_utils/compute_relevance.py @@ -58,7 +58,7 @@ def get_dataset(config, split): data_name = config['dataset']['name'] if 'coco' in data_name: # COCO custom dataset - dataset = data.CocoDataset(imgs_root=roots[split]['img'], captions_json=roots[split]['cap'], ids=ids[split], get_images=False) + dataset = data.CocoDataset(imgs_root=roots[split]['img'], captions_json=roots[split]['cap'], coco_annotation_ids=ids[split], get_images=False) elif 'f8k' in data_name or 'f30k' in data_name: dataset = data.FlickrDataset(root=roots[split]['img'], split=split, json=roots[split]['cap'], get_images=False) return dataset diff --git a/evaluation.py b/evaluation.py index 8232174..63b5313 100644 --- a/evaluation.py +++ b/evaluation.py @@ -10,66 +10,8 @@ from evaluate_utils.dcg import DCG from models.loss import order_sim, AlignmentContrastiveLoss -from utils import get_model -from data import get_test_loader - - -class AverageMeter(object): - """Computes and stores the average and current value""" - - def __init__(self): - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=0): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / (.0001 + self.count) - - def __str__(self): - """String representation for logging - """ - # for values that should be recorded exactly e.g. iteration number - if self.count == 0: - return str(self.val) - # for stats - return '%.4f (%.4f)' % (self.val, self.avg) - - -class LogCollector(object): - """A collection of logging objects that can change from train to val""" - - def __init__(self): - # to keep the order of logged variables deterministic - self.meters = OrderedDict() - - def update(self, k, v, n=0): - # create a new meter if previously not recorded - if k not in self.meters: - self.meters[k] = AverageMeter() - self.meters[k].update(v, n) - - def __str__(self): - """Concatenate the meters in one log line - """ - s = '' - for i, (k, v) in enumerate(self.meters.items()): - if i > 0: - s += ' ' - s += k + ' ' + str(v) - return s - - def tb_log(self, tb_logger, prefix='', step=None): - """Log using tensorboard - """ - for k, v in self.meters.items(): - tb_logger.add_scalar(prefix + k, v.val, global_step=step) +from utils import get_model, AverageMeter, LogCollector +from data import get_coco_image_retrieval_data_loader, get_test_loader def encode_data(model, data_loader, log_step=10, logging=print): @@ -108,14 +50,13 @@ def encode_data(model, data_loader, log_step=10, logging=print): else: text = targets captions = targets - wembeddings = model.img_txt_enc.txt_enc.word_embeddings(captions.cuda() if torch.cuda.is_available() else captions) # compute the embeddings with torch.no_grad(): _, _, img_emb, cap_emb, cap_length = model.forward_emb(images, text, img_length, cap_length, boxes) # initialize the numpy arrays given the size of the embeddings - if img_embs is None: + if img_embs is None: # N x max_len x 1024 img_embs = torch.zeros((len(data_loader.dataset), max_img_len, img_emb.size(2))) cap_embs = torch.zeros((len(data_loader.dataset), max_cap_len, cap_emb.size(2))) @@ -237,7 +178,7 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) - print(f"Time elapsed for i2t evaluation without 5-fold CV: {time.time() - eval_t2i_start_time} seconds.") + print(f"Time elapsed for t2i evaluation without 5-fold CV: {time.time() - eval_t2i_start_time} seconds.") if eval_i2t and eval_t2i: rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] @@ -317,8 +258,8 @@ def evalrank(config, checkpoint, split='dev', fold5=False, eval_t2i=True, eval_i print(f"Time elapsed for evalrank(): {time.time() - evalrank_start_time} seconds.") - -def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, measure='dot', sim_function=None, cap_batches=1): +def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, + measure='dot', sim_function=None, cap_batches=1): """ Images->Text (Image Annotation) Images: (5N, K) matrix of images @@ -404,7 +345,8 @@ def i2t(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=Fals return (r1, r5, r10, medr, meanr, mean_rougel_ndcg, mean_spice_ndcg) -def t2i(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, measure='dot', sim_function=None, im_batches=1): +def t2i(images, captions, img_lenghts, cap_lenghts, npts=None, return_ranks=False, ndcg_scorer=None, fold_index=0, + measure='dot', sim_function=None, im_batches=1): """ Text->Images (Image Search) Images: (5N, K) matrix of images diff --git a/models/text.py b/models/text.py index 0dac895..10d23b0 100644 --- a/models/text.py +++ b/models/text.py @@ -58,7 +58,7 @@ def forward(self, x, lengths): # Reshape *final* output to (batch_size, hidden_size) padded = pad_packed_sequence(out, batch_first=True) I = torch.LongTensor(lengths).view(-1, 1, 1) - I = (I.expand(x.size(0), 1, self.embed_size)-1).to(x.device) + I = (I.expand(x.size(0), 1, self.embed_size) - 1).to(x.device) out = torch.gather(padded[0], 1, I).squeeze(1) # normalization in the joint embedding space @@ -105,6 +105,8 @@ def forward(self, x, lengths): lengths: tensor of lengths (LongTensor) of size B ''' if not self.preextracted or self.post_transformer_layers > 0: + # this code builds the attention_mask so that its 1 for every valid token and pads 0 for the max len + # attention_mask is a kinda padding max_len = max(lengths) attention_mask = torch.ones(x.shape[0], max_len) for e, l in zip(attention_mask, lengths): @@ -115,7 +117,8 @@ def forward(self, x, lengths): outputs = x else: outputs = self.bert_model(x, attention_mask=attention_mask) - outputs = outputs[2][-1] + # https://huggingface.co/transformers/model_doc/bert.html#bertmodel + outputs = outputs[2][-1] # -> hidden_states[-1] if self.post_transformer_layers > 0: outputs = outputs.permute(1, 0, 2) @@ -124,7 +127,7 @@ def forward(self, x, lengths): if self.mean: x = outputs.mean(dim=1) else: - x = outputs[:, 0, :] # from the last layer take only the first word + x = outputs[:, 0, :] # from the last layer take only the first word out = self.map(x) diff --git a/utils.py b/utils.py index 1f2cb6a..6b46f17 100644 --- a/utils.py +++ b/utils.py @@ -16,3 +16,61 @@ def cosine_sim(x, y): x = x / numpy.expand_dims(numpy.linalg.norm(x, axis=1), 1) y = y / numpy.expand_dims(numpy.linalg.norm(y, axis=1), 1) return numpy.dot(x, y.T) + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=0): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / (.0001 + self.count) + + def __str__(self): + """String representation for logging + """ + # for values that should be recorded exactly e.g. iteration number + if self.count == 0: + return str(self.val) + # for stats + return '%.4f (%.4f)' % (self.val, self.avg) + + +class LogCollector(object): + """A collection of logging objects that can change from train to val""" + + def __init__(self): + # to keep the order of logged variables deterministic + self.meters = OrderedDict() + + def update(self, k, v, n=0): + # create a new meter if previously not recorded + if k not in self.meters: + self.meters[k] = AverageMeter() + self.meters[k].update(v, n) + + def __str__(self): + """Concatenate the meters in one log line + """ + s = '' + for i, (k, v) in enumerate(self.meters.items()): + if i > 0: + s += ' ' + s += k + ' ' + str(v) + return s + + def tb_log(self, tb_logger, prefix='', step=None): + """Log using tensorboard + """ + for k, v in self.meters.items(): + tb_logger.add_scalar(prefix + k, v.val, global_step=step) From 42378158f329d7959507f06a7a0191484ab86849 Mon Sep 17 00:00:00 2001 From: floschne Date: Wed, 30 Dec 2020 14:09:25 +0100 Subject: [PATCH 12/21] first working (but not fully optimized) IR Inference --- __init__.py | 1 + configs/teran_coco_MrSw_IR.yaml | 63 ++++++++++ data.py | 204 ++++++++++++++++++++++++++++++- evaluation.py | 1 - inference.py | 210 ++++++++++++++++++++++++++------ utils.py | 2 + 6 files changed, 444 insertions(+), 37 deletions(-) create mode 100644 __init__.py create mode 100644 configs/teran_coco_MrSw_IR.yaml diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..705e854 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +from .data import d \ No newline at end of file diff --git a/configs/teran_coco_MrSw_IR.yaml b/configs/teran_coco_MrSw_IR.yaml new file mode 100644 index 0000000..6d650c7 --- /dev/null +++ b/configs/teran_coco_MrSw_IR.yaml @@ -0,0 +1,63 @@ +dataset: + name: 'coco' + images-path: 'data/coco/images' # not needed if using pre-extracted bottom-up features + data: 'data' + restval: True + pre-extracted-features: False + +image-retrieval: + dataset: 'coco' # for now only coco support + split: 'test' # we can remove this in later versions + num_imgs: 5000 + batch_size: 100 + pre-extracted-img-features-root: 'data/coco/features_36' + create_query_batch: True + alignment_mode: 'MrSw' + + + +text-model: + name: 'bert' + pretrain: 'bert-base-uncased' + word-dim: 768 + extraction-hidden-layer: 6 + fine-tune: True + pre-extracted: False + layers: 0 + dropout: 0.1 + +image-model: + name: 'bottomup' + pre-extracted-features-root: 'data/coco/features_36' + transformer-layers: 4 + dropout: 0.1 + pos-encoding: 'concat-and-process' + crop-size: 224 # not used + fine-tune: False + feat-dim: 2048 + norm: True + +model: + name: 'teran' + embed-size: 1024 + text-aggregation: 'first' + image-aggregation: 'first' + layers: 2 + exclude-stopwords: False + shared-transformer: False + dropout: 0.1 + +training: + lr: 0.00001 # 0.000006 + grad-clip: 2.0 + max-violation: True + loss-type: 'alignment' + alignment-mode: 'MrSw' + measure: 'dot' + margin: 0.2 + bs: 40 + scheduler: 'steplr' + gamma: 0.1 + step-size: 20 + warmup: null + warmup-period: 1000 diff --git a/data.py b/data.py index 9aa4fde..2a2505c 100644 --- a/data.py +++ b/data.py @@ -147,6 +147,65 @@ def __len__(self): return len(self.annotation_ids) +class CocoImageRetrievalDataset: + """ + Custom COCO Dataset that uses only the images together with a user query. + Compatible with torch.utils.data.DataLoader. + """ + + def __init__(self, imgs_root, img_features_path, captions_json, coco_annotation_ids, query, num_imgs): + self.query = query + self.num_imgs = num_imgs + self.feats_data_path = os.path.join(img_features_path, 'bu_att') + self.box_data_path = os.path.join(img_features_path, 'bu_box') + self.imgs_root = imgs_root + + self.coco = COCO(captions_json) + self.anno_ids = coco_annotation_ids + + def __getitem__(self, idx): + """ + This function returns a tuple that is further passed to collate_fn + """ + img_id, img_size = self.get_raw_item(idx) + + img_feat_path = os.path.join(self.feats_data_path, '{}.npz'.format(img_id)) + img_box_path = os.path.join(self.box_data_path, '{}.npy'.format(img_id)) + + img_feat = np.load(img_feat_path)['feat'] + img_feat_box = np.load(img_box_path) + + # normalize box + img_feat_box = img_feat_box / np.tile(img_size, 2) + + img_feat = torch.Tensor(img_feat) + img_feat_box = torch.Tensor(img_feat_box) + + # we always return the query here since we want to compute the similarity of each image with the query + # this output is the input of the CollateFn + return img_feat, img_feat_box, img_id, self.query, idx + + def get_raw_item(self, idx): + next_img_idx = idx * 5 # in the coco dataset there are 5 captions for every image + ann_id = self.anno_ids[next_img_idx] + img_id = self.coco.anns[ann_id]['image_id'] + img_metadata = self.coco.imgs[img_id] + img_size = np.array([img_metadata['width'], img_metadata['height']]) + + return img_id, img_size + + def get_image_metadata(self, idx): + # TODO can't we just get coco.imgs[idx'] somehow? + next_img_idx = idx * 5 # in the coco dataset there are 5 captions for every image + ann_id = self.anno_ids[next_img_idx] + img_id = self.coco.anns[ann_id]['image_id'] + img_metadata = self.coco.imgs[img_id] + return img_metadata + + def __len__(self): + return self.num_imgs + + class BottomUpFeaturesDataset: def __init__(self, imgs_root, captions_json, features_path, split, ids=None, **kwargs): # which dataset? @@ -257,12 +316,112 @@ def get_raw_item(self, index, load_image=True): else: return root, caption, img_id, None, None, img_size - - def __len__(self): return len(self.ids) +class InferenceCollate(object): + def __new__(cls, *args, **kwargs): + # we only need to compute this once so it gets stored in a static class variable + cls.query_token_ids = None + cls.query_length = None + cls.img_feat_length = None + cls.img_feat_dim = None + cls.bboxes_length = None + cls.bboxes_dim = None + + return super(InferenceCollate, cls).__new__(cls) + + def __init__(self, config): + self.vocab_type = str(config['text-model']['name']).lower() + self.create_query_batch = bool(config['image-retrieval']['create_query_batch']) + if self.vocab_type == 'bert': + self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain']) + else: + raise ValueError("Currently only BERT Tokenizer is supported!") + + @classmethod + def set_query_token_ids(cls, query_token_ids): + cls.query_token_ids = query_token_ids + cls.query_length = len(query_token_ids) + + @classmethod + def set_img_feat_length_and_dimension(cls, img_feat): + # +1 because the first region feature is reserved as CLS + cls.img_feat_length = img_feat.shape[0] + 1 + cls.img_feat_dim = img_feat.shape[1] + + @classmethod + def set_bboxes_length_and_dimension(cls, bbox): + # +1 because the first region feature is reserved as CLS + cls.bboxes_length = bbox.shape[0] + 1 + cls.bboxes_dim = bbox.shape[1] + + def __call__(self, data): + img_feats, img_feat_bboxes, img_ids, queries, dataset_indices = zip(*data) + """ + Build batch tensors from a list of (img_feats, img_feat_boxes, img_ids, queries, dataset_indices) tuples. + Args: + - img_feats: + - img_feat_bboxes: + - img_ids: + - queries: + - dataset_indices: + + Returns: + - img_feature_batch: batch of image features + - img_feat_bboxes_batch: batch of bounding boxes of the image features + - img_feat_length: length of the image features and bounding boxes (all of same size) + - query_token_ids: bert token ids of the tokenized query + - query_length: length of the query + - dataset_indices: indices of the elements of the datasets inside the batch. + """ + + # encode (tokenize) the query + if self.query_token_ids is None: + # we don't need to pad or truncate since we only have a single query + # TODO actually we don't even need the tokenizer twice so we could just use a local variable + query_token_ids = torch.LongTensor(self.tokenizer.encode(queries[0])) + self.set_query_token_ids(query_token_ids) + + # prepare image features + if self.img_feat_length is None: + self.set_img_feat_length_and_dimension(img_feats[0]) + + # prepare bounding boxes + if self.bboxes_length is None: + self.set_bboxes_length_and_dimension(img_feat_bboxes[0]) + + assert self.bboxes_length == self.img_feat_length + + # create the image feature batch + batch_size = len(img_feats) + img_feature_batch = torch.zeros(batch_size, self.img_feat_length, self.img_feat_dim) + for i, f in enumerate(img_feats): + # reserve the first token as CLS + img_feature_batch[i, 1:] = f + + # create the image features bounding boxes batch + img_feat_bboxes_batch = torch.zeros(batch_size, self.bboxes_length, self.bboxes_dim) + for i, box in enumerate(img_feat_bboxes): + img_feat_bboxes_batch[i, 1:] = box + + if self.create_query_batch: + # create the query batch + # since the token id is a scalar, the dim is 1 and whe don't need to add it to the batch + # for the BERT embeddings the ids have to be Long + query_batch = torch.zeros(batch_size, self.query_length).long() + for i in range(len(queries)): + query_batch[i] = self.query_token_ids + + query_lengths = [self.query_length for _ in range(batch_size)] + img_feat_lengths = [self.img_feat_length for _ in range(batch_size)] + + return img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_batch, query_lengths, dataset_indices + else: + return img_feature_batch, img_feat_bboxes_batch, self.img_feat_length, self.query_token_ids, self.query_length, dataset_indices + + class Collate: def __init__(self, config): self.vocab_type = config['text-model']['name'] @@ -445,6 +604,47 @@ def get_loaders(config, workers, batch_size=None): return train_loader, val_loader +def get_coco_image_retrieval_data_loader(config, workers, query): + # create the dataset + loader + # 1) load / create a Coco Dataset to get meta info about images (we could also do this by hand) + # 2) choose (the first) N images and create a dataset with N samples where each sample consists of the n-th image + # and the query (gets repeated N times) # TODO maybe this is not necessary + + # get the directories that contain the coco json files and coco annotation ids (which we may not need, I think) + roots, coco_annotation_ids = get_paths(config) + + dataset_name = config['image-retrieval']['dataset'] + batch_size = config['image-retrieval']['batch_size'] + split_name = config['image-retrieval']['split'] + + imgs_root = roots[split_name]['img'] + + # for images we use pre-extracted features (not for text) + pre_extracted_img_features_root = config['image-retrieval']['pre-extracted-img-features-root'] + + captions_json = roots[split_name]['cap'] + coco_annotation_ids = coco_annotation_ids[split_name] + num_imgs = config['image-retrieval']['num_imgs'] + + dataset = CocoImageRetrievalDataset(imgs_root=imgs_root, + img_features_path=pre_extracted_img_features_root, + captions_json=captions_json, + coco_annotation_ids=coco_annotation_ids, + query=query, + num_imgs=num_imgs) + + # basically this creates the mini-batches which get passed to the model + collate_fn = InferenceCollate(config) + data_loader = torch.utils.data.DataLoader(dataset=dataset, + batch_size=batch_size, + shuffle=False, + pin_memory=True, + num_workers=workers, + collate_fn=collate_fn) + + return data_loader + + def get_test_loader(config, workers, split_name='test', batch_size=None): data_name = config['dataset']['name'] if batch_size is None: diff --git a/evaluation.py b/evaluation.py index 63b5313..f164cf0 100644 --- a/evaluation.py +++ b/evaluation.py @@ -1,7 +1,6 @@ from __future__ import print_function import time -from collections import OrderedDict import numpy import numpy as np diff --git a/inference.py b/inference.py index 6bb8dd5..035dc05 100644 --- a/inference.py +++ b/inference.py @@ -1,13 +1,139 @@ import argparse -from typing import List -from data import get_inference_loader +import os +import sys +import time +from typing import List, Any, Dict + +import numpy as np import torch +import tqdm import yaml +from data import get_coco_image_retrieval_data_loader +from models.loss import AlignmentContrastiveLoss from models.teran import TERAN +from utils import AverageMeter, LogCollector + + +def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=print): + # compute the embedding vectors v_i, s_j (paper) for each image region and word respectively + # -> forwarding the data through the respective TE stacks + print('Computing image and query embeddings...') + encode_data_start_time = time.time() + + batch_time = AverageMeter() + val_logger = LogCollector() + + # we don't need autograd for inference + model.eval() + + # array to keep all the embeddings + # TODO maybe we can store those embeddings in an index and load it instead of computing each time for each query + query_embs = None + num_query_feats = None + num_img_feats = None # all images have a fixed size of pre-extracted features of 36 + 1 regions + img_embs = None + + start_time = time.time() + for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_ids, query_lengths, + dataset_indices) in enumerate(data_loader): + + # make sure val logger is used + model.logger = val_logger + + # TODO + # in the first version just stack the query_token_ids, img_feat_length and query_length + # so that it has shape B x ? x ?, where B is len(img_feature_batch) (should be equal to bs set in the config) + # + # in the second version adapt model.forward_emb so that the embeddings get only computed once and then stacked + # to the same size as the img_embs + + # make sure val logger is used + model.logger = val_logger + + # compute the embeddings + with torch.no_grad(): + # TODO inside model.forward_emb we have to adapt the code for only a single query so that it doesn't get + # computed each time + _, _, img_emb, query_emb, _ = model.forward_emb(img_feature_batch, + query_token_ids, + img_feat_lengths, + query_lengths, + img_feat_bboxes_batch) + + # initialize the arrays given the size of the embeddings + if img_embs is None: + num_img_feats = img_feat_lengths[0] if isinstance(img_feat_lengths, list) else img_feat_lengths + num_query_feats = query_lengths[0] if isinstance(query_lengths, list) else query_lengths + img_feat_dim = img_emb.size(2) + query_feat_dim = query_emb.size(2) + img_embs = torch.zeros((len(data_loader.dataset), num_img_feats, img_feat_dim)) + query_embs = torch.zeros((len(data_loader.dataset), num_query_feats, query_feat_dim)) + + # preserve the embeddings by copying from gpu and converting to numpy + img_embs[dataset_indices, :, :] = img_emb.cpu().permute(1, 0, 2) + query_embs[dataset_indices, :, :] = query_emb.cpu().permute(1, 0, 2) + + # measure elapsed time per batch + batch_time.update(time.time() - start_time) + start_time = time.time() + + if i % log_step == 0: + logging( + f"Batch: [{i}/{len(data_loader)}]\t{str(model.logger)}\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})") + del img_feature_batch, query_token_ids + + print(f"Time elapsed to encode data: {time.time() - encode_data_start_time} seconds.") + return img_embs, query_embs, num_img_feats, num_query_feats + +def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_lengths, config): + # initialize similarity matrix evaluator + sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['image-retrieval']['alignment_mode'], + return_similarity_mat=True) + start_time = time.time() + img_embs_per_batch = 1000 # TODO config variable + img_emb_batches = 5 # TODO config / calc -def image_retrieval(checkpoint, opts, config) -> List[str]: + num_img_embs = img_embs.shape[0] + + # distances storage + distances = None + + # since its always the same query we can reuse the batch + # (TODO maybe we can even just use a batch of size 1?! -> check the sim_matrix_fn) + query_emb_batch = query_embs[:1] + query_length_batch = [query_lengths[0] if isinstance(query_lengths, list) else query_lengths for _ in range(1)] + query_emb_batch.cuda() + + # batch-wise compute the alignment distance between the images and the query + for i in tqdm.trange(img_emb_batches): + # create the current batch + img_embs_batch = img_embs[i * img_embs_per_batch:(i+1) * img_embs_per_batch] + img_embs_length_batch = [img_lengths for _ in range(img_embs_per_batch)] + img_embs_batch.cuda() + + # compute and pool the similarity matrices to get the global distance between the image and the query + alignment_distance = sim_matrix_fn(img_embs_batch, query_emb_batch, img_embs_length_batch, query_length_batch) + alignment_distance = alignment_distance.t().cpu().numpy() + + # store the distances + if distances is None: + distances = alignment_distance + else: + distances = np.concatenate([distances, alignment_distance], axis=1) + + # get the img indices descended sorted by the distance matrix + sorted_distance_indices = np.argsort(distances.squeeze())[::-1] + print(f"Time elapsed to compute and pool the similarity matrices: {time.time() - start_time} seconds.") + return sorted_distance_indices + + +def get_image_names(top_k_indices, data_loader) -> List[str]: + return [data_loader.dataset.get_image_metadata(idx)['file_name'] for idx in top_k_indices] + + +def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: # load model and options # checkpoint = torch.load(model_path) data_path = config['dataset']['data'] @@ -20,46 +146,62 @@ def image_retrieval(checkpoint, opts, config) -> List[str]: model.load_state_dict(checkpoint['model'], strict=False) print('Loading dataset') - dataloader = get_inference_loader(config, opts, workers=4) + data_loader = get_coco_image_retrieval_data_loader(config, + query=opts.query, + workers=opts.num_data_workers) + + # encode the data (i.e. compute the embeddings / TE outputs for the images and query) + img_embs, cap_embs, img_lengths, cap_lengths = encode_data_for_inference(model, data_loader) - return ["1", "2"] + torch.cuda.empty_cache() + print(f"Images: {img_embs.shape[0]}, Captions: {cap_embs.shape[0]}") + # compute the matching scores + distance_sorted_indices = compute_distance_sorted_indices(img_embs, cap_embs, img_lengths, cap_lengths, config) + top_k_indices = distance_sorted_indices[:opts.top_k] -def main(opts, current_config) -> List[str]: - checkpoint = torch.load(opts.checkpoint, map_location=torch.device(opts.device)) + # get the image names + top_k_images = get_image_names(top_k_indices, data_loader) + return top_k_images - print('Checkpoint loaded from {}'.format(opts.checkpoint)) - loaded_config = checkpoint['config'] - # Override some mandatory things in the configuration (paths) - if current_config is not None: - loaded_config['dataset']['images-path'] = current_config['dataset']['images-path'] - loaded_config['dataset']['data'] = current_config['dataset']['data'] - loaded_config['image-model']['pre-extracted-features-root'] = current_config['image-model'][ - 'pre-extracted-features-root'] +def prepare_model_checkpoint_and_config(opts): + checkpoint = torch.load(opts.model, map_location=torch.device(opts.device)) + print('Checkpoint loaded from {}'.format(opts.model)) + model_checkpoint_config = checkpoint['config'] - top_k_results = image_retrieval(checkpoint, opts, loaded_config) - return top_k_results + with open(opts.config, 'r') as yml_file: + loaded_config = yaml.load(yml_file) + # Override some mandatory things in the configuration + model_checkpoint_config['dataset']['images-path'] = loaded_config['dataset']['images-path'] + model_checkpoint_config['dataset']['data'] = loaded_config['dataset']['data'] + model_checkpoint_config['image-retrieval'] = loaded_config['image-retrieval'] + + return model_checkpoint_config, checkpoint if __name__ == '__main__': + print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", "NOT SET - ABORTING")) + if os.getenv("CUDA_VISIBLE_DEVICES", None) is None: + sys.exit(1) + parser = argparse.ArgumentParser() - parser.add_argument('--model', type=str, help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar" - , required=True) + parser.add_argument('--model', type=str, + help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar", required=True) parser.add_argument('--query', type=str, required=True) - parser.add_argument('--device', type=str, choices=['cpu', 'gpu'], default='cpu') - parser.add_argument('--num_images', type=int, default=1000) - parser.add_argument('--top_k', type=int, default=10) - parser.add_argument('--dataset', type=str, choices=['coco', 'flickr30k'], default='coco') - parser.add_argument('--config', type=str, default=None, help="Which configuration to use for overriding the " - "checkpoint configuration. See into 'config' folder") - + parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda') # cpu is only for local test runs + parser.add_argument('--num_data_workers', type=int, default=8) + parser.add_argument('--num_images', type=int, default=5000) + parser.add_argument('--top_k', type=int, default=100) + parser.add_argument('--dataset', type=str, choices=['coco'], default='coco') # TODO support other datasets + parser.add_argument('--config', type=str, default='configs/teran_coco_MrSw_IR.yaml', + help="Which configuration to use for overriding the checkpoint configuration. See into " + "'config' folder") opts = parser.parse_args() - if opts.config is not None: - with open(opts.config, 'r') as yml_file: - config = yaml.load(yml_file) - else: - config = None - top_k_results = main(opts, config) - print(f"######## TOP {opts.tok_k} RESULTS ########") - print(top_k_results) + + model_config, model_checkpoint = prepare_model_checkpoint_and_config(opts) + + top_k_matches = top_k_image_retrieval(opts, model_config, model_checkpoint) + + print(f"######## TOP {opts.top_k} RESULTS ########") + print(top_k_matches) diff --git a/utils.py b/utils.py index 6b46f17..822e782 100644 --- a/utils.py +++ b/utils.py @@ -1,3 +1,5 @@ +from collections import OrderedDict + import numpy from models.teran import TERAN From cbbc32767b22207ee8cc7714dcc34d4e6bafb01b Mon Sep 17 00:00:00 2001 From: floschne Date: Wed, 30 Dec 2020 17:27:10 +0100 Subject: [PATCH 13/21] further optimized computation time by only computing the query embedding once at IR inference --- configs/teran_coco_MrSw_IR.yaml | 6 +- data.py | 19 +++--- inference.py | 48 +++++++-------- models/teran.py | 101 ++++++++++++++++++-------------- models/utils.py | 3 +- 5 files changed, 95 insertions(+), 82 deletions(-) diff --git a/configs/teran_coco_MrSw_IR.yaml b/configs/teran_coco_MrSw_IR.yaml index 6d650c7..3b27218 100644 --- a/configs/teran_coco_MrSw_IR.yaml +++ b/configs/teran_coco_MrSw_IR.yaml @@ -9,9 +9,9 @@ image-retrieval: dataset: 'coco' # for now only coco support split: 'test' # we can remove this in later versions num_imgs: 5000 - batch_size: 100 - pre-extracted-img-features-root: 'data/coco/features_36' - create_query_batch: True + batch_size: 100 # 100 takes ~10s; 1000 takes ~14s to encode the data (compute the TE outputs) + pre_extracted_img_features_root: 'data/coco/features_36' + create_query_batch: False alignment_mode: 'MrSw' diff --git a/data.py b/data.py index 2a2505c..971273c 100644 --- a/data.py +++ b/data.py @@ -402,24 +402,25 @@ def __call__(self, data): img_feature_batch[i, 1:] = f # create the image features bounding boxes batch + img_feat_lengths = [self.img_feat_length for _ in range(batch_size)] img_feat_bboxes_batch = torch.zeros(batch_size, self.bboxes_length, self.bboxes_dim) for i, box in enumerate(img_feat_bboxes): img_feat_bboxes_batch[i, 1:] = box if self.create_query_batch: - # create the query batch + # create the full query batch of size B x |Q| # since the token id is a scalar, the dim is 1 and whe don't need to add it to the batch # for the BERT embeddings the ids have to be Long - query_batch = torch.zeros(batch_size, self.query_length).long() + query_token_ids_batch = torch.zeros(batch_size, self.query_length).long() for i in range(len(queries)): - query_batch[i] = self.query_token_ids - + query_token_ids_batch[i] = self.query_token_ids query_lengths = [self.query_length for _ in range(batch_size)] - img_feat_lengths = [self.img_feat_length for _ in range(batch_size)] - - return img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_batch, query_lengths, dataset_indices else: - return img_feature_batch, img_feat_bboxes_batch, self.img_feat_length, self.query_token_ids, self.query_length, dataset_indices + # create a pseudo query batch with only one element of size 1 x |Q| + query_token_ids_batch = self.query_token_ids.unsqueeze(dim=0) + query_lengths = [self.query_length] + + return img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_ids_batch, query_lengths, dataset_indices class Collate: @@ -620,7 +621,7 @@ def get_coco_image_retrieval_data_loader(config, workers, query): imgs_root = roots[split_name]['img'] # for images we use pre-extracted features (not for text) - pre_extracted_img_features_root = config['image-retrieval']['pre-extracted-img-features-root'] + pre_extracted_img_features_root = config['image-retrieval']['pre_extracted_img_features_root'] captions_json = roots[split_name]['cap'] coco_annotation_ids = coco_annotation_ids[split_name] diff --git a/inference.py b/inference.py index 035dc05..5442a84 100644 --- a/inference.py +++ b/inference.py @@ -2,7 +2,7 @@ import os import sys import time -from typing import List, Any, Dict +from typing import List import numpy as np import torch @@ -34,45 +34,42 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr num_img_feats = None # all images have a fixed size of pre-extracted features of 36 + 1 regions img_embs = None + # make sure val logger is used + model.logger = val_logger + start_time = time.time() - for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_ids, query_lengths, + for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_id_batch, query_lengths_batch, dataset_indices) in enumerate(data_loader): - # make sure val logger is used - model.logger = val_logger - - # TODO - # in the first version just stack the query_token_ids, img_feat_length and query_length - # so that it has shape B x ? x ?, where B is len(img_feature_batch) (should be equal to bs set in the config) - # - # in the second version adapt model.forward_emb so that the embeddings get only computed once and then stacked - # to the same size as the img_embs - - # make sure val logger is used - model.logger = val_logger + if query_embs is not None: + # set the query batch to None so it doesn't get forwarded by TERAN again (to safe computation) + query_token_id_batch = None + query_lengths_batch = None # compute the embeddings with torch.no_grad(): # TODO inside model.forward_emb we have to adapt the code for only a single query so that it doesn't get # computed each time _, _, img_emb, query_emb, _ = model.forward_emb(img_feature_batch, - query_token_ids, + query_token_id_batch, img_feat_lengths, - query_lengths, + query_lengths_batch, img_feat_bboxes_batch) # initialize the arrays given the size of the embeddings if img_embs is None: num_img_feats = img_feat_lengths[0] if isinstance(img_feat_lengths, list) else img_feat_lengths - num_query_feats = query_lengths[0] if isinstance(query_lengths, list) else query_lengths + num_query_feats = query_lengths_batch[0] if isinstance(query_lengths_batch, + list) else query_lengths_batch img_feat_dim = img_emb.size(2) query_feat_dim = query_emb.size(2) img_embs = torch.zeros((len(data_loader.dataset), num_img_feats, img_feat_dim)) - query_embs = torch.zeros((len(data_loader.dataset), num_query_feats, query_feat_dim)) + query_embs = torch.zeros((1, num_query_feats, query_feat_dim)) + query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2) # preserve the embeddings by copying from gpu and converting to numpy + # TODO we could persist them on the disk to further save time img_embs[dataset_indices, :, :] = img_emb.cpu().permute(1, 0, 2) - query_embs[dataset_indices, :, :] = query_emb.cpu().permute(1, 0, 2) # measure elapsed time per batch batch_time.update(time.time() - start_time) @@ -81,7 +78,7 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr if i % log_step == 0: logging( f"Batch: [{i}/{len(data_loader)}]\t{str(model.logger)}\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})") - del img_feature_batch, query_token_ids + del img_feature_batch, query_token_id_batch print(f"Time elapsed to encode data: {time.time() - encode_data_start_time} seconds.") return img_embs, query_embs, num_img_feats, num_query_feats @@ -92,10 +89,8 @@ def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_len sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['image-retrieval']['alignment_mode'], return_similarity_mat=True) start_time = time.time() - img_embs_per_batch = 1000 # TODO config variable - img_emb_batches = 5 # TODO config / calc - - num_img_embs = img_embs.shape[0] + img_emb_batches = 1 # TODO config / calc + img_embs_per_batch = img_embs.size(0) // img_emb_batches # TODO config variable # distances storage distances = None @@ -109,7 +104,7 @@ def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_len # batch-wise compute the alignment distance between the images and the query for i in tqdm.trange(img_emb_batches): # create the current batch - img_embs_batch = img_embs[i * img_embs_per_batch:(i+1) * img_embs_per_batch] + img_embs_batch = img_embs[i * img_embs_per_batch:(i + 1) * img_embs_per_batch] img_embs_length_batch = [img_lengths for _ in range(img_embs_per_batch)] img_embs_batch.cuda() @@ -189,7 +184,8 @@ def prepare_model_checkpoint_and_config(opts): parser.add_argument('--model', type=str, help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar", required=True) parser.add_argument('--query', type=str, required=True) - parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda') # cpu is only for local test runs + parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], + default='cuda') # cpu is only for local test runs parser.add_argument('--num_data_workers', type=int, default=8) parser.add_argument('--num_images', type=int, default=5000) parser.add_argument('--top_k', type=int, default=100) diff --git a/models/teran.py b/models/teran.py index f74e45a..8a638ab 100644 --- a/models/teran.py +++ b/models/teran.py @@ -1,16 +1,15 @@ import torch -import torch.nn.init +import torch.backends.cudnn as cudnn import torch.nn as nn import torch.nn.functional as F -import torch.backends.cudnn as cudnn +import torch.nn.init +from nltk.corpus import stopwords from transformers import BertTokenizer -from models.loss import ContrastiveLoss, PermInvMatchingLoss, AlignmentContrastiveLoss -from models.text import EncoderTextBERT, EncoderText -from models.visual import TransformerPostProcessing, EncoderImage - -from .utils import l2norm, PositionalEncodingImageBoxes, PositionalEncodingText, Aggregator, generate_square_subsequent_mask -from nltk.corpus import stopwords, words as nltk_words +from models.loss import ContrastiveLoss, AlignmentContrastiveLoss +from models.text import EncoderText +from models.visual import EncoderImage +from .utils import l2norm, Aggregator class JointTextImageTransformerEncoder(nn.Module): @@ -18,6 +17,7 @@ class JointTextImageTransformerEncoder(nn.Module): This is a bert caption encoder - transformer image encoder (using bottomup features). It process the encoder outputs through a transformer, like VilBERT and outputs two different graph embeddings """ + def __init__(self, config): super().__init__() self.txt_enc = EncoderText(config) @@ -36,8 +36,8 @@ def __init__(self, config): self.shared_transformer = config['model']['shared-transformer'] transformer_layer_1 = nn.TransformerEncoderLayer(d_model=embed_size, nhead=4, - dim_feedforward=2048, - dropout=dropout, activation='relu') + dim_feedforward=2048, + dropout=dropout, activation='relu') self.transformer_encoder_1 = nn.TransformerEncoder(transformer_layer_1, num_layers=layers) if not self.shared_transformer: @@ -52,15 +52,16 @@ def __init__(self, config): self.img_aggregation_type = config['model']['image-aggregation'] def forward(self, features, captions, feat_len, cap_len, boxes): - # process captions by using bert - full_cap_emb_aggr, c_emb = self.txt_enc(captions, cap_len) # B x S x cap_dim + if captions is not None: + # process captions by using bert + full_cap_emb_aggr, c_emb = self.txt_enc(captions, cap_len) # B x S x cap_dim + else: + full_cap_emb_aggr, full_cap_emb = None, None # process image regions using a two-layer transformer - full_img_emb_aggr, i_emb = self.img_enc(features, feat_len, boxes) # B x S x vis_dim + full_img_emb_aggr, i_emb = self.img_enc(features, feat_len, boxes) # B x S x vis_dim # i_emb = i_emb.permute(1, 0, 2) # B x S x vis_dim - bs = features.shape[0] - # if False: # # concatenate the embeddings together # max_summed_lengths = max([x + y for x, y in zip(feat_len, cap_len)]) @@ -84,44 +85,53 @@ def forward(self, features, captions, feat_len, cap_len, boxes): # forward the captions if self.text_aggregation_type is not None: - c_emb = self.cap_proj(c_emb) - - mask = torch.zeros(bs, max(cap_len)).bool() - mask = mask.to(features.device) - for m, c_len in zip(mask, cap_len): - m[c_len:] = True - full_cap_emb = self.transformer_encoder_1(c_emb.permute(1, 0, 2), src_key_padding_mask=mask) # S_txt x B x dim - full_cap_emb_aggr = self.text_aggregation(full_cap_emb, cap_len, mask) + if captions is not None: + c_emb = self.cap_proj(c_emb) + + cap_bs = captions.shape[0] + mask = torch.zeros(cap_bs, max(cap_len)).bool() + mask = mask.to(features.device) + for m, c_len in zip(mask, cap_len): + m[c_len:] = True + full_cap_emb = self.transformer_encoder_1(c_emb.permute(1, 0, 2), + src_key_padding_mask=mask) # S_txt x B x dim + full_cap_emb_aggr = self.text_aggregation(full_cap_emb, cap_len, mask) + + full_cap_emb_aggr = l2norm(full_cap_emb_aggr) + + # normalize even every vector of the set + full_cap_emb = F.normalize(full_cap_emb, p=2, dim=2) # else use the embedding output by the txt model - else: + elif self.text_aggregation_type is None: full_cap_emb = None # forward the regions if self.img_aggregation_type is not None: i_emb = self.img_proj(i_emb) - mask = torch.zeros(bs, max(feat_len)).bool() + feat_bs = features.shape[0] + mask = torch.zeros(feat_bs, max(feat_len)).bool() mask = mask.to(features.device) for m, v_len in zip(mask, feat_len): m[v_len:] = True if self.shared_transformer: - full_img_emb = self.transformer_encoder_1(i_emb.permute(1, 0, 2), src_key_padding_mask=mask) # S_txt x B x dim + full_img_emb = self.transformer_encoder_1(i_emb.permute(1, 0, 2), + src_key_padding_mask=mask) # S_txt x B x dim else: - full_img_emb = self.transformer_encoder_2(i_emb.permute(1, 0, 2), src_key_padding_mask=mask) # S_txt x B x dim + full_img_emb = self.transformer_encoder_2(i_emb.permute(1, 0, 2), + src_key_padding_mask=mask) # S_txt x B x dim full_img_emb_aggr = self.image_aggregation(full_img_emb, feat_len, mask) + full_img_emb_aggr = l2norm(full_img_emb_aggr) + # normalize even every vector of the set + full_img_emb = F.normalize(full_img_emb, p=2, dim=2) else: full_img_emb = None - full_cap_emb_aggr = l2norm(full_cap_emb_aggr) - full_img_emb_aggr = l2norm(full_img_emb_aggr) - - # normalize even every vector of the set - full_img_emb = F.normalize(full_img_emb, p=2, dim=2) - full_cap_emb = F.normalize(full_cap_emb, p=2, dim=2) - if self.order_embeddings: - full_cap_emb_aggr = torch.abs(full_cap_emb_aggr) + if captions is not None: + full_cap_emb_aggr = torch.abs(full_cap_emb_aggr) full_img_emb_aggr = torch.abs(full_img_emb_aggr) + return full_img_emb_aggr, full_cap_emb_aggr, full_img_emb, full_cap_emb @@ -145,7 +155,8 @@ def __init__(self, config): if 'alignment' in loss_type: self.alignment_criterion = AlignmentContrastiveLoss(margin=config['training']['margin'], measure=config['training']['measure'], - max_violation=config['training']['max-violation'], aggregation=config['training']['alignment-mode']) + max_violation=config['training']['max-violation'], + aggregation=config['training']['alignment-mode']) if 'matching' in loss_type: self.matching_criterion = ContrastiveLoss(margin=config['training']['margin'], measure=config['training']['measure'], @@ -181,18 +192,20 @@ def __init__(self, config): # self.txt_enc.eval() def forward_emb(self, images, captions, img_len, cap_len, boxes): - """Compute the image and caption embeddings + """ + Compute the image and caption embeddings """ # Set mini-batch dataset if torch.cuda.is_available(): images = images.cuda() - captions = captions.cuda() boxes = boxes.cuda() + if captions is not None: + captions = captions.cuda() # Forward img_emb_aggr, cap_emb_aggr, img_feats, cap_feats = self.img_txt_enc(images, captions, img_len, cap_len, boxes) - if self.tokenizer is not None: + if self.tokenizer is not None and captions is not None: # remove stopwords # keep only word indexes that are not stopwords good_word_indexes = [[i for i, (tok, w) in enumerate(zip(self.tokenizer.convert_ids_to_tokens(ids), ids)) if @@ -200,8 +213,8 @@ def forward_emb(self, images, captions, img_len, cap_len, boxes): cap_len = [len(w) - (cap_feats.shape[0] - orig_len) for w, orig_len in zip(good_word_indexes, cap_len)] min_cut_len = min([len(w) for w in good_word_indexes]) good_word_indexes = [words[:min_cut_len] for words in good_word_indexes] - good_word_indexes = torch.LongTensor(good_word_indexes).to(cap_feats.device) # B x S - good_word_indexes = good_word_indexes.t().unsqueeze(2).expand(-1, -1, cap_feats.shape[2]) # S x B x dim + good_word_indexes = torch.LongTensor(good_word_indexes).to(cap_feats.device) # B x S + good_word_indexes = good_word_indexes.t().unsqueeze(2).expand(-1, -1, cap_feats.shape[2]) # S x B x dim cap_feats = cap_feats.gather(dim=0, index=good_word_indexes) return img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_len @@ -262,10 +275,12 @@ def forward(self, images, targets, img_lengths, cap_lengths, boxes=None, ids=Non else: text = targets captions = targets - wembeddings = self.img_txt_enc.txt_enc.word_embeddings(captions.cuda() if torch.cuda.is_available() else captions) + wembeddings = self.img_txt_enc.txt_enc.word_embeddings( + captions.cuda() if torch.cuda.is_available() else captions) # compute the embeddings - img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_lengths = self.forward_emb(images, text, img_lengths, cap_lengths, boxes) + img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_lengths = self.forward_emb(images, text, img_lengths, + cap_lengths, boxes) # NOTE: img_feats and cap_feats are S x B x dim loss_dict = self.forward_loss(img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, img_lengths, cap_lengths) diff --git a/models/utils.py b/models/utils.py index 4f32bd4..3f0bac3 100644 --- a/models/utils.py +++ b/models/utils.py @@ -87,7 +87,8 @@ def forward(self, x, boxes): # x is seq_len x B x dim def l2norm(X): - """L2-normalize columns of X + """ + L2-normalize columns of X """ norm = torch.pow(X, 2).sum(dim=1, keepdim=True).sqrt() X = torch.div(X, norm) From 5798289cb16f216d93ec356c3b399cc3d2432e80 Mon Sep 17 00:00:00 2001 From: floschne Date: Wed, 30 Dec 2020 17:59:12 +0100 Subject: [PATCH 14/21] CocoImageRetrievalDataset is now inheriting from torch.data.Dataset --- data.py | 4 ++-- inference.py | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/data.py b/data.py index 971273c..5792382 100644 --- a/data.py +++ b/data.py @@ -147,7 +147,7 @@ def __len__(self): return len(self.annotation_ids) -class CocoImageRetrievalDataset: +class CocoImageRetrievalDataset(data.Dataset): """ Custom COCO Dataset that uses only the images together with a user query. Compatible with torch.utils.data.DataLoader. @@ -634,7 +634,7 @@ def get_coco_image_retrieval_data_loader(config, workers, query): query=query, num_imgs=num_imgs) - # basically this creates the mini-batches which get passed to the model + # this creates the batches which get passed to the model (inside the query gets repeated or not based on the config) collate_fn = InferenceCollate(config) data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, diff --git a/inference.py b/inference.py index 5442a84..5ebca8e 100644 --- a/inference.py +++ b/inference.py @@ -129,11 +129,6 @@ def get_image_names(top_k_indices, data_loader) -> List[str]: def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: - # load model and options - # checkpoint = torch.load(model_path) - data_path = config['dataset']['data'] - measure = config['training']['measure'] - # construct model model = TERAN(config) From 372f5888e1975d19f1bdf11bf1d7fc9512ac9207 Mon Sep 17 00:00:00 2001 From: floschne Date: Thu, 31 Dec 2020 10:36:36 +0100 Subject: [PATCH 15/21] Splitted computation of img and txt embeddings in TERAN --- models/teran.py | 150 ++++++++++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 63 deletions(-) diff --git a/models/teran.py b/models/teran.py index 8a638ab..cf48e23 100644 --- a/models/teran.py +++ b/models/teran.py @@ -51,61 +51,41 @@ def __init__(self, config): self.text_aggregation_type = config['model']['text-aggregation'] self.img_aggregation_type = config['model']['image-aggregation'] - def forward(self, features, captions, feat_len, cap_len, boxes): - if captions is not None: - # process captions by using bert - full_cap_emb_aggr, c_emb = self.txt_enc(captions, cap_len) # B x S x cap_dim - else: - full_cap_emb_aggr, full_cap_emb = None, None - - # process image regions using a two-layer transformer - full_img_emb_aggr, i_emb = self.img_enc(features, feat_len, boxes) # B x S x vis_dim - # i_emb = i_emb.permute(1, 0, 2) # B x S x vis_dim - - # if False: - # # concatenate the embeddings together - # max_summed_lengths = max([x + y for x, y in zip(feat_len, cap_len)]) - # i_c_emb = torch.zeros(bs, max_summed_lengths, self.embed_size) - # i_c_emb = i_c_emb.to(features.device) - # mask = torch.zeros(bs, max_summed_lengths).bool() - # mask = mask.to(features.device) - # for i_c, m, i, c, i_len, c_len in zip(i_c_emb, mask, i_emb, c_emb, feat_len, cap_len): - # i_c[:c_len] = c[:c_len] - # i_c[c_len:c_len + i_len] = i[:i_len] - # m[c_len + i_len:] = True - # - # i_c_emb = i_c_emb.permute(1, 0, 2) # S_vis + S_txt x B x dim - # out = self.transformer_encoder(i_c_emb, src_key_padding_mask=mask) # S_vis + S_txt x B x dim - # - # full_cap_emb = out[0, :, :] - # I = torch.LongTensor(cap_len).view(1, -1, 1) - # I = I.expand(1, bs, self.embed_size).to(features.device) - # full_img_emb = torch.gather(out, dim=0, index=I).squeeze(0) - # else: + def forward_txt(self, captions, cap_len): + # process captions by using bert + full_cap_emb_aggr, c_emb = self.txt_enc(captions, cap_len) # B x S x cap_dim # forward the captions if self.text_aggregation_type is not None: - if captions is not None: - c_emb = self.cap_proj(c_emb) + c_emb = self.cap_proj(c_emb) - cap_bs = captions.shape[0] - mask = torch.zeros(cap_bs, max(cap_len)).bool() - mask = mask.to(features.device) - for m, c_len in zip(mask, cap_len): - m[c_len:] = True - full_cap_emb = self.transformer_encoder_1(c_emb.permute(1, 0, 2), - src_key_padding_mask=mask) # S_txt x B x dim - full_cap_emb_aggr = self.text_aggregation(full_cap_emb, cap_len, mask) + cap_bs = captions.shape[0] + mask = torch.zeros(cap_bs, max(cap_len)).bool() + mask = mask.to(captions.device) + for m, c_len in zip(mask, cap_len): + m[c_len:] = True + full_cap_emb = self.transformer_encoder_1(c_emb.permute(1, 0, 2), + src_key_padding_mask=mask) # S_txt x B x dim + full_cap_emb_aggr = self.text_aggregation(full_cap_emb, cap_len, mask) - full_cap_emb_aggr = l2norm(full_cap_emb_aggr) + full_cap_emb_aggr = l2norm(full_cap_emb_aggr) - # normalize even every vector of the set - full_cap_emb = F.normalize(full_cap_emb, p=2, dim=2) + # normalize even every vector of the set + full_cap_emb = F.normalize(full_cap_emb, p=2, dim=2) # else use the embedding output by the txt model - elif self.text_aggregation_type is None: + else: full_cap_emb = None + if self.order_embeddings: + full_cap_emb_aggr = torch.abs(full_cap_emb_aggr) + + return full_cap_emb_aggr, full_cap_emb + + def forward_img(self, features, feat_len, boxes): + # process image regions using a two-layer transformer + full_img_emb_aggr, i_emb = self.img_enc(features, feat_len, boxes) # B x S x vis_dim # forward the regions + if self.img_aggregation_type is not None: i_emb = self.img_proj(i_emb) @@ -116,11 +96,12 @@ def forward(self, features, captions, feat_len, cap_len, boxes): m[v_len:] = True if self.shared_transformer: full_img_emb = self.transformer_encoder_1(i_emb.permute(1, 0, 2), - src_key_padding_mask=mask) # S_txt x B x dim + src_key_padding_mask=mask) # S_img x B x dim else: full_img_emb = self.transformer_encoder_2(i_emb.permute(1, 0, 2), - src_key_padding_mask=mask) # S_txt x B x dim + src_key_padding_mask=mask) # S_img x B x dim full_img_emb_aggr = self.image_aggregation(full_img_emb, feat_len, mask) + full_img_emb_aggr = l2norm(full_img_emb_aggr) # normalize even every vector of the set full_img_emb = F.normalize(full_img_emb, p=2, dim=2) @@ -128,10 +109,23 @@ def forward(self, features, captions, feat_len, cap_len, boxes): full_img_emb = None if self.order_embeddings: - if captions is not None: - full_cap_emb_aggr = torch.abs(full_cap_emb_aggr) full_img_emb_aggr = torch.abs(full_img_emb_aggr) + return full_img_emb_aggr, full_img_emb + + def forward(self, features, captions, feat_len, cap_len, boxes): + if captions is not None: + # process captions + full_cap_emb_aggr, full_cap_emb = self.forward_txt(captions, cap_len) + else: + full_cap_emb_aggr, full_cap_emb = None, None + + if features is not None: + # process image regions + full_img_emb_aggr, full_img_emb = self.forward_img(features, feat_len, boxes) + else: + full_img_emb_aggr, full_img_emb = None, None + return full_img_emb_aggr, full_cap_emb_aggr, full_img_emb, full_cap_emb @@ -191,14 +185,29 @@ def __init__(self, config): # self.img_enc.eval() # self.txt_enc.eval() + def remove_stopwords(self, captions, cap_feats, cap_len): + # remove stopwords + # keep only word indexes that are not stopwords + good_word_indexes = [[i for i, (tok, w) in enumerate(zip(self.tokenizer.convert_ids_to_tokens(ids), ids)) if + tok not in self.en_stops or w == 0] for ids in captions] # keeps the padding + cap_len = [len(w) - (cap_feats.shape[0] - orig_len) for w, orig_len in zip(good_word_indexes, cap_len)] + min_cut_len = min([len(w) for w in good_word_indexes]) + good_word_indexes = [words[:min_cut_len] for words in good_word_indexes] + good_word_indexes = torch.LongTensor(good_word_indexes).to(cap_feats.device) # B x S + good_word_indexes = good_word_indexes.t().unsqueeze(2).expand(-1, -1, cap_feats.shape[2]) # S x B x dim + cap_feats = cap_feats.gather(dim=0, index=good_word_indexes) + + return cap_feats, cap_len + def forward_emb(self, images, captions, img_len, cap_len, boxes): """ Compute the image and caption embeddings """ # Set mini-batch dataset if torch.cuda.is_available(): - images = images.cuda() - boxes = boxes.cuda() + if images is not None and boxes is not None: + images = images.cuda() + boxes = boxes.cuda() if captions is not None: captions = captions.cuda() @@ -206,19 +215,31 @@ def forward_emb(self, images, captions, img_len, cap_len, boxes): img_emb_aggr, cap_emb_aggr, img_feats, cap_feats = self.img_txt_enc(images, captions, img_len, cap_len, boxes) if self.tokenizer is not None and captions is not None: - # remove stopwords - # keep only word indexes that are not stopwords - good_word_indexes = [[i for i, (tok, w) in enumerate(zip(self.tokenizer.convert_ids_to_tokens(ids), ids)) if - tok not in self.en_stops or w == 0] for ids in captions] # keeps the padding - cap_len = [len(w) - (cap_feats.shape[0] - orig_len) for w, orig_len in zip(good_word_indexes, cap_len)] - min_cut_len = min([len(w) for w in good_word_indexes]) - good_word_indexes = [words[:min_cut_len] for words in good_word_indexes] - good_word_indexes = torch.LongTensor(good_word_indexes).to(cap_feats.device) # B x S - good_word_indexes = good_word_indexes.t().unsqueeze(2).expand(-1, -1, cap_feats.shape[2]) # S x B x dim - cap_feats = cap_feats.gather(dim=0, index=good_word_indexes) + cap_feats, cap_len = self.remove_stopwords(captions, cap_feats, cap_len) return img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_len + def forward_txt_emb(self, captions, cap_len): + """ + compute txt embeddings only + """ + if torch.cuda.is_available(): + captions = captions.cuda() + cap_emb_aggr, cap_feats = self.img_txt_enc.forward_txt(captions, cap_len) + if self.tokenizer is not None and captions is not None: + cap_feats, cap_len = self.remove_stopwords(captions, cap_feats, cap_len) + return cap_emb_aggr, cap_feats, cap_len + + def forward_img_emb(self, images, img_len, boxes): + """ + compute img embeddings only + """ + if torch.cuda.is_available(): + images = images.cuda() + boxes = boxes.cuda() + img_emb_aggr, img_feats = self.img_txt_enc.forward_img(images, img_len, boxes) + return img_emb_aggr, img_feats + def get_parameters(self): lr_multiplier = 1.0 if self.config['text-model']['fine-tune'] else 0.0 @@ -279,8 +300,11 @@ def forward(self, images, targets, img_lengths, cap_lengths, boxes=None, ids=Non captions.cuda() if torch.cuda.is_available() else captions) # compute the embeddings - img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_lengths = self.forward_emb(images, text, img_lengths, - cap_lengths, boxes) + img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_lengths = self.forward_emb(images, + text, + img_lengths, + cap_lengths, + boxes) # NOTE: img_feats and cap_feats are S x B x dim loss_dict = self.forward_loss(img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, img_lengths, cap_lengths) From deb7cd0a6810d7800b0ceea42a9dd8b9ce81231e Mon Sep 17 00:00:00 2001 From: floschne Date: Thu, 31 Dec 2020 10:39:34 +0100 Subject: [PATCH 16/21] implemented pre-computation of img embeddings --- configs/teran_coco_MrSw_IR.yaml | 4 +- data.py | 21 +++-- inference.py | 142 +++++++++++++++++++++----------- 3 files changed, 108 insertions(+), 59 deletions(-) diff --git a/configs/teran_coco_MrSw_IR.yaml b/configs/teran_coco_MrSw_IR.yaml index 3b27218..0c79a1f 100644 --- a/configs/teran_coco_MrSw_IR.yaml +++ b/configs/teran_coco_MrSw_IR.yaml @@ -13,8 +13,8 @@ image-retrieval: pre_extracted_img_features_root: 'data/coco/features_36' create_query_batch: False alignment_mode: 'MrSw' - - + use_precomputed_img_embeddings: False + pre_computed_img_embeddings_root: 'data/coco/pre_computed_embeddings' text-model: name: 'bert' diff --git a/data.py b/data.py index 5792382..27e73b5 100644 --- a/data.py +++ b/data.py @@ -332,12 +332,13 @@ def __new__(cls, *args, **kwargs): return super(InferenceCollate, cls).__new__(cls) - def __init__(self, config): + def __init__(self, config, pre_compute_img_embs): self.vocab_type = str(config['text-model']['name']).lower() self.create_query_batch = bool(config['image-retrieval']['create_query_batch']) - if self.vocab_type == 'bert': + self.pre_compute_img_embs = pre_compute_img_embs + if self.vocab_type == 'bert' and not pre_compute_img_embs: self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain']) - else: + elif self.vocab_type != 'bert': raise ValueError("Currently only BERT Tokenizer is supported!") @classmethod @@ -378,7 +379,7 @@ def __call__(self, data): """ # encode (tokenize) the query - if self.query_token_ids is None: + if self.query_token_ids is None and not self.pre_compute_img_embs: # we don't need to pad or truncate since we only have a single query # TODO actually we don't even need the tokenizer twice so we could just use a local variable query_token_ids = torch.LongTensor(self.tokenizer.encode(queries[0])) @@ -407,7 +408,7 @@ def __call__(self, data): for i, box in enumerate(img_feat_bboxes): img_feat_bboxes_batch[i, 1:] = box - if self.create_query_batch: + if self.create_query_batch and not self.pre_compute_img_embs: # create the full query batch of size B x |Q| # since the token id is a scalar, the dim is 1 and whe don't need to add it to the batch # for the BERT embeddings the ids have to be Long @@ -415,10 +416,14 @@ def __call__(self, data): for i in range(len(queries)): query_token_ids_batch[i] = self.query_token_ids query_lengths = [self.query_length for _ in range(batch_size)] - else: + elif not self.create_query_batch and not self.pre_compute_img_embs: # create a pseudo query batch with only one element of size 1 x |Q| query_token_ids_batch = self.query_token_ids.unsqueeze(dim=0) query_lengths = [self.query_length] + else: # self.pre_compute_img_embs == True + # when pre-computing the image embeddings, we don't need (and have) information about the query + query_token_ids_batch = None + query_lengths = None return img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_ids_batch, query_lengths, dataset_indices @@ -605,7 +610,7 @@ def get_loaders(config, workers, batch_size=None): return train_loader, val_loader -def get_coco_image_retrieval_data_loader(config, workers, query): +def get_coco_image_retrieval_data_loader(config, workers, query, pre_compute_img_embs=False): # create the dataset + loader # 1) load / create a Coco Dataset to get meta info about images (we could also do this by hand) # 2) choose (the first) N images and create a dataset with N samples where each sample consists of the n-th image @@ -635,7 +640,7 @@ def get_coco_image_retrieval_data_loader(config, workers, query): num_imgs=num_imgs) # this creates the batches which get passed to the model (inside the query gets repeated or not based on the config) - collate_fn = InferenceCollate(config) + collate_fn = InferenceCollate(config, pre_compute_img_embs) data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, diff --git a/inference.py b/inference.py index 5ebca8e..45f5806 100644 --- a/inference.py +++ b/inference.py @@ -2,6 +2,7 @@ import os import sys import time +from pathlib import Path from typing import List import numpy as np @@ -15,14 +16,26 @@ from utils import AverageMeter, LogCollector -def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=print): +def persist_img_embs(config, data_loader, dataset_indices, numpy_img_emb): + dst_root = Path(os.getcwd()).joinpath(config['image-retrieval']['pre_computed_img_embeddings_root']) + if not dst_root.exists(): + dst_root.mkdir(parents=True, exist_ok=True) + + assert len(dataset_indices) == len(numpy_img_emb) + img_names = get_image_names(dataset_indices, data_loader) + # TODO do we want to store them in one big npz? + for idx in range(len(img_names)): + dst = dst_root.joinpath(img_names[idx] + '.npz') + if dst.exists(): + continue + np.savez_compressed(str(dst), img_emb=numpy_img_emb[idx]) + + +def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=print, pre_compute_img_embs=False): # compute the embedding vectors v_i, s_j (paper) for each image region and word respectively # -> forwarding the data through the respective TE stacks - print('Computing image and query embeddings...') - encode_data_start_time = time.time() - - batch_time = AverageMeter() - val_logger = LogCollector() + print( + f'{"Pre-" if pre_compute_img_embs else ""}Computing image {"" if pre_compute_img_embs else "and query "}embeddings...') # we don't need autograd for inference model.eval() @@ -35,56 +48,61 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr img_embs = None # make sure val logger is used + batch_time = AverageMeter() + val_logger = LogCollector() model.logger = val_logger start_time = time.time() - for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_lengths, query_token_id_batch, query_lengths_batch, + for i, (img_feature_batch, img_feat_bboxes_batch, img_feat_len_batch, query_token_batch, query_len_batch, dataset_indices) in enumerate(data_loader): + batch_start_time = time.time() + """ + the data loader returns None values for the respective batches if the only query was already loaded + -> query_token_batch, query_len_batch = None, None + """ - if query_embs is not None: - # set the query batch to None so it doesn't get forwarded by TERAN again (to safe computation) - query_token_id_batch = None - query_lengths_batch = None - - # compute the embeddings with torch.no_grad(): - # TODO inside model.forward_emb we have to adapt the code for only a single query so that it doesn't get - # computed each time - _, _, img_emb, query_emb, _ = model.forward_emb(img_feature_batch, - query_token_id_batch, - img_feat_lengths, - query_lengths_batch, - img_feat_bboxes_batch) - - # initialize the arrays given the size of the embeddings - if img_embs is None: - num_img_feats = img_feat_lengths[0] if isinstance(img_feat_lengths, list) else img_feat_lengths - num_query_feats = query_lengths_batch[0] if isinstance(query_lengths_batch, - list) else query_lengths_batch - img_feat_dim = img_emb.size(2) + # compute the query embedding only in the first iteration (also because there is only 1 query in IR) + if query_embs is None and not pre_compute_img_embs: + # TODO maybe we can get the most matching roi from query_emb_aggr? + query_emb_aggr, query_emb, _ = model.forward_txt_emb(query_token_batch, query_len_batch) + + # store results as np arrays for further processing or persisting + num_query_feats = query_len_batch[0] if isinstance(query_len_batch, list) else query_len_batch query_feat_dim = query_emb.size(2) - img_embs = torch.zeros((len(data_loader.dataset), num_img_feats, img_feat_dim)) query_embs = torch.zeros((1, num_query_feats, query_feat_dim)) query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2) - # preserve the embeddings by copying from gpu and converting to numpy - # TODO we could persist them on the disk to further save time - img_embs[dataset_indices, :, :] = img_emb.cpu().permute(1, 0, 2) + # compute every image embedding in the dataset + img_emb_aggr, img_emb = model.forward_img_emb(img_feature_batch, img_feat_len_batch, img_feat_bboxes_batch) + + # init array to store results for further processing or persisting + if img_embs is None: + num_img_feats = img_feat_len_batch[0] if isinstance(img_feat_len_batch, + list) else img_feat_len_batch + img_feat_dim = img_emb.size(2) + img_embs = torch.zeros((len(data_loader.dataset), num_img_feats, img_feat_dim)) + + numpy_img_emb = img_emb.cpu().permute(1, 0, 2) # why are we permuting here? -> TERAN + img_embs[dataset_indices, :, :] = numpy_img_emb + if pre_compute_img_embs: + # if we are in a pre-compute run, persist the arrays + persist_img_embs(model_config, data_loader, dataset_indices, numpy_img_emb) # measure elapsed time per batch - batch_time.update(time.time() - start_time) - start_time = time.time() + batch_time.update(time.time() - batch_start_time) if i % log_step == 0: logging( f"Batch: [{i}/{len(data_loader)}]\t{str(model.logger)}\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})") - del img_feature_batch, query_token_id_batch + del img_feature_batch, query_token_batch - print(f"Time elapsed to encode data: {time.time() - encode_data_start_time} seconds.") + print( + f"Time elapsed to {'encode' if not pre_compute_img_embs else 'encode and persist'} data: {time.time() - start_time} seconds.") return img_embs, query_embs, num_img_feats, num_query_feats -def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_lengths, config): +def compute_distances(img_embs, query_embs, img_lengths, query_lengths, config): # initialize similarity matrix evaluator sim_matrix_fn = AlignmentContrastiveLoss(aggregation=config['image-retrieval']['alignment_mode'], return_similarity_mat=True) @@ -124,8 +142,8 @@ def compute_distance_sorted_indices(img_embs, query_embs, img_lengths, query_len return sorted_distance_indices -def get_image_names(top_k_indices, data_loader) -> List[str]: - return [data_loader.dataset.get_image_metadata(idx)['file_name'] for idx in top_k_indices] +def get_image_names(dataset_indices, data_loader) -> List[str]: + return [data_loader.dataset.get_image_metadata(idx)['file_name'] for idx in dataset_indices] def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: @@ -147,7 +165,7 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: print(f"Images: {img_embs.shape[0]}, Captions: {cap_embs.shape[0]}") # compute the matching scores - distance_sorted_indices = compute_distance_sorted_indices(img_embs, cap_embs, img_lengths, cap_lengths, config) + distance_sorted_indices = compute_distances(img_embs, cap_embs, img_lengths, cap_lengths, config) top_k_indices = distance_sorted_indices[:opts.top_k] # get the image names @@ -170,6 +188,24 @@ def prepare_model_checkpoint_and_config(opts): return model_checkpoint_config, checkpoint +def pre_compute_img_embeddings(opts, config, checkpoint): + # construct model + model = TERAN(config) + + # load model state + + model.load_state_dict(checkpoint['model'], strict=False) + + print('Loading dataset') + data_loader = get_coco_image_retrieval_data_loader(config, + query=opts.query, + workers=opts.num_data_workers, + pre_compute_img_embs=True) + + # encode the data (i.e. compute the embeddings / TE outputs for the images and query) + encode_data_for_inference(model, data_loader, pre_compute_img_embs=True) + + if __name__ == '__main__': print("CUDA_VISIBLE_DEVICES: " + os.getenv("CUDA_VISIBLE_DEVICES", "NOT SET - ABORTING")) if os.getenv("CUDA_VISIBLE_DEVICES", None) is None: @@ -178,21 +214,29 @@ def prepare_model_checkpoint_and_config(opts): parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, help="Model (checkpoint) to load. E.g. pretrained_models/coco_MrSw.pth.tar", required=True) - parser.add_argument('--query', type=str, required=True) - parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], - default='cuda') # cpu is only for local test runs + parser.add_argument('--pre_compute_img_embeddings', action='store_true', help="If set or true, the image " + "embeddings get precomputed and " + "persisted at the directory " + "specified in the config.") + parser.add_argument('--query', type=str, required='--pre_compute_img_embeddings' not in sys.argv) parser.add_argument('--num_data_workers', type=int, default=8) parser.add_argument('--num_images', type=int, default=5000) parser.add_argument('--top_k', type=int, default=100) parser.add_argument('--dataset', type=str, choices=['coco'], default='coco') # TODO support other datasets - parser.add_argument('--config', type=str, default='configs/teran_coco_MrSw_IR.yaml', - help="Which configuration to use for overriding the checkpoint configuration. See into " - "'config' folder") + parser.add_argument('--config', type=str, default='configs/teran_coco_MrSw_IR.yaml', help="Which configuration to " + "use for overriding the" + " checkpoint " + "configuration. See " + "into 'config' folder") + # cpu is only for local test runs + parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda') opts = parser.parse_args() model_config, model_checkpoint = prepare_model_checkpoint_and_config(opts) - top_k_matches = top_k_image_retrieval(opts, model_config, model_checkpoint) - - print(f"######## TOP {opts.top_k} RESULTS ########") - print(top_k_matches) + if not opts.pre_compute_img_embeddings: + top_k_matches = top_k_image_retrieval(opts, model_config, model_checkpoint) + print(f"######## TOP {opts.top_k} RESULTS ########") + print(top_k_matches) + else: + pre_compute_img_embeddings(opts, model_config, model_checkpoint) From 8b9e3ff34d51545e7137bf1cd1cf9a0e855c4be0 Mon Sep 17 00:00:00 2001 From: floschne Date: Thu, 31 Dec 2020 13:19:20 +0100 Subject: [PATCH 17/21] using pre-computed image embeddings --- configs/teran_coco_MrSw_IR_PreComp.yaml | 63 ++++++++++ data.py | 148 ++++++++++++++++-------- evaluation.py | 2 +- inference.py | 67 +++++++---- models/teran.py | 4 +- 5 files changed, 214 insertions(+), 70 deletions(-) create mode 100644 configs/teran_coco_MrSw_IR_PreComp.yaml diff --git a/configs/teran_coco_MrSw_IR_PreComp.yaml b/configs/teran_coco_MrSw_IR_PreComp.yaml new file mode 100644 index 0000000..7be7a6e --- /dev/null +++ b/configs/teran_coco_MrSw_IR_PreComp.yaml @@ -0,0 +1,63 @@ +dataset: + name: 'coco' + images-path: 'data/coco/images' # not needed if using pre-extracted bottom-up features + data: 'data' + restval: True + pre-extracted-features: False + +image-retrieval: + dataset: 'coco' # for now only coco support + split: 'test' # we can remove this in later versions + num_imgs: 5000 + batch_size: 100 # 100 takes ~10s; 1000 takes ~14s to encode the data (compute the TE outputs) + pre_extracted_img_features_root: 'data/coco/features_36' + create_query_batch: False + alignment_mode: 'MrSw' + use_precomputed_img_embeddings: True + pre_computed_img_embeddings_root: 'data/coco/pre_computed_embeddings' + +text-model: + name: 'bert' + pretrain: 'bert-base-uncased' + word-dim: 768 + extraction-hidden-layer: 6 + fine-tune: True + pre-extracted: False + layers: 0 + dropout: 0.1 + +image-model: + name: 'bottomup' + pre-extracted-features-root: 'data/coco/features_36' + transformer-layers: 4 + dropout: 0.1 + pos-encoding: 'concat-and-process' + crop-size: 224 # not used + fine-tune: False + feat-dim: 2048 + norm: True + +model: + name: 'teran' + embed-size: 1024 + text-aggregation: 'first' + image-aggregation: 'first' + layers: 2 + exclude-stopwords: False + shared-transformer: False + dropout: 0.1 + +training: + lr: 0.00001 # 0.000006 + grad-clip: 2.0 + max-violation: True + loss-type: 'alignment' + alignment-mode: 'MrSw' + measure: 'dot' + margin: 0.2 + bs: 40 + scheduler: 'steplr' + gamma: 0.1 + step-size: 20 + warmup: null + warmup-period: 1000 diff --git a/data.py b/data.py index 27e73b5..109e7b8 100644 --- a/data.py +++ b/data.py @@ -147,22 +147,88 @@ def __len__(self): return len(self.annotation_ids) -class CocoImageRetrievalDataset(data.Dataset): +class CocoImageRetrievalDatasetBase: + def __init__(self, captions_json, coco_annotation_ids, query, num_imgs): + self.query = query + self.num_imgs = num_imgs + + self.coco = COCO(captions_json) + self.anno_ids = coco_annotation_ids + + def get_raw_item(self, idx): + next_img_idx = idx * 5 # in the coco dataset there are 5 captions for every image + ann_id = self.anno_ids[next_img_idx] + coco_img_id = self.coco.anns[ann_id]['image_id'] + img_metadata = self.coco.imgs[coco_img_id] + img_size = np.array([img_metadata['width'], img_metadata['height']]) + + return coco_img_id, img_size + + def get_image_metadata(self, idx): + # TODO can't we just get coco.imgs[idx'] somehow? + next_img_idx = idx * 5 # in the coco dataset there are 5 captions for every image + ann_id = self.anno_ids[next_img_idx] + coco_img_id = self.coco.anns[ann_id]['image_id'] + img_metadata = self.coco.imgs[coco_img_id] + return img_metadata, coco_img_id + + +class PreComputedCocoEmbeddingsDataset(CocoImageRetrievalDatasetBase): + """ + Custom COCO Dataset that uses pre-computed image embedding + """ + + def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config): + CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs) + + pre_computed_img_embeddings_root = config['image-retrieval']['pre_computed_img_embeddings_root'] + self.pre_computed_img_embeddings_root = pre_computed_img_embeddings_root + + self.img_embs = {idx: self.__load_img_emb(idx) for idx in range(num_imgs)} + + self.vocab_type = str(config['text-model']['name']).lower() + if self.vocab_type == 'bert': + self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain']) + elif self.vocab_type != 'bert': + raise ValueError("Currently only BERT Tokenizer is supported!") + + def __load_img_emb(self, idx): + # just return the query and the img embedding + img_metadata, coco_img_id = self.get_image_metadata(idx) + file_name = img_metadata['file_name'] + npz = np.load(os.path.join(self.pre_computed_img_embeddings_root, file_name + '.npz')) + img_emd = npz.get('img_emb') + + return img_emd + + def get_img_embs_and_lens(self): + return self.img_embs + + def get_query_pseudo_batch(self): + # tokenize and encode the query + query_token_ids = torch.LongTensor(self.tokenizer.encode(self.query)) + # create a pseudo batch suitable for TERAN + query_token_pseudo_batch = query_token_ids.unsqueeze(dim=0) + query_lengths = [len(query_token_ids)] + return query_token_pseudo_batch, query_lengths + + def __len__(self): + return self.num_imgs + + +class PreComputedCocoFeaturesDataset(CocoImageRetrievalDatasetBase, data.Dataset): """ Custom COCO Dataset that uses only the images together with a user query. Compatible with torch.utils.data.DataLoader. """ def __init__(self, imgs_root, img_features_path, captions_json, coco_annotation_ids, query, num_imgs): - self.query = query - self.num_imgs = num_imgs + CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs) + self.feats_data_path = os.path.join(img_features_path, 'bu_att') self.box_data_path = os.path.join(img_features_path, 'bu_box') self.imgs_root = imgs_root - self.coco = COCO(captions_json) - self.anno_ids = coco_annotation_ids - def __getitem__(self, idx): """ This function returns a tuple that is further passed to collate_fn @@ -185,23 +251,6 @@ def __getitem__(self, idx): # this output is the input of the CollateFn return img_feat, img_feat_box, img_id, self.query, idx - def get_raw_item(self, idx): - next_img_idx = idx * 5 # in the coco dataset there are 5 captions for every image - ann_id = self.anno_ids[next_img_idx] - img_id = self.coco.anns[ann_id]['image_id'] - img_metadata = self.coco.imgs[img_id] - img_size = np.array([img_metadata['width'], img_metadata['height']]) - - return img_id, img_size - - def get_image_metadata(self, idx): - # TODO can't we just get coco.imgs[idx'] somehow? - next_img_idx = idx * 5 # in the coco dataset there are 5 captions for every image - ann_id = self.anno_ids[next_img_idx] - img_id = self.coco.anns[ann_id]['image_id'] - img_metadata = self.coco.imgs[img_id] - return img_metadata - def __len__(self): return self.num_imgs @@ -333,9 +382,9 @@ def __new__(cls, *args, **kwargs): return super(InferenceCollate, cls).__new__(cls) def __init__(self, config, pre_compute_img_embs): - self.vocab_type = str(config['text-model']['name']).lower() self.create_query_batch = bool(config['image-retrieval']['create_query_batch']) self.pre_compute_img_embs = pre_compute_img_embs + self.vocab_type = str(config['text-model']['name']).lower() if self.vocab_type == 'bert' and not pre_compute_img_embs: self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain']) elif self.vocab_type != 'bert': @@ -362,6 +411,7 @@ def __call__(self, data): img_feats, img_feat_bboxes, img_ids, queries, dataset_indices = zip(*data) """ Build batch tensors from a list of (img_feats, img_feat_boxes, img_ids, queries, dataset_indices) tuples. + This data comes from the dataset Args: - img_feats: - img_feat_bboxes: @@ -462,7 +512,7 @@ def __call__(self, data): cap_features = [torch.FloatTensor(f) for f in cap_features] wembeddings = [torch.FloatTensor(w) for w in wembeddings] else: - if self.vocab_type == 'bert': + if self.vocab_type == 'bert': cap_lengths = [len(self.tokenizer.tokenize(c)) + 2 for c in captions] # + 2 in order to account for begin and end tokens max_len = max(cap_lengths) @@ -508,7 +558,7 @@ def __call__(self, data): targets = torch.zeros(len(captions), max(cap_lengths)).long() for i, cap in enumerate(captions): end = cap_lengths[i] - targets[i, :end] = cap[:end] #caption token ids + targets[i, :end] = cap[:end] # caption token ids if not preextracted_images: return images, targets, None, cap_lengths, None, ids @@ -610,12 +660,7 @@ def get_loaders(config, workers, batch_size=None): return train_loader, val_loader -def get_coco_image_retrieval_data_loader(config, workers, query, pre_compute_img_embs=False): - # create the dataset + loader - # 1) load / create a Coco Dataset to get meta info about images (we could also do this by hand) - # 2) choose (the first) N images and create a dataset with N samples where each sample consists of the n-th image - # and the query (gets repeated N times) # TODO maybe this is not necessary - +def get_coco_image_retrieval_data(config, query, workers=None, pre_compute_img_embs=False): # get the directories that contain the coco json files and coco annotation ids (which we may not need, I think) roots, coco_annotation_ids = get_paths(config) @@ -625,28 +670,37 @@ def get_coco_image_retrieval_data_loader(config, workers, query, pre_compute_img imgs_root = roots[split_name]['img'] - # for images we use pre-extracted features (not for text) - pre_extracted_img_features_root = config['image-retrieval']['pre_extracted_img_features_root'] - captions_json = roots[split_name]['cap'] coco_annotation_ids = coco_annotation_ids[split_name] num_imgs = config['image-retrieval']['num_imgs'] + pre_extracted_img_features_root = config['image-retrieval']['pre_extracted_img_features_root'] + + use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings'] + if use_precomputed_img_embeddings: + dataset = PreComputedCocoEmbeddingsDataset(captions_json=captions_json, + coco_annotation_ids=coco_annotation_ids, + query=query, + num_imgs=num_imgs, + config=config) - dataset = CocoImageRetrievalDataset(imgs_root=imgs_root, - img_features_path=pre_extracted_img_features_root, - captions_json=captions_json, - coco_annotation_ids=coco_annotation_ids, - query=query, - num_imgs=num_imgs) + return dataset + + dataset = PreComputedCocoFeaturesDataset(imgs_root=imgs_root, + img_features_path=pre_extracted_img_features_root, + captions_json=captions_json, + coco_annotation_ids=coco_annotation_ids, + query=query, + num_imgs=num_imgs) # this creates the batches which get passed to the model (inside the query gets repeated or not based on the config) collate_fn = InferenceCollate(config, pre_compute_img_embs) - data_loader = torch.utils.data.DataLoader(dataset=dataset, - batch_size=batch_size, - shuffle=False, - pin_memory=True, - num_workers=workers, - collate_fn=collate_fn) + + data_loader = data.DataLoader(dataset=dataset, + batch_size=batch_size, + shuffle=False, + pin_memory=True, + num_workers=workers, + collate_fn=collate_fn) return data_loader diff --git a/evaluation.py b/evaluation.py index f164cf0..c815606 100644 --- a/evaluation.py +++ b/evaluation.py @@ -10,7 +10,7 @@ from evaluate_utils.dcg import DCG from models.loss import order_sim, AlignmentContrastiveLoss from utils import get_model, AverageMeter, LogCollector -from data import get_coco_image_retrieval_data_loader, get_test_loader +from data import get_coco_image_retrieval_data, get_test_loader def encode_data(model, data_loader, log_step=10, logging=print): diff --git a/inference.py b/inference.py index 45f5806..acb4b26 100644 --- a/inference.py +++ b/inference.py @@ -10,7 +10,7 @@ import tqdm import yaml -from data import get_coco_image_retrieval_data_loader +from data import get_coco_image_retrieval_data from models.loss import AlignmentContrastiveLoss from models.teran import TERAN from utils import AverageMeter, LogCollector @@ -65,7 +65,7 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr # compute the query embedding only in the first iteration (also because there is only 1 query in IR) if query_embs is None and not pre_compute_img_embs: # TODO maybe we can get the most matching roi from query_emb_aggr? - query_emb_aggr, query_emb, _ = model.forward_txt_emb(query_token_batch, query_len_batch) + query_emb_aggr, query_emb, _ = model.forward_txt(query_token_batch, query_len_batch) # store results as np arrays for further processing or persisting num_query_feats = query_len_batch[0] if isinstance(query_len_batch, list) else query_len_batch @@ -74,7 +74,7 @@ def encode_data_for_inference(model: TERAN, data_loader, log_step=10, logging=pr query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2) # compute every image embedding in the dataset - img_emb_aggr, img_emb = model.forward_img_emb(img_feature_batch, img_feat_len_batch, img_feat_bboxes_batch) + img_emb_aggr, img_emb = model.forward_img(img_feature_batch, img_feat_len_batch, img_feat_bboxes_batch) # init array to store results for further processing or persisting if img_embs is None: @@ -142,8 +142,32 @@ def compute_distances(img_embs, query_embs, img_lengths, query_lengths, config): return sorted_distance_indices -def get_image_names(dataset_indices, data_loader) -> List[str]: - return [data_loader.dataset.get_image_metadata(idx)['file_name'] for idx in dataset_indices] +def get_image_names(dataset_indices, dataset) -> List[str]: + return [dataset.get_image_metadata(idx)[0]['file_name'] for idx in dataset_indices] + + +def get_precomputed_embeddings(config, opts, model): + print("Loading pre-computed image embeddings...") + start = time.time() + # returns a PreComputedCocoEmbeddingsDataset + dataset = get_coco_image_retrieval_data(config, query=opts.query) + + # compute the query embedding + with torch.no_grad(): + query_token_pseudo_batch, query_lengths = dataset.get_query_pseudo_batch() + query_emb_aggr, query_emb, _ = model.forward_txt(query_token_pseudo_batch, query_lengths) + + # store results as np arrays for further processing or persisting + query_feat_dim = query_emb.size(2) + query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False) + query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2) + + # get the img embeddings and convert them to Tensors + np_img_embs = list(dataset.img_embs.values()) + img_embs = torch.Tensor(np_img_embs) + img_length = len(np_img_embs[0]) + print(f"Time elapsed to load pre-computed embeddings and compute query embedding: {time.time() - start} seconds!") + return img_embs, query_embs, img_length, query_lengths, dataset def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: @@ -153,23 +177,27 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: # load model state model.load_state_dict(checkpoint['model'], strict=False) - print('Loading dataset') - data_loader = get_coco_image_retrieval_data_loader(config, - query=opts.query, - workers=opts.num_data_workers) - - # encode the data (i.e. compute the embeddings / TE outputs for the images and query) - img_embs, cap_embs, img_lengths, cap_lengths = encode_data_for_inference(model, data_loader) + use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings'] + if use_precomputed_img_embeddings: + img_embs, query_embs, img_lengths, query_lengths, dataset = get_precomputed_embeddings(config, opts, model) + else: + # returns a Dataloader of a PreComputedCocoFeaturesDataset + data_loader = get_coco_image_retrieval_data(config, + query=opts.query, + workers=opts.num_data_workers) + dataset = data_loader.dataset + # encode the data (i.e. compute the embeddings / TE outputs for the images and query) + img_embs, query_embs, img_lengths, query_lengths = encode_data_for_inference(model, data_loader) torch.cuda.empty_cache() - print(f"Images: {img_embs.shape[0]}, Captions: {cap_embs.shape[0]}") + print(f"Images Embeddings: {img_embs.shape[0]}, Query Embeddings: {query_embs.shape[0]}") # compute the matching scores - distance_sorted_indices = compute_distances(img_embs, cap_embs, img_lengths, cap_lengths, config) + distance_sorted_indices = compute_distances(img_embs, query_embs, img_lengths, query_lengths, config) top_k_indices = distance_sorted_indices[:opts.top_k] # get the image names - top_k_images = get_image_names(top_k_indices, data_loader) + top_k_images = get_image_names(top_k_indices, dataset) return top_k_images @@ -193,14 +221,13 @@ def pre_compute_img_embeddings(opts, config, checkpoint): model = TERAN(config) # load model state - model.load_state_dict(checkpoint['model'], strict=False) print('Loading dataset') - data_loader = get_coco_image_retrieval_data_loader(config, - query=opts.query, - workers=opts.num_data_workers, - pre_compute_img_embs=True) + data_loader = get_coco_image_retrieval_data(config, + query=opts.query, + workers=opts.num_data_workers, + pre_compute_img_embs=True) # encode the data (i.e. compute the embeddings / TE outputs for the images and query) encode_data_for_inference(model, data_loader, pre_compute_img_embs=True) diff --git a/models/teran.py b/models/teran.py index cf48e23..b57be52 100644 --- a/models/teran.py +++ b/models/teran.py @@ -219,7 +219,7 @@ def forward_emb(self, images, captions, img_len, cap_len, boxes): return img_emb_aggr, cap_emb_aggr, img_feats, cap_feats, cap_len - def forward_txt_emb(self, captions, cap_len): + def forward_txt(self, captions, cap_len): """ compute txt embeddings only """ @@ -230,7 +230,7 @@ def forward_txt_emb(self, captions, cap_len): cap_feats, cap_len = self.remove_stopwords(captions, cap_feats, cap_len) return cap_emb_aggr, cap_feats, cap_len - def forward_img_emb(self, images, img_len, boxes): + def forward_img(self, images, img_len, boxes): """ compute img embeddings only """ From 6c697dca385f671fc78de49c6fab4c47c8ffc0ae Mon Sep 17 00:00:00 2001 From: floschne Date: Thu, 31 Dec 2020 15:06:52 +0100 Subject: [PATCH 18/21] optimized loading of per-computed embeddings --- data.py | 49 ++++++++++++++++++++++++++++++++----------------- inference.py | 11 +++++++++-- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/data.py b/data.py index 109e7b8..5758aad 100644 --- a/data.py +++ b/data.py @@ -1,6 +1,9 @@ import json as jsonmod import os import pickle +import time +from collections import OrderedDict +from multiprocessing import Pool import numpy as np import torch @@ -173,18 +176,26 @@ def get_image_metadata(self, idx): return img_metadata, coco_img_id +# This has to be outside any class so that it can be pickled for multiproc +def load_img_emb(args): + # just return the query and the img embedding + idx, file_name = args + npz = np.load(file_name) + img_emd = npz.get('img_emb') + return idx, img_emd + + class PreComputedCocoEmbeddingsDataset(CocoImageRetrievalDatasetBase): """ Custom COCO Dataset that uses pre-computed image embedding """ - def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config): + def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config, num_workers=32): CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs) pre_computed_img_embeddings_root = config['image-retrieval']['pre_computed_img_embeddings_root'] self.pre_computed_img_embeddings_root = pre_computed_img_embeddings_root - - self.img_embs = {idx: self.__load_img_emb(idx) for idx in range(num_imgs)} + self.num_workers = num_workers self.vocab_type = str(config['text-model']['name']).lower() if self.vocab_type == 'bert': @@ -192,17 +203,20 @@ def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config): elif self.vocab_type != 'bert': raise ValueError("Currently only BERT Tokenizer is supported!") - def __load_img_emb(self, idx): - # just return the query and the img embedding - img_metadata, coco_img_id = self.get_image_metadata(idx) - file_name = img_metadata['file_name'] - npz = np.load(os.path.join(self.pre_computed_img_embeddings_root, file_name + '.npz')) - img_emd = npz.get('img_emb') - - return img_emd - - def get_img_embs_and_lens(self): - return self.img_embs + self.img_embs = self.__load_img_embs() + + def __load_img_embs(self): + start = time.time() + print('Parellel loading of pre-computed image embeddings started...') + file_names = list(map(lambda m: os.path.join(self.pre_computed_img_embeddings_root, m[0]['file_name'] + '.npz'), + [self.get_image_metadata(i) for i in range(self.num_imgs)])) + # parallel loading of all image embeddings + with Pool(self.num_workers) as pool: + res = pool.map(load_img_emb, enumerate(file_names)) + pool.join() + res = OrderedDict(res) + print(f'Time elapsed to load pre-computed image embeddings: {time.time() - start} seconds') + return res def get_query_pseudo_batch(self): # tokenize and encode the query @@ -660,7 +674,7 @@ def get_loaders(config, workers, batch_size=None): return train_loader, val_loader -def get_coco_image_retrieval_data(config, query, workers=None, pre_compute_img_embs=False): +def get_coco_image_retrieval_data(config, query, num_workers=32, pre_compute_img_embs=False): # get the directories that contain the coco json files and coco annotation ids (which we may not need, I think) roots, coco_annotation_ids = get_paths(config) @@ -681,7 +695,8 @@ def get_coco_image_retrieval_data(config, query, workers=None, pre_compute_img_e coco_annotation_ids=coco_annotation_ids, query=query, num_imgs=num_imgs, - config=config) + config=config, + num_workers=num_workers) return dataset @@ -699,7 +714,7 @@ def get_coco_image_retrieval_data(config, query, workers=None, pre_compute_img_e batch_size=batch_size, shuffle=False, pin_memory=True, - num_workers=workers, + num_workers=num_workers, collate_fn=collate_fn) return data_loader diff --git a/inference.py b/inference.py index acb4b26..f246a1e 100644 --- a/inference.py +++ b/inference.py @@ -154,17 +154,24 @@ def get_precomputed_embeddings(config, opts, model): # compute the query embedding with torch.no_grad(): + start_query_batch = time.time() query_token_pseudo_batch, query_lengths = dataset.get_query_pseudo_batch() + print(f'Time to get query pseudo batch: {time.time() - start_query_batch}') + + start_query_enc = time.time() query_emb_aggr, query_emb, _ = model.forward_txt(query_token_pseudo_batch, query_lengths) + print(f'Time to compute query embedding: {time.time() - start_query_enc}') + # store results as np arrays for further processing or persisting query_feat_dim = query_emb.size(2) query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False) query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2) + # get the img embeddings and convert them to Tensors - np_img_embs = list(dataset.img_embs.values()) - img_embs = torch.Tensor(np_img_embs) + np_img_embs = np.array(list(dataset.img_embs.values())) + img_embs = torch.Tensor(np_img_embs) # here is the bottleneck img_length = len(np_img_embs[0]) print(f"Time elapsed to load pre-computed embeddings and compute query embedding: {time.time() - start} seconds!") return img_embs, query_embs, img_length, query_lengths, dataset From 3bf88b5dd16796082ed979c0a7c5751115ef52b6 Mon Sep 17 00:00:00 2001 From: floschne Date: Thu, 31 Dec 2020 15:11:49 +0100 Subject: [PATCH 19/21] minor bugfix --- inference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inference.py b/inference.py index f246a1e..f835947 100644 --- a/inference.py +++ b/inference.py @@ -162,13 +162,11 @@ def get_precomputed_embeddings(config, opts, model): query_emb_aggr, query_emb, _ = model.forward_txt(query_token_pseudo_batch, query_lengths) print(f'Time to compute query embedding: {time.time() - start_query_enc}') - # store results as np arrays for further processing or persisting query_feat_dim = query_emb.size(2) query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False) query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2) - # get the img embeddings and convert them to Tensors np_img_embs = np.array(list(dataset.img_embs.values())) img_embs = torch.Tensor(np_img_embs) # here is the bottleneck @@ -191,7 +189,7 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: # returns a Dataloader of a PreComputedCocoFeaturesDataset data_loader = get_coco_image_retrieval_data(config, query=opts.query, - workers=opts.num_data_workers) + num_workers=opts.num_data_workers) dataset = data_loader.dataset # encode the data (i.e. compute the embeddings / TE outputs for the images and query) img_embs, query_embs, img_lengths, query_lengths = encode_data_for_inference(model, data_loader) @@ -233,7 +231,7 @@ def pre_compute_img_embeddings(opts, config, checkpoint): print('Loading dataset') data_loader = get_coco_image_retrieval_data(config, query=opts.query, - workers=opts.num_data_workers, + num_workers=opts.num_data_workers, pre_compute_img_embs=True) # encode the data (i.e. compute the embeddings / TE outputs for the images and query) @@ -270,6 +268,8 @@ def pre_compute_img_embeddings(opts, config, checkpoint): if not opts.pre_compute_img_embeddings: top_k_matches = top_k_image_retrieval(opts, model_config, model_checkpoint) + print(f"##########################################") + print(f"QUERY: {opts.query}") print(f"######## TOP {opts.top_k} RESULTS ########") print(top_k_matches) else: From 9f17dfe30d906442cb6bd580fbf8f279a7bbfaf1 Mon Sep 17 00:00:00 2001 From: floschne Date: Sun, 3 Jan 2021 15:25:07 +0100 Subject: [PATCH 20/21] modularized code a bit --- data.py | 88 +++++++++++++++++++++++++++++----------------------- inference.py | 40 ++++++++++-------------- 2 files changed, 66 insertions(+), 62 deletions(-) diff --git a/data.py b/data.py index 5758aad..15f83b0 100644 --- a/data.py +++ b/data.py @@ -151,29 +151,18 @@ def __len__(self): class CocoImageRetrievalDatasetBase: - def __init__(self, captions_json, coco_annotation_ids, query, num_imgs): - self.query = query + def __init__(self, captions_json, coco_annotation_ids, num_imgs): self.num_imgs = num_imgs self.coco = COCO(captions_json) self.anno_ids = coco_annotation_ids - def get_raw_item(self, idx): - next_img_idx = idx * 5 # in the coco dataset there are 5 captions for every image - ann_id = self.anno_ids[next_img_idx] - coco_img_id = self.coco.anns[ann_id]['image_id'] - img_metadata = self.coco.imgs[coco_img_id] - img_size = np.array([img_metadata['width'], img_metadata['height']]) - - return coco_img_id, img_size - def get_image_metadata(self, idx): - # TODO can't we just get coco.imgs[idx'] somehow? next_img_idx = idx * 5 # in the coco dataset there are 5 captions for every image ann_id = self.anno_ids[next_img_idx] coco_img_id = self.coco.anns[ann_id]['image_id'] img_metadata = self.coco.imgs[coco_img_id] - return img_metadata, coco_img_id + return coco_img_id, img_metadata # This has to be outside any class so that it can be pickled for multiproc @@ -185,30 +174,24 @@ def load_img_emb(args): return idx, img_emd -class PreComputedCocoEmbeddingsDataset(CocoImageRetrievalDatasetBase): +class PreComputedCocoImageEmbeddingsDataset(CocoImageRetrievalDatasetBase): """ Custom COCO Dataset that uses pre-computed image embedding """ - def __init__(self, captions_json, coco_annotation_ids, query, num_imgs, config, num_workers=32): - CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs) + def __init__(self, captions_json, coco_annotation_ids, num_imgs, config, num_workers=32): + CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, num_imgs) pre_computed_img_embeddings_root = config['image-retrieval']['pre_computed_img_embeddings_root'] self.pre_computed_img_embeddings_root = pre_computed_img_embeddings_root self.num_workers = num_workers - self.vocab_type = str(config['text-model']['name']).lower() - if self.vocab_type == 'bert': - self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain']) - elif self.vocab_type != 'bert': - raise ValueError("Currently only BERT Tokenizer is supported!") - self.img_embs = self.__load_img_embs() def __load_img_embs(self): start = time.time() - print('Parellel loading of pre-computed image embeddings started...') - file_names = list(map(lambda m: os.path.join(self.pre_computed_img_embeddings_root, m[0]['file_name'] + '.npz'), + print('Parallel loading of pre-computed image embeddings started...') + file_names = list(map(lambda m: os.path.join(self.pre_computed_img_embeddings_root, m[1]['file_name'] + '.npz'), [self.get_image_metadata(i) for i in range(self.num_imgs)])) # parallel loading of all image embeddings with Pool(self.num_workers) as pool: @@ -218,16 +201,45 @@ def __load_img_embs(self): print(f'Time elapsed to load pre-computed image embeddings: {time.time() - start} seconds') return res - def get_query_pseudo_batch(self): + def __len__(self): + return self.num_imgs + + +class QueryEncoder: + def __init__(self, config, model): + self.vocab_type = str(config['text-model']['name']).lower() + if self.vocab_type == 'bert': + self.tokenizer = BertTokenizer.from_pretrained(config['text-model']['pretrain']) + elif self.vocab_type != 'bert': + raise ValueError("Currently only BERT Tokenizer is supported!") + + self.model = model + + def _get_query_pseudo_batch(self, query: str): # tokenize and encode the query - query_token_ids = torch.LongTensor(self.tokenizer.encode(self.query)) + query_token_ids = torch.LongTensor(self.tokenizer.encode(query)) # create a pseudo batch suitable for TERAN query_token_pseudo_batch = query_token_ids.unsqueeze(dim=0) query_lengths = [len(query_token_ids)] return query_token_pseudo_batch, query_lengths - def __len__(self): - return self.num_imgs + def compute_query_embedding(self, query): + # compute the query embedding + with torch.no_grad(): + start_query_batch = time.time() + query_token_pseudo_batch, query_lengths = self._get_query_pseudo_batch(query) + print(f'Time to get query pseudo batch: {time.time() - start_query_batch}') + + start_query_enc = time.time() + query_emb_aggr, query_emb, _ = self.model.forward_txt(query_token_pseudo_batch, query_lengths) + print(f'Time to compute query embedding: {time.time() - start_query_enc}') + + # store results as np arrays for further processing or persisting + query_feat_dim = query_emb.size(2) + query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False) + query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2) + + return query_embs, query_lengths class PreComputedCocoFeaturesDataset(CocoImageRetrievalDatasetBase, data.Dataset): @@ -237,17 +249,19 @@ class PreComputedCocoFeaturesDataset(CocoImageRetrievalDatasetBase, data.Dataset """ def __init__(self, imgs_root, img_features_path, captions_json, coco_annotation_ids, query, num_imgs): - CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, query, num_imgs) + CocoImageRetrievalDatasetBase.__init__(self, captions_json, coco_annotation_ids, num_imgs) self.feats_data_path = os.path.join(img_features_path, 'bu_att') self.box_data_path = os.path.join(img_features_path, 'bu_box') self.imgs_root = imgs_root + self.query = query def __getitem__(self, idx): """ This function returns a tuple that is further passed to collate_fn """ - img_id, img_size = self.get_raw_item(idx) + img_id, img_metadata = self.get_image_metadata(idx) + img_size = np.array([img_metadata['width'], img_metadata['height']]) img_feat_path = os.path.join(self.feats_data_path, '{}.npz'.format(img_id)) img_box_path = os.path.join(self.box_data_path, '{}.npy'.format(img_id)) @@ -674,7 +688,7 @@ def get_loaders(config, workers, batch_size=None): return train_loader, val_loader -def get_coco_image_retrieval_data(config, query, num_workers=32, pre_compute_img_embs=False): +def get_coco_image_retrieval_data(config, query=None, num_workers=32, pre_compute_img_embs=False): # get the directories that contain the coco json files and coco annotation ids (which we may not need, I think) roots, coco_annotation_ids = get_paths(config) @@ -691,13 +705,11 @@ def get_coco_image_retrieval_data(config, query, num_workers=32, pre_compute_img use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings'] if use_precomputed_img_embeddings: - dataset = PreComputedCocoEmbeddingsDataset(captions_json=captions_json, - coco_annotation_ids=coco_annotation_ids, - query=query, - num_imgs=num_imgs, - config=config, - num_workers=num_workers) - + dataset = PreComputedCocoImageEmbeddingsDataset(captions_json=captions_json, + coco_annotation_ids=coco_annotation_ids, + num_imgs=num_imgs, + config=config, + num_workers=num_workers) return dataset dataset = PreComputedCocoFeaturesDataset(imgs_root=imgs_root, diff --git a/inference.py b/inference.py index f835947..2949061 100644 --- a/inference.py +++ b/inference.py @@ -10,7 +10,7 @@ import tqdm import yaml -from data import get_coco_image_retrieval_data +from data import get_coco_image_retrieval_data, QueryEncoder from models.loss import AlignmentContrastiveLoss from models.teran import TERAN from utils import AverageMeter, LogCollector @@ -143,36 +143,21 @@ def compute_distances(img_embs, query_embs, img_lengths, query_lengths, config): def get_image_names(dataset_indices, dataset) -> List[str]: - return [dataset.get_image_metadata(idx)[0]['file_name'] for idx in dataset_indices] + return [dataset.get_image_metadata(idx)[1]['file_name'] for idx in dataset_indices] -def get_precomputed_embeddings(config, opts, model): +def load_precomputed_image_embeddings(config): print("Loading pre-computed image embeddings...") start = time.time() - # returns a PreComputedCocoEmbeddingsDataset - dataset = get_coco_image_retrieval_data(config, query=opts.query) - - # compute the query embedding - with torch.no_grad(): - start_query_batch = time.time() - query_token_pseudo_batch, query_lengths = dataset.get_query_pseudo_batch() - print(f'Time to get query pseudo batch: {time.time() - start_query_batch}') - - start_query_enc = time.time() - query_emb_aggr, query_emb, _ = model.forward_txt(query_token_pseudo_batch, query_lengths) - print(f'Time to compute query embedding: {time.time() - start_query_enc}') - - # store results as np arrays for further processing or persisting - query_feat_dim = query_emb.size(2) - query_embs = torch.zeros((1, query_lengths[0], query_feat_dim), requires_grad=False) - query_embs[0, :, :] = query_emb.cpu().permute(1, 0, 2) + # returns a PreComputedCocoImageEmbeddingsDataset + dataset = get_coco_image_retrieval_data(config) # get the img embeddings and convert them to Tensors np_img_embs = np.array(list(dataset.img_embs.values())) img_embs = torch.Tensor(np_img_embs) # here is the bottleneck - img_length = len(np_img_embs[0]) + img_lengths = len(np_img_embs[0]) print(f"Time elapsed to load pre-computed embeddings and compute query embedding: {time.time() - start} seconds!") - return img_embs, query_embs, img_length, query_lengths, dataset + return img_embs, img_lengths, dataset def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: @@ -184,7 +169,12 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings'] if use_precomputed_img_embeddings: - img_embs, query_embs, img_lengths, query_lengths, dataset = get_precomputed_embeddings(config, opts, model) + # load pre computed img embs + img_embs, img_lengths, dataset = load_precomputed_image_embeddings(config) + # compute query emb + query_encoder = QueryEncoder(config, model) + query_embs, query_lengths = query_encoder.compute_query_embedding(opts.query) + else: # returns a Dataloader of a PreComputedCocoFeaturesDataset data_loader = get_coco_image_retrieval_data(config, @@ -194,7 +184,9 @@ def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: # encode the data (i.e. compute the embeddings / TE outputs for the images and query) img_embs, query_embs, img_lengths, query_lengths = encode_data_for_inference(model, data_loader) - torch.cuda.empty_cache() + if opts.device == "cuda": + torch.cuda.empty_cache() + print(f"Images Embeddings: {img_embs.shape[0]}, Query Embeddings: {query_embs.shape[0]}") # compute the matching scores From 915dcbd0542437adb82bdc6052626deb1a419022 Mon Sep 17 00:00:00 2001 From: floschne Date: Sun, 3 Jan 2021 15:43:39 +0100 Subject: [PATCH 21/21] own fn to load teran --- inference.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/inference.py b/inference.py index 2949061..56e026d 100644 --- a/inference.py +++ b/inference.py @@ -146,31 +146,35 @@ def get_image_names(dataset_indices, dataset) -> List[str]: return [dataset.get_image_metadata(idx)[1]['file_name'] for idx in dataset_indices] -def load_precomputed_image_embeddings(config): +def load_precomputed_image_embeddings(config, num_workers): print("Loading pre-computed image embeddings...") start = time.time() # returns a PreComputedCocoImageEmbeddingsDataset - dataset = get_coco_image_retrieval_data(config) + dataset = get_coco_image_retrieval_data(config, num_workers=num_workers) # get the img embeddings and convert them to Tensors np_img_embs = np.array(list(dataset.img_embs.values())) - img_embs = torch.Tensor(np_img_embs) # here is the bottleneck + img_embs = torch.Tensor(np_img_embs) img_lengths = len(np_img_embs[0]) print(f"Time elapsed to load pre-computed embeddings and compute query embedding: {time.time() - start} seconds!") return img_embs, img_lengths, dataset -def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: +def load_teran(config, checkpoint): # construct model model = TERAN(config) - # load model state model.load_state_dict(checkpoint['model'], strict=False) + return model + + +def top_k_image_retrieval(opts, config, checkpoint) -> List[str]: + model = load_teran(config, checkpoint) use_precomputed_img_embeddings = config['image-retrieval']['use_precomputed_img_embeddings'] if use_precomputed_img_embeddings: # load pre computed img embs - img_embs, img_lengths, dataset = load_precomputed_image_embeddings(config) + img_embs, img_lengths, dataset = load_precomputed_image_embeddings(config, num_workers=opts.num_data_workers) # compute query emb query_encoder = QueryEncoder(config, model) query_embs, query_lengths = query_encoder.compute_query_embedding(opts.query)