mesnico · floschne · Dec 8, 2020 · Dec 8, 2020 · Dec 9, 2020 · Dec 9, 2020
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,10 @@
 *.ipynb_checkpoints
 *.json
 *.pth.tar
+
+
+.idea
+data
+pretrained_models
+*.tar
+*.ipynb
diff --git a/README.md b/README.md
@@ -32,6 +32,12 @@ conda activate teran
 export PYTHONPATH=.
 ```
 
+2.1 Setup minimal python environment for CUDA 10.1 using conda:
+```
+conda env create --file environment_min.yml
+conda activate teran
+export PYTHONPATH=.
+```
 ## Get the data
 1. Download and extract the data folder, containing annotations, the splits by Karpathy et al. and ROUGEL - SPICE precomputed relevances for both COCO and Flickr30K datasets:
 

diff --git a/__init__.py b/__init__.py
@@ -0,0 +1 @@
+from .data import d
diff --git a/configs/teran_coco_MrSw_IR.yaml b/configs/teran_coco_MrSw_IR.yaml
@@ -0,0 +1,63 @@
+dataset:
+  name: 'coco'
+  images-path: 'data/coco/images'  # not needed if using pre-extracted bottom-up features
+  data: 'data'
+  restval: True
+  pre-extracted-features: False
+
+image-retrieval:
+  dataset: 'coco' # for now only coco support
+  split: 'test' # we can remove this in later versions
+  num_imgs: 5000
+  batch_size: 100 # 100 takes ~10s; 1000 takes ~14s to encode the data (compute the TE outputs)
+  pre_extracted_img_features_root: 'data/coco/features_36'
+  create_query_batch: False
+  alignment_mode: 'MrSw'
+  use_precomputed_img_embeddings: False
+  pre_computed_img_embeddings_root: 'data/coco/pre_computed_embeddings'
+
+text-model:
+  name: 'bert'
+  pretrain: 'bert-base-uncased'
+  word-dim: 768
+  extraction-hidden-layer: 6
+  fine-tune: True
+  pre-extracted: False
+  layers: 0
+  dropout: 0.1
+
+image-model:
+  name: 'bottomup'
+  pre-extracted-features-root: 'data/coco/features_36'
+  transformer-layers: 4
+  dropout: 0.1
+  pos-encoding: 'concat-and-process'
+  crop-size: 224  # not used
+  fine-tune: False
+  feat-dim: 2048
+  norm: True
+
+model:
+  name: 'teran'
+  embed-size: 1024
+  text-aggregation: 'first'
+  image-aggregation: 'first'
+  layers: 2
+  exclude-stopwords: False
+  shared-transformer: False
+  dropout: 0.1
+
+training:
+  lr: 0.00001  # 0.000006
+  grad-clip: 2.0
+  max-violation: True
+  loss-type: 'alignment'
+  alignment-mode: 'MrSw'
+  measure: 'dot'
+  margin: 0.2
+  bs: 40
+  scheduler: 'steplr'
+  gamma: 0.1
+  step-size: 20
+  warmup: null
+  warmup-period: 1000
diff --git a/configs/teran_coco_MrSw_IR_PreComp.yaml b/configs/teran_coco_MrSw_IR_PreComp.yaml
@@ -0,0 +1,63 @@
+dataset:
+  name: 'coco'
+  images-path: 'data/coco/images'  # not needed if using pre-extracted bottom-up features
+  data: 'data'
+  restval: True
+  pre-extracted-features: False
+
+image-retrieval:
+  dataset: 'coco' # for now only coco support
+  split: 'test' # we can remove this in later versions
+  num_imgs: 5000
+  batch_size: 100 # 100 takes ~10s; 1000 takes ~14s to encode the data (compute the TE outputs)
+  pre_extracted_img_features_root: 'data/coco/features_36'
+  create_query_batch: False
+  alignment_mode: 'MrSw'
+  use_precomputed_img_embeddings: True
+  pre_computed_img_embeddings_root: 'data/coco/pre_computed_embeddings'
+
+text-model:
+  name: 'bert'
+  pretrain: 'bert-base-uncased'
+  word-dim: 768
+  extraction-hidden-layer: 6
+  fine-tune: True
+  pre-extracted: False
+  layers: 0
+  dropout: 0.1
+
+image-model:
+  name: 'bottomup'
+  pre-extracted-features-root: 'data/coco/features_36'
+  transformer-layers: 4
+  dropout: 0.1
+  pos-encoding: 'concat-and-process'
+  crop-size: 224  # not used
+  fine-tune: False
+  feat-dim: 2048
+  norm: True
+
+model:
+  name: 'teran'
+  embed-size: 1024
+  text-aggregation: 'first'
+  image-aggregation: 'first'
+  layers: 2
+  exclude-stopwords: False
+  shared-transformer: False
+  dropout: 0.1
+
+training:
+  lr: 0.00001  # 0.000006
+  grad-clip: 2.0
+  max-violation: True
+  loss-type: 'alignment'
+  alignment-mode: 'MrSw'
+  measure: 'dot'
+  margin: 0.2
+  bs: 40
+  scheduler: 'steplr'
+  gamma: 0.1
+  step-size: 20
+  warmup: null
+  warmup-period: 1000
diff --git a/configs/teran_inf_coco_MrSw.yaml b/configs/teran_inf_coco_MrSw.yaml
@@ -0,0 +1,59 @@
+dataset:
+  name: 'coco'
+  images-path: 'data/coco/images'  # not needed if using pre-extracted bottom-up features
+  data: 'data'
+  restval: True
+  pre-extracted-features: False
+
+text-model:
+  name: 'bert'
+  pretrain: 'bert-base-uncased'
+  word-dim: 768
+  extraction-hidden-layer: 6
+  fine-tune: True
+  pre-extracted: False
+  layers: 0
+  dropout: 0.1
+
+#text-model:
+#  name: 'gru'
+#  word-dim: 300
+#  fine-tune: True
+#  pre-extracted: False
+#  layers: 1
+
+image-model:
+  name: 'bottomup'
+  pre-extracted-features-root: 'data/coco/features_36'
+  transformer-layers: 4
+  dropout: 0.1
+  pos-encoding: 'concat-and-process'
+  crop-size: 224  # not used
+  fine-tune: False
+  feat-dim: 2048
+  norm: True
+
+model:
+  name: 'teran'
+  embed-size: 1024
+  text-aggregation: 'first'
+  image-aggregation: 'first'
+  layers: 2
+  exclude-stopwords: False
+  shared-transformer: False
+  dropout: 0.1
+
+training:
+  lr: 0.00001  # 0.000006
+  grad-clip: 2.0
+  max-violation: True
+  loss-type: 'alignment'
+  alignment-mode: 'MrSw'
+  measure: 'dot'
+  margin: 0.2
+  bs: 40
+  scheduler: 'steplr'
+  gamma: 0.1
+  step-size: 20
+  warmup: null
+  warmup-period: 1000