From c5c7cec911dc65f8e4b2d0605cbd4400ea660b7f Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Fri, 13 Nov 2020 09:58:07 +0100
Subject: [PATCH 01/19] remedy segfault with openmpi/intelmpi in MPI_Reduc

---
 src/contraction/ctr_2d_general.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/contraction/ctr_2d_general.cxx b/src/contraction/ctr_2d_general.cxx
index 117704be..3f02c6d4 100755
--- a/src/contraction/ctr_2d_general.cxx
+++ b/src/contraction/ctr_2d_general.cxx
@@ -431,7 +431,7 @@ namespace CTF_int {
         if (cdt_C->rank == owner_C)
           cdt_C->red(MPI_IN_PLACE, op_C, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C);
         else
-          cdt_C->red(op_C, NULL, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C);
+          cdt_C->red(op_C, op_C, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C);
         if (rank_C == owner_C){
           sr_C->copy(ctr_sub_lda_C, ctr_lda_C,
                      op_C, ctr_sub_lda_C, sr_C->mulid(),

From 594ecc8decdfd91f38094c9792a06e5806cf124a Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Fri, 19 Feb 2021 18:33:30 +0100
Subject: [PATCH 02/19] 1.) contraction time estimate returns value in sec 2.)
 dgemm take heed of 'rectangularism' of contraction 3.) used realistic model
 values for vsc4

---
 src/contraction/contraction.cxx | 232 ++++++++++++++++----------------
 src/contraction/contraction.h   |   2 +-
 src/contraction/ctr_tsr.cxx     |  14 +-
 src/shared/init_models.cxx      |  16 +--
 src/tensor/untyped_tensor.cxx   |  25 +++-
 src/tensor/untyped_tensor.h     |   6 +-
 6 files changed, 160 insertions(+), 135 deletions(-)

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index f5dd14dc..8421fae9 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -70,7 +70,7 @@ namespace CTF_int {
     func = func_;
     alpha = alpha_;
     beta  = beta_;
-   
+
     idx_A = (int*)alloc(sizeof(int)*A->order);
     idx_B = (int*)alloc(sizeof(int)*B->order);
     idx_C = (int*)alloc(sizeof(int)*C->order);
@@ -96,7 +96,7 @@ namespace CTF_int {
     func = func_;
     alpha = alpha_;
     beta  = beta_;
-   
+
     conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B, C->order, cidx_C, &idx_C);
   }
 
@@ -111,7 +111,7 @@ namespace CTF_int {
     //if (A->wrld->cdt.cm == MPI_COMM_WORLD){
 //      update_all_models(A->wrld->cdt.cm);
     //}
-   
+
     int stat = home_contract();
     if (stat != SUCCESS){
       printf("CTF ERROR: Failed to perform contraction\n");
@@ -120,7 +120,7 @@ namespace CTF_int {
 #endif
     }
   }
- 
+
   template<typename ptype>
   void get_perm(int     perm_order,
                 ptype   A,
@@ -166,7 +166,7 @@ namespace CTF_int {
         break;
     }
   }
-  
+
   void contraction::set_output_nnz_frac(double nnz_frac){
     //assert(nnz_frac >= 0. && nnz_frac <= 1.);
     this->output_nnz_frac = nnz_frac;
@@ -273,17 +273,17 @@ namespace CTF_int {
   }
 
   double contraction::estimate_time(){
-    int np = std::max(A->wrld->np,B->wrld->np);
-    double flop_rate = 1.E9*np;
-    double bw_rate = 1.E5*np;
-    return this->estimate_num_flops()/flop_rate + this->estimate_bw()/bw_rate;
+    ctr * ctrf;
+    double est_time;
+    int stat = map(&ctrf, est_time, false);
+    return est_time;
   }
 
   int contraction::is_equal(contraction const & os){
     if (this->A != os.A) return 0;
     if (this->B != os.B) return 0;
     if (this->C != os.C) return 0;
-   
+
     for (int i=0; i<A->order; i++){
       if (idx_A[i] != os.idx_A[i]) return 0;
     }
@@ -323,7 +323,7 @@ namespace CTF_int {
                      iparam *       inner_prm){
     int i, num_tot, num_ctr, num_no_ctr_A, num_no_ctr_B, num_weigh;
     int * idx_arr;
-     
+
     inv_idx(A->order, idx_A,
             B->order, idx_B,
             C->order, idx_C,
@@ -407,7 +407,7 @@ namespace CTF_int {
               A->sym[inA] != C->sym[inC]){
             broken = 1;
           }
-        } else { 
+        } else {
           if (((inA>=0) + (inB>=0) + (inC>=0) != 3) ||
               ((inB == -1) ^ (iB == -1)) ||
               ((inC == -1) ^ (iC == -1)) ||
@@ -429,7 +429,7 @@ namespace CTF_int {
         }
       }
     }
-   
+
     for (iC=0; iC<C->order; iC++){
       i = idx_C[iC];
       iA = idx_arr[3*i+0];
@@ -457,7 +457,7 @@ namespace CTF_int {
         }
       }
     }
-   
+
     for (iB=0; iB<B->order; iB++){
       i = idx_B[iB];
       iC = idx_arr[3*i+2];
@@ -577,7 +577,7 @@ namespace CTF_int {
     int idx_self_C, idx_self_A, idx_self_B;
     int num_self_C, num_self_A, num_self_B;
     int * ordering_A, * ordering_B, * ordering_C, * idx_arr;
-   
+
     CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&ordering_A);
     CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&ordering_B);
     CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&ordering_C);
@@ -645,13 +645,13 @@ namespace CTF_int {
     *new_ordering_A = ordering_A;
     *new_ordering_B = ordering_B;
     *new_ordering_C = ordering_C;
-   
+
     //iparam iprm;
     //calc_fold_nmk(A, B, C, idx_A, idx_B, idx_C, *new_ordering_A, *new_ordering_B, &iprm);
     //return iprm;
   }
 
- 
+
   void contraction::get_fold_ctr(contraction *& fold_ctr,
                                  int &          all_fdim_A,
                                  int &          all_fdim_B,
@@ -786,7 +786,7 @@ namespace CTF_int {
       permute_target(tfA->order, tfnew_ord_A, tAiord);
       permute_target(tfB->order, tfnew_ord_B, tBiord);
       permute_target(tfC->order, tfnew_ord_C, tCiord);
-   
+
       double time_est = 0.0;
       if (tA->is_sparse)
         time_est += tA->nnz_tot/(((double)tA->size)*tA->calc_npe())*tA->calc_nvirt()*est_time_transp(tall_fdim_A, tAiord, tall_flen_A, 1, tA->sr);
@@ -940,7 +940,7 @@ namespace CTF_int {
             }
           }
         }
-    
+
         A->spmatricize(iprm.m, iprm.k, nrow_idx, all_fdim_A, all_flen_A, csr_or_coo, use_ccsr);
       }
       if (!B->is_sparse){
@@ -975,7 +975,7 @@ namespace CTF_int {
         C->spmatricize(iprm.m, iprm.n, nrow_idx, all_fdim_C, all_flen_C, csr_or_coo, use_ccsr);
         C->sr->dealloc(C->data);
       }
-   
+
     }
 
     CTF_int::cdealloc(fnew_ord_A);
@@ -1022,9 +1022,9 @@ namespace CTF_int {
     int i, num_tot, iA, iB, iC;
     int * idx_arr;
     tensor * nA, * nB, * nC;
-  
+
     contraction * nctr;
-   
+
     if (new_contraction != NULL){
       nA = new tensor(A, 0, 0);
       nB = new tensor(B, 0, 0);
@@ -1088,7 +1088,7 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i;
-          }      
+          }
         }
         if (idx_arr[3*iA+2] != -1){
           if (C->sym[idx_arr[3*iA+2]] != A->sym[i] ||
@@ -1108,12 +1108,12 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i;
-          }      
+          }
         }
       }
     }
 
-  
+
     int nB_sym[B->order];
     if (new_contraction != NULL)
       memcpy(nB_sym, nB->sym, sizeof(int)*nB->order);
@@ -1138,7 +1138,7 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i+1;
-          }      
+          }
         }
         if (idx_arr[3*iB+2] != -1){
           if (C->sym[idx_arr[3*iB+2]] != B->sym[i] ||
@@ -1158,7 +1158,7 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i+1;
-          }      
+          }
         }
       }
     }
@@ -1205,7 +1205,7 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i+2;
-          }      
+          }
           if (idx_arr[3*iC+0] != -1){
             if (A->sym[idx_arr[3*iC+0]] != C->sym[i] ||
                 idx_C[i+1] != idx_A[idx_arr[3*iC+0]+1]){
@@ -1224,7 +1224,7 @@ namespace CTF_int {
               }
               CTF_int::cdealloc(idx_arr);
               return 3*i+2;
-            }      
+            }
           }
         }
       }
@@ -1238,7 +1238,7 @@ namespace CTF_int {
     int64_t len;
     int iA, iB, iC;
     int * idx_arr;
-      
+
     inv_idx(A->order, idx_A,
             B->order, idx_B,
             C->order, idx_C,
@@ -1287,7 +1287,7 @@ namespace CTF_int {
     return true;
   }
 
-   
+
   int contraction::check_mapping(){
 
     int num_tot, i, ph_A, ph_B, iA, iB, iC, pass, order, topo_order;
@@ -1302,11 +1302,11 @@ namespace CTF_int {
     if (B->is_mapped == 0) pass = 0;
     if (C->is_mapped == 0) pass = 0;
     ASSERT(pass==1);
-   
+
     if (A->is_folded == 1) pass = 0;
     if (B->is_folded == 1) pass = 0;
     if (C->is_folded == 1) pass = 0;
-   
+
     if (pass==0){
       DPRINTF(3,"failed confirmation here\n");
       return 0;
@@ -1331,7 +1331,7 @@ namespace CTF_int {
             B->order, idx_B,
             C->order, idx_C,
             &num_tot, &idx_arr);
-   
+
     if (!check_self_mapping(A, idx_A))
       pass = 0;
     if (!check_self_mapping(B, idx_B))
@@ -1552,7 +1552,7 @@ namespace CTF_int {
 
     tsr_order = num_weigh;
 
-   
+
     for (i=0; i<num_weigh; i++){
       iweigh = idx_weigh[i];
       iA = idx_arr[iweigh*3+0];
@@ -1563,7 +1563,7 @@ namespace CTF_int {
           B->edge_map[iB].type == PHYSICAL_MAP ||
           C->edge_map[iC].type == PHYSICAL_MAP)
         return NEGATIVE;
-    } 
+    }
     CTF_int::alloc_ptr(tsr_order*sizeof(int),           (void**)&restricted);
     CTF_int::alloc_ptr(tsr_order*sizeof(int64_t),       (void**)&tsr_edge_len);
     CTF_int::alloc_ptr(tsr_order*tsr_order*sizeof(int), (void**)&tsr_sym_table);
@@ -1586,7 +1586,7 @@ namespace CTF_int {
       iB = idx_arr[iweigh*3+1];
       iC = idx_arr[iweigh*3+2];
 
-     
+
       weigh_map[i].np = lcm(weigh_map[i].np,A->edge_map[iA].np);
       weigh_map[i].np = lcm(weigh_map[i].np,B->edge_map[iB].np);
       weigh_map[i].np = lcm(weigh_map[i].np,C->edge_map[iC].np);
@@ -1636,7 +1636,7 @@ namespace CTF_int {
 
     if (stat == ERROR)
       return ERROR;
-   
+
     /* define mapping of tensors A and B according to the mapping of ctr dims */
     if (stat == SUCCESS){
       for (i=0; i<num_weigh; i++){
@@ -1719,7 +1719,7 @@ namespace CTF_int {
     extract_free_comms(topo, A->order, A->edge_map,
                              B->order, B->edge_map,
                        num_sub_phys_dims, &sub_phys_comm, &comm_idx);
-   
+
 
     /* Map a tensor of dimension 2*num_ctr, with symmetries among each pair.
      * Set the edge lengths and symmetries according to those in ctr dims of A and B.
@@ -1784,7 +1784,7 @@ namespace CTF_int {
     //}
     if (stat == ERROR)
       return ERROR;
-   
+
     /* define mapping of tensors A and B according to the mapping of ctr dims */
     if (stat == SUCCESS){
       for (i=0; i<num_ctr; i++){
@@ -1840,7 +1840,7 @@ namespace CTF_int {
       iB = idx_arr[3*inoctr+1];
       iC = idx_arr[3*inoctr+2];
 
-     
+
       if (iC != -1 && iA != -1){
         copy_mapping(1, C->edge_map + iC, A->edge_map + iA);
       }
@@ -1861,7 +1861,7 @@ namespace CTF_int {
       iB = idx_arr[3*inoctr+1];
       iC = idx_arr[3*inoctr+2];
 
-     
+
       if (iA != -1 && iC != -1){
         copy_mapping(1, A->edge_map + iA, C->edge_map + iC);
       }
@@ -1879,7 +1879,7 @@ namespace CTF_int {
       iB = idx_arr[3*inoctr+1];
       iC = idx_arr[3*inoctr+2];
 
-     
+
       if (iA != -1 && iC != -1){
         copy_mapping(1, C->edge_map + iC, A->edge_map + iA);
       }
@@ -1972,7 +1972,7 @@ namespace CTF_int {
     nAB=0;
     nAC=0;
     nBC=0;
- 
+
     for (int i=0; i<num_tot; i++){
       if (idx_arr[3*i+0] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] == -1)
         nAB++;
@@ -1992,7 +1992,7 @@ namespace CTF_int {
     TAU_FSTOP(get_num_map_vars);
     return nv;
   }
- 
+
   bool contraction::switch_topo_perm(){
     ASSERT(A->topo == B->topo && B->topo == C->topo);
     topology const * topo = A->topo;
@@ -2096,7 +2096,7 @@ namespace CTF_int {
   bool contraction::
       exh_map_to_topo(topology const * topo,
                       int              variant){
-  
+
     int num_tot;
     int * idx_arr;
     inv_idx(A->order, idx_A,
@@ -2144,7 +2144,7 @@ namespace CTF_int {
         v = v/choose(nAC,nctr_2d);
         get_choice(nBC,nctr_2d,v%choose(nBC,nctr_2d),iBC);
         v = v/choose(nBC,nctr_2d);
-       
+
         for (int i=0; i<nctr_2d; i++){
          // printf("iAB[%d] = %d iAC[%d] = %d iBC[%d] = %d ord[%d] = %d\n", i, iAB[i], i, iAC[i], i, iBC[i], i, ord[i]);
           int iiAB=0;
@@ -2251,8 +2251,8 @@ namespace CTF_int {
         else if (jC != -1) C->edge_map[jC].aug_phys(topo, idim);
       }
     }*/
-   
-   
+
+
     //A->order*B->order*C->order+A->order*B->order+A->order*C->order+B->order*C->order+A->order+B->order+C->order+1;
 /*    int nv = variant;
     for (int idim=0; idim<topo->order; idim++){
@@ -2355,7 +2355,7 @@ namespace CTF_int {
     tensor * tA, * tB, * tC;
     get_perm<tensor*>(order, A, B, C, tA, tB, tC);
     get_perm<const int*>(order, idx_A, idx_B, idx_C, tidx_A, tidx_B, tidx_C);
-  
+
     inv_idx(tA->order, tidx_A,
             tB->order, tidx_B,
             tC->order, tidx_C,
@@ -2373,7 +2373,7 @@ namespace CTF_int {
       } else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
         idx_ctr[num_ctr] = i;
         num_ctr++;
-      } else if (idx_arr[3*i+2] != -1 && 
+      } else if (idx_arr[3*i+2] != -1 &&
                   ((idx_arr[3*i+0] != -1) || (idx_arr[3*i+1] != -1))){
         idx_no_ctr[num_no_ctr] = i;
         num_no_ctr++;
@@ -2385,7 +2385,7 @@ namespace CTF_int {
     tA->topo = topo;
     tB->topo = topo;
     tC->topo = topo;
-   
+
     /* Map the weigh indices of A, B, and C*/
 
 
@@ -2401,7 +2401,7 @@ namespace CTF_int {
         break;
       }
 
-     
+
       /* Map the contraction indices of A and B */
       ret = map_ctr_indices(idx_arr, idx_ctr, num_tot, num_ctr, topo, tA, tB);
       if (ret == NEGATIVE) {
@@ -2497,7 +2497,7 @@ namespace CTF_int {
       if (ret!=SUCCESS) return ret;
       ret = map_symtsr(tC->order, tC->sym_table, tC->edge_map);
       if (ret!=SUCCESS) return ret;
-     
+
 
       stat = SUCCESS;
     } while(0);
@@ -2510,7 +2510,7 @@ namespace CTF_int {
     topology * tA, * tB, * tC;
     int ret;
     tensor * tsr_keep, * tsr_change_A, * tsr_change_B;
-   
+
     tA = A->topo;
     tB = B->topo;
     tC = C->topo;
@@ -2540,7 +2540,7 @@ namespace CTF_int {
         tsr_change_B = B;
       }
     }
-   
+
     tA = tsr_change_A->topo;
     tB = tsr_change_B->topo;
     tC = tsr_keep->topo;
@@ -2555,7 +2555,7 @@ namespace CTF_int {
       if (!ret)
         return NEGATIVE;
     }
-   
+
     if (tA != tC){
       morph_topo(tC, tA,
                  tsr_change_A->order, tsr_change_A->edge_map);
@@ -2599,7 +2599,7 @@ namespace CTF_int {
     if (can_fold()){
       est_time = est_time_fold();
       iparam prm = map_fold(false);
-    
+
       sctr = construct_ctr(1, &prm);
       if (this->is_sparse())
         est_time = ((spctr*)sctr)->est_time_rec(sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C);
@@ -2657,7 +2657,7 @@ namespace CTF_int {
         mem_fold += mem_fold_C;
         mem_fold_tmp = std::max(mem_fold_tmp, mem_fold);
         mem_fold_tmp = std::max(mem_fold_tmp, mem_fold_C + mem_fold_tmp_C + (int64_t)(nnz_frac_C*C->size*C->sr->pair_size()));
-        //printf("mem_fold_C is %E mem_fold is %E mem_fold_tmp_C is %E\n",(double)mem_fold_C,(double)mem_fold, (double)(mem_fold_C + mem_fold_tmp_C + (int64_t)(nnz_frac_C*C->size*C->sr->pair_size()))); 
+        //printf("mem_fold_C is %E mem_fold is %E mem_fold_tmp_C is %E\n",(double)mem_fold_C,(double)mem_fold, (double)(mem_fold_C + mem_fold_tmp_C + (int64_t)(nnz_frac_C*C->size*C->sr->pair_size())));
       } else {
         mem_fold += C->size*C->sr->el_size;
       }
@@ -2763,7 +2763,7 @@ namespace CTF_int {
         A->set_padding();
         B->set_padding();
         C->set_padding();
-     
+
         topology * topo_i = NULL;
         if (t < 8){
           if ((t & 1) > 0){
@@ -2790,21 +2790,21 @@ namespace CTF_int {
           }
         } else topo_i = wrld->topovec[t-8];
         ASSERT(topo_i != NULL);
-     
+
         ret = map_to_topology(topo_i, j);
 
         if (ret == NEGATIVE){
           //printf("map_to_topology returned negative\n");
           continue;
         }
-   
+
         A->is_mapped = 1;
         B->is_mapped = 1;
         C->is_mapped = 1;
         A->topo = topo_i;
         B->topo = topo_i;
         C->topo = topo_i;
-       
+
         if (check_mapping() == 0){
           continue;
         }
@@ -2846,7 +2846,7 @@ namespace CTF_int {
           //bmemuse = memuse;
           DPRINTF(1,"[SEL] Found new best contraction memuse = %E, est_time = %E\n",(double)memuse,best_time);
           btopo = 6*t+j;
-        } 
+        }
       }
     }
     TAU_FSTOP(evaluate_mappings)
@@ -2907,7 +2907,7 @@ namespace CTF_int {
         A->topo = topo_i;
         B->topo = topo_i;
         C->topo = topo_i;
-       
+
         br = switch_topo_perm();
         if (!br){ DPRINTF(3,"switch topo perm returned false\n"); }
         if (!br) continue;
@@ -2915,7 +2915,7 @@ namespace CTF_int {
           continue;
         }
         valid_mappings++;
-       
+
         A->set_padding();
         B->set_padding();
         C->set_padding();
@@ -2953,7 +2953,7 @@ namespace CTF_int {
           //bmemuse = memuse;
           btopo = old_off+j;
           DPRINTF(1,"[EXH] Found new best contraction i %d btopo %ld old_off %ld j %d memuse = %E, est_time = %E\n",i,btopo,old_off,j,(double)memuse,best_time);
-        } 
+        }
       }
     }
 #if DEBUG >= 2
@@ -2973,7 +2973,7 @@ namespace CTF_int {
     time=gbest_time;
   }
 
-  int contraction::map(ctr ** ctrf, bool do_remap){
+  int contraction::map(ctr ** ctrf, double &gbest_time_sel, bool do_remap){
     int ret, j, need_remap, d;
     int * old_phase_A, * old_phase_B, * old_phase_C;
     topology * old_topo_A, * old_topo_B, * old_topo_C;
@@ -2991,14 +2991,14 @@ namespace CTF_int {
     ASSERT(A->wrld->comm == B->wrld->comm && B->wrld->comm == C->wrld->comm);
     World * wrld = A->wrld;
     CommData global_comm = wrld->cdt;
-   
+
 //    TAU_FSTART(init_select_ctr_map);
   #if BEST_VOL
     CTF_int::alloc_ptr(sizeof(int64_t)*A->order, (void**)&virt_blk_len_A);
     CTF_int::alloc_ptr(sizeof(int64_t)*B->order, (void**)&virt_blk_len_B);
     CTF_int::alloc_ptr(sizeof(int64_t)*C->order, (void**)&virt_blk_len_C);
   #endif
-   
+
     ASSERT(A->is_mapped);
     ASSERT(B->is_mapped);
     ASSERT(C->is_mapped);
@@ -3012,7 +3012,7 @@ namespace CTF_int {
     #endif
     }
 
-    // must calculate nnz_frac in initial layout 
+    // must calculate nnz_frac in initial layout
     double nnz_frac_A, nnz_frac_B, nnz_frac_C;
     this->calc_nnz_frac(nnz_frac_A, nnz_frac_B, nnz_frac_C);
   #if VERBOSE >= 1
@@ -3058,14 +3058,14 @@ namespace CTF_int {
     TAU_FSTOP(ctr_sig_map_find);
     topology * topo_g = NULL;
     int j_g;
-    int64_t ttopo; 
+    int64_t ttopo;
     bool is_exh;
     if (search_sig != ctr_sig_map.end()){
       ttopo = search_sig->second.ttopo;
       is_exh = search_sig->second.is_exh;
     } else {
       int64_t ttopo_sel, ttopo_exh;
-      double gbest_time_sel, gbest_time_exh;
+      double gbest_time_exh;
       TAU_FSTART(get_best_sel_map);
       get_best_sel_map(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, ttopo_sel, gbest_time_sel);
       TAU_FSTOP(get_best_sel_map);
@@ -3227,7 +3227,7 @@ namespace CTF_int {
 
     MPI_Barrier(global_comm.cm);
 #endif
-    
+
 
     if (A->is_cyclic == 0 &&
         B->is_cyclic == 0 &&
@@ -3272,18 +3272,18 @@ namespace CTF_int {
       need_remap = 1;
     if (need_remap)
       C->redistribute(*dC);
-                  
+
     TAU_FSTOP(redistribute_for_contraction);
-   
+
     CTF_int::cdealloc( old_phase_A );
     CTF_int::cdealloc( old_phase_B );
     CTF_int::cdealloc( old_phase_C );
-   
+
     delete [] old_map_A;
     delete [] old_map_B;
     delete [] old_map_C;
 
-   
+
     delete dA;
     delete dB;
     delete dC;
@@ -3575,7 +3575,7 @@ namespace CTF_int {
       } else if (sC && i_C != -1){
         nvirt = virt_dim[i]/str_C->strip_dim[i_C];
       }*/
-     
+
       nvirt = nvirt * virt_dim[i];
     }
     if (nvirt_all != NULL)
@@ -3746,7 +3746,7 @@ namespace CTF_int {
         }
         rec_ctr = &skctr->rec_ctr;
       }
- 
+
       if (B->is_sparse && B->wrld->np > 1){
         spctr_pin_keys * skctr = new spctr_pin_keys(this, 1);
         if (is_top){
@@ -3757,7 +3757,7 @@ namespace CTF_int {
         }
         rec_ctr = &skctr->rec_ctr;
       }
- 
+
       if (C->is_sparse && C->wrld->np > 1){
         spctr_pin_keys * skctr = new spctr_pin_keys(this, 2);
         if (is_top){
@@ -4007,7 +4007,7 @@ namespace CTF_int {
       } else if (sC && i_C != -1){
         nvirt = virt_dim[i]/str_C->strip_dim[i_C];
       }*/
-     
+
       nvirt = nvirt * virt_dim[i];
     }
 
@@ -4113,7 +4113,7 @@ namespace CTF_int {
     int * phys_mapped;
 
     int nphys_dim = A->topo->order;
- 
+
     CTF_int::alloc_ptr(sizeof(int)*nphys_dim*3, (void**)&phys_mapped);
     memset(phys_mapped, 0, sizeof(int)*nphys_dim*3);
 
@@ -4267,7 +4267,7 @@ namespace CTF_int {
         CBA.contract();
         return SUCCESS;
       }
-     
+
     }*/
 
 
@@ -4311,7 +4311,7 @@ namespace CTF_int {
       fftsr.is_offloadable = 0;
   #endif
     }*/
-
+  double time_est;
   #ifdef PROFILE
     TAU_FSTART(pre_map_barrier);
     MPI_Barrier(global_comm.cm);
@@ -4323,7 +4323,8 @@ namespace CTF_int {
   #endif
   #if REDIST
     //stat = map_tensors(type, fftsr, felm, alpha, beta, &ctrf);
-    stat = map(&ctrf);
+
+    stat = map(&ctrf, time_est);
     if (stat == ERROR) {
       printf("Failed to map tensors to physical grid\n");
       return ERROR;
@@ -4345,7 +4346,7 @@ namespace CTF_int {
       }
   #endif
     }
-    stat = map(&ctrf);
+    stat = map(&ctrf, time_est);
     if (stat == ERROR) {
       printf("Failed to map tensors to physical grid\n");
       return ERROR;
@@ -4499,8 +4500,10 @@ namespace CTF_int {
         if (size_blk_B != NULL) cdealloc(size_blk_B);
         if (size_blk_C != NULL) cdealloc(size_blk_C);
       }
-    } else
+    } else{
+      if (global_comm.rank == 0) ctrf->print();
       ctrf->run(A->data, B->data, C->data);
+    }
   #ifdef PROFILE_MEMORY
     if (C->wrld->rank == 0){
       printf("Finished contraction  computation\n");
@@ -4643,12 +4646,12 @@ namespace CTF_int {
     char const * dbeta;
     ctr * ctrf;
     tensor * tnsr_A, * tnsr_B, * tnsr_C;
- 
+
     bool is_cons = this->check_consistency();
     if (!is_cons) return ERROR;
- 
+
     CommData global_comm = A->wrld->cdt;
- 
+
     A->unfold();
     B->unfold();
     C->unfold();
@@ -4716,7 +4719,7 @@ namespace CTF_int {
     tnsr_A = A;
     tnsr_B = B;
     tnsr_C = C;
-   
+
     tensor * new_tsr;
     while (tnsr_A->extract_diag(map_A, 1, new_tsr, &new_idx) == SUCCESS){
       if (tnsr_A != A) delete tnsr_A;
@@ -4800,7 +4803,7 @@ namespace CTF_int {
       if (ocfact != 1){
         if (ocfact != 1){
           tnsr_C->sr->safecopy(oc_align_alpha, tnsr_C->sr->addid());
-         
+
           for (int i=0; i<ocfact; i++){
             tnsr_C->sr->add(oc_align_alpha, align_alpha, oc_align_alpha);
           }
@@ -4818,7 +4821,8 @@ namespace CTF_int {
 
         contraction * unfold_ctr;
         new_ctr.unfold_broken_sym(&unfold_ctr);
-        if (unfold_ctr->map(&ctrf, 0) == SUCCESS){
+        double time_est;
+        if (unfold_ctr->map(&ctrf, time_est, 0) == SUCCESS){
 /*  #else
         int sy = 0;
         for (i=0; i<A->order; i++){
@@ -4938,9 +4942,9 @@ namespace CTF_int {
       s.execute();
       delete C_buf;
       return SUCCESS;
-     
+
     }
-   
+
     if (A->has_zero_edge_len ||
         B->has_zero_edge_len ||
         C->has_zero_edge_len){
@@ -5165,7 +5169,7 @@ namespace CTF_int {
           nc->set_output_nnz_frac(this->output_nnz_frac);
           nc->idx_A[iA] = num_tot;
         }
-        nc->execute(); 
+        nc->execute();
         delete nc;
         delete X2;
         cdealloc(symX);
@@ -5198,7 +5202,7 @@ namespace CTF_int {
         CTF_int::alloc_ptr(new_ctr.A->calc_nvirt()*sizeof(int64_t), (void**)&new_ctr.A->nnz_blk);
         new_ctr.A->set_new_nnz_glb(A->nnz_blk);
       }
-    }    
+    }
     if (was_home_B){
       if (A == B){ //stype->tid_A == stype->tid_B){
         new_ctr.B = new_ctr.A; //tensors[ntype.tid_B];
@@ -5369,7 +5373,7 @@ namespace CTF_int {
             ) )
           npres++;
       }
-     
+
       if (npres > 1){
         cdealloc(idx_arr);
         return true;
@@ -5448,7 +5452,7 @@ namespace CTF_int {
             ) )
           npres++;
       }
-     
+
       if (npres > 1){
         int sym_mask[T->order];
         std::fill(sym_mask, sym_mask+T->order, 0);
@@ -5456,7 +5460,7 @@ namespace CTF_int {
         /*for (int k=0; k<T->order; k++){
           printf("sym_mask[%d]=%d\n",k,sym_mask[k]);
         }*/
-       
+
         if (T->is_home){
           if (T->wrld->cdt.rank == 0)
             DPRINTF(2,"Tensor %s leaving home\n", T->name);
@@ -5744,7 +5748,7 @@ namespace CTF_int {
     delete [] edge_map_B;
     delete [] edge_map_C;
   }
-    
+
   bool contraction_signature::operator<(contraction_signature const & other) const{
     if (order_A > other.order_A) return true;
     if (order_A < other.order_A) return false;
@@ -5754,39 +5758,39 @@ namespace CTF_int {
     if (order_C < other.order_C) return false;
     for (int i=0; i<order_A; i++){
       if (lens_A[i] > other.lens_A[i]) return true;
-      if (lens_A[i] < other.lens_A[i]) return false; 
+      if (lens_A[i] < other.lens_A[i]) return false;
     }
     for (int i=0; i<order_B; i++){
       if (lens_B[i] > other.lens_B[i]) return true;
-      if (lens_B[i] < other.lens_B[i]) return false; 
+      if (lens_B[i] < other.lens_B[i]) return false;
     }
     for (int i=0; i<order_C; i++){
       if (lens_C[i] > other.lens_C[i]) return true;
-      if (lens_C[i] < other.lens_C[i]) return false; 
+      if (lens_C[i] < other.lens_C[i]) return false;
     }
     for (int i=0; i<order_A; i++){
       if (idx_A[i] > other.idx_A[i]) return true;
-      if (idx_A[i] < other.idx_A[i]) return false; 
+      if (idx_A[i] < other.idx_A[i]) return false;
     }
     for (int i=0; i<order_B; i++){
       if (idx_B[i] > other.idx_B[i]) return true;
-      if (idx_B[i] < other.idx_B[i]) return false; 
+      if (idx_B[i] < other.idx_B[i]) return false;
     }
     for (int i=0; i<order_C; i++){
       if (idx_C[i] > other.idx_C[i]) return true;
-      if (idx_C[i] < other.idx_C[i]) return false; 
+      if (idx_C[i] < other.idx_C[i]) return false;
     }
     for (int i=0; i<order_A; i++){
       if (sym_A[i] > other.sym_A[i]) return true;
-      if (sym_A[i] < other.sym_A[i]) return false; 
+      if (sym_A[i] < other.sym_A[i]) return false;
     }
     for (int i=0; i<order_B; i++){
       if (sym_B[i] > other.sym_B[i]) return true;
-      if (sym_B[i] < other.sym_B[i]) return false; 
+      if (sym_B[i] < other.sym_B[i]) return false;
     }
     for (int i=0; i<order_C; i++){
       if (sym_C[i] > other.sym_C[i]) return true;
-      if (sym_C[i] < other.sym_C[i]) return false; 
+      if (sym_C[i] < other.sym_C[i]) return false;
     }
     if (is_sparse_A > other.is_sparse_A) return true;
     if (is_sparse_A < other.is_sparse_A) return false;
@@ -5808,15 +5812,15 @@ namespace CTF_int {
     if (topo_C->order < other.topo_C->order) return false;
     for (int i=0; i<topo_A->order; i++){
       if (topo_A->lens[i] > other.topo_A->lens[i]) return true;
-      if (topo_A->lens[i] < other.topo_A->lens[i]) return false; 
+      if (topo_A->lens[i] < other.topo_A->lens[i]) return false;
     }
     for (int i=0; i<topo_B->order; i++){
       if (topo_B->lens[i] > other.topo_B->lens[i]) return true;
-      if (topo_B->lens[i] < other.topo_B->lens[i]) return false; 
+      if (topo_B->lens[i] < other.topo_B->lens[i]) return false;
     }
     for (int i=0; i<topo_C->order; i++){
       if (topo_C->lens[i] > other.topo_C->lens[i]) return true;
-      if (topo_C->lens[i] < other.topo_C->lens[i]) return false; 
+      if (topo_C->lens[i] < other.topo_C->lens[i]) return false;
     }
     for (int i=0; i<order_A; i++){
       if (rank_dim_map(edge_map_A+i,other.edge_map_A+i) == 1) return true;
diff --git a/src/contraction/contraction.h b/src/contraction/contraction.h
index 90ca2211..a9fe3640 100644
--- a/src/contraction/contraction.h
+++ b/src/contraction/contraction.h
@@ -304,7 +304,7 @@ namespace CTF_int {
        * \param[in] do_remap whether to redistribute tensors
        * \return SUCCESS if valid mapping found, ERROR if not enough memory or another issue
        */
-      int map(ctr ** ctrf, bool do_remap=1);
+      int map(ctr ** ctrf, double &time_estimate, bool do_remap=1);
  
       /**
         * \brief contracts tensors alpha*A*B+beta*C -> C.
diff --git a/src/contraction/ctr_tsr.cxx b/src/contraction/ctr_tsr.cxx
index 45cc994a..d548b752 100755
--- a/src/contraction/ctr_tsr.cxx
+++ b/src/contraction/ctr_tsr.cxx
@@ -347,8 +347,8 @@ namespace CTF_int {
       printf("edge_len_C[%d]=%ld\n",i,edge_len_C[i]);
     }
     printf("is inner = %d\n", is_inner);
-    if (is_inner) printf("inner n = %ld m= %ld k = %ld l = %ld\n",
-                          inner_params.n, inner_params.m, inner_params.k, inner_params.l);
+    if (is_inner) printf("inner n = %ld m= %ld k = %ld l = %ld, ta = %c, tb =%c, tc = %c\n",
+                          inner_params.n, inner_params.m, inner_params.k, inner_params.l, inner_params.tA, inner_params.tB, inner_params.tC);
   }
 
   seq_tsr_ctr::seq_tsr_ctr(ctr * other) : ctr(other) {
@@ -436,7 +436,13 @@ namespace CTF_int {
   double seq_tsr_ctr::est_time_fp(int nlyr){
     //return COST_MEMBW*(size_A+size_B+size_C)+COST_FLOP*flops;
     double ps[] = {1.0, (double)est_membw(), est_fp()};
-//    printf("time estimate is %lf\n", seq_tsr_ctr_mdl.est_time(ps));
+    // incorperating the fact that dgemm with small k-edges is less effective
+    double k(inner_params.k);
+    double fac;
+    fac = std::max(1.0, 70/(k*0.3+5.0));
+
+//    printf("%d %d %d:time estimate is %lf\n",
+//           inner_params.m, inner_params.n, inner_params.k,  seq_tsr_ctr_mdl_inr.est_time(ps)*fac);
     if (is_custom && !is_inner){
       return seq_tsr_ctr_mdl_cst.est_time(ps);
     } else if (is_inner){
@@ -449,7 +455,7 @@ namespace CTF_int {
         if (inner_params.offload)
           return seq_tsr_ctr_mdl_off.est_time(ps);
         else
-          return seq_tsr_ctr_mdl_inr.est_time(ps);
+          return seq_tsr_ctr_mdl_inr.est_time(ps)*fac;
       }
     } else
       return seq_tsr_ctr_mdl_ref.est_time(ps);
diff --git a/src/shared/init_models.cxx b/src/shared/init_models.cxx
index cb4cbbd7..94e1c5e3 100644
--- a/src/shared/init_models.cxx
+++ b/src/shared/init_models.cxx
@@ -3,19 +3,19 @@ double csrred_mdl_init[] = {3.0689E-03, 2.2385E-03, 4.4815E-07};
 double csrred_mdl_cst_init[] = {-1.8323E-04, 1.3076E-04, 2.8732E-09};
 double alltoall_mdl_init[] = {1.0000E-06, 1.0000E-06, 5.0000E-10};
 double alltoallv_mdl_init[] = {7.3164E-23, 1.0404E-04, 2.5827E-07};
-double red_mdl_init[] = {1.7255E-12, 1.2558E-11, 3.7127E-10};
+double red_mdl_init[] = {4.5530E-11, 3.0466E-17, 5.0877E-10};
 double red_mdl_cst_init[] = {1.2881E-04, 1.4093E-16, 8.3976E-10};
 double allred_mdl_init[] = {4.7939E-14, 7.4715E-13, 2.0949E-06};
 double allred_mdl_cst_init[] = {-3.3754E-04, 2.1343E-04, 3.0801E-09};
-double bcast_mdl_init[] = {1.1722E-82, 3.0112E-05, 8.6197E-09};
+double bcast_mdl_init[] = {1.1115E-16, 1.0754E-01, 4.1995E-10};
 double seq_tsr_ctr_mdl_cst_init[] = {7.8076E-13, 6.9558E-08, 1.3923E-08};
 double seq_tsr_ctr_mdl_ref_init[] = {4.9138E-08, 5.8290E-10, 4.8575E-11};
-double seq_tsr_ctr_mdl_inr_init[] = {1.0689E-05, 9.4660E-10, 2.1921E-10};
+double seq_tsr_ctr_mdl_inr_init[] = {6.0166E-21, 2.3443E-13, 2.0967E-11};
 double seq_tsr_ctr_mdl_off_init[] = {6.2925E-05, 1.7449E-11, 1.7211E-12};
-double seq_tsr_ctr_mdl_cst_inr_init[] = {1.3863E-04, 2.0119E-10, 9.8820E-09};
+double seq_tsr_ctr_mdl_cst_inr_init[] = {0.0, 0.0, 1.6E-11};
 double seq_tsr_ctr_mdl_cst_off_init[] = {8.4844E-04, 5.9246E-11, 3.5247E-10};
-double long_contig_transp_mdl_init[] = {1.5117E-04, 1.9091E-09};
-double shrt_contig_transp_mdl_init[] = {7.7643E-05, 6.4347E-12};
+double long_contig_transp_mdl_init[] = {0.0, 1.25E-08};
+double shrt_contig_transp_mdl_init[] = {0.0, 1.25E-08};
 double non_contig_transp_mdl_init[] = {2.6680E-05, 4.6247E-06};
 double seq_tsr_spctr_cst_off_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
 double seq_tsr_spctr_cst_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
@@ -37,5 +37,5 @@ double seq_tsr_spctr_k4_init[] = {2.0404E-04, 8.2989E-09, 6.0431E-11};
 double seq_tsr_spctr_k5_init[] = {6.9073E-15, 4.0130E-09, 2.2669E-13};
 double pin_keys_mdl_init[] = {4.0261E-05, 7.2443E-07};
 double spredist_mdl_init[] = {6.8713E-23, 7.8867E-04, 6.9422E-11};
-double dgtog_res_mdl_init[] = {4.3225E-22, 6.3127E-03, 3.6107E-07};
-double blres_mdl_init[] = {1.1782E-05, 6.7690E-10};}
+double dgtog_res_mdl_init[] = {0.0, 0.0, 5.0E-10};
+double blres_mdl_init[] = {0.0, 3E-8};}
diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx
index bced3ac2..f3ae76ba 100644
--- a/src/tensor/untyped_tensor.cxx
+++ b/src/tensor/untyped_tensor.cxx
@@ -19,6 +19,7 @@
 
 using namespace CTF;
 
+
 namespace CTF_int {
 
   LinModel<3> spredist_mdl(spredist_mdl_init,"spredist_mdl");
@@ -686,6 +687,7 @@ namespace CTF_int {
         memset(this->nnz_blk, 0, sizeof(int64_t)*calc_nvirt());
         this->set_new_nnz_glb(this->nnz_blk);
       } else {
+        if (!is_dry)
         sr->set(this->data, sr->addid(), this->size);
       }
     } else {
@@ -726,15 +728,19 @@ namespace CTF_int {
           //this->has_home = 0;
     /*      if (wrld->rank == 0)
             DPRINTF(3,"Initial size of tensor %d is " PRId64 ",",tensor_id,this->size);*/
-          this->home_buffer = sr->alloc(this->home_size);
-          if (wrld->rank == 0) DPRINTF(2,"Creating home of %s\n",name);
-          register_size(this->size*sr->el_size);
-          this->data = this->home_buffer;
+          if (!is_dry) {
+            this->home_buffer = sr->alloc(this->home_size);
+            if (wrld->rank == 0) DPRINTF(2,"Creating home of %s\n",name);
+            register_size(this->size*sr->el_size);
+            this->data = this->home_buffer;
+          }
         } else {
-          this->data = sr->alloc(this->size);
+          if (!is_dry)
+            this->data = sr->alloc(this->size);
         }
         #else
-        this->data = sr->alloc(this->size);
+        if (!is_dry)
+          this->data = sr->alloc(this->size);
         //CTF_int::alloc_ptr(this->size*sr->el_size, (void**)&this->data);
         #endif
         #if DEBUG >= 1
@@ -743,7 +749,8 @@ namespace CTF_int {
         this->print_lens();
         this->print_map(stdout);
         #endif
-        sr->init(this->size, this->data);
+        if (!is_dry)
+          sr->init(this->size, this->data);
       }
     }
     TAU_FSTOP(set_zero_tsr);
@@ -4125,5 +4132,9 @@ namespace CTF_int {
     return subtsrs;
   }
 
+  bool tensor::is_dry = false;
+  void tensor::set_dry_run(){
+    is_dry = true;
+  }
 }
 
diff --git a/src/tensor/untyped_tensor.h b/src/tensor/untyped_tensor.h
index 6a24b6a5..cc4f3c85 100644
--- a/src/tensor/untyped_tensor.h
+++ b/src/tensor/untyped_tensor.h
@@ -139,6 +139,8 @@ namespace CTF_int {
       int64_t nnz_tot;
       /** \brief nonzero elements in each block owned locally */
       int64_t * nnz_blk;
+      /** \brief dry run does not allocate any tensor data */
+      static bool is_dry;
 
       /**
        * \brief associated an index map with the tensor for future operation
@@ -372,7 +374,7 @@ namespace CTF_int {
        * \param[out] size number of elements in data
        */
       void get_raw_data(char ** data, int64_t * size) const;
-      
+
       /**
        * \brief query mapping to processor grid and intra-processor blocking, which may be used to define a tensor with the same initial distribution
        * \param[out] idx array of this->order chars describing this processor modes mapping on processor grid dimensions tarting from 'a'
@@ -1063,6 +1065,8 @@ namespace CTF_int {
        * \return tensor with same data point as this one but no edge lengths of size 1
        */
       tensor * get_no_unit_len_alias();
+
+      static void set_dry_run();
   };
 }
 #endif// __UNTYPED_TENSOR_H__

From bf6ecd3708b9cca48cb14f2495fb0ab64372643c Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Wed, 24 Feb 2021 08:41:26 +0100
Subject: [PATCH 03/19] comment out one print statement

---
 src/contraction/contraction.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index 8421fae9..5261fbb4 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -4501,7 +4501,7 @@ namespace CTF_int {
         if (size_blk_C != NULL) cdealloc(size_blk_C);
       }
     } else{
-      if (global_comm.rank == 0) ctrf->print();
+//      if (global_comm.rank == 0) ctrf->print();
       ctrf->run(A->data, B->data, C->data);
     }
   #ifdef PROFILE_MEMORY

From e40c5f341738a2e3c6b927526664a22023ced30b Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Wed, 12 May 2021 11:16:35 +0200
Subject: [PATCH 04/19] undo changes related to time estimates

---
 src/contraction/contraction.cxx | 232 ++++++++++++++++----------------
 src/contraction/contraction.h   |   2 +-
 2 files changed, 115 insertions(+), 119 deletions(-)

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index 5261fbb4..f5dd14dc 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -70,7 +70,7 @@ namespace CTF_int {
     func = func_;
     alpha = alpha_;
     beta  = beta_;
-
+   
     idx_A = (int*)alloc(sizeof(int)*A->order);
     idx_B = (int*)alloc(sizeof(int)*B->order);
     idx_C = (int*)alloc(sizeof(int)*C->order);
@@ -96,7 +96,7 @@ namespace CTF_int {
     func = func_;
     alpha = alpha_;
     beta  = beta_;
-
+   
     conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B, C->order, cidx_C, &idx_C);
   }
 
@@ -111,7 +111,7 @@ namespace CTF_int {
     //if (A->wrld->cdt.cm == MPI_COMM_WORLD){
 //      update_all_models(A->wrld->cdt.cm);
     //}
-
+   
     int stat = home_contract();
     if (stat != SUCCESS){
       printf("CTF ERROR: Failed to perform contraction\n");
@@ -120,7 +120,7 @@ namespace CTF_int {
 #endif
     }
   }
-
+ 
   template<typename ptype>
   void get_perm(int     perm_order,
                 ptype   A,
@@ -166,7 +166,7 @@ namespace CTF_int {
         break;
     }
   }
-
+  
   void contraction::set_output_nnz_frac(double nnz_frac){
     //assert(nnz_frac >= 0. && nnz_frac <= 1.);
     this->output_nnz_frac = nnz_frac;
@@ -273,17 +273,17 @@ namespace CTF_int {
   }
 
   double contraction::estimate_time(){
-    ctr * ctrf;
-    double est_time;
-    int stat = map(&ctrf, est_time, false);
-    return est_time;
+    int np = std::max(A->wrld->np,B->wrld->np);
+    double flop_rate = 1.E9*np;
+    double bw_rate = 1.E5*np;
+    return this->estimate_num_flops()/flop_rate + this->estimate_bw()/bw_rate;
   }
 
   int contraction::is_equal(contraction const & os){
     if (this->A != os.A) return 0;
     if (this->B != os.B) return 0;
     if (this->C != os.C) return 0;
-
+   
     for (int i=0; i<A->order; i++){
       if (idx_A[i] != os.idx_A[i]) return 0;
     }
@@ -323,7 +323,7 @@ namespace CTF_int {
                      iparam *       inner_prm){
     int i, num_tot, num_ctr, num_no_ctr_A, num_no_ctr_B, num_weigh;
     int * idx_arr;
-
+     
     inv_idx(A->order, idx_A,
             B->order, idx_B,
             C->order, idx_C,
@@ -407,7 +407,7 @@ namespace CTF_int {
               A->sym[inA] != C->sym[inC]){
             broken = 1;
           }
-        } else {
+        } else { 
           if (((inA>=0) + (inB>=0) + (inC>=0) != 3) ||
               ((inB == -1) ^ (iB == -1)) ||
               ((inC == -1) ^ (iC == -1)) ||
@@ -429,7 +429,7 @@ namespace CTF_int {
         }
       }
     }
-
+   
     for (iC=0; iC<C->order; iC++){
       i = idx_C[iC];
       iA = idx_arr[3*i+0];
@@ -457,7 +457,7 @@ namespace CTF_int {
         }
       }
     }
-
+   
     for (iB=0; iB<B->order; iB++){
       i = idx_B[iB];
       iC = idx_arr[3*i+2];
@@ -577,7 +577,7 @@ namespace CTF_int {
     int idx_self_C, idx_self_A, idx_self_B;
     int num_self_C, num_self_A, num_self_B;
     int * ordering_A, * ordering_B, * ordering_C, * idx_arr;
-
+   
     CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&ordering_A);
     CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&ordering_B);
     CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&ordering_C);
@@ -645,13 +645,13 @@ namespace CTF_int {
     *new_ordering_A = ordering_A;
     *new_ordering_B = ordering_B;
     *new_ordering_C = ordering_C;
-
+   
     //iparam iprm;
     //calc_fold_nmk(A, B, C, idx_A, idx_B, idx_C, *new_ordering_A, *new_ordering_B, &iprm);
     //return iprm;
   }
 
-
+ 
   void contraction::get_fold_ctr(contraction *& fold_ctr,
                                  int &          all_fdim_A,
                                  int &          all_fdim_B,
@@ -786,7 +786,7 @@ namespace CTF_int {
       permute_target(tfA->order, tfnew_ord_A, tAiord);
       permute_target(tfB->order, tfnew_ord_B, tBiord);
       permute_target(tfC->order, tfnew_ord_C, tCiord);
-
+   
       double time_est = 0.0;
       if (tA->is_sparse)
         time_est += tA->nnz_tot/(((double)tA->size)*tA->calc_npe())*tA->calc_nvirt()*est_time_transp(tall_fdim_A, tAiord, tall_flen_A, 1, tA->sr);
@@ -940,7 +940,7 @@ namespace CTF_int {
             }
           }
         }
-
+    
         A->spmatricize(iprm.m, iprm.k, nrow_idx, all_fdim_A, all_flen_A, csr_or_coo, use_ccsr);
       }
       if (!B->is_sparse){
@@ -975,7 +975,7 @@ namespace CTF_int {
         C->spmatricize(iprm.m, iprm.n, nrow_idx, all_fdim_C, all_flen_C, csr_or_coo, use_ccsr);
         C->sr->dealloc(C->data);
       }
-
+   
     }
 
     CTF_int::cdealloc(fnew_ord_A);
@@ -1022,9 +1022,9 @@ namespace CTF_int {
     int i, num_tot, iA, iB, iC;
     int * idx_arr;
     tensor * nA, * nB, * nC;
-
+  
     contraction * nctr;
-
+   
     if (new_contraction != NULL){
       nA = new tensor(A, 0, 0);
       nB = new tensor(B, 0, 0);
@@ -1088,7 +1088,7 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i;
-          }
+          }      
         }
         if (idx_arr[3*iA+2] != -1){
           if (C->sym[idx_arr[3*iA+2]] != A->sym[i] ||
@@ -1108,12 +1108,12 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i;
-          }
+          }      
         }
       }
     }
 
-
+  
     int nB_sym[B->order];
     if (new_contraction != NULL)
       memcpy(nB_sym, nB->sym, sizeof(int)*nB->order);
@@ -1138,7 +1138,7 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i+1;
-          }
+          }      
         }
         if (idx_arr[3*iB+2] != -1){
           if (C->sym[idx_arr[3*iB+2]] != B->sym[i] ||
@@ -1158,7 +1158,7 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i+1;
-          }
+          }      
         }
       }
     }
@@ -1205,7 +1205,7 @@ namespace CTF_int {
             }
             CTF_int::cdealloc(idx_arr);
             return 3*i+2;
-          }
+          }      
           if (idx_arr[3*iC+0] != -1){
             if (A->sym[idx_arr[3*iC+0]] != C->sym[i] ||
                 idx_C[i+1] != idx_A[idx_arr[3*iC+0]+1]){
@@ -1224,7 +1224,7 @@ namespace CTF_int {
               }
               CTF_int::cdealloc(idx_arr);
               return 3*i+2;
-            }
+            }      
           }
         }
       }
@@ -1238,7 +1238,7 @@ namespace CTF_int {
     int64_t len;
     int iA, iB, iC;
     int * idx_arr;
-
+      
     inv_idx(A->order, idx_A,
             B->order, idx_B,
             C->order, idx_C,
@@ -1287,7 +1287,7 @@ namespace CTF_int {
     return true;
   }
 
-
+   
   int contraction::check_mapping(){
 
     int num_tot, i, ph_A, ph_B, iA, iB, iC, pass, order, topo_order;
@@ -1302,11 +1302,11 @@ namespace CTF_int {
     if (B->is_mapped == 0) pass = 0;
     if (C->is_mapped == 0) pass = 0;
     ASSERT(pass==1);
-
+   
     if (A->is_folded == 1) pass = 0;
     if (B->is_folded == 1) pass = 0;
     if (C->is_folded == 1) pass = 0;
-
+   
     if (pass==0){
       DPRINTF(3,"failed confirmation here\n");
       return 0;
@@ -1331,7 +1331,7 @@ namespace CTF_int {
             B->order, idx_B,
             C->order, idx_C,
             &num_tot, &idx_arr);
-
+   
     if (!check_self_mapping(A, idx_A))
       pass = 0;
     if (!check_self_mapping(B, idx_B))
@@ -1552,7 +1552,7 @@ namespace CTF_int {
 
     tsr_order = num_weigh;
 
-
+   
     for (i=0; i<num_weigh; i++){
       iweigh = idx_weigh[i];
       iA = idx_arr[iweigh*3+0];
@@ -1563,7 +1563,7 @@ namespace CTF_int {
           B->edge_map[iB].type == PHYSICAL_MAP ||
           C->edge_map[iC].type == PHYSICAL_MAP)
         return NEGATIVE;
-    }
+    } 
     CTF_int::alloc_ptr(tsr_order*sizeof(int),           (void**)&restricted);
     CTF_int::alloc_ptr(tsr_order*sizeof(int64_t),       (void**)&tsr_edge_len);
     CTF_int::alloc_ptr(tsr_order*tsr_order*sizeof(int), (void**)&tsr_sym_table);
@@ -1586,7 +1586,7 @@ namespace CTF_int {
       iB = idx_arr[iweigh*3+1];
       iC = idx_arr[iweigh*3+2];
 
-
+     
       weigh_map[i].np = lcm(weigh_map[i].np,A->edge_map[iA].np);
       weigh_map[i].np = lcm(weigh_map[i].np,B->edge_map[iB].np);
       weigh_map[i].np = lcm(weigh_map[i].np,C->edge_map[iC].np);
@@ -1636,7 +1636,7 @@ namespace CTF_int {
 
     if (stat == ERROR)
       return ERROR;
-
+   
     /* define mapping of tensors A and B according to the mapping of ctr dims */
     if (stat == SUCCESS){
       for (i=0; i<num_weigh; i++){
@@ -1719,7 +1719,7 @@ namespace CTF_int {
     extract_free_comms(topo, A->order, A->edge_map,
                              B->order, B->edge_map,
                        num_sub_phys_dims, &sub_phys_comm, &comm_idx);
-
+   
 
     /* Map a tensor of dimension 2*num_ctr, with symmetries among each pair.
      * Set the edge lengths and symmetries according to those in ctr dims of A and B.
@@ -1784,7 +1784,7 @@ namespace CTF_int {
     //}
     if (stat == ERROR)
       return ERROR;
-
+   
     /* define mapping of tensors A and B according to the mapping of ctr dims */
     if (stat == SUCCESS){
       for (i=0; i<num_ctr; i++){
@@ -1840,7 +1840,7 @@ namespace CTF_int {
       iB = idx_arr[3*inoctr+1];
       iC = idx_arr[3*inoctr+2];
 
-
+     
       if (iC != -1 && iA != -1){
         copy_mapping(1, C->edge_map + iC, A->edge_map + iA);
       }
@@ -1861,7 +1861,7 @@ namespace CTF_int {
       iB = idx_arr[3*inoctr+1];
       iC = idx_arr[3*inoctr+2];
 
-
+     
       if (iA != -1 && iC != -1){
         copy_mapping(1, A->edge_map + iA, C->edge_map + iC);
       }
@@ -1879,7 +1879,7 @@ namespace CTF_int {
       iB = idx_arr[3*inoctr+1];
       iC = idx_arr[3*inoctr+2];
 
-
+     
       if (iA != -1 && iC != -1){
         copy_mapping(1, C->edge_map + iC, A->edge_map + iA);
       }
@@ -1972,7 +1972,7 @@ namespace CTF_int {
     nAB=0;
     nAC=0;
     nBC=0;
-
+ 
     for (int i=0; i<num_tot; i++){
       if (idx_arr[3*i+0] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] == -1)
         nAB++;
@@ -1992,7 +1992,7 @@ namespace CTF_int {
     TAU_FSTOP(get_num_map_vars);
     return nv;
   }
-
+ 
   bool contraction::switch_topo_perm(){
     ASSERT(A->topo == B->topo && B->topo == C->topo);
     topology const * topo = A->topo;
@@ -2096,7 +2096,7 @@ namespace CTF_int {
   bool contraction::
       exh_map_to_topo(topology const * topo,
                       int              variant){
-
+  
     int num_tot;
     int * idx_arr;
     inv_idx(A->order, idx_A,
@@ -2144,7 +2144,7 @@ namespace CTF_int {
         v = v/choose(nAC,nctr_2d);
         get_choice(nBC,nctr_2d,v%choose(nBC,nctr_2d),iBC);
         v = v/choose(nBC,nctr_2d);
-
+       
         for (int i=0; i<nctr_2d; i++){
          // printf("iAB[%d] = %d iAC[%d] = %d iBC[%d] = %d ord[%d] = %d\n", i, iAB[i], i, iAC[i], i, iBC[i], i, ord[i]);
           int iiAB=0;
@@ -2251,8 +2251,8 @@ namespace CTF_int {
         else if (jC != -1) C->edge_map[jC].aug_phys(topo, idim);
       }
     }*/
-
-
+   
+   
     //A->order*B->order*C->order+A->order*B->order+A->order*C->order+B->order*C->order+A->order+B->order+C->order+1;
 /*    int nv = variant;
     for (int idim=0; idim<topo->order; idim++){
@@ -2355,7 +2355,7 @@ namespace CTF_int {
     tensor * tA, * tB, * tC;
     get_perm<tensor*>(order, A, B, C, tA, tB, tC);
     get_perm<const int*>(order, idx_A, idx_B, idx_C, tidx_A, tidx_B, tidx_C);
-
+  
     inv_idx(tA->order, tidx_A,
             tB->order, tidx_B,
             tC->order, tidx_C,
@@ -2373,7 +2373,7 @@ namespace CTF_int {
       } else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
         idx_ctr[num_ctr] = i;
         num_ctr++;
-      } else if (idx_arr[3*i+2] != -1 &&
+      } else if (idx_arr[3*i+2] != -1 && 
                   ((idx_arr[3*i+0] != -1) || (idx_arr[3*i+1] != -1))){
         idx_no_ctr[num_no_ctr] = i;
         num_no_ctr++;
@@ -2385,7 +2385,7 @@ namespace CTF_int {
     tA->topo = topo;
     tB->topo = topo;
     tC->topo = topo;
-
+   
     /* Map the weigh indices of A, B, and C*/
 
 
@@ -2401,7 +2401,7 @@ namespace CTF_int {
         break;
       }
 
-
+     
       /* Map the contraction indices of A and B */
       ret = map_ctr_indices(idx_arr, idx_ctr, num_tot, num_ctr, topo, tA, tB);
       if (ret == NEGATIVE) {
@@ -2497,7 +2497,7 @@ namespace CTF_int {
       if (ret!=SUCCESS) return ret;
       ret = map_symtsr(tC->order, tC->sym_table, tC->edge_map);
       if (ret!=SUCCESS) return ret;
-
+     
 
       stat = SUCCESS;
     } while(0);
@@ -2510,7 +2510,7 @@ namespace CTF_int {
     topology * tA, * tB, * tC;
     int ret;
     tensor * tsr_keep, * tsr_change_A, * tsr_change_B;
-
+   
     tA = A->topo;
     tB = B->topo;
     tC = C->topo;
@@ -2540,7 +2540,7 @@ namespace CTF_int {
         tsr_change_B = B;
       }
     }
-
+   
     tA = tsr_change_A->topo;
     tB = tsr_change_B->topo;
     tC = tsr_keep->topo;
@@ -2555,7 +2555,7 @@ namespace CTF_int {
       if (!ret)
         return NEGATIVE;
     }
-
+   
     if (tA != tC){
       morph_topo(tC, tA,
                  tsr_change_A->order, tsr_change_A->edge_map);
@@ -2599,7 +2599,7 @@ namespace CTF_int {
     if (can_fold()){
       est_time = est_time_fold();
       iparam prm = map_fold(false);
-
+    
       sctr = construct_ctr(1, &prm);
       if (this->is_sparse())
         est_time = ((spctr*)sctr)->est_time_rec(sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C);
@@ -2657,7 +2657,7 @@ namespace CTF_int {
         mem_fold += mem_fold_C;
         mem_fold_tmp = std::max(mem_fold_tmp, mem_fold);
         mem_fold_tmp = std::max(mem_fold_tmp, mem_fold_C + mem_fold_tmp_C + (int64_t)(nnz_frac_C*C->size*C->sr->pair_size()));
-        //printf("mem_fold_C is %E mem_fold is %E mem_fold_tmp_C is %E\n",(double)mem_fold_C,(double)mem_fold, (double)(mem_fold_C + mem_fold_tmp_C + (int64_t)(nnz_frac_C*C->size*C->sr->pair_size())));
+        //printf("mem_fold_C is %E mem_fold is %E mem_fold_tmp_C is %E\n",(double)mem_fold_C,(double)mem_fold, (double)(mem_fold_C + mem_fold_tmp_C + (int64_t)(nnz_frac_C*C->size*C->sr->pair_size()))); 
       } else {
         mem_fold += C->size*C->sr->el_size;
       }
@@ -2763,7 +2763,7 @@ namespace CTF_int {
         A->set_padding();
         B->set_padding();
         C->set_padding();
-
+     
         topology * topo_i = NULL;
         if (t < 8){
           if ((t & 1) > 0){
@@ -2790,21 +2790,21 @@ namespace CTF_int {
           }
         } else topo_i = wrld->topovec[t-8];
         ASSERT(topo_i != NULL);
-
+     
         ret = map_to_topology(topo_i, j);
 
         if (ret == NEGATIVE){
           //printf("map_to_topology returned negative\n");
           continue;
         }
-
+   
         A->is_mapped = 1;
         B->is_mapped = 1;
         C->is_mapped = 1;
         A->topo = topo_i;
         B->topo = topo_i;
         C->topo = topo_i;
-
+       
         if (check_mapping() == 0){
           continue;
         }
@@ -2846,7 +2846,7 @@ namespace CTF_int {
           //bmemuse = memuse;
           DPRINTF(1,"[SEL] Found new best contraction memuse = %E, est_time = %E\n",(double)memuse,best_time);
           btopo = 6*t+j;
-        }
+        } 
       }
     }
     TAU_FSTOP(evaluate_mappings)
@@ -2907,7 +2907,7 @@ namespace CTF_int {
         A->topo = topo_i;
         B->topo = topo_i;
         C->topo = topo_i;
-
+       
         br = switch_topo_perm();
         if (!br){ DPRINTF(3,"switch topo perm returned false\n"); }
         if (!br) continue;
@@ -2915,7 +2915,7 @@ namespace CTF_int {
           continue;
         }
         valid_mappings++;
-
+       
         A->set_padding();
         B->set_padding();
         C->set_padding();
@@ -2953,7 +2953,7 @@ namespace CTF_int {
           //bmemuse = memuse;
           btopo = old_off+j;
           DPRINTF(1,"[EXH] Found new best contraction i %d btopo %ld old_off %ld j %d memuse = %E, est_time = %E\n",i,btopo,old_off,j,(double)memuse,best_time);
-        }
+        } 
       }
     }
 #if DEBUG >= 2
@@ -2973,7 +2973,7 @@ namespace CTF_int {
     time=gbest_time;
   }
 
-  int contraction::map(ctr ** ctrf, double &gbest_time_sel, bool do_remap){
+  int contraction::map(ctr ** ctrf, bool do_remap){
     int ret, j, need_remap, d;
     int * old_phase_A, * old_phase_B, * old_phase_C;
     topology * old_topo_A, * old_topo_B, * old_topo_C;
@@ -2991,14 +2991,14 @@ namespace CTF_int {
     ASSERT(A->wrld->comm == B->wrld->comm && B->wrld->comm == C->wrld->comm);
     World * wrld = A->wrld;
     CommData global_comm = wrld->cdt;
-
+   
 //    TAU_FSTART(init_select_ctr_map);
   #if BEST_VOL
     CTF_int::alloc_ptr(sizeof(int64_t)*A->order, (void**)&virt_blk_len_A);
     CTF_int::alloc_ptr(sizeof(int64_t)*B->order, (void**)&virt_blk_len_B);
     CTF_int::alloc_ptr(sizeof(int64_t)*C->order, (void**)&virt_blk_len_C);
   #endif
-
+   
     ASSERT(A->is_mapped);
     ASSERT(B->is_mapped);
     ASSERT(C->is_mapped);
@@ -3012,7 +3012,7 @@ namespace CTF_int {
     #endif
     }
 
-    // must calculate nnz_frac in initial layout
+    // must calculate nnz_frac in initial layout 
     double nnz_frac_A, nnz_frac_B, nnz_frac_C;
     this->calc_nnz_frac(nnz_frac_A, nnz_frac_B, nnz_frac_C);
   #if VERBOSE >= 1
@@ -3058,14 +3058,14 @@ namespace CTF_int {
     TAU_FSTOP(ctr_sig_map_find);
     topology * topo_g = NULL;
     int j_g;
-    int64_t ttopo;
+    int64_t ttopo; 
     bool is_exh;
     if (search_sig != ctr_sig_map.end()){
       ttopo = search_sig->second.ttopo;
       is_exh = search_sig->second.is_exh;
     } else {
       int64_t ttopo_sel, ttopo_exh;
-      double gbest_time_exh;
+      double gbest_time_sel, gbest_time_exh;
       TAU_FSTART(get_best_sel_map);
       get_best_sel_map(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, ttopo_sel, gbest_time_sel);
       TAU_FSTOP(get_best_sel_map);
@@ -3227,7 +3227,7 @@ namespace CTF_int {
 
     MPI_Barrier(global_comm.cm);
 #endif
-
+    
 
     if (A->is_cyclic == 0 &&
         B->is_cyclic == 0 &&
@@ -3272,18 +3272,18 @@ namespace CTF_int {
       need_remap = 1;
     if (need_remap)
       C->redistribute(*dC);
-
+                  
     TAU_FSTOP(redistribute_for_contraction);
-
+   
     CTF_int::cdealloc( old_phase_A );
     CTF_int::cdealloc( old_phase_B );
     CTF_int::cdealloc( old_phase_C );
-
+   
     delete [] old_map_A;
     delete [] old_map_B;
     delete [] old_map_C;
 
-
+   
     delete dA;
     delete dB;
     delete dC;
@@ -3575,7 +3575,7 @@ namespace CTF_int {
       } else if (sC && i_C != -1){
         nvirt = virt_dim[i]/str_C->strip_dim[i_C];
       }*/
-
+     
       nvirt = nvirt * virt_dim[i];
     }
     if (nvirt_all != NULL)
@@ -3746,7 +3746,7 @@ namespace CTF_int {
         }
         rec_ctr = &skctr->rec_ctr;
       }
-
+ 
       if (B->is_sparse && B->wrld->np > 1){
         spctr_pin_keys * skctr = new spctr_pin_keys(this, 1);
         if (is_top){
@@ -3757,7 +3757,7 @@ namespace CTF_int {
         }
         rec_ctr = &skctr->rec_ctr;
       }
-
+ 
       if (C->is_sparse && C->wrld->np > 1){
         spctr_pin_keys * skctr = new spctr_pin_keys(this, 2);
         if (is_top){
@@ -4007,7 +4007,7 @@ namespace CTF_int {
       } else if (sC && i_C != -1){
         nvirt = virt_dim[i]/str_C->strip_dim[i_C];
       }*/
-
+     
       nvirt = nvirt * virt_dim[i];
     }
 
@@ -4113,7 +4113,7 @@ namespace CTF_int {
     int * phys_mapped;
 
     int nphys_dim = A->topo->order;
-
+ 
     CTF_int::alloc_ptr(sizeof(int)*nphys_dim*3, (void**)&phys_mapped);
     memset(phys_mapped, 0, sizeof(int)*nphys_dim*3);
 
@@ -4267,7 +4267,7 @@ namespace CTF_int {
         CBA.contract();
         return SUCCESS;
       }
-
+     
     }*/
 
 
@@ -4311,7 +4311,7 @@ namespace CTF_int {
       fftsr.is_offloadable = 0;
   #endif
     }*/
-  double time_est;
+
   #ifdef PROFILE
     TAU_FSTART(pre_map_barrier);
     MPI_Barrier(global_comm.cm);
@@ -4323,8 +4323,7 @@ namespace CTF_int {
   #endif
   #if REDIST
     //stat = map_tensors(type, fftsr, felm, alpha, beta, &ctrf);
-
-    stat = map(&ctrf, time_est);
+    stat = map(&ctrf);
     if (stat == ERROR) {
       printf("Failed to map tensors to physical grid\n");
       return ERROR;
@@ -4346,7 +4345,7 @@ namespace CTF_int {
       }
   #endif
     }
-    stat = map(&ctrf, time_est);
+    stat = map(&ctrf);
     if (stat == ERROR) {
       printf("Failed to map tensors to physical grid\n");
       return ERROR;
@@ -4500,10 +4499,8 @@ namespace CTF_int {
         if (size_blk_B != NULL) cdealloc(size_blk_B);
         if (size_blk_C != NULL) cdealloc(size_blk_C);
       }
-    } else{
-//      if (global_comm.rank == 0) ctrf->print();
+    } else
       ctrf->run(A->data, B->data, C->data);
-    }
   #ifdef PROFILE_MEMORY
     if (C->wrld->rank == 0){
       printf("Finished contraction  computation\n");
@@ -4646,12 +4643,12 @@ namespace CTF_int {
     char const * dbeta;
     ctr * ctrf;
     tensor * tnsr_A, * tnsr_B, * tnsr_C;
-
+ 
     bool is_cons = this->check_consistency();
     if (!is_cons) return ERROR;
-
+ 
     CommData global_comm = A->wrld->cdt;
-
+ 
     A->unfold();
     B->unfold();
     C->unfold();
@@ -4719,7 +4716,7 @@ namespace CTF_int {
     tnsr_A = A;
     tnsr_B = B;
     tnsr_C = C;
-
+   
     tensor * new_tsr;
     while (tnsr_A->extract_diag(map_A, 1, new_tsr, &new_idx) == SUCCESS){
       if (tnsr_A != A) delete tnsr_A;
@@ -4803,7 +4800,7 @@ namespace CTF_int {
       if (ocfact != 1){
         if (ocfact != 1){
           tnsr_C->sr->safecopy(oc_align_alpha, tnsr_C->sr->addid());
-
+         
           for (int i=0; i<ocfact; i++){
             tnsr_C->sr->add(oc_align_alpha, align_alpha, oc_align_alpha);
           }
@@ -4821,8 +4818,7 @@ namespace CTF_int {
 
         contraction * unfold_ctr;
         new_ctr.unfold_broken_sym(&unfold_ctr);
-        double time_est;
-        if (unfold_ctr->map(&ctrf, time_est, 0) == SUCCESS){
+        if (unfold_ctr->map(&ctrf, 0) == SUCCESS){
 /*  #else
         int sy = 0;
         for (i=0; i<A->order; i++){
@@ -4942,9 +4938,9 @@ namespace CTF_int {
       s.execute();
       delete C_buf;
       return SUCCESS;
-
+     
     }
-
+   
     if (A->has_zero_edge_len ||
         B->has_zero_edge_len ||
         C->has_zero_edge_len){
@@ -5169,7 +5165,7 @@ namespace CTF_int {
           nc->set_output_nnz_frac(this->output_nnz_frac);
           nc->idx_A[iA] = num_tot;
         }
-        nc->execute();
+        nc->execute(); 
         delete nc;
         delete X2;
         cdealloc(symX);
@@ -5202,7 +5198,7 @@ namespace CTF_int {
         CTF_int::alloc_ptr(new_ctr.A->calc_nvirt()*sizeof(int64_t), (void**)&new_ctr.A->nnz_blk);
         new_ctr.A->set_new_nnz_glb(A->nnz_blk);
       }
-    }
+    }    
     if (was_home_B){
       if (A == B){ //stype->tid_A == stype->tid_B){
         new_ctr.B = new_ctr.A; //tensors[ntype.tid_B];
@@ -5373,7 +5369,7 @@ namespace CTF_int {
             ) )
           npres++;
       }
-
+     
       if (npres > 1){
         cdealloc(idx_arr);
         return true;
@@ -5452,7 +5448,7 @@ namespace CTF_int {
             ) )
           npres++;
       }
-
+     
       if (npres > 1){
         int sym_mask[T->order];
         std::fill(sym_mask, sym_mask+T->order, 0);
@@ -5460,7 +5456,7 @@ namespace CTF_int {
         /*for (int k=0; k<T->order; k++){
           printf("sym_mask[%d]=%d\n",k,sym_mask[k]);
         }*/
-
+       
         if (T->is_home){
           if (T->wrld->cdt.rank == 0)
             DPRINTF(2,"Tensor %s leaving home\n", T->name);
@@ -5748,7 +5744,7 @@ namespace CTF_int {
     delete [] edge_map_B;
     delete [] edge_map_C;
   }
-
+    
   bool contraction_signature::operator<(contraction_signature const & other) const{
     if (order_A > other.order_A) return true;
     if (order_A < other.order_A) return false;
@@ -5758,39 +5754,39 @@ namespace CTF_int {
     if (order_C < other.order_C) return false;
     for (int i=0; i<order_A; i++){
       if (lens_A[i] > other.lens_A[i]) return true;
-      if (lens_A[i] < other.lens_A[i]) return false;
+      if (lens_A[i] < other.lens_A[i]) return false; 
     }
     for (int i=0; i<order_B; i++){
       if (lens_B[i] > other.lens_B[i]) return true;
-      if (lens_B[i] < other.lens_B[i]) return false;
+      if (lens_B[i] < other.lens_B[i]) return false; 
     }
     for (int i=0; i<order_C; i++){
       if (lens_C[i] > other.lens_C[i]) return true;
-      if (lens_C[i] < other.lens_C[i]) return false;
+      if (lens_C[i] < other.lens_C[i]) return false; 
     }
     for (int i=0; i<order_A; i++){
       if (idx_A[i] > other.idx_A[i]) return true;
-      if (idx_A[i] < other.idx_A[i]) return false;
+      if (idx_A[i] < other.idx_A[i]) return false; 
     }
     for (int i=0; i<order_B; i++){
       if (idx_B[i] > other.idx_B[i]) return true;
-      if (idx_B[i] < other.idx_B[i]) return false;
+      if (idx_B[i] < other.idx_B[i]) return false; 
     }
     for (int i=0; i<order_C; i++){
       if (idx_C[i] > other.idx_C[i]) return true;
-      if (idx_C[i] < other.idx_C[i]) return false;
+      if (idx_C[i] < other.idx_C[i]) return false; 
     }
     for (int i=0; i<order_A; i++){
       if (sym_A[i] > other.sym_A[i]) return true;
-      if (sym_A[i] < other.sym_A[i]) return false;
+      if (sym_A[i] < other.sym_A[i]) return false; 
     }
     for (int i=0; i<order_B; i++){
       if (sym_B[i] > other.sym_B[i]) return true;
-      if (sym_B[i] < other.sym_B[i]) return false;
+      if (sym_B[i] < other.sym_B[i]) return false; 
     }
     for (int i=0; i<order_C; i++){
       if (sym_C[i] > other.sym_C[i]) return true;
-      if (sym_C[i] < other.sym_C[i]) return false;
+      if (sym_C[i] < other.sym_C[i]) return false; 
     }
     if (is_sparse_A > other.is_sparse_A) return true;
     if (is_sparse_A < other.is_sparse_A) return false;
@@ -5812,15 +5808,15 @@ namespace CTF_int {
     if (topo_C->order < other.topo_C->order) return false;
     for (int i=0; i<topo_A->order; i++){
       if (topo_A->lens[i] > other.topo_A->lens[i]) return true;
-      if (topo_A->lens[i] < other.topo_A->lens[i]) return false;
+      if (topo_A->lens[i] < other.topo_A->lens[i]) return false; 
     }
     for (int i=0; i<topo_B->order; i++){
       if (topo_B->lens[i] > other.topo_B->lens[i]) return true;
-      if (topo_B->lens[i] < other.topo_B->lens[i]) return false;
+      if (topo_B->lens[i] < other.topo_B->lens[i]) return false; 
     }
     for (int i=0; i<topo_C->order; i++){
       if (topo_C->lens[i] > other.topo_C->lens[i]) return true;
-      if (topo_C->lens[i] < other.topo_C->lens[i]) return false;
+      if (topo_C->lens[i] < other.topo_C->lens[i]) return false; 
     }
     for (int i=0; i<order_A; i++){
       if (rank_dim_map(edge_map_A+i,other.edge_map_A+i) == 1) return true;
diff --git a/src/contraction/contraction.h b/src/contraction/contraction.h
index a9fe3640..90ca2211 100644
--- a/src/contraction/contraction.h
+++ b/src/contraction/contraction.h
@@ -304,7 +304,7 @@ namespace CTF_int {
        * \param[in] do_remap whether to redistribute tensors
        * \return SUCCESS if valid mapping found, ERROR if not enough memory or another issue
        */
-      int map(ctr ** ctrf, double &time_estimate, bool do_remap=1);
+      int map(ctr ** ctrf, bool do_remap=1);
  
       /**
         * \brief contracts tensors alpha*A*B+beta*C -> C.

From e06635fc3c308f78e6504f19f89ca1e3d0d9cc68 Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Wed, 12 May 2021 12:14:09 +0200
Subject: [PATCH 05/19] add dryRun feature. without any output

---
 src/contraction/contraction.cxx | 11 ++++++++---
 src/interface/tensor.cxx        |  1 +
 src/interface/world.cxx         |  9 +++++++++
 src/interface/world.h           |  9 +++++++++
 src/tensor/untyped_tensor.cxx   | 14 +++++---------
 src/tensor/untyped_tensor.h     |  3 ---
 6 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index f5dd14dc..55b08476 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -3250,7 +3250,7 @@ namespace CTF_int {
       }
     } else
       need_remap = 1;
-    if (need_remap)
+    if (need_remap && !wrld->dryRanks)
       A->redistribute(*dA);
     need_remap = 0;
     if (B->topo == old_topo_B){
@@ -3260,7 +3260,7 @@ namespace CTF_int {
       }
     } else
       need_remap = 1;
-    if (need_remap)
+    if (need_remap && !wrld->dryRanks)
       B->redistribute(*dB);
     need_remap = 0;
     if (C->topo == old_topo_C){
@@ -3270,7 +3270,7 @@ namespace CTF_int {
       }
     } else
       need_remap = 1;
-    if (need_remap)
+    if (need_remap && !wrld->dryRanks)
       C->redistribute(*dC);
                   
     TAU_FSTOP(redistribute_for_contraction);
@@ -4368,6 +4368,11 @@ namespace CTF_int {
       C->print_map();
     }
 #endif
+  if (A->wrld->dryRanks){
+    delete ctrf;
+    TAU_FSTOP(contract);
+    return SUCCESS;
+  }
 
   #ifdef PROFILE
     TAU_FSTART(pre_fold_barrier);
diff --git a/src/interface/tensor.cxx b/src/interface/tensor.cxx
index 3ae791f5..aba43748 100644
--- a/src/interface/tensor.cxx
+++ b/src/interface/tensor.cxx
@@ -1563,6 +1563,7 @@ NORM_INFTY_INST(double)
       IASSERT(0);
       return;
     }
+    if (T.wrld->dryRanks) return;
     for (int64_t i=0; i<T.size; i++){
       ((dtype*)T.data)[i] = ((dtype)((rtype)CTF_int::get_rand48()*(rmax-rmin)))+rmin;
     }
diff --git a/src/interface/world.cxx b/src/interface/world.cxx
index b81b8736..5e0c37e5 100644
--- a/src/interface/world.cxx
+++ b/src/interface/world.cxx
@@ -85,6 +85,13 @@ namespace CTF {
 #endif
   }
 
+  World::World(std::string print, int dryRanks_){
+    comm = MPI_COMM_WORLD;
+    dryRanks = dryRanks_;
+
+    this->init(comm, TOPOLOGY_GENERIC);
+  }
+
 
   World::World(int             order, 
                int const *     lens, 
@@ -164,6 +171,8 @@ namespace CTF {
                   int             argc,
                   const char * const *  argv){
     cdt = CommData(comm);
+    if (dryRanks) cdt.np = dryRanks;
+
     if (mach == TOPOLOGY_GENERIC)
       phys_topology = NULL;
     else
diff --git a/src/interface/world.h b/src/interface/world.h
index 622b27d4..1a729ce2 100644
--- a/src/interface/world.h
+++ b/src/interface/world.h
@@ -24,6 +24,8 @@ namespace CTF {
       int rank;
       /** \brief number of processors */
       int np;
+      /** \brief set dryRun */
+      int dryRanks = 0;
       /** \brief derived topologies */
       std::vector< CTF_int::topology* > topovec;
       /** \brief whether the world has been initialized */
@@ -86,6 +88,13 @@ namespace CTF {
        */
       World(char const * emptystring);
 
+      /**
+       * \brief constructor for a dry world
+       * \param[in] print determines how to handle output
+       * \param[in] dryRanks number of dry ranks
+      */
+
+      World(std::string print, int dryRanks);
 
       /**
        * \brief frees CTF library
diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx
index f3ae76ba..9ea6d19f 100644
--- a/src/tensor/untyped_tensor.cxx
+++ b/src/tensor/untyped_tensor.cxx
@@ -687,7 +687,7 @@ namespace CTF_int {
         memset(this->nnz_blk, 0, sizeof(int64_t)*calc_nvirt());
         this->set_new_nnz_glb(this->nnz_blk);
       } else {
-        if (!is_dry)
+        if (!wrld->dryRanks)
         sr->set(this->data, sr->addid(), this->size);
       }
     } else {
@@ -728,18 +728,18 @@ namespace CTF_int {
           //this->has_home = 0;
     /*      if (wrld->rank == 0)
             DPRINTF(3,"Initial size of tensor %d is " PRId64 ",",tensor_id,this->size);*/
-          if (!is_dry) {
+          if (!wrld->dryRanks) {
             this->home_buffer = sr->alloc(this->home_size);
             if (wrld->rank == 0) DPRINTF(2,"Creating home of %s\n",name);
             register_size(this->size*sr->el_size);
             this->data = this->home_buffer;
           }
         } else {
-          if (!is_dry)
+          if (!wrld->dryRanks)
             this->data = sr->alloc(this->size);
         }
         #else
-        if (!is_dry)
+        if (!wrld->dryRanks)
           this->data = sr->alloc(this->size);
         //CTF_int::alloc_ptr(this->size*sr->el_size, (void**)&this->data);
         #endif
@@ -749,7 +749,7 @@ namespace CTF_int {
         this->print_lens();
         this->print_map(stdout);
         #endif
-        if (!is_dry)
+        if (!wrld->dryRanks)
           sr->init(this->size, this->data);
       }
     }
@@ -4132,9 +4132,5 @@ namespace CTF_int {
     return subtsrs;
   }
 
-  bool tensor::is_dry = false;
-  void tensor::set_dry_run(){
-    is_dry = true;
-  }
 }
 
diff --git a/src/tensor/untyped_tensor.h b/src/tensor/untyped_tensor.h
index cc4f3c85..0ebbd631 100644
--- a/src/tensor/untyped_tensor.h
+++ b/src/tensor/untyped_tensor.h
@@ -139,8 +139,6 @@ namespace CTF_int {
       int64_t nnz_tot;
       /** \brief nonzero elements in each block owned locally */
       int64_t * nnz_blk;
-      /** \brief dry run does not allocate any tensor data */
-      static bool is_dry;
 
       /**
        * \brief associated an index map with the tensor for future operation
@@ -1066,7 +1064,6 @@ namespace CTF_int {
        */
       tensor * get_no_unit_len_alias();
 
-      static void set_dry_run();
   };
 }
 #endif// __UNTYPED_TENSOR_H__

From 19f8385b67b4822c5f89fe9de195488a1268b6d3 Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Mon, 17 May 2021 12:01:34 +0200
Subject: [PATCH 06/19] change output for dryRun

---
 src/contraction/contraction.cxx    | 32 +++++++++++++++++++++++-------
 src/contraction/ctr_2d_general.cxx | 15 ++++++++------
 src/contraction/ctr_tsr.cxx        |  9 ++++++---
 src/tensor/untyped_tensor.cxx      | 22 ++++++++++++--------
 4 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index 55b08476..84181cb1 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -3205,6 +3205,15 @@ namespace CTF_int {
 //    assert(est_time == std::min(gbest_time_sel,gbest_time_exh));
 //#endif
 #endif
+    if (A->wrld->dryRanks){
+      int64_t memuse;
+      double est_time;
+      detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time);
+      printf( "Contraction will use %f GB per rank and take %f seconds\n"
+            , memuse/1024.0/1024./1024, est_time);
+    }
+
+
 
     if (can_fold()){
       iparam prm = map_fold(false);
@@ -4169,6 +4178,8 @@ namespace CTF_int {
     ctr * ctrf;
     CommData global_comm = C->wrld->cdt;
 
+
+
     if (A->has_zero_edge_len || B->has_zero_edge_len
         || C->has_zero_edge_len){
       if (!C->sr->isequal(beta,C->sr->mulid()) && !C->has_zero_edge_len){
@@ -4368,12 +4379,6 @@ namespace CTF_int {
       C->print_map();
     }
 #endif
-  if (A->wrld->dryRanks){
-    delete ctrf;
-    TAU_FSTOP(contract);
-    return SUCCESS;
-  }
-
   #ifdef PROFILE
     TAU_FSTART(pre_fold_barrier);
     MPI_Barrier(global_comm.cm);
@@ -4388,12 +4393,24 @@ namespace CTF_int {
     if (is_inner){
       iparam prm;
       TAU_FSTART(map_fold);
-      prm = map_fold();
+      prm = map_fold(!A->wrld->dryRanks);
       TAU_FSTOP(map_fold);
       delete ctrf;
       ctrf = construct_ctr(1, &prm);
     }
   #endif
+
+
+  if (A->wrld->dryRanks){
+    A->print_map();
+    B->print_map();
+    C->print_map();
+    ctrf->print();
+    delete ctrf;
+    TAU_FSTOP(contract);
+    return SUCCESS;
+  }
+
   #if (VERBOSE >= 1 || DEBUG >= 1)
   if (global_comm.rank == 0){
     ctrf->print();
@@ -5251,6 +5268,7 @@ namespace CTF_int {
 
     ret = new_ctr.sym_contract();//&ntype, ftsr, felm, alpha, beta);
     if (ret!= SUCCESS) return ret;
+    if (C->wrld->dryRanks) return SUCCESS;
     if (was_home_C) new_ctr.C->unfold();
 
     if (was_home_C && !new_ctr.C->is_home){
diff --git a/src/contraction/ctr_2d_general.cxx b/src/contraction/ctr_2d_general.cxx
index 3f02c6d4..ee3c6055 100755
--- a/src/contraction/ctr_2d_general.cxx
+++ b/src/contraction/ctr_2d_general.cxx
@@ -199,15 +199,18 @@ namespace CTF_int {
 
   void ctr_2d_general::print() {
     printf("ctr_2d_general: edge_len = %ld\n", edge_len);
-    printf("move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld\n",
+    printf("move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld",
             move_A, ctr_lda_A, ctr_sub_lda_A);
-    if (move_A) printf("cdt_A length = %d\n",cdt_A->np);
-    printf("move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld\n",
+    if (move_A) printf(", cdt_A length = %d",cdt_A->np);
+    printf("\n");
+    printf("move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld",
             move_B, ctr_lda_B, ctr_sub_lda_B);
-    if (move_B) printf("cdt_B length = %d\n",cdt_B->np);
-    printf("move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld\n",
+    if (move_B) printf(", cdt_B length = %d",cdt_B->np);
+    printf("\n");
+    printf("move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld",
             move_C, ctr_lda_C, ctr_sub_lda_C);
-    if (move_C) printf("cdt_C length = %d\n",cdt_C->np);
+    if (move_C) printf(", cdt_C length = %d",cdt_C->np);
+    printf("\n");
 #ifdef OFFLOAD
     if (alloc_host_buf)
       printf("alloc_host_buf is true\n");
diff --git a/src/contraction/ctr_tsr.cxx b/src/contraction/ctr_tsr.cxx
index d548b752..cb449e64 100755
--- a/src/contraction/ctr_tsr.cxx
+++ b/src/contraction/ctr_tsr.cxx
@@ -338,14 +338,17 @@ namespace CTF_int {
     int i;
     printf("seq_tsr_ctr:\n");
     for (i=0; i<order_A; i++){
-      printf("edge_len_A[%d]=%ld\n",i,edge_len_A[i]);
+      printf("edge_len_A[%d]=%ld ",i,edge_len_A[i]);
     }
+    printf("\n");
     for (i=0; i<order_B; i++){
-      printf("edge_len_B[%d]=%ld\n",i,edge_len_B[i]);
+      printf("edge_len_B[%d]=%ld ",i,edge_len_B[i]);
     }
+    printf("\n");
     for (i=0; i<order_C; i++){
-      printf("edge_len_C[%d]=%ld\n",i,edge_len_C[i]);
+      printf("edge_len_C[%d]=%ld ",i,edge_len_C[i]);
     }
+    printf("\n");
     printf("is inner = %d\n", is_inner);
     if (is_inner) printf("inner n = %ld m= %ld k = %ld l = %ld, ta = %c, tb =%c, tc = %c\n",
                           inner_params.n, inner_params.m, inner_params.k, inner_params.l, inner_params.tA, inner_params.tB, inner_params.tC);
diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx
index 9ea6d19f..95ae5531 100644
--- a/src/tensor/untyped_tensor.cxx
+++ b/src/tensor/untyped_tensor.cxx
@@ -677,7 +677,6 @@ namespace CTF_int {
     int * restricted;
     int btopo;
     int64_t bmemuse;
-
     if (this->is_mapped){
       if (is_sparse){
         sr->pair_dealloc(this->data);
@@ -759,17 +758,24 @@ namespace CTF_int {
 
   void tensor::print_map(FILE * stream, bool allcall) const {
     if (!allcall || wrld->rank == 0){
-      if (is_sparse)
-        printf("printing mapping of sparse tensor %s\n",name);
-      else
-        printf("printing mapping of dense tensor %s\n",name);
+//      if (is_sparse)
+//        printf("printing mapping of sparse tensor %s\n",name);
+//      else
+//        printf("printing mapping of dense tensor %s\n",name);
+//      if (topo != NULL){
+//        printf("CTF: %s mapped to order %d topology with dims:",name,topo->order);
+//        for (int dim=0; dim<topo->order; dim++){
+//          printf(" %d ",topo->lens[dim]);
+//        }
+//      }
+//      printf("\n");
       if (topo != NULL){
-        printf("CTF: %s mapped to order %d topology with dims:",name,topo->order);
+        printf("%s topo (",name);
         for (int dim=0; dim<topo->order; dim++){
-          printf(" %d ",topo->lens[dim]);
+          printf(", %d",topo->lens[dim]);
         }
+        printf("); ");
       }
-      printf("\n");
       char tname[200];
       tname[0] = '\0';
       sprintf(tname, "%s[", name);

From fc734f17e7f42b8ad71629f16fcdbf91d66c6a74 Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Fri, 28 May 2021 20:54:34 +0200
Subject: [PATCH 07/19] change of model + very lengthy output in dryRun

---
 src/contraction/contraction.cxx | 51 ++++++++++++++-------
 src/contraction/contraction.h   |  2 +-
 src/shared/init_models.cxx      | 81 +++++++++++++++++----------------
 3 files changed, 76 insertions(+), 58 deletions(-)

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index 84181cb1..f90c2994 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -2582,11 +2582,12 @@ namespace CTF_int {
     assert(nnz_frac_C>=0.);
   }
 
-  void contraction::detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time){
+  void contraction::detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time, double &redist_time, double &contr_time, double &fold_time){
     TAU_FSTART(detail_estimate_mem_and_time);
     ctr * sctr;
     est_time = 0.;
     memuse = 0;
+    fold_time = 0.0;
     topology * topo_i = A->topo;
     bool csr_or_coo = B->is_sparse || C->is_sparse || is_custom || !A->sr->has_coo_ker;
     bool use_ccsr =  csr_or_coo && A->is_sparse && C->is_sparse && !B->is_sparse;
@@ -2598,13 +2599,14 @@ namespace CTF_int {
 #if FOLD_TSR
     if (can_fold()){
       est_time = est_time_fold();
+      fold_time = est_time;
       iparam prm = map_fold(false);
     
       sctr = construct_ctr(1, &prm);
       if (this->is_sparse())
-        est_time = ((spctr*)sctr)->est_time_rec(sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C);
+        est_time += ((spctr*)sctr)->est_time_rec(sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C);
       else
-        est_time = sctr->est_time_rec(sctr->num_lyr);
+        est_time += sctr->est_time_rec(sctr->num_lyr);
       A->remove_fold();
       B->remove_fold();
       C->remove_fold();
@@ -2672,6 +2674,7 @@ namespace CTF_int {
       }
 
     }
+    contr_time = est_time - fold_time;
 #if DEBUG >= 4
     printf("mapping passed contr est_time = %E sec %d %ld %ld %ld %E %E %E\n", est_time, sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C);
 #endif
@@ -2724,6 +2727,7 @@ namespace CTF_int {
       mem_redist_tmp += C->get_redist_mem(*dC, nnz_frac_C);
       //mem_redist += (int64_t)(nnz_frac_C*C->size*C->sr->pair_size()) +C->get_redist_mem(*dC, nnz_frac_C);
     }
+    redist_time = est_time - contr_time - fold_time;
     assert(mem_fold_tmp >= 0);
     assert(mem_fold >= 0);
     assert(mem_redist >= 0);
@@ -2755,7 +2759,9 @@ namespace CTF_int {
   #if DEBUG > 4
       for (int t=1; t<(int)wrld->topovec.size()+8; t++){
   #else
-      for (int64_t t=global_comm.rank+1; t<(int)wrld->topovec.size()+8; t+=global_comm.np){
+      int64_t incr(global_comm.np);
+      if (A->wrld->dryRanks) incr = 1;
+      for (int64_t t=global_comm.rank+1; t<(int)wrld->topovec.size()+8; t+=incr){
   #endif
         A->clear_mapping();
         B->clear_mapping();
@@ -2794,7 +2800,7 @@ namespace CTF_int {
         ret = map_to_topology(topo_i, j);
 
         if (ret == NEGATIVE){
-          //printf("map_to_topology returned negative\n");
+//          printf("map_to_topology returned negative %d %d\n", t, j);
           continue;
         }
    
@@ -2806,6 +2812,7 @@ namespace CTF_int {
         C->topo = topo_i;
        
         if (check_mapping() == 0){
+//          printf("check mapping is zero %d %d\n", t, j);
           continue;
         }
         A->set_padding();
@@ -2825,11 +2832,17 @@ namespace CTF_int {
           continue;
         }
         int64_t memuse;//, bmemuse;
-        double est_time;
-        detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time);
+        double est_time, redist_time, contr_time, fold_time;
+        detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time);
 #ifdef MIN_MEMORY
         est_time = memuse;
 #endif
+
+        if (A->wrld->dryRanks) printf( "t %d j %d will use %f GB per rank and take %f s, %f %f %f\n"
+                                     , t, j, memuse/1024.0/1024./1024
+                                     , est_time, redist_time, contr_time, fold_time);
+
+
         ASSERT(est_time >= 0.0);
         if ((int64_t)memuse >= max_memuse){
           if (global_comm.rank == 0)
@@ -2889,7 +2902,7 @@ namespace CTF_int {
       int64_t old_off = choice_offset;
       choice_offset += tnum_choices;
       for (int j=0; j<tnum_choices; j++){
-        if ((old_off + j)%global_comm.np != global_comm.rank)
+        if (!A->wrld->dryRanks && (old_off + j)%global_comm.np != global_comm.rank)
           continue;
         A->clear_mapping();
         B->clear_mapping();
@@ -2931,12 +2944,15 @@ namespace CTF_int {
           continue;
         }
         int64_t memuse;//, bmemuse;
-        double est_time;
-        detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time);
+        double est_time, redist_time, contr_time, fold_time;
+        detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time);
 #ifdef MIN_MEMORY
         est_time = memuse;
 #endif
         ASSERT(est_time >= 0.0);
+        if (A->wrld->dryRanks) printf( "topo %d order %d will use %f GB per rank and take %f s, %f %f %f\n"
+                                     , i, j, memuse/1024.0/1024./1024, est_time, redist_time, contr_time, fold_time);
+
 
         if ((int64_t)memuse >= max_memuse){
           DPRINTF(3,"[EXH] Not enough memory available for topo %d with order %d memory %ld/%ld\n", i,j,memuse,max_memuse);
@@ -3076,7 +3092,7 @@ namespace CTF_int {
       A->set_padding();
       B->set_padding();
       C->set_padding();
-      if (gbest_time_sel < 100.){
+      if (gbest_time_sel < 1e-100){
         gbest_time_exh = gbest_time_sel+1.;
         ttopo_exh = ttopo_sel;
       } else {
@@ -3107,6 +3123,7 @@ namespace CTF_int {
       ctr_sig_map.insert(std::pair<contraction_signature,topo_info>(sig,ti));
       TAU_FSTOP(ctr_sig_map_insert);
     }
+
     if (!do_remap || ttopo == INT64_MAX || ttopo == -1){
       CTF_int::cdealloc(old_phase_A);
       CTF_int::cdealloc(old_phase_B);
@@ -3191,9 +3208,9 @@ namespace CTF_int {
 #if (VERBOSE >= 1 || DEBUG >= 1 || PROFILE_MEMORY >= 1)
 
     int64_t memuse;
-    double est_time;
+    double est_time, redist_time, contr_time, fold_time;
 
-    detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time);
+    detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time);
     if (global_comm.rank == 0){
       printf("Contraction will use %E bytes per processor out of %E available memory (already used %E) and take an estimated of %E sec\n",
               (double)memuse,(double)proc_bytes_available(),(double)proc_bytes_used(),est_time);
@@ -3207,10 +3224,10 @@ namespace CTF_int {
 #endif
     if (A->wrld->dryRanks){
       int64_t memuse;
-      double est_time;
-      detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time);
-      printf( "Contraction will use %f GB per rank and take %f seconds\n"
-            , memuse/1024.0/1024./1024, est_time);
+      double est_time, redist_time, contr_time, fold_time;
+      detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time);
+      printf( "Contraction will use %f GB per rank and take %f s, %f %f %f\n"
+            , memuse/1024.0/1024./1024, est_time, redist_time, contr_time, fold_time);
     }
 
 
diff --git a/src/contraction/contraction.h b/src/contraction/contraction.h
index 90ca2211..8395983c 100644
--- a/src/contraction/contraction.h
+++ b/src/contraction/contraction.h
@@ -292,7 +292,7 @@ namespace CTF_int {
 
       void calc_nnz_frac(double & nnz_frac_A, double & nnz_frac_B, double & nnz_frac_C);
 
-      void detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time);
+      void detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time, double &redist_time, double &contr_time, double &fold_time);
 
       void get_best_sel_map(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & idx, double & time);
 
diff --git a/src/shared/init_models.cxx b/src/shared/init_models.cxx
index 94e1c5e3..14a29a37 100644
--- a/src/shared/init_models.cxx
+++ b/src/shared/init_models.cxx
@@ -1,41 +1,42 @@
 namespace CTF_int{
-double csrred_mdl_init[] = {3.0689E-03, 2.2385E-03, 4.4815E-07};
-double csrred_mdl_cst_init[] = {-1.8323E-04, 1.3076E-04, 2.8732E-09};
-double alltoall_mdl_init[] = {1.0000E-06, 1.0000E-06, 5.0000E-10};
-double alltoallv_mdl_init[] = {7.3164E-23, 1.0404E-04, 2.5827E-07};
-double red_mdl_init[] = {4.5530E-11, 3.0466E-17, 5.0877E-10};
-double red_mdl_cst_init[] = {1.2881E-04, 1.4093E-16, 8.3976E-10};
-double allred_mdl_init[] = {4.7939E-14, 7.4715E-13, 2.0949E-06};
-double allred_mdl_cst_init[] = {-3.3754E-04, 2.1343E-04, 3.0801E-09};
-double bcast_mdl_init[] = {1.1115E-16, 1.0754E-01, 4.1995E-10};
-double seq_tsr_ctr_mdl_cst_init[] = {7.8076E-13, 6.9558E-08, 1.3923E-08};
-double seq_tsr_ctr_mdl_ref_init[] = {4.9138E-08, 5.8290E-10, 4.8575E-11};
-double seq_tsr_ctr_mdl_inr_init[] = {6.0166E-21, 2.3443E-13, 2.0967E-11};
-double seq_tsr_ctr_mdl_off_init[] = {6.2925E-05, 1.7449E-11, 1.7211E-12};
-double seq_tsr_ctr_mdl_cst_inr_init[] = {0.0, 0.0, 1.6E-11};
-double seq_tsr_ctr_mdl_cst_off_init[] = {8.4844E-04, 5.9246E-11, 3.5247E-10};
-double long_contig_transp_mdl_init[] = {0.0, 1.25E-08};
-double shrt_contig_transp_mdl_init[] = {0.0, 1.25E-08};
-double non_contig_transp_mdl_init[] = {2.6680E-05, 4.6247E-06};
-double seq_tsr_spctr_cst_off_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_off_k2_init[] = {2.1996E-04, 3.1883E-09, 3.8743E-11};
-double seq_tsr_spctr_off_k0_init[] = {8.6970E-06, 4.5598E-11, 1.1544E-09};
-double seq_tsr_spctr_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_off_k2_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_k2_init[] = {2.1303E-74, 5.7379E-09, 4.1887E-11};
-double seq_tsr_spctr_cst_k3_init[] = {1.4917E-05, 2.5510E-10, 5.4110E-12};
-double seq_tsr_spctr_cst_k4_init[] = {5.6408E-06, 1.8318E-09, 5.2399E-80};
-double seq_tsr_spctr_cst_k5_init[] = {2.8218E-05, 3.0049E-09, 5.2399E-11};
-double seq_tsr_spctr_k0_init[] = {3.9315E-05, 2.2285E-08, 6.1958E-08};
-double seq_tsr_spctr_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_k2_init[] = {5.9868E-14, 1.4877E-09, 5.3514E-12};
-double seq_tsr_spctr_k3_init[] = {1.3994E-15, 2.5071E-09, 2.7323E-11};
-double seq_tsr_spctr_k4_init[] = {2.0404E-04, 8.2989E-09, 6.0431E-11};
-double seq_tsr_spctr_k5_init[] = {6.9073E-15, 4.0130E-09, 2.2669E-13};
-double pin_keys_mdl_init[] = {4.0261E-05, 7.2443E-07};
-double spredist_mdl_init[] = {6.8713E-23, 7.8867E-04, 6.9422E-11};
-double dgtog_res_mdl_init[] = {0.0, 0.0, 5.0E-10};
-double blres_mdl_init[] = {0.0, 3E-8};}
+  double csrred_mdl_init[] = {3.0689E-03, 2.2385E-03, 4.4815E-07};          // not used I guess (at least not in dense)
+  double csrred_mdl_cst_init[] = {-1.8323E-04, 1.3076E-04, 2.8732E-09};     // not used I guess (at least not in dense)
+  double alltoall_mdl_init[] = {1.0000E-06, 1.0000E-06, 5.0000E-10};        // not used I guess
+  double alltoallv_mdl_init[] = {7.3164E-23, 1.0404E-04, 2.5827E-07};       // not used I guess
+  double red_mdl_init[] = {4.5530E-11, 3.0466E-17, 2.5E-9}; // mpi_reduce, used for summa for moving C 
+  double red_mdl_cst_init[] = {1.2881E-04, 1.4093E-16, 8.3976E-10};         // not used I guess
+  double allred_mdl_init[] = {4.7939E-14, 7.4715E-13, 2.0949E-06}; // de-facto not used I guess
+  double allred_mdl_cst_init[] = {-3.3754E-04, 2.1343E-04, 3.0801E-09};     // not used I guess
+  double bcast_mdl_init[] = {1.1115E-16, 1.0754E-16, 1.32E-9}; //mpi_bcast, used for summa for bcasting A,B, 3rd parameter is around 0.7 GB/s
+  double seq_tsr_ctr_mdl_cst_init[] = {7.8076E-13, 6.9558E-08, 1.3923E-08}; // not used I guess
+  double seq_tsr_ctr_mdl_ref_init[] = {4.9138E-08, 5.8290E-10, 4.8575E-11}; // not used I guess
+  double seq_tsr_ctr_mdl_inr_init[] = {6.0166E-21, 2.3443E-13, 1.4286E-11}; // our model, 2nd parameter negligible for large matrices, 3rd paramter fixed to 70GFLOPS/s/core
+  double seq_tsr_ctr_mdl_off_init[] = {6.2925E-05, 1.7449E-11, 1.7211E-12}; // not used I guess
+  double seq_tsr_ctr_mdl_cst_inr_init[] = {0.0, 0.0, 1.6E-11};
+  double seq_tsr_ctr_mdl_cst_off_init[] = {8.4844E-04, 5.9246E-11, 3.5247E-10};
+  double long_contig_transp_mdl_init[] = {0.0, 1.25E-08};
+  double shrt_contig_transp_mdl_init[] = {0.0, 1.25E-08};
+  double non_contig_transp_mdl_init[] = {2.6680E-05, 4.6247E-06};
+  double seq_tsr_spctr_cst_off_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_off_k2_init[] = {2.1996E-04, 3.1883E-09, 3.8743E-11};
+  double seq_tsr_spctr_off_k0_init[] = {8.6970E-06, 4.5598E-11, 1.1544E-09};
+  double seq_tsr_spctr_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_off_k2_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_k2_init[] = {2.1303E-74, 5.7379E-09, 4.1887E-11};
+  double seq_tsr_spctr_cst_k3_init[] = {1.4917E-05, 2.5510E-10, 5.4110E-12};
+  double seq_tsr_spctr_cst_k4_init[] = {5.6408E-06, 1.8318E-09, 5.2399E-80};
+  double seq_tsr_spctr_cst_k5_init[] = {2.8218E-05, 3.0049E-09, 5.2399E-11};
+  double seq_tsr_spctr_k0_init[] = {3.9315E-05, 2.2285E-08, 6.1958E-08};
+  double seq_tsr_spctr_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_k2_init[] = {5.9868E-14, 1.4877E-09, 5.3514E-12};
+  double seq_tsr_spctr_k3_init[] = {1.3994E-15, 2.5071E-09, 2.7323E-11};
+  double seq_tsr_spctr_k4_init[] = {2.0404E-04, 8.2989E-09, 6.0431E-11};
+  double seq_tsr_spctr_k5_init[] = {6.9073E-15, 4.0130E-09, 2.2669E-13};
+  double pin_keys_mdl_init[] = {4.0261E-05, 7.2443E-07};
+  double spredist_mdl_init[] = {6.8713E-23, 7.8867E-04, 6.9422E-11};
+  double dgtog_res_mdl_init[] = {0.0, 0.0, 7.25E-10};  // elementwise reshuffling of distribution
+  double blres_mdl_init[] = {0.0, 1E-10};  // blockwise reshuffling of distribution 
+}

From f260af2372367d4c7b408a1264cb68c4efa6ba9e Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Fri, 4 Jun 2021 12:23:43 +0200
Subject: [PATCH 08/19] minor changes on dryRun output

---
 src/contraction/contraction.cxx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index f90c2994..690590e5 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -2838,10 +2838,10 @@ namespace CTF_int {
         est_time = memuse;
 #endif
 
-        if (A->wrld->dryRanks) printf( "t %d j %d will use %f GB per rank and take %f s, %f %f %f\n"
+        if (A->wrld->dryRanks) printf( "t %d j %d will use %f GB per rank and take %f s, %f %f %f"
                                      , t, j, memuse/1024.0/1024./1024
                                      , est_time, redist_time, contr_time, fold_time);
-
+        if (A->wrld->dryRanks) C->print_map();
 
         ASSERT(est_time >= 0.0);
         if ((int64_t)memuse >= max_memuse){
@@ -3092,7 +3092,7 @@ namespace CTF_int {
       A->set_padding();
       B->set_padding();
       C->set_padding();
-      if (gbest_time_sel < 1e-100){
+      if (gbest_time_sel < 1e100){
         gbest_time_exh = gbest_time_sel+1.;
         ttopo_exh = ttopo_sel;
       } else {
@@ -3298,7 +3298,7 @@ namespace CTF_int {
       need_remap = 1;
     if (need_remap && !wrld->dryRanks)
       C->redistribute(*dC);
-                  
+
     TAU_FSTOP(redistribute_for_contraction);
    
     CTF_int::cdealloc( old_phase_A );

From 67b7e0b367a4d71fa4d77e44db4c7975de381d8e Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Mon, 19 Jul 2021 15:21:33 +0200
Subject: [PATCH 09/19] solomonik fix: disable fast slice when tensors are not
 mapped to all processors

---
 src/tensor/untyped_tensor.cxx | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx
index 95ae5531..6e81d1f7 100644
--- a/src/tensor/untyped_tensor.cxx
+++ b/src/tensor/untyped_tensor.cxx
@@ -1041,7 +1041,16 @@ namespace CTF_int {
       bool tsr_has_sym = false;
       bool tsr_has_virt = false;
 
+      int topo_dims_A = tsr_A->topo->order;
+      int topo_dims_B = tsr_B->topo->order;
+
       for (int i=0; i<this->order; i++){
+        if (tsr_A->edge_map[i].type == PHYSICAL_MAP){
+          topo_dims_A--;
+        }
+        if (tsr_B->edge_map[i].type == PHYSICAL_MAP){
+          topo_dims_B--;
+        }
         if (A->sym[i] != NS || this->sym[i] != NS)
           tsr_has_sym = true;
         if (A->edge_map[i].type == VIRTUAL_MAP || (A->edge_map[i].has_child && A->edge_map[i].child->type == VIRTUAL_MAP)){
@@ -1053,7 +1062,7 @@ namespace CTF_int {
       }
       int nvirt_A = tsr_A->calc_nvirt();
       int nvirt_B = tsr_B->calc_nvirt();
-      if (tsr_B->wrld->np == tsr_A->wrld->np && !tsr_has_sym && !this->is_sparse && !A->is_sparse && nvirt_A == 1 && nvirt_B == 1 && !tsr_has_virt){
+      if (tsr_B->wrld->np == tsr_A->wrld->np && !tsr_has_sym && !this->is_sparse && !A->is_sparse && nvirt_A == 1 && nvirt_B == 1 && !tsr_has_virt && topo_dims_A ==0 && topo_dims_B == 0){
         push_slice(this, offsets_B, ends_B, beta, A, offsets_A, ends_A, alpha);
         TAU_FSTOP(slice);
         return;

From 48766764bb4c8e2a9839bc7d6d55e40e925c98b8 Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Tue, 21 Dec 2021 21:32:48 +0100
Subject: [PATCH 10/19] simple test

---
 tst | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 tst

diff --git a/tst b/tst
new file mode 100644
index 00000000..aff60f88
--- /dev/null
+++ b/tst
@@ -0,0 +1 @@
+fiaf:

From 32dfea25c50b6e81418f3181cf7eacbc5ae21f9a Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Tue, 28 Dec 2021 16:35:14 +0100
Subject: [PATCH 11/19] blocked communicator for summa2d and matrices only

---
 src/contraction/contraction.cxx    |  13 +++-
 src/contraction/ctr_2d_general.cxx | 108 +++++++++++++++++++++++++++++
 src/contraction/ctr_2d_general.h   |  15 ++++
 src/contraction/ctr_comm.h         |   4 +-
 src/interface/common.cxx           | 104 +++++++++++++++++++++++++++
 src/interface/common.h             |  19 +++++
 tst                                |   1 -
 7 files changed, 261 insertions(+), 3 deletions(-)
 delete mode 100644 tst

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index 690590e5..e43cd29d 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -4463,6 +4463,12 @@ namespace CTF_int {
   #endif
     TAU_FSTART(ctr_func);
     /* Invoke the contraction algorithm */
+    TAU_FSTART(blockComm);
+    std::vector<int> swap;
+    ctrf->blockComm( A->topo->lens, A->data, B->data, C->data
+		   , A->size, B->size, C->size, global_comm, swap);
+    MPI_Barrier(global_comm.cm);
+    TAU_FSTOP(blockComm);
     A->topo->activate();
 
   #ifdef PROFILE_MEMORY
@@ -4547,7 +4553,12 @@ namespace CTF_int {
   #endif
 
 
-    A->topo->deactivate();
+//    A->topo->deactivate();
+    TAU_FSTART(blockComm);
+    MPI_Barrier(global_comm.cm);
+    ctrf->blockComm( A->topo->lens, A->data, B->data, C->data
+                   , A->size, B->size, C->size, global_comm, swap);
+    TAU_FSTOP(blockComm);
 
   #ifdef PROFILE
     TAU_FSTART(post_ctr_func_barrier);
diff --git a/src/contraction/ctr_2d_general.cxx b/src/contraction/ctr_2d_general.cxx
index ee3c6055..31fb0a60 100755
--- a/src/contraction/ctr_2d_general.cxx
+++ b/src/contraction/ctr_2d_general.cxx
@@ -472,5 +472,113 @@ namespace CTF_int {
     }
     TAU_FSTOP(ctr_2d_general);
   }
+
+  void ctr_2d_general::blockComm( int const * rgrid, char *A, char *B, char *C
+                                , size_t sizeA, size_t sizeB, size_t sizeC
+                                , CommData glb_comm, std::vector<int> &swap
+  ){
+    int rank = glb_comm.rank;
+    int np = glb_comm.np;
+    int src, dst;
+    // we have to determine the partners
+    if (! swap.size() ) {
+      ipair nr(getNumNodes(glb_comm.cm));
+      // rGrid is the rankGrid of the given tensor topology
+      CommGrid grid({rgrid[0], rgrid[1]}, nr.first);
+      ipair nGrid = grid.nGrid;
+      ipair iGrid = grid.iGrid;
+      // rr is the key/color pair for the original rank distribution of dim_comm[0]
+      std::vector<ipair> rr(np);
+      std::vector< std::pair<ipair, int> > perm(np);
+      for (int r(0); r < np; r++) rr[r] = { r % rgrid[0], r / rgrid[0] };
+      // the desired distribution are nGrid[0] x nGrid[1] blocks with the some color
+      for (int r(0); r < np; r++){
+        // the color is the jth column and kth row in the nodeGrid
+        int clr = (rr[r].second/iGrid.second)*nGrid.first + rr[r].first/iGrid.first;
+        int key = (rr[r].second%iGrid.second)*iGrid.first + rr[r].first%iGrid.first;
+       // we have to swap color and key that we can use std::sort
+        perm[r] = { { clr, key }, r};
+      }
+      std::sort(perm.begin(), perm.end());
+      for (auto p: perm) swap.push_back(p.second);
+
+      src = swap[rank];
+      auto it( std::find(swap.begin(), swap.end(), rank) );
+      dst = std::distance(swap.begin(), it);
+    }
+    else {
+      dst = swap[rank];
+      auto it( std::find(swap.begin(), swap.end(), rank) );
+      src = std::distance(swap.begin(), it);
+    }
+
+    MPI_Barrier(glb_comm.cm);
+    MPI_Status s;
+    MPI_Sendrecv_replace(&cdt_A->color, 1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s);
+    MPI_Sendrecv_replace(&cdt_B->color, 1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s);
+    MPI_Sendrecv_replace(&cdt_A->rank,  1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s);
+    MPI_Sendrecv_replace(&cdt_B->rank,  1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s);
+
+    MPI_Barrier(glb_comm.cm);
+
+    size_t el(std::max(sizeA, sizeB));
+    el = std::max(el, sizeC);
+    char *buf = new char[el*sr_A->el_size];
+    // Do the A job
+    MPI_Request sreq, rreq;
+    MPI_Irecv(buf, sizeA, sr_A->mdtype(), src, 0, glb_comm.cm, &rreq);
+    MPI_Isend(A,   sizeA, sr_A->mdtype(), dst, 0, glb_comm.cm, &sreq);
+    MPI_Wait(&rreq, MPI_STATUS_IGNORE);
+    MPI_Wait(&sreq, MPI_STATUS_IGNORE);
+    memcpy(A, buf, sizeA*sr_A->el_size);
+
+    // Do the B job
+    MPI_Irecv(buf, sizeB, sr_A->mdtype(), src, 0, glb_comm.cm, &rreq);
+    MPI_Isend(B,   sizeB, sr_A->mdtype(), dst, 0, glb_comm.cm, &sreq);
+    MPI_Wait(&rreq, MPI_STATUS_IGNORE);
+    MPI_Wait(&sreq, MPI_STATUS_IGNORE);
+    memcpy(B, buf, sizeB*sr_A->el_size);
+
+    // Do the B job
+    MPI_Irecv(buf, sizeC, sr_A->mdtype(), src, 0, glb_comm.cm, &rreq);
+    MPI_Isend(C,   sizeC, sr_A->mdtype(), dst, 0, glb_comm.cm, &sreq);
+    MPI_Wait(&rreq, MPI_STATUS_IGNORE);
+    MPI_Wait(&sreq, MPI_STATUS_IGNORE);
+    memcpy(C, buf, sizeC*sr_A->el_size);
+    MPI_Barrier(glb_comm.cm);
+  }
+
+  ipair ctr_2d_general::getNumNodes(MPI_Comm comm){
+    int rank, np;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &np);
+
+    std::vector<std::string> nodeList(np);
+    char nodeName[MPI_MAX_PROCESSOR_NAME];
+    char nodeNames[np*MPI_MAX_PROCESSOR_NAME];
+    std::vector<int> nameLengths(np);
+    std::vector<int> off(np);
+    int nameLength;
+    MPI_Get_processor_name(nodeName, &nameLength);
+    MPI_Allgather(
+      &nameLength, 1, MPI_INT, nameLengths.data(), 1, MPI_INT, comm
+    );
+    for (int i(1); i < np; i++) off[i] = off[i-1] + nameLengths[i-1];
+    MPI_Allgatherv(
+      nodeName, nameLengths[rank], MPI_BYTE, nodeNames,
+      nameLengths.data(), off.data(), MPI_BYTE, comm
+    );
+    for (int i(0); i < np; i++) {
+      std::string s(&nodeNames[off[i]], nameLengths[i]);
+      nodeList[i] = s;
+    }
+    std::sort(nodeList.begin(), nodeList.end());
+    std::vector<std::string>::iterator it(
+      std::unique(nodeList.begin(), nodeList.end())
+    );
+    size_t nNodes(std::distance(nodeList.begin(), it));
+    return {nNodes, np/nNodes};
+  }
+
 }
 
diff --git a/src/contraction/ctr_2d_general.h b/src/contraction/ctr_2d_general.h
index 3df75d3d..fd85d12f 100644
--- a/src/contraction/ctr_2d_general.h
+++ b/src/contraction/ctr_2d_general.h
@@ -68,6 +68,8 @@ namespace CTF_int{
       CommData * cdt_A;
       CommData * cdt_B;
       CommData * cdt_C;
+
+
       /* Class to be called on sub-blocks */
       ctr * rec_ctr;
       
@@ -81,6 +83,19 @@ namespace CTF_int{
        *  where b is the smallest blocking factor among A and B or A and C or B and C. 
        */
       void run(char * A, char * B, char * C);
+      /**
+       * \brief interchanges processors in the communicator -> permuting
+       *  the data such that each communicator has adjacent global ranks
+       */
+      void blockComm( int const *rgrid, char *A, char *B, char *C
+                    , size_t sizeA, size_t sizeB, size_t sizeC
+                    , CommData globalComm, std::vector<int> &swap);
+
+      /**
+       * \brief returns the number of nodes & number of ranks per node
+       *        note: only trustworthy if ranks per node is the same for all nodes!!
+       */
+      ipair getNumNodes(MPI_Comm comm);
       /**
        * \brief returns the number of bytes of buffer space
        *  we need 
diff --git a/src/contraction/ctr_comm.h b/src/contraction/ctr_comm.h
index 0f4670df..cd2c6935 100644
--- a/src/contraction/ctr_comm.h
+++ b/src/contraction/ctr_comm.h
@@ -200,7 +200,9 @@ namespace CTF_int{
       virtual double est_time_fp(int nlyr) { return 0; };
       virtual double est_time_rec(int nlyr) { return est_time_fp(nlyr); };
       virtual ctr * clone() { return NULL; };
-      
+      virtual void blockComm( int const *rgrid, char *A, char *B, char *C
+                            , size_t sizeA, size_t sizeB, size_t sizeC
+                            , CommData globalComm, std::vector<int> &swap) {};
       /**
        * \brief deallocates generic ctr object
        */
diff --git a/src/interface/common.cxx b/src/interface/common.cxx
index ed7ffc07..b8873add 100644
--- a/src/interface/common.cxx
+++ b/src/interface/common.cxx
@@ -571,6 +571,110 @@ namespace CTF_int {
     alltoallv_mdl.observe(tps);
   }
 
+
+  CommGrid::CommGrid(ipair _rGrid, int _nNodes){
+    nRanks = _rGrid.first*_rGrid.second;
+    colorKey.resize(nRanks);
+    nGrid = getNodeGrid(_nNodes, _rGrid);
+    rGrid = _rGrid;
+    iGrid.first  = rGrid.first / nGrid.first;
+    iGrid.second = rGrid.second / nGrid.second;
+    assert(colorKey.size() == iGrid.first*iGrid.second*_nNodes);
+  }
+
+  ipair CommGrid::getNodeGrid(int nNodes, ipair rGrid){
+    ipair nGrid({1, 1});
+    std::vector<int> facNodes(CommGrid::factorize(nNodes));
+    std::vector<int> facrgf(CommGrid::factorize(rGrid.first));
+    std::vector<int> facrgs(CommGrid::factorize(rGrid.second));
+    std::vector<int> diff;
+
+    // We are selecting all prim factors of #nodes
+    // which do not occur in the prim factors of a grid edge
+    // we remove these factors and assign them to the opponent grid edge
+
+    std::set_difference( facNodes.begin(), facNodes.end()
+                       , facrgf.begin(), facrgf.end()
+                       , std::back_inserter(diff)
+                       );
+
+   for (auto d: diff)
+      facNodes.erase(std::find(facNodes.begin(), facNodes.end(), d));
+
+    nGrid.second =
+      std::accumulate(diff.begin(), diff.end(), 1, std::multiplies<int>());
+    diff.resize(0);
+
+    std::set_difference( facNodes.begin(), facNodes.end()
+                       , facrgs.begin(), facrgs.end()
+                       , std::back_inserter(diff)
+                       );
+    for (auto d: diff)
+      facNodes.erase(std::find(facNodes.begin(), facNodes.end(), d));
+
+    nGrid.first =
+      std::accumulate(diff.begin(), diff.end(), 1, std::multiplies<int>());
+
+    // if there is no element left, all prim factors are distributed
+    if (!facNodes.size()) return nGrid;
+    //assign the remaining prim factors as such that the grid on every
+    //node is closest possible to a square
+    double minVal(DBL_MAX);
+    ipair bestPair;
+    for (int i(0); i < pow(2, facNodes.size()); i++){
+      ipair edges(CommGrid::getSquare(i, facNodes));
+      // build igrid.first / igrid.second and take the one with
+      // a ratio closest to one
+      //its not true that the node grid candidates are divisor of the rGrid:
+      //we allow only these edges
+      int first(edges.first*nGrid.first);
+      int second(edges.second*nGrid.second);
+      if ( (nRanks/first)*first != nRanks) continue;
+      if ( (nRanks/second)*second != nRanks) continue;
+
+      double val(1.0/(double)first  + 1.0/(double)second);
+      if ( minVal > val ){
+        minVal = val;
+        bestPair = {edges.first, edges.second};
+      }
+    }
+    nGrid.first  *= bestPair.first;
+    nGrid.second *= bestPair.second;
+    return nGrid;
+  }
+
+  std::vector<int> CommGrid::factorize(int number){ 
+    std::vector<int> factors;
+    int n(number);
+    if (n < 4) factors.push_back(n);
+    int d(2);
+    while (d*d <= n)
+    while (n>1){
+      while (!(n%d)){
+        factors.push_back(d);
+        n /= d;
+      }
+      d++;
+    }
+    return factors;
+  }
+
+  ipair CommGrid::getSquare(int id, std::vector<int> factors) {
+    ipair result({1,1});
+    result.second = std::accumulate(
+      factors.begin(), factors.end(), 1, std::multiplies<double>()
+    );
+    for (int pos(0); ; pos++) {
+      int bit(pow(2,pos));
+      if (bit > id) break;
+      if(id & bit) result.first *= factors[pos];
+    }
+    result.second /= result.first;
+    return result;
+  }
+
+
+
   char * get_default_inds(int order, int start_index){
     char * inds = (char*)CTF_int::alloc(order*sizeof(char));
     for (int i=0; i<order; i++){
diff --git a/src/interface/common.h b/src/interface/common.h
index e46d5f70..8fa1c43a 100644
--- a/src/interface/common.h
+++ b/src/interface/common.h
@@ -14,6 +14,7 @@
 #include <iostream>
 #include <limits.h>
 #include <random>
+#include <cfloat>
 
 #include "../shared/model.h"
 
@@ -241,6 +242,24 @@ namespace CTF_int {
 
   };
 
+  using ipair = std::pair<int,int>;
+  struct CommGrid {
+    CommGrid(){};
+    ~CommGrid(){};
+    CommGrid(ipair _rGrid, int _nNodes);
+
+    int nRanks;
+    std::vector<ipair> colorKey;
+    ipair rGrid; // RankGrid: given by the user
+    ipair nGrid; // NodeGrid: output, grid of nodes
+    ipair iGrid; // intraNodeGrid: the ranks of one node possess this grid
+
+    ipair getNodeGrid(int nNodes, ipair rGrid);
+    std::vector<int> factorize(int number);
+    ipair getSquare(int id, std::vector<int> factors);
+  };
+
+
   int  alloc_ptr(int64_t len, void ** const ptr);
   int  mst_alloc_ptr(int64_t len, void ** const ptr);
   void * alloc(int64_t len);
diff --git a/tst b/tst
deleted file mode 100644
index aff60f88..00000000
--- a/tst
+++ /dev/null
@@ -1 +0,0 @@
-fiaf:

From e4f1a90fce27e21c7ed1db4b03c9018f53bccd95 Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Tue, 28 Dec 2021 17:40:33 +0100
Subject: [PATCH 12/19] profile bcast! temp. as it needs an extra barrier

---
 src/interface/common.cxx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/interface/common.cxx b/src/interface/common.cxx
index b8873add..3aa5b3fc 100644
--- a/src/interface/common.cxx
+++ b/src/interface/common.cxx
@@ -387,7 +387,10 @@ namespace CTF_int {
 #ifdef TUNE
     double st_time = MPI_Wtime();
 #endif
+    TAU_FSTART(bcast);
     MPI_Bcast(buf, count, mdtype, root, cm);
+    MPI_Barrier(cm);
+    TAU_FSTART(bcast);
 #ifdef TUNE
     MPI_Barrier(cm);
     double exe_time = MPI_Wtime()-st_time;

From 88c101348b00a87def0393f962d3f87b1007110a Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Tue, 28 Dec 2021 18:13:38 +0100
Subject: [PATCH 13/19] fix typo in last commit

---
 src/interface/common.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interface/common.cxx b/src/interface/common.cxx
index 3aa5b3fc..c8d250da 100644
--- a/src/interface/common.cxx
+++ b/src/interface/common.cxx
@@ -390,7 +390,7 @@ namespace CTF_int {
     TAU_FSTART(bcast);
     MPI_Bcast(buf, count, mdtype, root, cm);
     MPI_Barrier(cm);
-    TAU_FSTART(bcast);
+    TAU_FSTOP(bcast);
 #ifdef TUNE
     MPI_Barrier(cm);
     double exe_time = MPI_Wtime()-st_time;

From 53ae5daad851bf3b198ebe1fa761c13b12291116 Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Mon, 28 Mar 2022 11:14:32 +0200
Subject: [PATCH 14/19] aG:Add --without-scalapack to the configure script

---
 configure | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 5e5627b1..145548e6 100755
--- a/configure
+++ b/configure
@@ -14,6 +14,7 @@ function usage
   echo -e '\t--with-lapack     Tells CTF build to enable LAPACK functionality regardless of whether LAPACK libs have been given.'
   echo
   echo -e '\t--with-scalapack  Tells CTF build to enable ScaLAPACK functionality regardless of whether ScaLAPACK libs have been given.'
+  echo -e '\t--without-scalapack  Tells CTF build to disable ScaLAPACK functionality regardless of whether ScaLAPACK libs have been given.'
   echo
   echo -e '\t--build-scalapack Tells CTF to download and build ScaLAPACK library.'
   echo
@@ -498,6 +499,7 @@ depstype=normal
 WITH_CUDA=0
 WITH_LAPACK=0
 WITH_SCALAPACK=0
+WITHOUT_SCALAPACK=0
 WITH_STATIC=1
 WITH_DYNAMIC=1
 BUILD_SCALAPACK=0
@@ -518,6 +520,9 @@ while [ "x$1" != "x" ]; do
     --with-scalapack)
       WITH_SCALAPACK=1
       ;;
+    --without-scalapack)
+      WITHOUT_SCALAPACK=1
+      ;;
     --build-scalapack)
       BUILD_SCALAPACK=1
       ;;
@@ -1035,7 +1040,7 @@ if [ $BUILD_SCALAPACK = 1 ]; then
 fi
 
 USING_SCALA=0
-if [ $WITH_STATIC = 1 ]; then
+if [[ $WITH_STATIC = 1 && $WITHOUT_SCALAPACK = 0 ]]; then
   echo -n 'Checking for static ScaLAPACK... '
   if testlink "$LIB_PATH $LIBS" $PDGEMM $VERBOSE; then
     echo 'static SCALAPACK found.'
@@ -1069,7 +1074,7 @@ if [ $WITH_STATIC = 1 ]; then
   fi
 fi
 
-if [ $WITH_DYNAMIC = 1 ]; then
+if [[ $WITH_DYNAMIC = 1 && $WITHOUT_SCALAPACK = 0 ]]; then
   echo -n 'Checking for dynamic ScaLAPACK... '
   if testldlink "$LD_LIB_PATH" "$LD_LIBS" $PDGEMM $VERBOSE; then
     echo 'dynamic SCALAPACK found.'

From 1df94ea7fe72a9488f1b93069ba3b6360f3bbce1 Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Mon, 2 May 2022 07:49:58 +0200
Subject: [PATCH 15/19] add node-awareness. still segfaulting in dryRun

---
 src/contraction/contraction.cxx    |  85 +++++++++++-
 src/contraction/ctr_2d_general.cxx |  20 +++
 src/contraction/ctr_2d_general.h   |  13 ++
 src/contraction/ctr_comm.cxx       |  15 +++
 src/contraction/ctr_comm.h         |   6 +
 src/interface/common.cxx           |  17 ++-
 src/interface/common.h             |  14 +-
 src/interface/world.cxx            |  20 ++-
 src/interface/world.h              |   5 +-
 src/mapping/Makefile               |   4 +-
 src/mapping/node_aware_dist.cxx    | 210 +++++++++++++++++++++++++++++
 src/mapping/node_aware_dist.h      |  16 +++
 src/mapping/topology.cxx           | 200 ++++++++++++++++++++-------
 src/mapping/topology.h             |  62 +++++++--
 14 files changed, 612 insertions(+), 75 deletions(-)
 create mode 100644 src/mapping/node_aware_dist.cxx
 create mode 100644 src/mapping/node_aware_dist.h

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index 690590e5..ac91dd99 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -3,6 +3,7 @@
 #include "../scaling/strp_tsr.h"
 #include "../mapping/mapping.h"
 #include "../mapping/distribution.h"
+#include "../mapping/node_aware_dist.h"
 #include "../tensor/untyped_tensor.h"
 #include "../shared/util.h"
 #include "../shared/memcontrol.h"
@@ -2838,7 +2839,7 @@ namespace CTF_int {
         est_time = memuse;
 #endif
 
-        if (A->wrld->dryRanks) printf( "t %d j %d will use %f GB per rank and take %f s, %f %f %f"
+        if (A->wrld->dryRanks) printf( "t %ld j %d will use %f GB per rank and take %f s, %f %f %f"
                                      , t, j, memuse/1024.0/1024./1024
                                      , est_time, redist_time, contr_time, fold_time);
         if (A->wrld->dryRanks) C->print_map();
@@ -4461,6 +4462,65 @@ namespace CTF_int {
     MPI_Barrier(global_comm.cm);
     TAU_FSTOP(pre_ctr_func_barrier);
   #endif
+
+
+#define NODE_AWARE 1
+#ifdef NODE_AWARE
+    TAU_FSTART(node_aware_remapping);
+    /* reorder processor grid to account for node-awareness */
+    topology orig_topo = *(C->topo);
+    int64_t node_aware_send_to_rank(0);
+    int64_t node_aware_recv_from_rank(0);
+    // FIXME: support sparsity
+    if (C->wrld->ppn != 1 && !is_sparse()){
+      std::vector<int> pe_grid(orig_topo.lens, orig_topo.lens + orig_topo.order);
+      std::vector<std::vector<int> > inter_node_grids = CTF_int::get_inter_node_grids(pe_grid, C->wrld->np/C->wrld->ppn);
+      //std::vector< std::vector<int> > intra_node_grids = CTF_int::get_all_shapes(C->wrld->ppn()){
+      int * intra_node_lens = (int*)CTF_int::alloc(orig_topo.order*sizeof(int));
+      int64_t best_topo_index(0);
+      double best_comm_vol = DBL_MAX;
+      for (size_t i=0; i<inter_node_grids.size(); i++){
+        for (int j=0; j<orig_topo.order; j++){
+          intra_node_lens[j] = orig_topo.lens[j] / inter_node_grids[i][j];
+        }
+        topology na_topo_i(orig_topo.order, orig_topo.lens, orig_topo.glb_comm, 0, intra_node_lens);
+        // overwrite topology object in a way that also changes information in CommData objects pointed to ctrf
+        C->topo->morph_to(na_topo_i);
+
+        double comm_vol_i = ctrf->est_internode_comm_vol_rec(ctrf->num_lyr);
+        if (comm_vol_i < best_comm_vol){
+          best_topo_index = i;
+          best_comm_vol = comm_vol_i;
+        }
+        C->topo->morph_to(orig_topo);
+      }
+      for (int j=0; j<orig_topo.order; j++){
+        intra_node_lens[j] = orig_topo.lens[j] / inter_node_grids[best_topo_index][j];
+      }
+      topology node_aware_topo(orig_topo.order, orig_topo.lens, orig_topo.glb_comm, 0, intra_node_lens);
+      // overwrite topology object in a way that also changes information in CommData objects pointed to ctrf
+      C->topo->morph_to(node_aware_topo);
+      node_aware_send_to_rank = get_inv_topo_reorder_rank(node_aware_topo.order, node_aware_topo.lens, intra_node_lens, orig_topo.glb_comm.rank);
+      node_aware_recv_from_rank = get_topo_reorder_rank(node_aware_topo.order, node_aware_topo.lens, node_aware_topo.lda, intra_node_lens, orig_topo.glb_comm.rank);
+      if (orig_topo.glb_comm.rank != node_aware_send_to_rank){
+        IASSERT(orig_topo.glb_comm.rank != node_aware_recv_from_rank);
+        TAU_FSTART(redistribute_for_node_aware);
+        // FIXME: to support sparsity need to also communicate nnz information here
+        MPI_Status stat;
+        MPI_Sendrecv_replace(A->data, A->size, A->sr->mdtype(), node_aware_send_to_rank, 1322, node_aware_recv_from_rank, 1322, orig_topo.glb_comm.cm, &stat);
+        MPI_Sendrecv_replace(B->data, B->size, B->sr->mdtype(), node_aware_send_to_rank, 1323, node_aware_recv_from_rank, 1323, orig_topo.glb_comm.cm, &stat);
+        MPI_Sendrecv_replace(C->data, C->size, C->sr->mdtype(), node_aware_send_to_rank, 1324, node_aware_recv_from_rank, 1324, orig_topo.glb_comm.cm, &stat);
+        TAU_FSTOP(redistribute_for_node_aware);
+      }
+      cdealloc(intra_node_lens);
+    }
+    TAU_FSTOP(node_aware_remapping);
+#endif
+
+
+
+
+
     TAU_FSTART(ctr_func);
     /* Invoke the contraction algorithm */
     A->topo->activate();
@@ -4545,6 +4605,29 @@ namespace CTF_int {
       printf("Finished contraction  computation\n");
     }
   #endif
+#ifdef NODE_AWARE
+    TAU_FSTART(node_aware_backmapping);
+    /* reorder processor grid to account for node-awareness */
+    // FIXME: support sparsity
+    if (C->wrld->ppn != 1 && !is_sparse() && orig_topo.glb_comm.rank != node_aware_send_to_rank){
+      TAU_FSTART(redistribute_for_node_aware);
+      // FIXME: to support sparsity need to also communicate nnz information here
+      MPI_Status stat;
+      if (A->is_home) {
+        MPI_Sendrecv_replace(A->data, A->size, A->sr->mdtype(), node_aware_recv_from_rank, 1325, node_aware_send_to_rank, 1325, orig_topo.glb_comm.cm, &stat);
+      }
+      if (B->is_home) {
+        MPI_Sendrecv_replace(B->data, B->size, B->sr->mdtype(), node_aware_recv_from_rank, 1326, node_aware_send_to_rank, 1326, orig_topo.glb_comm.cm, &stat);
+      }
+      MPI_Sendrecv_replace(C->data, C->size, C->sr->mdtype(), node_aware_recv_from_rank, 1327, node_aware_send_to_rank, 1327, orig_topo.glb_comm.cm, &stat);
+      TAU_FSTOP(redistribute_for_node_aware);
+    }
+    if (C->wrld->ppn != 1 && !is_sparse()) {
+      C->topo->morph_to(orig_topo);
+    }
+    TAU_FSTOP(node_aware_backmapping);
+#endif
+
 
 
     A->topo->deactivate();
diff --git a/src/contraction/ctr_2d_general.cxx b/src/contraction/ctr_2d_general.cxx
index ee3c6055..844f7310 100755
--- a/src/contraction/ctr_2d_general.cxx
+++ b/src/contraction/ctr_2d_general.cxx
@@ -265,6 +265,26 @@ namespace CTF_int {
     return rec_ctr->est_time_rec(1)*(double)edge_len/MIN(nlyr,edge_len) + est_time_fp(nlyr);
   }
 
+
+  double ctr_2d_general::est_internode_collective_comm_vol(int nlyr) {
+    int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
+    find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
+    double sz = 0.0;
+    if (move_A)
+      sz += (sr_A->el_size*s_A) * ((cdt_A->np / cdt_A->intra_node_np) - 1);
+    if (move_B)
+      sz += (sr_B->el_size*s_B) * ((cdt_B->np / cdt_B->intra_node_np) - 1);
+    if (move_C)
+      sz += (sr_C->el_size*s_C) * ((cdt_C->np / cdt_C->intra_node_np) - 1);
+    return (sz*(double)edge_len)/MIN(nlyr,edge_len);
+  }
+
+  double ctr_2d_general::est_internode_comm_vol_rec(int nlyr) {
+    return rec_ctr->est_internode_comm_vol_rec(1)*(double)edge_len/MIN(nlyr,edge_len) + est_internode_collective_comm_vol(nlyr);
+  }
+
+
+
   int64_t ctr_2d_general::mem_fp() {
     int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
     find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
diff --git a/src/contraction/ctr_2d_general.h b/src/contraction/ctr_2d_general.h
index 3df75d3d..6a20f088 100644
--- a/src/contraction/ctr_2d_general.h
+++ b/src/contraction/ctr_2d_general.h
@@ -102,6 +102,19 @@ namespace CTF_int{
        * \return bytes needed for recursive contraction
        */
       double est_time_rec(int nlyr);
+
+      /**
+       * \brief estimate the inter-node communication volume of this kernel
+       * \return volume in bytes, represented as floating point
+       */
+      double est_internode_collective_comm_vol(int nlyr);
+
+      /**
+       * \brief estimate the inter-node communication volume of the algorithm recursively
+       * \return volume in bytes, represented as floating point
+       */
+      double est_internode_comm_vol_rec(int nlyr);
+
       ctr * clone();
 
       /**
diff --git a/src/contraction/ctr_comm.cxx b/src/contraction/ctr_comm.cxx
index de7a3e20..693cebe2 100755
--- a/src/contraction/ctr_comm.cxx
+++ b/src/contraction/ctr_comm.cxx
@@ -193,6 +193,21 @@ namespace CTF_int {
     return rec_ctr->est_time_rec(nlyr) + est_time_fp(nlyr);
   }
 
+  double ctr_replicate::est_internode_comm_vol_rec(int nlyr) {
+    int i;
+    double sz = 0.;
+    for (i = 0; i < ncdt_A; i++) {
+      sz += (size_A*sr_A->el_size) * ((cdt_A[i]->np / cdt_A[i]->intra_node_np) - 1);
+    }
+    for (i = 0; i < ncdt_B; i++) {
+      sz += (size_B*sr_B->el_size) * ((cdt_B[i]->np / cdt_B[i]->intra_node_np) - 1);
+    }
+    for (i = 0; i < ncdt_C; i++) {
+      sz += (size_C*sr_C->el_size) * ((cdt_C[i]->np / cdt_C[i]->intra_node_np) - 1);
+    }
+    return rec_ctr->est_internode_comm_vol_rec(nlyr) + sz;
+  }
+
   int64_t ctr_replicate::mem_fp(){
     return 0;
   }
diff --git a/src/contraction/ctr_comm.h b/src/contraction/ctr_comm.h
index 0f4670df..c7164181 100644
--- a/src/contraction/ctr_comm.h
+++ b/src/contraction/ctr_comm.h
@@ -199,6 +199,7 @@ namespace CTF_int{
       virtual int64_t mem_rec() { return mem_fp(); };
       virtual double est_time_fp(int nlyr) { return 0; };
       virtual double est_time_rec(int nlyr) { return est_time_fp(nlyr); };
+      virtual double est_internode_comm_vol_rec(int nlyr) { return 0; };
       virtual ctr * clone() { return NULL; };
       
       /**
@@ -254,6 +255,11 @@ namespace CTF_int{
        * \return time in sec
        */
       double est_time_rec(int nlyr);
+      /**
+       * \brief estimate the inter-node communication volume of the algorithm
+       * \return volume in bytes, represented as floating point
+       */
+      double est_internode_comm_vol_rec(int nlyr);
       void print();
       ctr * clone();
 
diff --git a/src/interface/common.cxx b/src/interface/common.cxx
index ed7ffc07..df88ba85 100644
--- a/src/interface/common.cxx
+++ b/src/interface/common.cxx
@@ -270,6 +270,7 @@ namespace CTF_int {
     rank    = other.rank;
     np      = other.np;
     color   = other.color;
+    intra_node_np = other.intra_node_np;
     created = 0;
   }
 
@@ -279,6 +280,7 @@ namespace CTF_int {
     rank    = other.rank;
     np      = other.np;
     color   = other.color;
+    intra_node_np = other.intra_node_np;
     created = 0;
     return *this;
   }
@@ -288,16 +290,18 @@ namespace CTF_int {
     cm = cm_;
     MPI_Comm_rank(cm, &rank);
     MPI_Comm_size(cm, &np);
+    intra_node_np = 1;
     alive = 1;
     created = 0;
   }
 
-  CommData::CommData(int rank_, int color_, int np_){
-    rank    = rank_;
-    color   = color_;
-    np      = np_;
-    alive   = 0;
-    created = 0;
+  CommData::CommData(int rank_, int color_, int np_, int intra_node_np_){
+    rank          = rank_;
+    color         = color_;
+    np            = np_;
+    intra_node_np = intra_node_np_;
+    alive         = 0;
+    created       = 0;
   }
 
   CommData::CommData(int rank_, int color_, CommData parent){
@@ -306,6 +310,7 @@ namespace CTF_int {
     ASSERT(parent.alive);
     MPI_Comm_split(parent.cm, color, rank_, &cm);
     MPI_Comm_size(cm, &np);
+    intra_node_np = 1;
     alive   = 1;
     created = 1;
   }
diff --git a/src/interface/common.h b/src/interface/common.h
index e46d5f70..d60cbcd9 100644
--- a/src/interface/common.h
+++ b/src/interface/common.h
@@ -134,20 +134,31 @@ namespace CTF_int {
   // accumulates computed flops (targeted for internal use)
   void add_computed_flops(int64_t n);
 
+  void set_save_glb_comm(MPI_Comm gcm);
+
   // get computed flops
   int64_t get_computed_flops();
 
   // accumulates computed flops (targeted for internal use)
   void add_estimated_flops(int64_t n);
 
+  // wrapper of MPI communicator
   class CommData {
     public:
+      // MPI communicator
       MPI_Comm cm;
+      // number of processors
       int np;
+      // rank of processor
       int rank;
+      // color of subcommunicator cm relative to some parent commmunicator, if provided
       int color;
+      // 1 if this communicator is active (MPI_Comm is created and not finalized)
       int alive;
+      // 1 if this object created a communicator that needs to be finalized (as opposed to being an alias to a different communicator object)
       int created;
+      // intra_node_np, number of processes per node (intra-node grid dimension) corresponding to this communicator, if provided, 1 otherwise
+      int intra_node_np;
   
       CommData();
       ~CommData();
@@ -167,8 +178,9 @@ namespace CTF_int {
        * \param[in] rank rank within this comm
        * \param[in] color identifier of comm within parent
        * \param[in] np number of processors within this comm
+       * \param[in] intra_node_np number of processors per physical node
        */
-      CommData(int rank, int color, int np);
+      CommData(int rank, int color, int np, int intra_node_np=1);
 
       /**
        * \brief create active subcomm from parent comm which must be active
diff --git a/src/interface/world.cxx b/src/interface/world.cxx
index 5e0c37e5..844e5f0f 100644
--- a/src/interface/world.cxx
+++ b/src/interface/world.cxx
@@ -58,6 +58,7 @@ namespace CTF {
   World::World(int            argc,
                char * const * argv){
     comm = MPI_COMM_WORLD;
+    ppn = 1;
 #ifdef BGQ
     this->init(comm, TOPOLOGY_BGQ, argc, argv);
 #else
@@ -71,9 +72,11 @@ namespace CTF {
 
 
   World::World(MPI_Comm       comm_,
+               int            ppn_,
                int            argc,
                char * const * argv){
     comm = comm_;
+    ppn  = ppn_;
 #ifdef BGQ
     this->init(comm, TOPOLOGY_BGQ, argc, argv);
 #else
@@ -85,9 +88,10 @@ namespace CTF {
 #endif
   }
 
-  World::World(std::string print, int dryRanks_){
+  World::World(std::string print, int dryRanks_, int ppn_){
     comm = MPI_COMM_WORLD;
     dryRanks = dryRanks_;
+    ppn = ppn_;
 
     this->init(comm, TOPOLOGY_GENERIC);
   }
@@ -99,11 +103,13 @@ namespace CTF {
                int             argc,
                char * const *  argv){
     comm = comm_;
+    ppn = 1;
     this->init(comm, order, lens, argc, argv);
   }
 
   World::World(World const & other){
     comm        = other.comm;
+    ppn         = other.ppn;
 #if DEBUG >= 1
     if (other.rank == 0){
       printf("CTF WARNING: Creating copy of World, which is not free or useful, pass original World by reference instead if possible.\n");
@@ -196,7 +202,7 @@ namespace CTF {
 
   int World::initialize(int                   argc,
                         const char * const *  argv){
-    char * mem_size, * ppn;
+    char * mem_size, * cppn;
     if (comm == MPI_COMM_WORLD && universe_exists){
       delete phys_topology;
       *this = universe;
@@ -271,16 +277,16 @@ namespace CTF {
                     imem_size);
         CTF_int::set_mem_size(imem_size);
       }
-      ppn = getenv("CTF_PPN");
-      if (ppn != NULL){
+      cppn = getenv("CTF_PPN");
+      if (cppn != NULL){
         if (rank == 0)
           printf("Assuming %d processes per node due to CTF_PPN environment variable\n",
-                    atoi(ppn));
-        ASSERT(atoi(ppn)>=1);
+                    atoi(cppn));
+        ASSERT(atoi(cppn)>=1);
   #ifdef BGQ
         CTF_int::set_memcap(.75);
   #else
-        CTF_int::set_memcap(.75/atof(ppn));
+        CTF_int::set_memcap(.75/atof(cppn));
   #endif
       }
       if (rank == 0)
diff --git a/src/interface/world.h b/src/interface/world.h
index 1a729ce2..07743efb 100644
--- a/src/interface/world.h
+++ b/src/interface/world.h
@@ -24,6 +24,8 @@ namespace CTF {
       int rank;
       /** \brief number of processors */
       int np;
+      /** \brief number of processors per node (optional / can be 1)*/
+      int ppn;
       /** \brief set dryRun */
       int dryRanks = 0;
       /** \brief derived topologies */
@@ -65,6 +67,7 @@ namespace CTF {
        * \param[in] argv main arguments 
        */
       World(MPI_Comm       comm = MPI_COMM_WORLD,
+            int            ppn  = 1,
             int            argc = 0,
             char * const * argv = NULL);
 
@@ -94,7 +97,7 @@ namespace CTF {
        * \param[in] dryRanks number of dry ranks
       */
 
-      World(std::string print, int dryRanks);
+      World(std::string print, int dryRanks, int ppn = 1);
 
       /**
        * \brief frees CTF library
diff --git a/src/mapping/Makefile b/src/mapping/Makefile
index d5c66a28..a609849c 100644
--- a/src/mapping/Makefile
+++ b/src/mapping/Makefile
@@ -1,10 +1,10 @@
-LOBJS = mapping.o distribution.o topology.o
+LOBJS = mapping.o distribution.o topology.o node_aware_dist.o
 OBJS = $(addprefix $(ODIR)/, $(LOBJS))
 
 ctf: $(OBJS) 
 
 #%d | r ! grep -ho "\.\..*\.h" *.cxx *.h | sort | uniq
-HDRS = ../../Makefile $(BDIR)/config.mk  ../interface/common.h ../mapping/mapping.h ../shared/util.h ../summation/sum_tsr.h ../tensor/untyped_tensor.h 
+HDRS = ../../Makefile $(BDIR)/config.mk  ../interface/common.h ../mapping/mapping.h ../mapping/node_aware_dist.h ../shared/util.h ../summation/sum_tsr.h ../tensor/untyped_tensor.h
 
 $(OBJS): $(ODIR)/%.o: %.cxx *.h  $(HDRS)
 	$(FCXX) -c $< -o $@
diff --git a/src/mapping/node_aware_dist.cxx b/src/mapping/node_aware_dist.cxx
new file mode 100644
index 00000000..f67e22b3
--- /dev/null
+++ b/src/mapping/node_aware_dist.cxx
@@ -0,0 +1,210 @@
+/* The code in this file has been written by Andreas Irmler. */
+
+#include "../tensor/untyped_tensor.h"
+#include "../shared/util.h"
+#include "node_aware_dist.h"
+using ivec  = std::vector<int>;
+using vivec = std::vector<ivec>;
+
+
+namespace CTF_int {
+
+
+  struct Tree {
+
+    //Copy
+    Tree(Tree const &other) {
+      order = other.order;
+      sgf = other.sgf;
+      ogf = other.ogf;
+    }
+
+    //Constructor 1
+    Tree(int _order, vivec _sgf, vivec _ogf){
+      order = _order;
+      sgf = _sgf;
+      ogf = _ogf;
+    }
+
+    // Constructor 2
+    Tree(Tree t, int pos, int el){
+      order = t.order + 1;
+      sgf = t.sgf;
+      ogf = t.ogf;
+      assert(sgf.size() > pos);
+      assert(ogf.size() > pos);
+      sgf[pos].push_back(el);
+      std::sort(sgf[pos].begin(), sgf[pos].end());
+      auto it = std::find(ogf[pos].begin(), ogf[pos].end(), el);
+      assert(it != ogf[pos].end());
+      ogf[pos].erase(it);
+    }
+
+    bool find(int pos, int el) {
+      if (ogf.size() <= pos) {
+        printf("Find problem! order %d, size: %ld, pos: %d, el: %d\n"
+              , order, ogf.size(), pos, el);
+        assert(0);
+      }
+      auto it = std::find(ogf[pos].begin(), ogf[pos].end(), el);
+      if (it == ogf[pos].end()) return false;
+      return true;
+    }
+
+    int order;
+    vivec sgf; // settled grid factors. ie factors which are already assigned
+    vivec ogf; // open grid factors. factors which can
+  };
+
+
+  // return a vector of prim factors
+  ivec iv_factorize(int number){
+    ivec factors;
+    int n(number);
+    if (n < 4) factors.push_back(n);
+    int d(2);
+    while (d*d <= n)
+    while (n>1){
+      while (!(n%d)){
+        factors.push_back(d);
+        n /= d;
+      }
+      d++;
+    }
+    return factors;
+  }
+
+  // return vector with input arguments
+  ivec lineToVint(std::string line) {
+    ivec out;
+    size_t pos;
+    while ((pos = line.find(",")) != std::string::npos) {
+      out.push_back(std::stoi(line.substr(0, pos)));
+      line.erase(0, pos + 1);
+    }
+    out.push_back(std::stoi(line));
+
+    return out;
+  }
+
+
+  std::vector< std::vector<int> > get_inter_node_grids(std::vector<int> rGrid, int nodes){
+    int ranks(std::accumulate(rGrid.begin(), rGrid.end(), 1, std::multiplies<int>()));
+    int ranksPerNode(ranks/nodes);
+    IASSERT (ranksPerNode*nodes == ranks );
+
+	  vivec nodeGrid; // final node Grid
+    const ivec nodeFactors(iv_factorize(nodes));
+    const ivec rankFactors(iv_factorize(ranks));
+    vivec gridFactors; // the tensor grid expressed in prim factors
+    ivec assignedFactors; // rank factors which are already assigned
+    ivec openFactors; // unassigned rank factors
+    for (auto r: rGrid) {
+      gridFactors.push_back(iv_factorize(r));
+    }
+    vivec openGridFactors; // grid factors which cannot assigned to a edge
+
+    for (auto gf: gridFactors){
+
+      ivec others, diff;
+      // all prim factors which are not at the given edge
+      std::set_difference( rankFactors.begin()
+                        , rankFactors.end()
+                        , gf.begin()
+                        , gf.end()
+                        , std::back_inserter(others)
+                        );
+      /*
+      for (auto x: others) {
+        std::cout << "others: " << x << " ";
+      }
+      std::cout << std::endl;
+      */
+      // is there a node factor which lives only on a given edge?
+      // if so assign this factor to this edge
+      std::set_difference( nodeFactors.begin()
+                        , nodeFactors.end()
+                        , others.begin()
+                        , others.end()
+                        , std::back_inserter(diff)
+                        );
+      assignedFactors.insert(assignedFactors.end(), diff.begin(), diff.end());
+
+      openGridFactors.resize(openGridFactors.size()+1);
+      std::set_difference( gf.begin()
+                        , gf.end()
+                        , diff.begin()
+                        , diff.end()
+                        , std::back_inserter(openGridFactors.back())
+                        );
+      if (!diff.size()) diff.push_back(1);
+      nodeGrid.push_back(diff);
+
+    }
+
+    std::sort(assignedFactors.begin(), assignedFactors.end());
+    std::set_difference( nodeFactors.begin()
+                      , nodeFactors.end()
+                      , assignedFactors.begin()
+                      , assignedFactors.end()
+                      , std::back_inserter(openFactors)
+                      );
+    // The algorithm goes like that:
+    // 1.) we pick the last element of the list, remove it from the list,
+    //     then open N branches where N is the number of possible possitions
+    //     for that element in the rank Grid
+    // 2.) we remove identical branches
+    // 3.) we go to step 1
+
+    size_t b(0);
+    size_t n(rGrid.size());
+    std::vector<Tree> treeVec;
+    treeVec.emplace_back(0, nodeGrid, openGridFactors);
+    // we loop over all prim Factors of the number of nodes
+    while (openFactors.size()){
+      // take the last element of the list and remove it from the list
+      auto f(openFactors.back());
+      openFactors.pop_back();
+
+      // we work only in the last layer of the tree
+      // we have to find the begin/end in the whole vector
+      auto o(treeVec.back().order);
+      auto b(std::distance( treeVec.begin()
+                          , std::find_if( treeVec.begin()
+                                        , treeVec.end()
+                                        , [o] (const Tree &a)
+                                          { return a.order == o;}
+                                        )
+                          ));
+
+      auto e(treeVec.size());
+      // loop over the last layer of the tree and distribute the
+      // element to all possible positions
+      // however: if a potential element is already in the list,
+      //          do not add it
+      for (size_t t(b); t < e; t++){
+        for (auto i(0); i < n; i++)
+        if ( treeVec[t].find(i, f) ){
+          bool distinct(true);
+          auto cand = Tree(treeVec[t], i, f);
+          for (size_t n(e); n < treeVec.size(); n++){
+            if (cand.sgf == treeVec[n].sgf) distinct = false;
+          }
+          if (distinct) treeVec.push_back(cand);
+        }
+      }
+    }
+
+    std::vector< std::vector<int> > inter_node_grids;
+    for (auto tv: treeVec) {
+      if (treeVec.back().order == tv.order) {
+        std::vector<int> sgf;
+        for (auto s: tv.sgf) {
+          sgf.push_back(std::accumulate(s.begin(), s.end(), 1, std::multiplies<int>()));
+        }
+        inter_node_grids.push_back(sgf);
+      }
+    }
+    return inter_node_grids;
+  }
+}
diff --git a/src/mapping/node_aware_dist.h b/src/mapping/node_aware_dist.h
new file mode 100644
index 00000000..41f09006
--- /dev/null
+++ b/src/mapping/node_aware_dist.h
@@ -0,0 +1,16 @@
+/*Copyright (c) 2022, Edgar Solomonik, all rights reserved.*/
+
+#ifndef __INT_NODE_DISTRIBUTION_H__
+#define __INT_NODE_DISTRIBUTION_H__
+
+namespace CTF_int {
+  /**
+   * \brief returns all possible valid choices inter-node grids, given an overall processor grid and a number of nodes
+   * \param[in] rGrid overall processor grid
+   * \param[in] nodes number of nodes
+   * \return vector of inter node processor grids of total size equal to the number of nodes and of same dimension as rGrid, where each dimension divides into the respective dimension of rGrid
+   */
+  std::vector<std::vector<int> > get_inter_node_grids(std::vector<int> rGrid, int nodes);
+}
+
+#endif
diff --git a/src/mapping/topology.cxx b/src/mapping/topology.cxx
index 02b6eae1..b7980699 100644
--- a/src/mapping/topology.cxx
+++ b/src/mapping/topology.cxx
@@ -3,6 +3,7 @@
 #include "topology.h"
 #include "../shared/util.h"
 #include "../mapping/mapping.h"
+#include <vector>
 
 #ifdef BGQ
 #include "mpix.h"
@@ -17,7 +18,41 @@ namespace CTF_int {
     is_activated = false;
     dim_comm     = NULL;
   }*/
-  
+
+  int get_inv_topo_reorder_rank(int order, int const * lens, int const * intra_node_lens, int new_rank){
+    int irank = new_rank;
+    int intra_node_rank = 0;
+    int node_rank = 0;
+    int lda_node_rank = 1;
+    int lda_intra_node_rank = 1;
+    for (int i=0; i<order; i++){
+      intra_node_rank += (irank%intra_node_lens[i])*lda_intra_node_rank;
+      node_rank += ((irank%lens[i])/intra_node_lens[i])*lda_node_rank;
+      irank = irank / lens[i];
+      lda_node_rank = lda_node_rank*(lens[i]/intra_node_lens[i]);
+      lda_intra_node_rank = lda_intra_node_rank*intra_node_lens[i];
+    }
+    return intra_node_rank + lda_intra_node_rank*node_rank;
+  }
+
+  int get_topo_reorder_rank(int order, int const * lens, int const * lda, int const * intra_node_lens, int rank){
+    int num_intra_node = 1;
+    for (int i=0; i<order; i++){
+      num_intra_node *= intra_node_lens[i];
+    }
+    int intra_node_rank = rank % num_intra_node;
+    int node_rank = rank / num_intra_node;
+    int new_rank = 0;
+    for (int i=0; i<order; i++){
+      int i_node_rank = node_rank % (lens[i]/intra_node_lens[i]);
+      node_rank = node_rank / (lens[i]/intra_node_lens[i]);
+      int i_intra_node_rank = intra_node_rank % intra_node_lens[i];
+      intra_node_rank = intra_node_rank / intra_node_lens[i];
+      new_rank += (i_node_rank*intra_node_lens[i] + i_intra_node_rank)*lda[i];
+    }
+    return new_rank;
+  }
+
   topology::~topology(){
     deactivate();
     CTF_int::cdealloc(lens);
@@ -25,7 +60,7 @@ namespace CTF_int {
     CTF_int::cdealloc(dim_comm);
   }
 
-  topology::topology(topology const & other) : glb_comm(other.glb_comm) {
+  topology::topology(topology const & other) : glb_comm(other.glb_comm), unord_glb_comm(other.unord_glb_comm) {
     order        = other.order;
 
     lens         = (int*)CTF_int::alloc(order*sizeof(int));
@@ -40,32 +75,69 @@ namespace CTF_int {
     }
 
     is_activated = other.is_activated;
+    is_reordered = other.is_reordered;
+  }
+
+  void topology::morph_to(topology const & other){
+    ASSERT(order == other.order);
+    ASSERT(!is_reordered || !other.is_reordered);
+    memcpy(lens, other.lens, order*sizeof(int));
+    memcpy(lda, other.lda, order*sizeof(int));
+
+    // overwrite communicators, swapping out CommData objects pointed to elsewhere
+    for (int i=0; i<order; i++){
+      dim_comm[i] = CommData(other.dim_comm[i]);
+    }
+
+    is_activated = other.is_activated;
+    is_reordered = other.is_reordered;
+    glb_comm = other.glb_comm;
+    unord_glb_comm = other.unord_glb_comm;
   }
 
   topology::topology(int         order_,
                      int const * lens_,
                      CommData    cdt,
-                     bool        activate) : glb_comm(cdt) {
+                     bool        activate,
+                     int const * intra_node_lens) : unord_glb_comm(cdt), glb_comm(cdt) {
     order        = order_;
     lens         = (int*)CTF_int::alloc(order_*sizeof(int));
     lda          = (int*)CTF_int::alloc(order_*sizeof(int));
     dim_comm     = (CommData*)CTF_int::alloc(order_*sizeof(CommData));
     is_activated = false;
-   
+
     memcpy(lens, lens_, order_*sizeof(int));
     //reverse FIXME: this is assumed somewhere...
 //    for (int i=0; i<order; i++){
 //      lens[i] = lens_[order-i-1];
 //    }
- 
+
+    lda[0] = 1;
+    for (int i = 1; i < order; i++) {
+      lda[i] = lda[i-1] * lens[i-1];
+    }
+
+    if (intra_node_lens == NULL){
+      is_reordered = false;
+      //glb_comm = cdt;
+    } else {
+      int new_rank = get_topo_reorder_rank(order, lens, lda, intra_node_lens, cdt.rank);
+      is_reordered = true;
+      glb_comm = CommData(new_rank, 0, cdt.np);
+    }
     int stride = 1, cut = 0;
     int rank = glb_comm.rank;
     for (int i=0; i<order; i++){
       lda[i] = stride;
-      dim_comm[i] = CommData(((rank/stride)%lens[i]),
-                             (((rank/(stride*lens[i]))*stride)+cut),
-                             lens[i]);
-//      SETUP_SUB_COMM_SHELL(cdt, dim_comm[i],
+      if (intra_node_lens == NULL)
+        dim_comm[i] = CommData(((rank/stride)%lens[i]),
+                               (((rank/(stride*lens[i]))*stride)+cut),
+                               lens[i]);
+      else
+        dim_comm[i] = CommData(((rank/stride)%lens[i]),
+                               (((rank/(stride*lens[i]))*stride)+cut),
+                               lens[i],
+                               intra_node_lens[i]);
       stride*=lens[i];
       cut = (rank - (rank/stride)*stride);
     }
@@ -75,10 +147,11 @@ namespace CTF_int {
 
   void topology::activate(){
     if (!is_activated){
+      if (is_reordered) glb_comm.activate(unord_glb_comm.cm);
       for (int i=0; i<order; i++){
         dim_comm[i].activate(glb_comm.cm);
       }
-    } 
+    }
     is_activated = true;
   }
 
@@ -87,7 +160,8 @@ namespace CTF_int {
       for (int i=0; i<order; i++){
         dim_comm[i].deactivate();
       }
-    } 
+      if (is_reordered) glb_comm.deactivate();
+    }
     is_activated = false;
   }
 
@@ -141,7 +215,7 @@ namespace CTF_int {
         topo = new topology(dim, topo_dims, glb_comm, 1);
         CTF_int::cdealloc(topo_dims);
         return topo;
-      } else 
+      } else
       #endif
       {
         int order;
@@ -397,34 +471,36 @@ namespace CTF_int {
     }
   }
 
-  /** 
+  /**
    * \brief computes all unique factorizations into non-primes each yielding a topology, prepending additional factors as specified
    * \param[in] cdt global communicator
    * \param[in] n_uf number of unique prime factors
    * \param[in] uniq_fact list of prime factors
    * \param[in] n_prepend number of factors to prepend
-   * \param[in] mults ? 
+   * \param[in] mults multiplicities of each factor
    * \param[in] prelens factors to prepend
    * \return lens vector of factorizations
    */
-  std::vector< topology* > get_all_topos(CommData cdt, int n_uf, int const * uniq_fact, int const * mults, int n_prepend, int const * prelens){
-    std::vector<topology*> topos;
+  std::vector< std::vector<int>* > get_all_shapes_rec(int n_uf, int const * uniq_fact, int const * mults, int n_prepend, int const * prelens){
+    std::vector< std::vector<int>* > shapes;
 
+    // enumerate the number of different possible numbers (including 1) that divide (with remainder 0) the number of processors
     int num_divisors = 1;
     for (int i=0; i<n_uf; i++){
       num_divisors *= (1+mults[i]);
       ASSERT(num_divisors < 1E6);
     }
-    
+
     if (num_divisors == 1){
-      topos.push_back(new topology(n_prepend, prelens, cdt));
-      return topos;
+      shapes.push_back(new std::vector<int>(prelens,prelens+n_prepend));
+      return shapes;
     }
     int sub_mults[n_uf];
     int new_prelens[n_prepend+1];
     memcpy(new_prelens, prelens, n_prepend*sizeof(int));
     //FIXME: load may be highly imbalanced
     //for (int div=cdt.rank; div<num_divisors; div+=cdt.np)
+    //iterate through all possible divisors
     for (int div=1; div<num_divisors; div++){
       //memcpy(sub_mults, mults, n_uf*sizeof(int));
       int dmults[n_uf];
@@ -437,37 +513,40 @@ namespace CTF_int {
         len0 *= std::pow(uniq_fact[i], dmults[i]);
       }
       new_prelens[n_prepend] = len0;
-      std::vector< topology* > new_topos = get_all_topos(cdt, n_uf, uniq_fact, sub_mults, n_prepend+1, new_prelens);
+      std::vector< std::vector<int>* > new_shapes = get_all_shapes_rec(n_uf, uniq_fact, sub_mults, n_prepend+1, new_prelens);
       //FIXME call some append function?
-      for (unsigned i=0; i<new_topos.size(); i++){
-        topos.push_back(new_topos[i]);
+      for (unsigned i=0; i<new_shapes.size(); i++){
+        shapes.push_back(new_shapes[i]);
       }
     }
-    return topos;
+    return shapes;
   }
 
-  std::vector< topology* > get_generic_topovec(CommData cdt){
-    std::vector<topology*> topovec;
-
+  /**
+   * \brief generate all possible factorizations of size into divisors
+  *  \param[in] total size that numbers should multiply to
+  *  \return all possible collections of natural numbers that multiply to size (excluding 1s)
+   */
+  std::vector< std::vector<int>* > get_all_shapes(int size){
     int nfact, * factors;
-    factorize(cdt.np, &nfact, &factors);
+    factorize(size, &nfact, &factors);
     if (nfact <= 1){
-      topovec.push_back(new topology(nfact, factors, cdt));
-      if (cdt.np >= 7 && cdt.rank == 0) 
-        DPRINTF(1,"CTF WARNING: using a world with a prime number of processors may lead to very bad performance\n");
+      std::vector<std::vector<int>*> shapes;
+      shapes.push_back(new std::vector<int>(factors, factors+nfact));
       if (nfact > 0) cdealloc(factors);
-      return topovec;
+      return shapes;
     }
     std::sort(factors,factors+nfact);
+    //compute number of unique factors
     int n_uf = 1;
     assert(factors[0] != 1);
     for (int i=1; i<nfact; i++){
       if (factors[i] != factors[i-1]) n_uf++;
     }
-    if (n_uf >= 3){
-      if (cdt.rank == 0) 
-        DPRINTF(1,"CTF WARNING: using a world with a number of processors that contains 3 or more unique prime factors may lead to suboptimal performance, when possible use p=2^k3^l processors for some k,l\n");
-    }
+    //if (n_uf >= 3){
+    //  if (cdt.rank == 0)
+    //    DPRINTF(1,"CTF WARNING: using a world with a number of processors that contains 3 or more unique prime factors may lead to suboptimal performance, when possible use p=2^k3^l processors for some k,l\n");
+    //}
     int uniq_fact[n_uf];
     int mults[n_uf];
     int i_uf = 0;
@@ -481,7 +560,30 @@ namespace CTF_int {
       } else mults[i_uf]++;
     }
     cdealloc(factors);
-    return get_all_topos(cdt, n_uf, uniq_fact, mults, 0, NULL);
+    std::vector< std::vector<int> * > shapes = get_all_shapes_rec(n_uf, uniq_fact, mults, 0, NULL);
+    return shapes;
+  }
+
+
+  std::vector< topology* > create_topos_from_shapes(std::vector< std::vector<int>* > shapes, CommData cdt){
+    std::vector< topology* > topos;
+    for (int i=0; i<(int)shapes.size(); i++){
+      topos.push_back(new topology(shapes[i]->size(), &shapes[i]->operator[](0), cdt));
+    }
+    return topos;
+  }
+
+  std::vector< topology* > get_generic_topovec(CommData cdt){
+    std::vector< std::vector<int> * > shapes = get_all_shapes(cdt.np);
+    std::vector< topology* > topos = create_topos_from_shapes(shapes, cdt);
+    for (int i=0; i<(int)shapes.size(); i++){
+      delete shapes[i];
+    }
+
+    if (shapes.size() == 1 && cdt.np >= 7 && cdt.rank == 0)
+      DPRINTF(1,"CTF WARNING: using a world with a prime number of processors may lead to very bad performance\n");
+    return topos;
+
   }
 
 
@@ -493,7 +595,7 @@ namespace CTF_int {
     bool changed;
     /*int i=0;
     do {
-      for (int j=0; j< perm_vec[i]->order; 
+      for (int j=0; j< perm_vec[i]->order;
     } while(i<perm_vec.size();*/
     do {
 //      printf("HERE %d %d %d %d\n",perm_vec[0]->order, perm_vec.size(), perm_vec[0]->lens[0], perm_vec[0]->lens[1]);
@@ -538,9 +640,9 @@ namespace CTF_int {
                                       CommData         glb_comm){
     std::vector< topology* > topos;
     topos.push_back(new topology(*topo));
-    
+
     if (topo->order <= 1) return topos;
-    
+
     int * new_lens = (int*)alloc(sizeof(int)*topo->order-1);
 
     for (int i=0; i<topo->order-1; i++){
@@ -567,12 +669,12 @@ namespace CTF_int {
     }
     return topos;
   }
-    
+
   int find_topology(topology const *           topo,
                     std::vector< topology* > & topovec){
     int i, j, found;
     std::vector< topology* >::iterator iter;
-    
+
     found = -1;
     for (j=0, iter=topovec.begin(); iter!=topovec.end(); iter++, j++){
       if ((*iter)->order == topo->order){
@@ -585,7 +687,7 @@ namespace CTF_int {
       }
       if (found != -1) return found;
     }
-    return -1;  
+    return -1;
   }
 
   int get_best_topo(int64_t  nvirt,
@@ -638,8 +740,8 @@ namespace CTF_int {
     CommData *   sub_phys_comm;
     int * comm_idx;
     mapping const * map;
-    memset(phys_mapped, 0, topo->order*sizeof(int));  
-    
+    memset(phys_mapped, 0, topo->order*sizeof(int));
+
     num_sub_phys_dims = 0;
 
     for (i=0; i<order_A; i++){
@@ -648,7 +750,7 @@ namespace CTF_int {
         phys_mapped[map->cdt] = 1;
         if (map->has_child) map = map->child;
         else break;
-      } 
+      }
     }
     for (i=0; i<order_B; i++){
       map = &edge_map_B[i];
@@ -656,7 +758,7 @@ namespace CTF_int {
         phys_mapped[map->cdt] = 1;
         if (map->has_child) map = map->child;
         else break;
-      } 
+      }
     }
 
     num_sub_phys_dims = 0;
@@ -680,7 +782,7 @@ namespace CTF_int {
 
   }
 
-  int can_morph(topology const * topo_keep, 
+  int can_morph(topology const * topo_keep,
                 topology const * topo_change){
     int i, j, lda;
     lda = 1;
@@ -715,7 +817,7 @@ namespace CTF_int {
           do {
             for (j=0; j<new_topo->order; j++){
               if (new_topo->lda[j] == old_lda) break;
-            } 
+            }
             ASSERT(j!=new_topo->order);
             new_rec_map->type   = PHYSICAL_MAP;
             new_rec_map->cdt    = j;
@@ -749,7 +851,7 @@ namespace CTF_int {
             break;
           }
         }
-        edge_map[i].clear();      
+        edge_map[i].clear();
         edge_map[i] = *new_map;
         CTF_int::cdealloc(new_map);
       }
diff --git a/src/mapping/topology.h b/src/mapping/topology.h
index 6194f690..d6f034ec 100644
--- a/src/mapping/topology.h
+++ b/src/mapping/topology.h
@@ -13,41 +13,78 @@ namespace CTF_int {
   /* \brief mesh/torus topology configuration */
   class topology {
     public:
+      // number of dimensions in torus
       int        order;
+      // lengths of dimensions
       int *      lens;
+      // lda[i] = lens[i-1] * ... * lens[0]
       int *      lda;
+      // global communicator is reordered if intra-node grid is provided
+      int        is_reordered;
+      // whether dim_comm communicators have been activated
       bool       is_activated;
+
+      // list of communicators along fibers of each dimension of torus
       CommData * dim_comm;
+      // global communicator, ordered as in torus given by dim_comm
       CommData   glb_comm;
+      // global communicator, ordered as given, assuming processors are ordered as [processes in node 1], [processes in node 2], etc.
+      CommData   unord_glb_comm;
 
       //topology();
       ~topology();
 
-      /** 
+      /**
        * \brief copy constructor
        * \param[in] other topology to copy
        */
       topology(topology const & other);
 
       /**
-       * \brief constructs torus topology 
+       * \brief overwrite this topology with communicators of another, without reallocating CommData objects, allowing to 'hot-swap' this topology for another, propagating change through creatred ctr objects
+       * \param[in] other topology to copy
+       */
+      void morph_to(topology const & other);
+
+
+      /**
+       * \brief constructs torus topology, if intra_node_lens is NULL, the p processors are folded into a torus, otherwise, the each set of prod(intra_node_lens) processors is mapped to different modes of the processor grid, e.g., if lens_ = [6,4] and intra_node_lens=[3,2] (6 processes per node), the processors are assiged as
+       * [[ 0  1  2  6  7  8 ],
+       *  [ 3  4  5  9  10 11],
+       *  [ 12 13 14 18 19 20],
+       *  [ 15 16 17 21 22 23]]
        * \param[in] order_ number of torus dimensions
        * \param[in] lens_ lengths of torus dimensions
-       * \param[in] cdt communicator for whole torus 
+       * \param[in] cdt communicator for whole torus
        * \param[in] activate whether to create MPI_Comms
+       * \param[in] intra_node_lens lengths of intra-node processor grid
        */
       topology(int         order_,
                int const * lens_,
                CommData    cdt,
-               bool        activate=false);
-     
-      /* \brief create (split off) MPI communicators, re-entrant */ 
+               bool        activate=false,
+               int const * intra_node_lens=NULL);
+
+      /* \brief create (split off) MPI communicators, re-entrant */
       void activate();
 
       /* \breif free MPI communicators, re-entrant */
       void deactivate();
   };
 
+  /**
+   * \brief determine this processors rank in the global communicator given by reordering nodes so that they adhere to the assignment described in the constructor of the topology() object, assuming initial order is node by node
+   *
+   * \param[in] order_ number of torus dimensions
+   * \param[in] lens_ lengths of torus dimensions
+   * \param[in] lda_ prefix product of lengths of torus dimensions
+   * \param[in] intra_node_lens lengths of intra-node processor grid
+   */
+  int get_topo_reorder_rank(int order, int const * lens, int const * lda, int const * intra_node_lens, int rank);
+
+  int get_inv_topo_reorder_rank(int order, int const * lens, int const * intra_node_lens, int new_rank);
+
+
   /**
    * \brief get dimension and torus lengths of specified topology
    *
@@ -57,6 +94,15 @@ namespace CTF_int {
   topology * get_phys_topo(CommData glb_comm,
                            TOPOLOGY mach);
 
+
+  /**
+   * \brief generate all possible factorizations of size into divisors
+  *  \param[in] total size that numbers should multiply to
+  *  \return all possible collections of natural numbers that multiply to size (excluding 1s)
+   */
+  std::vector< std::vector<int>* > get_all_shapes(int size);
+
+
   /**
    * \brief computes all topology configurations given undelying physical topology information
    * \param[in] cdt global communicator
@@ -87,7 +133,7 @@ namespace CTF_int {
   int find_topology(topology const *           topo,
                     std::vector< topology* > & topovec);
 
- 
+
   /**
    * \brief get the best topologoes (least nvirt) over all procs
    * \param[in] nvirt best virtualization achieved by this proc
@@ -102,7 +148,7 @@ namespace CTF_int {
                      CommData global_comm,
                      int64_t  bcomm_vol=0,
                      int64_t  bmemuse=0);
-   
+
 
   /**
    * \brief extracts the set of physical dimensions still available for mapping

From 5094ae9f4099dad9a381779bc713d07f0035d832 Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Mon, 2 May 2022 08:31:45 +0200
Subject: [PATCH 16/19] fix segfaults in dryRun

---
 src/tensor/untyped_tensor.cxx | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx
index 6e81d1f7..b4cdbf4d 100644
--- a/src/tensor/untyped_tensor.cxx
+++ b/src/tensor/untyped_tensor.cxx
@@ -42,7 +42,7 @@ namespace CTF_int {
   void tensor::free_self(){
     if (order > -1){
       if (wrld->rank == 0) DPRINTF(3,"Deleted order %d tensor %s\n",order,name);
-      if (is_folded) unfold();
+      if (is_folded && !wrld->dryRanks) unfold();
       //if (is_folded) unfold(0,1);
       cdealloc(sym);
       cdealloc(lens);
@@ -308,7 +308,7 @@ namespace CTF_int {
         }*/
         this->home_size = other->home_size;
         register_size(this->home_size*sr->el_size);
-        this->home_buffer = sr->alloc(other->home_size);
+        if (!wrld->dryRanks) this->home_buffer = sr->alloc(other->home_size);
         if (other->is_home){
           this->is_home = 1;
           this->data = this->home_buffer;
@@ -316,14 +316,16 @@ namespace CTF_int {
           /*if (this->is_home || this->home_size != other->home_size){
           }*/
           this->is_home = 0;
-          sr->copy(this->home_buffer, other->home_buffer, other->home_size);
-          //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
-          this->data = sr->alloc(other->size);
+          if (!wrld->dryRanks){
+            sr->copy(this->home_buffer, other->home_buffer, other->home_size);
+            //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
+            this->data = sr->alloc(other->size);
+          }
         }
         this->has_home = 1;
       } else {
         //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
-        this->data = sr->alloc(other->size);
+        if (!wrld->dryRanks) this->data = sr->alloc(other->size);
 /*          if (this->has_home && !this->is_home){
           CTF_int::cdealloc(this->home_buffer);
         }*/
@@ -332,9 +334,9 @@ namespace CTF_int {
       }
   #else
       //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
-      this->data = sr->alloc(other->size);
+      if (!wrld->dryRanks) this->data = sr->alloc(other->size);
   #endif
-      sr->copy(this->data, other->data, other->size);
+      if (!wrld->dryRanks) sr->copy(this->data, other->data, other->size);
     } else {
       ASSERT(this->is_sparse);
       has_home = other->has_home;

From 63bbf1809af6da28759d3cdc6a905519a3bbd78a Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Mon, 9 May 2022 08:05:39 +0200
Subject: [PATCH 17/19] add node awareness to dryRun branch

---
 src/contraction/contraction.cxx    | 41 ++++++++++++++++++++++------
 src/contraction/ctr_2d_general.cxx |  6 ++--
 src/contraction/ctr_comm.cxx       | 15 ++++------
 src/interface/common.cxx           | 44 ++++++++++++++++++++----------
 src/interface/common.h             | 10 +++++--
 src/interface/world.cxx            |  3 +-
 src/interface/world.h              |  4 ++-
 src/mapping/topology.cxx           | 42 ++++++++++++++++++++++++++--
 8 files changed, 123 insertions(+), 42 deletions(-)

diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index ac91dd99..1f527378 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -2839,10 +2839,11 @@ namespace CTF_int {
         est_time = memuse;
 #endif
 
-        if (A->wrld->dryRanks) printf( "t %ld j %d will use %f GB per rank and take %f s, %f %f %f"
-                                     , t, j, memuse/1024.0/1024./1024
-                                     , est_time, redist_time, contr_time, fold_time);
-        if (A->wrld->dryRanks) C->print_map();
+        if (A->wrld->dryRanks && A->wrld->verbose == 2)
+          printf( "t %ld j %d will use %f GB per rank and take %f s, %f %f %f"
+                , t, j, memuse/1024.0/1024./1024
+                , est_time, redist_time, contr_time, fold_time);
+        if (A->wrld->dryRanks && A->wrld->verbose == 2) C->print_map();
 
         ASSERT(est_time >= 0.0);
         if ((int64_t)memuse >= max_memuse){
@@ -4424,11 +4425,36 @@ namespace CTF_int {
     B->print_map();
     C->print_map();
     ctrf->print();
+#define NODE_AWARE 1
+#ifdef NODE_AWARE
+    if (C->wrld->ppn){
+      topology orig_topo = *(C->topo);
+      std::vector<int> pe_grid(orig_topo.lens, orig_topo.lens + orig_topo.order);
+      std::vector<std::vector<int> > inter_node_grids =
+        CTF_int::get_inter_node_grids(pe_grid, C->wrld->dryRanks/C->wrld->ppn);
+      int * intra_node_lens = (int*)CTF_int::alloc(orig_topo.order*sizeof(int));
+      double comm_vol_ref = ctrf->est_internode_comm_vol_rec(ctrf->num_lyr);
+      printf("Ref: %f\n", comm_vol_ref/1024.0/1024.0/1024.0);
+      for (size_t i=0; i<inter_node_grids.size(); i++){
+        for (int j=0; j<orig_topo.order; j++)
+          intra_node_lens[j] = orig_topo.lens[j] / inter_node_grids[i][j];
+        topology na_topo_i(orig_topo.order, orig_topo.lens, orig_topo.glb_comm, 0, intra_node_lens);
+        C->topo->morph_to(na_topo_i);
+        double comm_vol_i = ctrf->est_internode_comm_vol_rec(ctrf->num_lyr);
+        for (int j=0; j < orig_topo.order; j++) printf("%d ", inter_node_grids[i][j]);
+        printf("-> %f\n", comm_vol_i/1024.0/1024.0/1024.0);
+
+        C->topo->morph_to(orig_topo);
+      }
+      cdealloc(intra_node_lens);
+    }
+#endif
     delete ctrf;
     TAU_FSTOP(contract);
     return SUCCESS;
   }
 
+
   #if (VERBOSE >= 1 || DEBUG >= 1)
   if (global_comm.rank == 0){
     ctrf->print();
@@ -4464,7 +4490,6 @@ namespace CTF_int {
   #endif
 
 
-#define NODE_AWARE 1
 #ifdef NODE_AWARE
     TAU_FSTART(node_aware_remapping);
     /* reorder processor grid to account for node-awareness */
@@ -4472,7 +4497,7 @@ namespace CTF_int {
     int64_t node_aware_send_to_rank(0);
     int64_t node_aware_recv_from_rank(0);
     // FIXME: support sparsity
-    if (C->wrld->ppn != 1 && !is_sparse()){
+    if (C->wrld->ppn && !is_sparse()){
       std::vector<int> pe_grid(orig_topo.lens, orig_topo.lens + orig_topo.order);
       std::vector<std::vector<int> > inter_node_grids = CTF_int::get_inter_node_grids(pe_grid, C->wrld->np/C->wrld->ppn);
       //std::vector< std::vector<int> > intra_node_grids = CTF_int::get_all_shapes(C->wrld->ppn()){
@@ -4609,7 +4634,7 @@ namespace CTF_int {
     TAU_FSTART(node_aware_backmapping);
     /* reorder processor grid to account for node-awareness */
     // FIXME: support sparsity
-    if (C->wrld->ppn != 1 && !is_sparse() && orig_topo.glb_comm.rank != node_aware_send_to_rank){
+    if (C->wrld->ppn && !is_sparse() && orig_topo.glb_comm.rank != node_aware_send_to_rank){
       TAU_FSTART(redistribute_for_node_aware);
       // FIXME: to support sparsity need to also communicate nnz information here
       MPI_Status stat;
@@ -4622,7 +4647,7 @@ namespace CTF_int {
       MPI_Sendrecv_replace(C->data, C->size, C->sr->mdtype(), node_aware_recv_from_rank, 1327, node_aware_send_to_rank, 1327, orig_topo.glb_comm.cm, &stat);
       TAU_FSTOP(redistribute_for_node_aware);
     }
-    if (C->wrld->ppn != 1 && !is_sparse()) {
+    if (C->wrld->ppn  && !is_sparse()) {
       C->topo->morph_to(orig_topo);
     }
     TAU_FSTOP(node_aware_backmapping);
diff --git a/src/contraction/ctr_2d_general.cxx b/src/contraction/ctr_2d_general.cxx
index 844f7310..0a7f0b7d 100755
--- a/src/contraction/ctr_2d_general.cxx
+++ b/src/contraction/ctr_2d_general.cxx
@@ -271,11 +271,11 @@ namespace CTF_int {
     find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
     double sz = 0.0;
     if (move_A)
-      sz += (sr_A->el_size*s_A) * ((cdt_A->np / cdt_A->intra_node_np) - 1);
+      sz += (sr_A->el_size*s_A) * (cdt_A->num_nodes - 1);// ((cdt_A->np / cdt_A->intra_node_np) - 1);
     if (move_B)
-      sz += (sr_B->el_size*s_B) * ((cdt_B->np / cdt_B->intra_node_np) - 1);
+      sz += (sr_B->el_size*s_B) * (cdt_B->num_nodes - 1);// ((cdt_B->np / cdt_B->intra_node_np) - 1);
     if (move_C)
-      sz += (sr_C->el_size*s_C) * ((cdt_C->np / cdt_C->intra_node_np) - 1);
+      sz += (sr_C->el_size*s_C) * (cdt_C->num_nodes - 1); //((cdt_C->np / cdt_C->intra_node_np) - 1);
     return (sz*(double)edge_len)/MIN(nlyr,edge_len);
   }
 
diff --git a/src/contraction/ctr_comm.cxx b/src/contraction/ctr_comm.cxx
index 693cebe2..63f659f4 100755
--- a/src/contraction/ctr_comm.cxx
+++ b/src/contraction/ctr_comm.cxx
@@ -196,15 +196,12 @@ namespace CTF_int {
   double ctr_replicate::est_internode_comm_vol_rec(int nlyr) {
     int i;
     double sz = 0.;
-    for (i = 0; i < ncdt_A; i++) {
-      sz += (size_A*sr_A->el_size) * ((cdt_A[i]->np / cdt_A[i]->intra_node_np) - 1);
-    }
-    for (i = 0; i < ncdt_B; i++) {
-      sz += (size_B*sr_B->el_size) * ((cdt_B[i]->np / cdt_B[i]->intra_node_np) - 1);
-    }
-    for (i = 0; i < ncdt_C; i++) {
-      sz += (size_C*sr_C->el_size) * ((cdt_C[i]->np / cdt_C[i]->intra_node_np) - 1);
-    }
+    for (i = 0; i < ncdt_A; i++)
+      sz += (size_A*sr_A->el_size) * (cdt_A[i]->num_nodes - 1);
+    for (i = 0; i < ncdt_B; i++)
+      sz += (size_B*sr_B->el_size) * (cdt_B[i]->num_nodes - 1);
+    for (i = 0; i < ncdt_C; i++)
+      sz += (size_C*sr_C->el_size) * (cdt_C[i]->num_nodes - 1);
     return rec_ctr->est_internode_comm_vol_rec(nlyr) + sz;
   }
 
diff --git a/src/interface/common.cxx b/src/interface/common.cxx
index df88ba85..a56c4a81 100644
--- a/src/interface/common.cxx
+++ b/src/interface/common.cxx
@@ -265,23 +265,29 @@ namespace CTF_int {
   }
 
   CommData::CommData(CommData const & other){
-    cm      = other.cm;
-    alive   = other.alive;
-    rank    = other.rank;
-    np      = other.np;
-    color   = other.color;
+    cm            = other.cm;
+    alive         = other.alive;
+    rank          = other.rank;
+    np            = other.np;
+    color         = other.color;
+    global_rank   = other.global_rank;
+    node_id       = other.node_id;
+    num_nodes     = other.num_nodes;
     intra_node_np = other.intra_node_np;
-    created = 0;
+    created       = 0;
   }
 
   CommData& CommData::operator=(CommData const & other){
-    cm      = other.cm;
-    alive   = other.alive;
-    rank    = other.rank;
-    np      = other.np;
-    color   = other.color;
+    cm            = other.cm;
+    alive         = other.alive;
+    rank          = other.rank;
+    np            = other.np;
+    color         = other.color;
+    global_rank   = other.global_rank;
+    node_id       = other.node_id;
+    num_nodes     = other.num_nodes;
     intra_node_np = other.intra_node_np;
-    created = 0;
+    created       = 0;
     return *this;
   }
 
@@ -290,15 +296,20 @@ namespace CTF_int {
     cm = cm_;
     MPI_Comm_rank(cm, &rank);
     MPI_Comm_size(cm, &np);
-    intra_node_np = 1;
+    MPI_Comm_rank(MPI_COMM_WORLD,&global_rank);
+    intra_node_np = 0;
     alive = 1;
     created = 0;
   }
 
-  CommData::CommData(int rank_, int color_, int np_, int intra_node_np_){
+  CommData::CommData(
+    int rank_, int color_, int np_, int num_nodes_, int global_rank_, int intra_node_np_
+  ){
     rank          = rank_;
     color         = color_;
     np            = np_;
+    num_nodes     = num_nodes_;
+    global_rank   = global_rank_;
     intra_node_np = intra_node_np_;
     alive         = 0;
     created       = 0;
@@ -310,7 +321,8 @@ namespace CTF_int {
     ASSERT(parent.alive);
     MPI_Comm_split(parent.cm, color, rank_, &cm);
     MPI_Comm_size(cm, &np);
-    intra_node_np = 1;
+    global_rank = parent.global_rank;
+    intra_node_np = 0;
     alive   = 1;
     created = 1;
   }
@@ -616,6 +628,7 @@ namespace CTF_int {
       lda *= lens[i];
     }
   }
+
 /*
 #define USE_CUST_DBL_CMPLX 0
 
@@ -663,4 +676,5 @@ namespace CTF_int {
     return is_new;
   }
 
+
 }
diff --git a/src/interface/common.h b/src/interface/common.h
index d60cbcd9..cde870b9 100644
--- a/src/interface/common.h
+++ b/src/interface/common.h
@@ -159,7 +159,13 @@ namespace CTF_int {
       int created;
       // intra_node_np, number of processes per node (intra-node grid dimension) corresponding to this communicator, if provided, 1 otherwise
       int intra_node_np;
-  
+      // global rank
+      int global_rank;
+      // node id
+      int node_id;
+      // number of distinct nodes in the communicator
+      int num_nodes;
+
       CommData();
       ~CommData();
 
@@ -180,7 +186,7 @@ namespace CTF_int {
        * \param[in] np number of processors within this comm
        * \param[in] intra_node_np number of processors per physical node
        */
-      CommData(int rank, int color, int np, int intra_node_np=1);
+      CommData(int rank, int color, int np, int num_nodes, int glbRank,  int intra_node_np=0);
 
       /**
        * \brief create active subcomm from parent comm which must be active
diff --git a/src/interface/world.cxx b/src/interface/world.cxx
index 844e5f0f..bc052c63 100644
--- a/src/interface/world.cxx
+++ b/src/interface/world.cxx
@@ -58,7 +58,6 @@ namespace CTF {
   World::World(int            argc,
                char * const * argv){
     comm = MPI_COMM_WORLD;
-    ppn = 1;
 #ifdef BGQ
     this->init(comm, TOPOLOGY_BGQ, argc, argv);
 #else
@@ -92,6 +91,7 @@ namespace CTF {
     comm = MPI_COMM_WORLD;
     dryRanks = dryRanks_;
     ppn = ppn_;
+    if (print == "high") verbose = 2;
 
     this->init(comm, TOPOLOGY_GENERIC);
   }
@@ -103,7 +103,6 @@ namespace CTF {
                int             argc,
                char * const *  argv){
     comm = comm_;
-    ppn = 1;
     this->init(comm, order, lens, argc, argv);
   }
 
diff --git a/src/interface/world.h b/src/interface/world.h
index 07743efb..39fef3db 100644
--- a/src/interface/world.h
+++ b/src/interface/world.h
@@ -25,9 +25,11 @@ namespace CTF {
       /** \brief number of processors */
       int np;
       /** \brief number of processors per node (optional / can be 1)*/
-      int ppn;
+      int ppn = 0;
       /** \brief set dryRun */
       int dryRanks = 0;
+      /** \brief verbosity of dryRun */
+      int verbose = 1;
       /** \brief derived topologies */
       std::vector< CTF_int::topology* > topovec;
       /** \brief whether the world has been initialized */
diff --git a/src/mapping/topology.cxx b/src/mapping/topology.cxx
index b7980699..9a87418e 100644
--- a/src/mapping/topology.cxx
+++ b/src/mapping/topology.cxx
@@ -9,6 +9,8 @@
 #include "mpix.h"
 #endif
 
+using ipair = std::pair<int, int>;
+
 namespace CTF_int {
 /*
   topology::topology(){
@@ -123,20 +125,56 @@ namespace CTF_int {
     } else {
       int new_rank = get_topo_reorder_rank(order, lens, lda, intra_node_lens, cdt.rank);
       is_reordered = true;
-      glb_comm = CommData(new_rank, 0, cdt.np);
+      glb_comm = CommData(new_rank, 0, cdt.np, cdt.num_nodes, cdt.global_rank);
     }
     int stride = 1, cut = 0;
     int rank = glb_comm.rank;
+    std::vector<int> num_nodes(order);
+    std::vector< std::vector<ipair> > como(order, std::vector<ipair> (cdt.np));
+    for (int r(0); r < cdt.np; r++){
+      int stride =1, cut = 0;
+      for (size_t i=0; i<order; i++){
+        como[i][r] = {(((r/(stride*lens[i]))*stride)+cut), r/72};
+        stride*=lens[i];
+        cut = (r - (r/stride)*stride);
+      }
+    }
+    // sort for the same color
+    for (auto &c: como) std::sort(c.begin(), c.end());
+    for (int i=0; i< order; i++){
+      std::vector<ipair> sameColor;
+      std::copy_if( como[i].begin()
+                  , como[i].end()
+                  , std::back_inserter(sameColor)
+                  , [](ipair &a){ return a.first == 0;}
+                  );
+      std::sort( sameColor.begin()
+               , sameColor.end()
+               , [](ipair &a, ipair &b){return a.second < b.second;}
+               );
+      num_nodes[i] = std::distance( sameColor.begin()
+                                  , std::unique( sameColor.begin()
+                                               , sameColor.end()
+                                               , [](ipair &a, ipair &b)
+                                                 { return a.second == b.second;}
+                                               )
+                                  );
+    }
+
     for (int i=0; i<order; i++){
       lda[i] = stride;
       if (intra_node_lens == NULL)
         dim_comm[i] = CommData(((rank/stride)%lens[i]),
                                (((rank/(stride*lens[i]))*stride)+cut),
-                               lens[i]);
+                               lens[i],
+                               num_nodes[i],
+                               rank);
       else
         dim_comm[i] = CommData(((rank/stride)%lens[i]),
                                (((rank/(stride*lens[i]))*stride)+cut),
                                lens[i],
+                               lens[i] / intra_node_lens[i],
+                               rank,
                                intra_node_lens[i]);
       stride*=lens[i];
       cut = (rank - (rank/stride)*stride);

From 64246ca1d77763bdca8af977b6f109113e277188 Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Mon, 5 Aug 2024 23:20:38 +0200
Subject: [PATCH 18/19] Add cc4s-model-trainer and change dump of TUNE data: -
 minor changes in the output of dry-run - switch on exhaustive in (almost)
 every contraction - dont concider 'shape' in gemm contractions - new feature:
 only add model parameters which were actually tuned - change: dump only the
 slowest measurement from all ranks

---
 Makefile                        |   2 +-
 bench/model_trainer_cc4s.cxx    | 151 ++++++++++++++++++++++++++++++++
 src/contraction/contraction.cxx |  20 ++---
 src/contraction/ctr_tsr.cxx     |   3 +-
 src/shared/init_models.cxx      |   2 +-
 src/shared/model.cxx            |  58 +++++++++++-
 src/shared/model.h              |   8 +-
 7 files changed, 225 insertions(+), 19 deletions(-)
 create mode 100644 bench/model_trainer_cc4s.cxx

diff --git a/Makefile b/Makefile
index 85c8f2f1..875f6f1c 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,7 @@ uninstall:
 EXAMPLES = algebraic_multigrid apsp bitonic_sort btwn_central ccsd checkpoint dft_3D fft force_integration force_integration_sparse jacobi matmul neural_network particle_interaction qinformatics recursive_matmul scan sparse_mp3 sparse_permuted_slice spectral_element spmv sssp strassen trace mis mis2 ao_mo_transf block_sparse checkpoint_sparse hosvd mttkrp fft_with_idx_partition
 TESTS = bivar_function bivar_transform ccsdt_map_test ccsdt_t3_to_t2 dft diag_ctr diag_sym endomorphism_cust endomorphism_cust_sp endomorphism gemm_4D multi_tsr_sym permute_multiworld readall_test readwrite_test repack scalar speye sptensor_sum subworld_gemm sy_times_ns test_suite univar_function weigh_4D  reduce_bcast
 
-BENCHMARKS = bench_contraction bench_nosym_transp bench_redistribution model_trainer
+BENCHMARKS = model_trainer_cc4s model_trainer bench_contraction bench_nosym_transp
 
 SCALAPACK_TESTS = qr svd eigh
 
diff --git a/bench/model_trainer_cc4s.cxx b/bench/model_trainer_cc4s.cxx
new file mode 100644
index 00000000..11c4ad09
--- /dev/null
+++ b/bench/model_trainer_cc4s.cxx
@@ -0,0 +1,151 @@
+/** Copyright (c) 2011, Edgar Solomonik, all rights reserved.
+  * \addtogroup benchmarks
+  * @{
+  * \addtogroup model_trainer
+  * @{
+  * \brief Executes a set of different contractions on different processor counts to train model parameters
+  */
+
+#include <ctf.hpp>
+#define TEST_SUITE
+#include "../examples/ccsd.cxx"
+#include "../examples/sparse_mp3.cxx"
+#undef TEST_SUITE
+using namespace CTF;
+
+namespace CTF_int{
+  void update_all_models(MPI_Comm comm);
+}
+
+struct Ccsd_dimensions {
+  int64_t No;
+  int64_t Nv;
+  int64_t Nx;
+  int64_t Ng;
+};
+
+Ccsd_dimensions get_ccsd_dimensions(double mem_per_core, int64_t nvfac, World &dw) {
+  int np;
+  MPI_Comm_size(dw.comm, &np);
+  int64_t No(10); 
+  while ( No*No*No*No*nvfac*nvfac*8./np/1024/1024 < mem_per_core) No++;
+  return Ccsd_dimensions({No, No*nvfac, No, (int64_t) No*nvfac*2.5});
+}
+
+void ph_contraction(int64_t No, int64_t Nv, World &dw) {
+  int64_t vvoo[] = {Nv, Nv, No, No};
+  int syms[] = {NS, NS, NS, NS};
+  CTF::Tensor< double > T(4, vvoo, syms, dw, "T");
+  CTF::Tensor< double > V(4, vvoo, syms, dw, "V");
+  CTF::Tensor< double > R(4, vvoo, syms, dw, "R");
+  V.fill_random(0, 1);
+  T.fill_random(0, 1);
+  R["abij"] = T["acik"] * V["cbkj"];
+}
+
+void ggv_contraction(int64_t Nv, int64_t Nx, int64_t Ng, World &dw) {
+  int64_t gxv[] = {Ng, Nx, Nv};
+  int64_t vvxx[] = {Nv, Nv, Nx, Nx};
+  int syms[] = {NS, NS, NS, NS};
+  CTF::Tensor< double > G(3,  gxv, syms, dw, "B");
+  CTF::Tensor< double > V(4, vvxx, syms, dw, "C");
+  G.fill_random(0, 1);
+  V["cdxy"] = G["Gxc"] * G["Gyd"];
+}
+
+void rvt_contraction(int64_t No, int64_t Nv, int64_t Nx, World &dw) {
+  int64_t vvoo[] = {Nv, Nv, No, No};
+  int64_t xxoo[] = {Nx, Nx, No, No};
+  int64_t vvxx[] = {Nv, Nv, Nx, Nx};
+  int syms[] = {NS, NS, NS, NS};
+  CTF::Tensor< double > T(4, vvoo, syms, dw, "T");
+  CTF::Tensor< double > V(4, vvxx, syms, dw, "V");
+  CTF::Tensor< double > R(4, xxoo, syms, dw, "R");
+  V.fill_random(0, 1);
+  T.fill_random(0, 1);
+  R["abij"] = V["xyab"] * T["xyij"];
+}
+
+void train_ccsd(World & dw, double mem_per_core, int64_t nvfac, int c_id){
+  auto dim = get_ccsd_dimensions(mem_per_core, nvfac, dw);
+  if (c_id & 1) ph_contraction(dim.No, dim.Nv, dw);
+  if (c_id & 2) ggv_contraction(dim.Nv, dim.Nx, dim.Ng, dw); 
+  if (c_id & 4) rvt_contraction(dim.No, dim.Nv, dim.Nx, dw); 
+}
+
+
+
+void train_all(std::string dump_file){
+  World dw(MPI_COMM_WORLD);
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+
+  // number of iterations for training
+  int num_iterations = 2, rounds = 2;
+
+  for (int i=0; i<num_iterations; i++){
+    if (rank == 0){
+      printf("Starting iteration %d/%d\n", i+1,num_iterations);
+    }
+    for (int j(0); j < rounds; j++) {
+      train_ccsd(dw, 10.,  8, 7);
+      train_ccsd(dw, 10., 12, 7);
+      train_ccsd(dw, 25.,  8, 7);
+      train_ccsd(dw, 25., 12, 7);
+      train_ccsd(dw, 25., 16, 7);
+      CTF_int::update_all_models(dw.comm);
+      if (rank == 0) printf("Completed training round %d/%d\n", j, rounds);
+    }
+  }
+
+
+//  CTF_int::write_all_models(coeff_file);
+  if (rank == 0) CTF_int::print_all_models();
+
+  if (dump_file.size()) CTF_int::dump_touched_models(dump_file);
+
+}
+
+char* getCmdOption(char ** begin,
+                   char ** end,
+                   const   std::string & option){
+  char ** itr = std::find(begin, end, option);
+  if (itr != end && ++itr != end){
+    return *itr;
+  }
+  return 0;
+}
+
+
+int main(int argc, char ** argv){
+  int rank, np;
+  int const in_num = argc;
+  char ** input_str = argv;
+
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+
+  // Boolean expression that are used to pass command line argument to function train_all
+  std::string dump_file = "./data";
+
+  {
+    World dw(MPI_COMM_WORLD, argc, argv);
+
+    if (rank == 0){
+      printf("we train\n");
+    }
+    train_all(dump_file);
+  }
+
+
+  MPI_Finalize();
+  return 0;
+}
+
+/**
+ * @}
+ * @}
+ */
diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index 690590e5..3769b996 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -111,7 +111,6 @@ namespace CTF_int {
     //if (A->wrld->cdt.cm == MPI_COMM_WORLD){
 //      update_all_models(A->wrld->cdt.cm);
     //}
-   
     int stat = home_contract();
     if (stat != SUCCESS){
       printf("CTF ERROR: Failed to perform contraction\n");
@@ -2838,10 +2837,10 @@ namespace CTF_int {
         est_time = memuse;
 #endif
 
-        if (A->wrld->dryRanks) printf( "t %d j %d will use %f GB per rank and take %f s, %f %f %f"
-                                     , t, j, memuse/1024.0/1024./1024
-                                     , est_time, redist_time, contr_time, fold_time);
-        if (A->wrld->dryRanks) C->print_map();
+//        if (A->wrld->dryRanks) printf( "t %d j %d will use %f GB per rank and take %f s, %f %f %f"
+//                                     , t, j, memuse/1024.0/1024./1024
+//                                     , est_time, redist_time, contr_time, fold_time);
+//        if (A->wrld->dryRanks) C->print_map();
 
         ASSERT(est_time >= 0.0);
         if ((int64_t)memuse >= max_memuse){
@@ -2950,7 +2949,7 @@ namespace CTF_int {
         est_time = memuse;
 #endif
         ASSERT(est_time >= 0.0);
-        if (A->wrld->dryRanks) printf( "topo %d order %d will use %f GB per rank and take %f s, %f %f %f\n"
+        if (A->wrld->dryRanks) printf( "topo %d order %d will use %f GB per rank and take %f s (%f %f %f, redist/contraction/folding)\n"
                                      , i, j, memuse/1024.0/1024./1024, est_time, redist_time, contr_time, fold_time);
 
 
@@ -3092,7 +3091,7 @@ namespace CTF_int {
       A->set_padding();
       B->set_padding();
       C->set_padding();
-      if (gbest_time_sel < 1e100){
+      if (gbest_time_sel > 1e100){
         gbest_time_exh = gbest_time_sel+1.;
         ttopo_exh = ttopo_sel;
       } else {
@@ -3226,7 +3225,7 @@ namespace CTF_int {
       int64_t memuse;
       double est_time, redist_time, contr_time, fold_time;
       detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time);
-      printf( "Contraction will use %f GB per rank and take %f s, %f %f %f\n"
+      printf( "Contraction will use %f GB per rank and take %f s (%f %f %f, redist/contraction/folding)\n"
             , memuse/1024.0/1024./1024, est_time, redist_time, contr_time, fold_time);
     }
 
@@ -4419,10 +4418,11 @@ namespace CTF_int {
 
 
   if (A->wrld->dryRanks){
+// iran: this is the silent version
     A->print_map();
     B->print_map();
     C->print_map();
-    ctrf->print();
+    //ctrf->print();
     delete ctrf;
     TAU_FSTOP(contract);
     return SUCCESS;
@@ -5215,7 +5215,6 @@ namespace CTF_int {
         return SUCCESS;
       }
     }
-
     contraction new_ctr = contraction(*this);
 
     was_home_A = A->is_home;
@@ -5284,6 +5283,7 @@ namespace CTF_int {
     }
 
     ret = new_ctr.sym_contract();//&ntype, ftsr, felm, alpha, beta);
+
     if (ret!= SUCCESS) return ret;
     if (C->wrld->dryRanks) return SUCCESS;
     if (was_home_C) new_ctr.C->unfold();
diff --git a/src/contraction/ctr_tsr.cxx b/src/contraction/ctr_tsr.cxx
index cb449e64..bd398ec1 100755
--- a/src/contraction/ctr_tsr.cxx
+++ b/src/contraction/ctr_tsr.cxx
@@ -458,7 +458,7 @@ namespace CTF_int {
         if (inner_params.offload)
           return seq_tsr_ctr_mdl_off.est_time(ps);
         else
-          return seq_tsr_ctr_mdl_inr.est_time(ps)*fac;
+          return seq_tsr_ctr_mdl_inr.est_time(ps);
       }
     } else
       return seq_tsr_ctr_mdl_ref.est_time(ps);
@@ -501,6 +501,7 @@ namespace CTF_int {
 
     if (!sr) return;
 #endif
+
     if (is_custom && !is_inner){
       double st_time = MPI_Wtime();
       ASSERT(is_inner == 0);
diff --git a/src/shared/init_models.cxx b/src/shared/init_models.cxx
index 14a29a37..9c1d0153 100644
--- a/src/shared/init_models.cxx
+++ b/src/shared/init_models.cxx
@@ -16,7 +16,7 @@ namespace CTF_int{
   double seq_tsr_ctr_mdl_cst_off_init[] = {8.4844E-04, 5.9246E-11, 3.5247E-10};
   double long_contig_transp_mdl_init[] = {0.0, 1.25E-08};
   double shrt_contig_transp_mdl_init[] = {0.0, 1.25E-08};
-  double non_contig_transp_mdl_init[] = {2.6680E-05, 4.6247E-06};
+  double non_contig_transp_mdl_init[] = {2.6680E-05, 8.6247E-08};
   double seq_tsr_spctr_cst_off_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
   double seq_tsr_spctr_cst_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
   double seq_tsr_spctr_cst_off_k2_init[] = {2.1996E-04, 3.1883E-09, 3.8743E-11};
diff --git a/src/shared/model.cxx b/src/shared/model.cxx
index ef17ebf4..f8a31e0d 100644
--- a/src/shared/model.cxx
+++ b/src/shared/model.cxx
@@ -53,6 +53,14 @@ namespace CTF_int {
 #endif
   }
 
+  void dump_touched_models(std::string path){
+#ifdef TUNE
+    for (int i=0; i<(int)get_all_models().size(); i++){
+      get_all_models()[i]->dump_data(path, true);
+    }
+#endif
+  }
+
 #define SPLINE_CHUNK_SZ = 8
 
   double cddot(int n,       const double *dX,
@@ -245,7 +253,7 @@ namespace CTF_int {
     //if (nobs % tune_interval == 0){
 
     //define the number of cols in the matrix to be the min of the number of observations and
-    //the number we are willing to store (hist_size)
+    //the number we are willing to store ( {}hist_size)
     int nrcol = std::min(nobs,(int64_t)hist_size);
     //max of the number of local observations and nparam (will usually be the former)
     int ncol = std::max(nrcol, nparam);
@@ -697,11 +705,12 @@ namespace CTF_int {
   }
 
   template <int nparam>
-  void LinModel<nparam>::dump_data(std::string path){
+  void LinModel<nparam>::dump_data(std::string path, bool dump_only_touched){
     int rank = 0;
     int np, my_rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
     MPI_Comm_size(MPI_COMM_WORLD, &np);
+/*
     while(rank < np){
         if (rank == my_rank){
         // Open the file
@@ -721,6 +730,7 @@ namespace CTF_int {
         int num_records = std::min(nobs, (int64_t)hist_size);
         for(int i=0; i<num_records; i++){
             std::string instance = "";
+           ofs << i << " ";
            for(int j=0; j<mat_lda; j++){
              ofs<<time_param_mat[i*mat_lda+j]<<" ";
            }
@@ -731,6 +741,48 @@ namespace CTF_int {
       rank++;
       MPI_Barrier(MPI_COMM_WORLD);
     }
+*/
+    int num_records = std::min(nobs, (int64_t)hist_size);
+    bool dump = true;
+    if (dump_only_touched) dump = (bool) num_records;
+    std::vector<double> local_times(num_records), max_times(local_times);
+    int min_records(0), max_records(0);
+    for (int i=0; i < num_records; i++) {
+      local_times[i] = time_param_mat[i*mat_lda];
+    }
+    MPI_Allreduce(&num_records, &max_records, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+    MPI_Allreduce(&num_records, &min_records, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    assert(max_records == min_records);
+    if (max_records > 0 && max_records == min_records) {
+      MPI_Reduce(local_times.data(), max_times.data(), num_records, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+      if (!my_rank && dump) {
+        // Open the file
+        std::ofstream ofs;
+        std::string model_name = std::string(name);
+        ofs.open(path+"/"+model_name, std::ofstream::out | std::ofstream::app);
+
+        // Dump the model coeffs
+        ofs << "Coeff: ";
+        for(int i=0; i<nparam; i++){
+          ofs << coeff_guess[i] << " ";
+        }
+        ofs << std::endl;
+
+
+        // Dump the training data
+        int num_records = std::min(nobs, (int64_t)hist_size);
+        for(int i=0; i<num_records; i++){
+          std::string instance = "";
+          ofs << max_times[i];
+          for(int j=1; j<mat_lda; j++){
+            ofs << " " << time_param_mat[i*mat_lda+j];
+          }
+          ofs<<"\n";
+        }
+        ofs.close();
+      }
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
   }
 
 
@@ -832,7 +884,7 @@ namespace CTF_int {
   }
 
   template <int nparam>
-  void CubicModel<nparam>::dump_data(std::string path){
+  void CubicModel<nparam>::dump_data(std::string path, bool dump_only_touched){
     lmdl.dump_data(path);
   }
 
diff --git a/src/shared/model.h b/src/shared/model.h
index e9b3a7d8..206f6bda 100644
--- a/src/shared/model.h
+++ b/src/shared/model.h
@@ -20,7 +20,7 @@ namespace CTF_int {
       virtual void print_uo(){};
       virtual void load_coeff(std::string file_name){};
       virtual void write_coeff(std::string file_name){};
-      virtual void dump_data(std::string path){};
+      virtual void dump_data(std::string path, bool dump_only_touched = false){};
   };
 
   void update_all_models(MPI_Comm cm);
@@ -28,6 +28,7 @@ namespace CTF_int {
   void load_all_models(std::string file_name);
   void write_all_models(std::string file_name);
   void dump_all_models(std::string path);
+  void dump_touched_models(std::string path);
 
   /**
    * \brief Linear performance models, which given measurements, provides new model guess
@@ -138,7 +139,8 @@ namespace CTF_int {
       /**
        * \brief dump model data to a file
        */
-      void dump_data(std::string path);
+      void dump_data(std::string path, bool dump_only_touched = false);
+
   };
 
   /**
@@ -216,7 +218,7 @@ namespace CTF_int {
        * \brief write model coefficients to file
        * \param[in] path the path that we wish to dump all files to
        */
-      void dump_data(std::string path);
+      void dump_data(std::string path, bool dump_only_touched = false);
 
   };
 

From dcdcf234ec1b03ccd5007db9cde5f4e6d82fc49e Mon Sep 17 00:00:00 2001
From: Andreas Irmler <andreas.irmler@tuwien.ac.at>
Date: Thu, 8 Aug 2024 10:02:03 +0200
Subject: [PATCH 19/19] minor changes in cc4s_model_trainer

---
 bench/model_trainer_cc4s.cxx | 71 +++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 26 deletions(-)

diff --git a/bench/model_trainer_cc4s.cxx b/bench/model_trainer_cc4s.cxx
index 11c4ad09..85201a47 100644
--- a/bench/model_trainer_cc4s.cxx
+++ b/bench/model_trainer_cc4s.cxx
@@ -6,6 +6,8 @@
   * \brief Executes a set of different contractions on different processor counts to train model parameters
   */
 
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <ctf.hpp>
 #define TEST_SUITE
 #include "../examples/ccsd.cxx"
@@ -27,12 +29,12 @@ struct Ccsd_dimensions {
 Ccsd_dimensions get_ccsd_dimensions(double mem_per_core, int64_t nvfac, World &dw) {
   int np;
   MPI_Comm_size(dw.comm, &np);
-  int64_t No(10); 
+  int64_t No(10);
   while ( No*No*No*No*nvfac*nvfac*8./np/1024/1024 < mem_per_core) No++;
   return Ccsd_dimensions({No, No*nvfac, No, (int64_t) No*nvfac*2.5});
 }
 
-void ph_contraction(int64_t No, int64_t Nv, World &dw) {
+void ph1_contraction(int64_t No, int64_t Nv, World &dw) {
   int64_t vvoo[] = {Nv, Nv, No, No};
   int syms[] = {NS, NS, NS, NS};
   CTF::Tensor< double > T(4, vvoo, syms, dw, "T");
@@ -43,6 +45,18 @@ void ph_contraction(int64_t No, int64_t Nv, World &dw) {
   R["abij"] = T["acik"] * V["cbkj"];
 }
 
+void ph2_contraction(int64_t No, int64_t Nv, World &dw) {
+  int64_t vvoo[] = {Nv, Nv, No, No};
+  int64_t ovvo[] = {No, Nv, Nv, No};
+  int syms[] = {NS, NS, NS, NS};
+  CTF::Tensor< double > T(4, vvoo, syms, dw, "T");
+  CTF::Tensor< double > V(4, ovvo, syms, dw, "V");
+  CTF::Tensor< double > R(4, vvoo, syms, dw, "R");
+  V.fill_random(0, 1);
+  T.fill_random(0, 1);
+  R["abij"] = T["acik"] * V["kbcj"];
+}
+
 void ggv_contraction(int64_t Nv, int64_t Nx, int64_t Ng, World &dw) {
   int64_t gxv[] = {Ng, Nx, Nv};
   int64_t vvxx[] = {Nv, Nv, Nx, Nx};
@@ -68,34 +82,32 @@ void rvt_contraction(int64_t No, int64_t Nv, int64_t Nx, World &dw) {
 
 void train_ccsd(World & dw, double mem_per_core, int64_t nvfac, int c_id){
   auto dim = get_ccsd_dimensions(mem_per_core, nvfac, dw);
-  if (c_id & 1) ph_contraction(dim.No, dim.Nv, dw);
-  if (c_id & 2) ggv_contraction(dim.Nv, dim.Nx, dim.Ng, dw); 
-  if (c_id & 4) rvt_contraction(dim.No, dim.Nv, dim.Nx, dw); 
+  if (c_id & 1) ph1_contraction(dim.No, dim.Nv, dw);
+  if (c_id & 2) ph2_contraction(dim.No, dim.Nv, dw);
+  if (c_id & 4) ggv_contraction(dim.Nv, dim.Nx, dim.Ng, dw);
+  if (c_id & 8) rvt_contraction(dim.No, dim.Nv, dim.Nx, dw);
 }
 
 
 
-void train_all(std::string dump_file){
-  World dw(MPI_COMM_WORLD);
+void train_all(std::string dump_path, int num_iterations, int rounds, int ppn){
+  World dw("hallo", 0, ppn);
   int rank;
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
 
-  // number of iterations for training
-  int num_iterations = 2, rounds = 2;
-
   for (int i=0; i<num_iterations; i++){
     if (rank == 0){
       printf("Starting iteration %d/%d\n", i+1,num_iterations);
     }
     for (int j(0); j < rounds; j++) {
-      train_ccsd(dw, 10.,  8, 7);
-      train_ccsd(dw, 10., 12, 7);
-      train_ccsd(dw, 25.,  8, 7);
-      train_ccsd(dw, 25., 12, 7);
-      train_ccsd(dw, 25., 16, 7);
+      train_ccsd(dw, 10.,  8, 15);
+      train_ccsd(dw, 10., 12, 15);
+      train_ccsd(dw, 25.,  8, 15);
+      train_ccsd(dw, 25., 12, 15);
+      train_ccsd(dw, 25., 16, 15);
       CTF_int::update_all_models(dw.comm);
-      if (rank == 0) printf("Completed training round %d/%d\n", j, rounds);
+      if (rank == 0) printf("Completed training round %d/%d\n", j+1, rounds);
     }
   }
 
@@ -103,7 +115,7 @@ void train_all(std::string dump_file){
 //  CTF_int::write_all_models(coeff_file);
   if (rank == 0) CTF_int::print_all_models();
 
-  if (dump_file.size()) CTF_int::dump_touched_models(dump_file);
+  if (dump_path.size()) CTF_int::dump_touched_models(dump_path);
 
 }
 
@@ -127,18 +139,25 @@ int main(int argc, char ** argv){
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Comm_size(MPI_COMM_WORLD, &np);
 
+  std::string dump_path("./data");
+  int iterations(3), rounds(3), ppn(0);
+  if (getCmdOption(input_str, input_str+in_num, "-write")){
+    dump_path = getCmdOption(input_str, input_str+in_num, "-write");
+  }
+  if (getCmdOption(input_str, input_str+in_num, "-ppn")){
+    ppn = atoi(getCmdOption(input_str, input_str+in_num, "-ppn"));
+  }
 
-  // Boolean expression that are used to pass command line argument to function train_all
-  std::string dump_file = "./data";
-
-  {
-    World dw(MPI_COMM_WORLD, argc, argv);
-
-    if (rank == 0){
-      printf("we train\n");
+  struct stat info;
+  if (!rank) {
+    if(!stat( dump_path.c_str(), &info ) != 0 ) {
+      printf( "Warning: dumping data into existing directory %s.\n", dump_path.c_str() );
+    } else {
+      mkdir(dump_path.c_str(), 0777);
     }
-    train_all(dump_file);
+    printf("we train\n");
   }
+  train_all(dump_path, iterations, rounds, ppn);
 
 
   MPI_Finalize();