diff --git a/Makefile b/Makefile
index 85c8f2f1..875f6f1c 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,7 @@ uninstall:
 EXAMPLES = algebraic_multigrid apsp bitonic_sort btwn_central ccsd checkpoint dft_3D fft force_integration force_integration_sparse jacobi matmul neural_network particle_interaction qinformatics recursive_matmul scan sparse_mp3 sparse_permuted_slice spectral_element spmv sssp strassen trace mis mis2 ao_mo_transf block_sparse checkpoint_sparse hosvd mttkrp fft_with_idx_partition
 TESTS = bivar_function bivar_transform ccsdt_map_test ccsdt_t3_to_t2 dft diag_ctr diag_sym endomorphism_cust endomorphism_cust_sp endomorphism gemm_4D multi_tsr_sym permute_multiworld readall_test readwrite_test repack scalar speye sptensor_sum subworld_gemm sy_times_ns test_suite univar_function weigh_4D  reduce_bcast
 
-BENCHMARKS = bench_contraction bench_nosym_transp bench_redistribution model_trainer
+BENCHMARKS = model_trainer_cc4s model_trainer bench_contraction bench_nosym_transp
 
 SCALAPACK_TESTS = qr svd eigh
 
diff --git a/bench/model_trainer_cc4s.cxx b/bench/model_trainer_cc4s.cxx
new file mode 100644
index 00000000..85201a47
--- /dev/null
+++ b/bench/model_trainer_cc4s.cxx
@@ -0,0 +1,170 @@
+/** Copyright (c) 2011, Edgar Solomonik, all rights reserved.
+  * \addtogroup benchmarks
+  * @{
+  * \addtogroup model_trainer
+  * @{
+  * \brief Executes a set of different contractions on different processor counts to train model parameters
+  */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctf.hpp>
+#define TEST_SUITE
+#include "../examples/ccsd.cxx"
+#include "../examples/sparse_mp3.cxx"
+#undef TEST_SUITE
+using namespace CTF;
+
+namespace CTF_int{
+  void update_all_models(MPI_Comm comm);
+}
+
+struct Ccsd_dimensions {
+  int64_t No;
+  int64_t Nv;
+  int64_t Nx;
+  int64_t Ng;
+};
+
+Ccsd_dimensions get_ccsd_dimensions(double mem_per_core, int64_t nvfac, World &dw) {
+  int np;
+  MPI_Comm_size(dw.comm, &np);
+  int64_t No(10);
+  while ( No*No*No*No*nvfac*nvfac*8./np/1024/1024 < mem_per_core) No++;
+  return Ccsd_dimensions({No, No*nvfac, No, (int64_t) No*nvfac*2.5});
+}
+
+void ph1_contraction(int64_t No, int64_t Nv, World &dw) {
+  int64_t vvoo[] = {Nv, Nv, No, No};
+  int syms[] = {NS, NS, NS, NS};
+  CTF::Tensor< double > T(4, vvoo, syms, dw, "T");
+  CTF::Tensor< double > V(4, vvoo, syms, dw, "V");
+  CTF::Tensor< double > R(4, vvoo, syms, dw, "R");
+  V.fill_random(0, 1);
+  T.fill_random(0, 1);
+  R["abij"] = T["acik"] * V["cbkj"];
+}
+
+void ph2_contraction(int64_t No, int64_t Nv, World &dw) {
+  int64_t vvoo[] = {Nv, Nv, No, No};
+  int64_t ovvo[] = {No, Nv, Nv, No};
+  int syms[] = {NS, NS, NS, NS};
+  CTF::Tensor< double > T(4, vvoo, syms, dw, "T");
+  CTF::Tensor< double > V(4, ovvo, syms, dw, "V");
+  CTF::Tensor< double > R(4, vvoo, syms, dw, "R");
+  V.fill_random(0, 1);
+  T.fill_random(0, 1);
+  R["abij"] = T["acik"] * V["kbcj"];
+}
+
+void ggv_contraction(int64_t Nv, int64_t Nx, int64_t Ng, World &dw) {
+  int64_t gxv[] = {Ng, Nx, Nv};
+  int64_t vvxx[] = {Nv, Nv, Nx, Nx};
+  int syms[] = {NS, NS, NS, NS};
+  CTF::Tensor< double > G(3,  gxv, syms, dw, "B");
+  CTF::Tensor< double > V(4, vvxx, syms, dw, "C");
+  G.fill_random(0, 1);
+  V["cdxy"] = G["Gxc"] * G["Gyd"];
+}
+
+void rvt_contraction(int64_t No, int64_t Nv, int64_t Nx, World &dw) {
+  int64_t vvoo[] = {Nv, Nv, No, No};
+  int64_t xxoo[] = {Nx, Nx, No, No};
+  int64_t vvxx[] = {Nv, Nv, Nx, Nx};
+  int syms[] = {NS, NS, NS, NS};
+  CTF::Tensor< double > T(4, vvoo, syms, dw, "T");
+  CTF::Tensor< double > V(4, vvxx, syms, dw, "V");
+  CTF::Tensor< double > R(4, xxoo, syms, dw, "R");
+  V.fill_random(0, 1);
+  T.fill_random(0, 1);
+  R["abij"] = V["xyab"] * T["xyij"];
+}
+
+void train_ccsd(World & dw, double mem_per_core, int64_t nvfac, int c_id){
+  auto dim = get_ccsd_dimensions(mem_per_core, nvfac, dw);
+  if (c_id & 1) ph1_contraction(dim.No, dim.Nv, dw);
+  if (c_id & 2) ph2_contraction(dim.No, dim.Nv, dw);
+  if (c_id & 4) ggv_contraction(dim.Nv, dim.Nx, dim.Ng, dw);
+  if (c_id & 8) rvt_contraction(dim.No, dim.Nv, dim.Nx, dw);
+}
+
+
+
+void train_all(std::string dump_path, int num_iterations, int rounds, int ppn){
+  World dw("hallo", 0, ppn);
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+
+  for (int i=0; i<num_iterations; i++){
+    if (rank == 0){
+      printf("Starting iteration %d/%d\n", i+1,num_iterations);
+    }
+    for (int j(0); j < rounds; j++) {
+      train_ccsd(dw, 10.,  8, 15);
+      train_ccsd(dw, 10., 12, 15);
+      train_ccsd(dw, 25.,  8, 15);
+      train_ccsd(dw, 25., 12, 15);
+      train_ccsd(dw, 25., 16, 15);
+      CTF_int::update_all_models(dw.comm);
+      if (rank == 0) printf("Completed training round %d/%d\n", j+1, rounds);
+    }
+  }
+
+
+//  CTF_int::write_all_models(coeff_file);
+  if (rank == 0) CTF_int::print_all_models();
+
+  if (dump_path.size()) CTF_int::dump_touched_models(dump_path);
+
+}
+
+char* getCmdOption(char ** begin,
+                   char ** end,
+                   const   std::string & option){
+  char ** itr = std::find(begin, end, option);
+  if (itr != end && ++itr != end){
+    return *itr;
+  }
+  return 0;
+}
+
+
+int main(int argc, char ** argv){
+  int rank, np;
+  int const in_num = argc;
+  char ** input_str = argv;
+
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+  std::string dump_path("./data");
+  int iterations(3), rounds(3), ppn(0);
+  if (getCmdOption(input_str, input_str+in_num, "-write")){
+    dump_path = getCmdOption(input_str, input_str+in_num, "-write");
+  }
+  if (getCmdOption(input_str, input_str+in_num, "-ppn")){
+    ppn = atoi(getCmdOption(input_str, input_str+in_num, "-ppn"));
+  }
+
+  struct stat info;
+  if (!rank) {
+    if(!stat( dump_path.c_str(), &info ) != 0 ) {
+      printf( "Warning: dumping data into existing directory %s.\n", dump_path.c_str() );
+    } else {
+      mkdir(dump_path.c_str(), 0777);
+    }
+    printf("we train\n");
+  }
+  train_all(dump_path, iterations, rounds, ppn);
+
+
+  MPI_Finalize();
+  return 0;
+}
+
+/**
+ * @}
+ * @}
+ */
diff --git a/configure b/configure
index 5e5627b1..145548e6 100755
--- a/configure
+++ b/configure
@@ -14,6 +14,7 @@ function usage
   echo -e '\t--with-lapack     Tells CTF build to enable LAPACK functionality regardless of whether LAPACK libs have been given.'
   echo
   echo -e '\t--with-scalapack  Tells CTF build to enable ScaLAPACK functionality regardless of whether ScaLAPACK libs have been given.'
+  echo -e '\t--without-scalapack  Tells CTF build to disable ScaLAPACK functionality regardless of whether ScaLAPACK libs have been given.'
   echo
   echo -e '\t--build-scalapack Tells CTF to download and build ScaLAPACK library.'
   echo
@@ -498,6 +499,7 @@ depstype=normal
 WITH_CUDA=0
 WITH_LAPACK=0
 WITH_SCALAPACK=0
+WITHOUT_SCALAPACK=0
 WITH_STATIC=1
 WITH_DYNAMIC=1
 BUILD_SCALAPACK=0
@@ -518,6 +520,9 @@ while [ "x$1" != "x" ]; do
     --with-scalapack)
       WITH_SCALAPACK=1
       ;;
+    --without-scalapack)
+      WITHOUT_SCALAPACK=1
+      ;;
     --build-scalapack)
       BUILD_SCALAPACK=1
       ;;
@@ -1035,7 +1040,7 @@ if [ $BUILD_SCALAPACK = 1 ]; then
 fi
 
 USING_SCALA=0
-if [ $WITH_STATIC = 1 ]; then
+if [[ $WITH_STATIC = 1 && $WITHOUT_SCALAPACK = 0 ]]; then
   echo -n 'Checking for static ScaLAPACK... '
   if testlink "$LIB_PATH $LIBS" $PDGEMM $VERBOSE; then
     echo 'static SCALAPACK found.'
@@ -1069,7 +1074,7 @@ if [ $WITH_STATIC = 1 ]; then
   fi
 fi
 
-if [ $WITH_DYNAMIC = 1 ]; then
+if [[ $WITH_DYNAMIC = 1 && $WITHOUT_SCALAPACK = 0 ]]; then
   echo -n 'Checking for dynamic ScaLAPACK... '
   if testldlink "$LD_LIB_PATH" "$LD_LIBS" $PDGEMM $VERBOSE; then
     echo 'dynamic SCALAPACK found.'
diff --git a/src/contraction/contraction.cxx b/src/contraction/contraction.cxx
index f5dd14dc..87cd61d7 100644
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -3,6 +3,7 @@
 #include "../scaling/strp_tsr.h"
 #include "../mapping/mapping.h"
 #include "../mapping/distribution.h"
+#include "../mapping/node_aware_dist.h"
 #include "../tensor/untyped_tensor.h"
 #include "../shared/util.h"
 #include "../shared/memcontrol.h"
@@ -111,7 +112,6 @@ namespace CTF_int {
     //if (A->wrld->cdt.cm == MPI_COMM_WORLD){
 //      update_all_models(A->wrld->cdt.cm);
     //}
-   
     int stat = home_contract();
     if (stat != SUCCESS){
       printf("CTF ERROR: Failed to perform contraction\n");
@@ -2582,11 +2582,12 @@ namespace CTF_int {
     assert(nnz_frac_C>=0.);
   }
 
-  void contraction::detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time){
+  void contraction::detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time, double &redist_time, double &contr_time, double &fold_time){
     TAU_FSTART(detail_estimate_mem_and_time);
     ctr * sctr;
     est_time = 0.;
     memuse = 0;
+    fold_time = 0.0;
     topology * topo_i = A->topo;
     bool csr_or_coo = B->is_sparse || C->is_sparse || is_custom || !A->sr->has_coo_ker;
     bool use_ccsr =  csr_or_coo && A->is_sparse && C->is_sparse && !B->is_sparse;
@@ -2598,13 +2599,14 @@ namespace CTF_int {
 #if FOLD_TSR
     if (can_fold()){
       est_time = est_time_fold();
+      fold_time = est_time;
       iparam prm = map_fold(false);
     
       sctr = construct_ctr(1, &prm);
       if (this->is_sparse())
-        est_time = ((spctr*)sctr)->est_time_rec(sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C);
+        est_time += ((spctr*)sctr)->est_time_rec(sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C);
       else
-        est_time = sctr->est_time_rec(sctr->num_lyr);
+        est_time += sctr->est_time_rec(sctr->num_lyr);
       A->remove_fold();
       B->remove_fold();
       C->remove_fold();
@@ -2672,6 +2674,7 @@ namespace CTF_int {
       }
 
     }
+    contr_time = est_time - fold_time;
 #if DEBUG >= 4
     printf("mapping passed contr est_time = %E sec %d %ld %ld %ld %E %E %E\n", est_time, sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C);
 #endif
@@ -2724,6 +2727,7 @@ namespace CTF_int {
       mem_redist_tmp += C->get_redist_mem(*dC, nnz_frac_C);
       //mem_redist += (int64_t)(nnz_frac_C*C->size*C->sr->pair_size()) +C->get_redist_mem(*dC, nnz_frac_C);
     }
+    redist_time = est_time - contr_time - fold_time;
     assert(mem_fold_tmp >= 0);
     assert(mem_fold >= 0);
     assert(mem_redist >= 0);
@@ -2755,7 +2759,9 @@ namespace CTF_int {
   #if DEBUG > 4
       for (int t=1; t<(int)wrld->topovec.size()+8; t++){
   #else
-      for (int64_t t=global_comm.rank+1; t<(int)wrld->topovec.size()+8; t+=global_comm.np){
+      int64_t incr(global_comm.np);
+      if (A->wrld->dryRanks) incr = 1;
+      for (int64_t t=global_comm.rank+1; t<(int)wrld->topovec.size()+8; t+=incr){
   #endif
         A->clear_mapping();
         B->clear_mapping();
@@ -2794,7 +2800,7 @@ namespace CTF_int {
         ret = map_to_topology(topo_i, j);
 
         if (ret == NEGATIVE){
-          //printf("map_to_topology returned negative\n");
+//          printf("map_to_topology returned negative %d %d\n", t, j);
           continue;
         }
    
@@ -2806,6 +2812,7 @@ namespace CTF_int {
         C->topo = topo_i;
        
         if (check_mapping() == 0){
+//          printf("check mapping is zero %d %d\n", t, j);
           continue;
         }
         A->set_padding();
@@ -2825,11 +2832,18 @@ namespace CTF_int {
           continue;
         }
         int64_t memuse;//, bmemuse;
-        double est_time;
-        detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time);
+        double est_time, redist_time, contr_time, fold_time;
+        detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time);
 #ifdef MIN_MEMORY
         est_time = memuse;
 #endif
+
+        if (A->wrld->dryRanks && A->wrld->verbose == 2)
+          printf( "t %ld j %d will use %f GB per rank and take %f s, %f %f %f"
+                , t, j, memuse/1024.0/1024./1024
+                , est_time, redist_time, contr_time, fold_time);
+        if (A->wrld->dryRanks && A->wrld->verbose == 2) C->print_map();
+
         ASSERT(est_time >= 0.0);
         if ((int64_t)memuse >= max_memuse){
           if (global_comm.rank == 0)
@@ -2889,7 +2903,7 @@ namespace CTF_int {
       int64_t old_off = choice_offset;
       choice_offset += tnum_choices;
       for (int j=0; j<tnum_choices; j++){
-        if ((old_off + j)%global_comm.np != global_comm.rank)
+        if (!A->wrld->dryRanks && (old_off + j)%global_comm.np != global_comm.rank)
           continue;
         A->clear_mapping();
         B->clear_mapping();
@@ -2931,12 +2945,15 @@ namespace CTF_int {
           continue;
         }
         int64_t memuse;//, bmemuse;
-        double est_time;
-        detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time);
+        double est_time, redist_time, contr_time, fold_time;
+        detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time);
 #ifdef MIN_MEMORY
         est_time = memuse;
 #endif
         ASSERT(est_time >= 0.0);
+        if (A->wrld->dryRanks) printf( "topo %d order %d will use %f GB per rank and take %f s (%f %f %f, redist/contraction/folding)\n"
+                                     , i, j, memuse/1024.0/1024./1024, est_time, redist_time, contr_time, fold_time);
+
 
         if ((int64_t)memuse >= max_memuse){
           DPRINTF(3,"[EXH] Not enough memory available for topo %d with order %d memory %ld/%ld\n", i,j,memuse,max_memuse);
@@ -3076,7 +3093,7 @@ namespace CTF_int {
       A->set_padding();
       B->set_padding();
       C->set_padding();
-      if (gbest_time_sel < 100.){
+      if (gbest_time_sel > 1e100){
         gbest_time_exh = gbest_time_sel+1.;
         ttopo_exh = ttopo_sel;
       } else {
@@ -3107,6 +3124,7 @@ namespace CTF_int {
       ctr_sig_map.insert(std::pair<contraction_signature,topo_info>(sig,ti));
       TAU_FSTOP(ctr_sig_map_insert);
     }
+
     if (!do_remap || ttopo == INT64_MAX || ttopo == -1){
       CTF_int::cdealloc(old_phase_A);
       CTF_int::cdealloc(old_phase_B);
@@ -3191,9 +3209,9 @@ namespace CTF_int {
 #if (VERBOSE >= 1 || DEBUG >= 1 || PROFILE_MEMORY >= 1)
 
     int64_t memuse;
-    double est_time;
+    double est_time, redist_time, contr_time, fold_time;
 
-    detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time);
+    detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time);
     if (global_comm.rank == 0){
       printf("Contraction will use %E bytes per processor out of %E available memory (already used %E) and take an estimated of %E sec\n",
               (double)memuse,(double)proc_bytes_available(),(double)proc_bytes_used(),est_time);
@@ -3205,6 +3223,15 @@ namespace CTF_int {
 //    assert(est_time == std::min(gbest_time_sel,gbest_time_exh));
 //#endif
 #endif
+    if (A->wrld->dryRanks){
+      int64_t memuse;
+      double est_time, redist_time, contr_time, fold_time;
+      detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time);
+      printf( "Contraction will use %f GB per rank and take %f s (%f %f %f, redist/contraction/folding)\n"
+            , memuse/1024.0/1024./1024, est_time, redist_time, contr_time, fold_time);
+    }
+
+
 
     if (can_fold()){
       iparam prm = map_fold(false);
@@ -3250,7 +3277,7 @@ namespace CTF_int {
       }
     } else
       need_remap = 1;
-    if (need_remap)
+    if (need_remap && !wrld->dryRanks)
       A->redistribute(*dA);
     need_remap = 0;
     if (B->topo == old_topo_B){
@@ -3260,7 +3287,7 @@ namespace CTF_int {
       }
     } else
       need_remap = 1;
-    if (need_remap)
+    if (need_remap && !wrld->dryRanks)
       B->redistribute(*dB);
     need_remap = 0;
     if (C->topo == old_topo_C){
@@ -3270,9 +3297,9 @@ namespace CTF_int {
       }
     } else
       need_remap = 1;
-    if (need_remap)
+    if (need_remap && !wrld->dryRanks)
       C->redistribute(*dC);
-                  
+
     TAU_FSTOP(redistribute_for_contraction);
    
     CTF_int::cdealloc( old_phase_A );
@@ -4169,6 +4196,8 @@ namespace CTF_int {
     ctr * ctrf;
     CommData global_comm = C->wrld->cdt;
 
+
+
     if (A->has_zero_edge_len || B->has_zero_edge_len
         || C->has_zero_edge_len){
       if (!C->sr->isequal(beta,C->sr->mulid()) && !C->has_zero_edge_len){
@@ -4368,7 +4397,6 @@ namespace CTF_int {
       C->print_map();
     }
 #endif
-
   #ifdef PROFILE
     TAU_FSTART(pre_fold_barrier);
     MPI_Barrier(global_comm.cm);
@@ -4383,12 +4411,50 @@ namespace CTF_int {
     if (is_inner){
       iparam prm;
       TAU_FSTART(map_fold);
-      prm = map_fold();
+      prm = map_fold(!A->wrld->dryRanks);
       TAU_FSTOP(map_fold);
       delete ctrf;
       ctrf = construct_ctr(1, &prm);
     }
   #endif
+
+
+  if (A->wrld->dryRanks){
+// iran: this is the silent version
+    A->print_map();
+    B->print_map();
+    C->print_map();
+    //ctrf->print();
+#define NODE_AWARE 1
+#ifdef NODE_AWARE
+    if (C->wrld->ppn){
+      topology orig_topo = *(C->topo);
+      std::vector<int> pe_grid(orig_topo.lens, orig_topo.lens + orig_topo.order);
+      std::vector<std::vector<int> > inter_node_grids =
+        CTF_int::get_inter_node_grids(pe_grid, C->wrld->dryRanks/C->wrld->ppn);
+      int * intra_node_lens = (int*)CTF_int::alloc(orig_topo.order*sizeof(int));
+      double comm_vol_ref = ctrf->est_internode_comm_vol_rec(ctrf->num_lyr);
+      printf("Ref: %f\n", comm_vol_ref/1024.0/1024.0/1024.0);
+      for (size_t i=0; i<inter_node_grids.size(); i++){
+        for (int j=0; j<orig_topo.order; j++)
+          intra_node_lens[j] = orig_topo.lens[j] / inter_node_grids[i][j];
+        topology na_topo_i(orig_topo.order, orig_topo.lens, orig_topo.glb_comm, 0, intra_node_lens);
+        C->topo->morph_to(na_topo_i);
+        double comm_vol_i = ctrf->est_internode_comm_vol_rec(ctrf->num_lyr);
+        for (int j=0; j < orig_topo.order; j++) printf("%d ", inter_node_grids[i][j]);
+        printf("-> %f\n", comm_vol_i/1024.0/1024.0/1024.0);
+
+        C->topo->morph_to(orig_topo);
+      }
+      cdealloc(intra_node_lens);
+    }
+#endif
+    delete ctrf;
+    TAU_FSTOP(contract);
+    return SUCCESS;
+  }
+
+
   #if (VERBOSE >= 1 || DEBUG >= 1)
   if (global_comm.rank == 0){
     ctrf->print();
@@ -4422,8 +4488,72 @@ namespace CTF_int {
     MPI_Barrier(global_comm.cm);
     TAU_FSTOP(pre_ctr_func_barrier);
   #endif
+
+
+#ifdef NODE_AWARE
+    TAU_FSTART(node_aware_remapping);
+    /* reorder processor grid to account for node-awareness */
+    topology orig_topo = *(C->topo);
+    int64_t node_aware_send_to_rank(0);
+    int64_t node_aware_recv_from_rank(0);
+    // FIXME: support sparsity
+    if (C->wrld->ppn && !is_sparse()){
+      std::vector<int> pe_grid(orig_topo.lens, orig_topo.lens + orig_topo.order);
+      std::vector<std::vector<int> > inter_node_grids = CTF_int::get_inter_node_grids(pe_grid, C->wrld->np/C->wrld->ppn);
+      //std::vector< std::vector<int> > intra_node_grids = CTF_int::get_all_shapes(C->wrld->ppn()){
+      int * intra_node_lens = (int*)CTF_int::alloc(orig_topo.order*sizeof(int));
+      int64_t best_topo_index(0);
+      double best_comm_vol = DBL_MAX;
+      for (size_t i=0; i<inter_node_grids.size(); i++){
+        for (int j=0; j<orig_topo.order; j++){
+          intra_node_lens[j] = orig_topo.lens[j] / inter_node_grids[i][j];
+        }
+        topology na_topo_i(orig_topo.order, orig_topo.lens, orig_topo.glb_comm, 0, intra_node_lens);
+        // overwrite topology object in a way that also changes information in CommData objects pointed to ctrf
+        C->topo->morph_to(na_topo_i);
+
+        double comm_vol_i = ctrf->est_internode_comm_vol_rec(ctrf->num_lyr);
+        if (comm_vol_i < best_comm_vol){
+          best_topo_index = i;
+          best_comm_vol = comm_vol_i;
+        }
+        C->topo->morph_to(orig_topo);
+      }
+      for (int j=0; j<orig_topo.order; j++){
+        intra_node_lens[j] = orig_topo.lens[j] / inter_node_grids[best_topo_index][j];
+      }
+      topology node_aware_topo(orig_topo.order, orig_topo.lens, orig_topo.glb_comm, 0, intra_node_lens);
+      // overwrite topology object in a way that also changes information in CommData objects pointed to ctrf
+      C->topo->morph_to(node_aware_topo);
+      node_aware_send_to_rank = get_inv_topo_reorder_rank(node_aware_topo.order, node_aware_topo.lens, intra_node_lens, orig_topo.glb_comm.rank);
+      node_aware_recv_from_rank = get_topo_reorder_rank(node_aware_topo.order, node_aware_topo.lens, node_aware_topo.lda, intra_node_lens, orig_topo.glb_comm.rank);
+      if (orig_topo.glb_comm.rank != node_aware_send_to_rank){
+        IASSERT(orig_topo.glb_comm.rank != node_aware_recv_from_rank);
+        TAU_FSTART(redistribute_for_node_aware);
+        // FIXME: to support sparsity need to also communicate nnz information here
+        MPI_Status stat;
+        MPI_Sendrecv_replace(A->data, A->size, A->sr->mdtype(), node_aware_send_to_rank, 1322, node_aware_recv_from_rank, 1322, orig_topo.glb_comm.cm, &stat);
+        MPI_Sendrecv_replace(B->data, B->size, B->sr->mdtype(), node_aware_send_to_rank, 1323, node_aware_recv_from_rank, 1323, orig_topo.glb_comm.cm, &stat);
+        MPI_Sendrecv_replace(C->data, C->size, C->sr->mdtype(), node_aware_send_to_rank, 1324, node_aware_recv_from_rank, 1324, orig_topo.glb_comm.cm, &stat);
+        TAU_FSTOP(redistribute_for_node_aware);
+      }
+      cdealloc(intra_node_lens);
+    }
+    TAU_FSTOP(node_aware_remapping);
+#endif
+
+
+
+
+
     TAU_FSTART(ctr_func);
     /* Invoke the contraction algorithm */
+    TAU_FSTART(blockComm);
+    std::vector<int> swap;
+    ctrf->blockComm( A->topo->lens, A->data, B->data, C->data
+		   , A->size, B->size, C->size, global_comm, swap);
+    MPI_Barrier(global_comm.cm);
+    TAU_FSTOP(blockComm);
     A->topo->activate();
 
   #ifdef PROFILE_MEMORY
@@ -4506,9 +4636,37 @@ namespace CTF_int {
       printf("Finished contraction  computation\n");
     }
   #endif
+#ifdef NODE_AWARE
+    TAU_FSTART(node_aware_backmapping);
+    /* reorder processor grid to account for node-awareness */
+    // FIXME: support sparsity
+    if (C->wrld->ppn && !is_sparse() && orig_topo.glb_comm.rank != node_aware_send_to_rank){
+      TAU_FSTART(redistribute_for_node_aware);
+      // FIXME: to support sparsity need to also communicate nnz information here
+      MPI_Status stat;
+      if (A->is_home) {
+        MPI_Sendrecv_replace(A->data, A->size, A->sr->mdtype(), node_aware_recv_from_rank, 1325, node_aware_send_to_rank, 1325, orig_topo.glb_comm.cm, &stat);
+      }
+      if (B->is_home) {
+        MPI_Sendrecv_replace(B->data, B->size, B->sr->mdtype(), node_aware_recv_from_rank, 1326, node_aware_send_to_rank, 1326, orig_topo.glb_comm.cm, &stat);
+      }
+      MPI_Sendrecv_replace(C->data, C->size, C->sr->mdtype(), node_aware_recv_from_rank, 1327, node_aware_send_to_rank, 1327, orig_topo.glb_comm.cm, &stat);
+      TAU_FSTOP(redistribute_for_node_aware);
+    }
+    if (C->wrld->ppn  && !is_sparse()) {
+      C->topo->morph_to(orig_topo);
+    }
+    TAU_FSTOP(node_aware_backmapping);
+#endif
+
 
 
-    A->topo->deactivate();
+//    A->topo->deactivate();
+    TAU_FSTART(blockComm);
+    MPI_Barrier(global_comm.cm);
+    ctrf->blockComm( A->topo->lens, A->data, B->data, C->data
+                   , A->size, B->size, C->size, global_comm, swap);
+    TAU_FSTOP(blockComm);
 
   #ifdef PROFILE
     TAU_FSTART(post_ctr_func_barrier);
@@ -5176,7 +5334,6 @@ namespace CTF_int {
         return SUCCESS;
       }
     }
-
     contraction new_ctr = contraction(*this);
 
     was_home_A = A->is_home;
@@ -5245,7 +5402,9 @@ namespace CTF_int {
     }
 
     ret = new_ctr.sym_contract();//&ntype, ftsr, felm, alpha, beta);
+
     if (ret!= SUCCESS) return ret;
+    if (C->wrld->dryRanks) return SUCCESS;
     if (was_home_C) new_ctr.C->unfold();
 
     if (was_home_C && !new_ctr.C->is_home){
diff --git a/src/contraction/contraction.h b/src/contraction/contraction.h
index 90ca2211..8395983c 100644
--- a/src/contraction/contraction.h
+++ b/src/contraction/contraction.h
@@ -292,7 +292,7 @@ namespace CTF_int {
 
       void calc_nnz_frac(double & nnz_frac_A, double & nnz_frac_B, double & nnz_frac_C);
 
-      void detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time);
+      void detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time, double &redist_time, double &contr_time, double &fold_time);
 
       void get_best_sel_map(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & idx, double & time);
 
diff --git a/src/contraction/ctr_2d_general.cxx b/src/contraction/ctr_2d_general.cxx
index 117704be..e5faad92 100755
--- a/src/contraction/ctr_2d_general.cxx
+++ b/src/contraction/ctr_2d_general.cxx
@@ -199,15 +199,18 @@ namespace CTF_int {
 
   void ctr_2d_general::print() {
     printf("ctr_2d_general: edge_len = %ld\n", edge_len);
-    printf("move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld\n",
+    printf("move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld",
             move_A, ctr_lda_A, ctr_sub_lda_A);
-    if (move_A) printf("cdt_A length = %d\n",cdt_A->np);
-    printf("move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld\n",
+    if (move_A) printf(", cdt_A length = %d",cdt_A->np);
+    printf("\n");
+    printf("move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld",
             move_B, ctr_lda_B, ctr_sub_lda_B);
-    if (move_B) printf("cdt_B length = %d\n",cdt_B->np);
-    printf("move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld\n",
+    if (move_B) printf(", cdt_B length = %d",cdt_B->np);
+    printf("\n");
+    printf("move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld",
             move_C, ctr_lda_C, ctr_sub_lda_C);
-    if (move_C) printf("cdt_C length = %d\n",cdt_C->np);
+    if (move_C) printf(", cdt_C length = %d",cdt_C->np);
+    printf("\n");
 #ifdef OFFLOAD
     if (alloc_host_buf)
       printf("alloc_host_buf is true\n");
@@ -262,6 +265,26 @@ namespace CTF_int {
     return rec_ctr->est_time_rec(1)*(double)edge_len/MIN(nlyr,edge_len) + est_time_fp(nlyr);
   }
 
+
+  double ctr_2d_general::est_internode_collective_comm_vol(int nlyr) {
+    int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
+    find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
+    double sz = 0.0;
+    if (move_A)
+      sz += (sr_A->el_size*s_A) * (cdt_A->num_nodes - 1);// ((cdt_A->np / cdt_A->intra_node_np) - 1);
+    if (move_B)
+      sz += (sr_B->el_size*s_B) * (cdt_B->num_nodes - 1);// ((cdt_B->np / cdt_B->intra_node_np) - 1);
+    if (move_C)
+      sz += (sr_C->el_size*s_C) * (cdt_C->num_nodes - 1); //((cdt_C->np / cdt_C->intra_node_np) - 1);
+    return (sz*(double)edge_len)/MIN(nlyr,edge_len);
+  }
+
+  double ctr_2d_general::est_internode_comm_vol_rec(int nlyr) {
+    return rec_ctr->est_internode_comm_vol_rec(1)*(double)edge_len/MIN(nlyr,edge_len) + est_internode_collective_comm_vol(nlyr);
+  }
+
+
+
   int64_t ctr_2d_general::mem_fp() {
     int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
     find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
@@ -431,7 +454,7 @@ namespace CTF_int {
         if (cdt_C->rank == owner_C)
           cdt_C->red(MPI_IN_PLACE, op_C, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C);
         else
-          cdt_C->red(op_C, NULL, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C);
+          cdt_C->red(op_C, op_C, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C);
         if (rank_C == owner_C){
           sr_C->copy(ctr_sub_lda_C, ctr_lda_C,
                      op_C, ctr_sub_lda_C, sr_C->mulid(),
@@ -469,5 +492,113 @@ namespace CTF_int {
     }
     TAU_FSTOP(ctr_2d_general);
   }
+
+  void ctr_2d_general::blockComm( int const * rgrid, char *A, char *B, char *C
+                                , size_t sizeA, size_t sizeB, size_t sizeC
+                                , CommData glb_comm, std::vector<int> &swap
+  ){
+    int rank = glb_comm.rank;
+    int np = glb_comm.np;
+    int src, dst;
+    // we have to determine the partners
+    if (! swap.size() ) {
+      ipair nr(getNumNodes(glb_comm.cm));
+      // rGrid is the rankGrid of the given tensor topology
+      CommGrid grid({rgrid[0], rgrid[1]}, nr.first);
+      ipair nGrid = grid.nGrid;
+      ipair iGrid = grid.iGrid;
+      // rr is the key/color pair for the original rank distribution of dim_comm[0]
+      std::vector<ipair> rr(np);
+      std::vector< std::pair<ipair, int> > perm(np);
+      for (int r(0); r < np; r++) rr[r] = { r % rgrid[0], r / rgrid[0] };
+      // the desired distribution are nGrid[0] x nGrid[1] blocks with the some color
+      for (int r(0); r < np; r++){
+        // the color is the jth column and kth row in the nodeGrid
+        int clr = (rr[r].second/iGrid.second)*nGrid.first + rr[r].first/iGrid.first;
+        int key = (rr[r].second%iGrid.second)*iGrid.first + rr[r].first%iGrid.first;
+       // we have to swap color and key that we can use std::sort
+        perm[r] = { { clr, key }, r};
+      }
+      std::sort(perm.begin(), perm.end());
+      for (auto p: perm) swap.push_back(p.second);
+
+      src = swap[rank];
+      auto it( std::find(swap.begin(), swap.end(), rank) );
+      dst = std::distance(swap.begin(), it);
+    }
+    else {
+      dst = swap[rank];
+      auto it( std::find(swap.begin(), swap.end(), rank) );
+      src = std::distance(swap.begin(), it);
+    }
+
+    MPI_Barrier(glb_comm.cm);
+    MPI_Status s;
+    MPI_Sendrecv_replace(&cdt_A->color, 1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s);
+    MPI_Sendrecv_replace(&cdt_B->color, 1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s);
+    MPI_Sendrecv_replace(&cdt_A->rank,  1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s);
+    MPI_Sendrecv_replace(&cdt_B->rank,  1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s);
+
+    MPI_Barrier(glb_comm.cm);
+
+    size_t el(std::max(sizeA, sizeB));
+    el = std::max(el, sizeC);
+    char *buf = new char[el*sr_A->el_size];
+    // Do the A job
+    MPI_Request sreq, rreq;
+    MPI_Irecv(buf, sizeA, sr_A->mdtype(), src, 0, glb_comm.cm, &rreq);
+    MPI_Isend(A,   sizeA, sr_A->mdtype(), dst, 0, glb_comm.cm, &sreq);
+    MPI_Wait(&rreq, MPI_STATUS_IGNORE);
+    MPI_Wait(&sreq, MPI_STATUS_IGNORE);
+    memcpy(A, buf, sizeA*sr_A->el_size);
+
+    // Do the B job
+    MPI_Irecv(buf, sizeB, sr_A->mdtype(), src, 0, glb_comm.cm, &rreq);
+    MPI_Isend(B,   sizeB, sr_A->mdtype(), dst, 0, glb_comm.cm, &sreq);
+    MPI_Wait(&rreq, MPI_STATUS_IGNORE);
+    MPI_Wait(&sreq, MPI_STATUS_IGNORE);
+    memcpy(B, buf, sizeB*sr_A->el_size);
+
+    // Do the B job
+    MPI_Irecv(buf, sizeC, sr_A->mdtype(), src, 0, glb_comm.cm, &rreq);
+    MPI_Isend(C,   sizeC, sr_A->mdtype(), dst, 0, glb_comm.cm, &sreq);
+    MPI_Wait(&rreq, MPI_STATUS_IGNORE);
+    MPI_Wait(&sreq, MPI_STATUS_IGNORE);
+    memcpy(C, buf, sizeC*sr_A->el_size);
+    MPI_Barrier(glb_comm.cm);
+  }
+
+  ipair ctr_2d_general::getNumNodes(MPI_Comm comm){
+    int rank, np;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &np);
+
+    std::vector<std::string> nodeList(np);
+    char nodeName[MPI_MAX_PROCESSOR_NAME];
+    char nodeNames[np*MPI_MAX_PROCESSOR_NAME];
+    std::vector<int> nameLengths(np);
+    std::vector<int> off(np);
+    int nameLength;
+    MPI_Get_processor_name(nodeName, &nameLength);
+    MPI_Allgather(
+      &nameLength, 1, MPI_INT, nameLengths.data(), 1, MPI_INT, comm
+    );
+    for (int i(1); i < np; i++) off[i] = off[i-1] + nameLengths[i-1];
+    MPI_Allgatherv(
+      nodeName, nameLengths[rank], MPI_BYTE, nodeNames,
+      nameLengths.data(), off.data(), MPI_BYTE, comm
+    );
+    for (int i(0); i < np; i++) {
+      std::string s(&nodeNames[off[i]], nameLengths[i]);
+      nodeList[i] = s;
+    }
+    std::sort(nodeList.begin(), nodeList.end());
+    std::vector<std::string>::iterator it(
+      std::unique(nodeList.begin(), nodeList.end())
+    );
+    size_t nNodes(std::distance(nodeList.begin(), it));
+    return {nNodes, np/nNodes};
+  }
+
 }
 
diff --git a/src/contraction/ctr_2d_general.h b/src/contraction/ctr_2d_general.h
index 3df75d3d..9dbed5d7 100644
--- a/src/contraction/ctr_2d_general.h
+++ b/src/contraction/ctr_2d_general.h
@@ -68,6 +68,8 @@ namespace CTF_int{
       CommData * cdt_A;
       CommData * cdt_B;
       CommData * cdt_C;
+
+
       /* Class to be called on sub-blocks */
       ctr * rec_ctr;
       
@@ -81,6 +83,19 @@ namespace CTF_int{
        *  where b is the smallest blocking factor among A and B or A and C or B and C. 
        */
       void run(char * A, char * B, char * C);
+      /**
+       * \brief interchanges processors in the communicator -> permuting
+       *  the data such that each communicator has adjacent global ranks
+       */
+      void blockComm( int const *rgrid, char *A, char *B, char *C
+                    , size_t sizeA, size_t sizeB, size_t sizeC
+                    , CommData globalComm, std::vector<int> &swap);
+
+      /**
+       * \brief returns the number of nodes & number of ranks per node
+       *        note: only trustworthy if ranks per node is the same for all nodes!!
+       */
+      ipair getNumNodes(MPI_Comm comm);
       /**
        * \brief returns the number of bytes of buffer space
        *  we need 
@@ -102,6 +117,19 @@ namespace CTF_int{
        * \return bytes needed for recursive contraction
        */
       double est_time_rec(int nlyr);
+
+      /**
+       * \brief estimate the inter-node communication volume of this kernel
+       * \return volume in bytes, represented as floating point
+       */
+      double est_internode_collective_comm_vol(int nlyr);
+
+      /**
+       * \brief estimate the inter-node communication volume of the algorithm recursively
+       * \return volume in bytes, represented as floating point
+       */
+      double est_internode_comm_vol_rec(int nlyr);
+
       ctr * clone();
 
       /**
diff --git a/src/contraction/ctr_comm.cxx b/src/contraction/ctr_comm.cxx
index de7a3e20..63f659f4 100755
--- a/src/contraction/ctr_comm.cxx
+++ b/src/contraction/ctr_comm.cxx
@@ -193,6 +193,18 @@ namespace CTF_int {
     return rec_ctr->est_time_rec(nlyr) + est_time_fp(nlyr);
   }
 
+  double ctr_replicate::est_internode_comm_vol_rec(int nlyr) {
+    int i;
+    double sz = 0.;
+    for (i = 0; i < ncdt_A; i++)
+      sz += (size_A*sr_A->el_size) * (cdt_A[i]->num_nodes - 1);
+    for (i = 0; i < ncdt_B; i++)
+      sz += (size_B*sr_B->el_size) * (cdt_B[i]->num_nodes - 1);
+    for (i = 0; i < ncdt_C; i++)
+      sz += (size_C*sr_C->el_size) * (cdt_C[i]->num_nodes - 1);
+    return rec_ctr->est_internode_comm_vol_rec(nlyr) + sz;
+  }
+
   int64_t ctr_replicate::mem_fp(){
     return 0;
   }
diff --git a/src/contraction/ctr_comm.h b/src/contraction/ctr_comm.h
index 0f4670df..a748888a 100644
--- a/src/contraction/ctr_comm.h
+++ b/src/contraction/ctr_comm.h
@@ -199,8 +199,11 @@ namespace CTF_int{
       virtual int64_t mem_rec() { return mem_fp(); };
       virtual double est_time_fp(int nlyr) { return 0; };
       virtual double est_time_rec(int nlyr) { return est_time_fp(nlyr); };
+      virtual double est_internode_comm_vol_rec(int nlyr) { return 0; };
       virtual ctr * clone() { return NULL; };
-      
+      virtual void blockComm( int const *rgrid, char *A, char *B, char *C
+                            , size_t sizeA, size_t sizeB, size_t sizeC
+                            , CommData globalComm, std::vector<int> &swap) {};
       /**
        * \brief deallocates generic ctr object
        */
@@ -254,6 +257,11 @@ namespace CTF_int{
        * \return time in sec
        */
       double est_time_rec(int nlyr);
+      /**
+       * \brief estimate the inter-node communication volume of the algorithm
+       * \return volume in bytes, represented as floating point
+       */
+      double est_internode_comm_vol_rec(int nlyr);
       void print();
       ctr * clone();
 
diff --git a/src/contraction/ctr_tsr.cxx b/src/contraction/ctr_tsr.cxx
index 45cc994a..bd398ec1 100755
--- a/src/contraction/ctr_tsr.cxx
+++ b/src/contraction/ctr_tsr.cxx
@@ -338,17 +338,20 @@ namespace CTF_int {
     int i;
     printf("seq_tsr_ctr:\n");
     for (i=0; i<order_A; i++){
-      printf("edge_len_A[%d]=%ld\n",i,edge_len_A[i]);
+      printf("edge_len_A[%d]=%ld ",i,edge_len_A[i]);
     }
+    printf("\n");
     for (i=0; i<order_B; i++){
-      printf("edge_len_B[%d]=%ld\n",i,edge_len_B[i]);
+      printf("edge_len_B[%d]=%ld ",i,edge_len_B[i]);
     }
+    printf("\n");
     for (i=0; i<order_C; i++){
-      printf("edge_len_C[%d]=%ld\n",i,edge_len_C[i]);
+      printf("edge_len_C[%d]=%ld ",i,edge_len_C[i]);
     }
+    printf("\n");
     printf("is inner = %d\n", is_inner);
-    if (is_inner) printf("inner n = %ld m= %ld k = %ld l = %ld\n",
-                          inner_params.n, inner_params.m, inner_params.k, inner_params.l);
+    if (is_inner) printf("inner n = %ld m= %ld k = %ld l = %ld, ta = %c, tb =%c, tc = %c\n",
+                          inner_params.n, inner_params.m, inner_params.k, inner_params.l, inner_params.tA, inner_params.tB, inner_params.tC);
   }
 
   seq_tsr_ctr::seq_tsr_ctr(ctr * other) : ctr(other) {
@@ -436,7 +439,13 @@ namespace CTF_int {
   double seq_tsr_ctr::est_time_fp(int nlyr){
     //return COST_MEMBW*(size_A+size_B+size_C)+COST_FLOP*flops;
     double ps[] = {1.0, (double)est_membw(), est_fp()};
-//    printf("time estimate is %lf\n", seq_tsr_ctr_mdl.est_time(ps));
+    // incorperating the fact that dgemm with small k-edges is less effective
+    double k(inner_params.k);
+    double fac;
+    fac = std::max(1.0, 70/(k*0.3+5.0));
+
+//    printf("%d %d %d:time estimate is %lf\n",
+//           inner_params.m, inner_params.n, inner_params.k,  seq_tsr_ctr_mdl_inr.est_time(ps)*fac);
     if (is_custom && !is_inner){
       return seq_tsr_ctr_mdl_cst.est_time(ps);
     } else if (is_inner){
@@ -492,6 +501,7 @@ namespace CTF_int {
 
     if (!sr) return;
 #endif
+
     if (is_custom && !is_inner){
       double st_time = MPI_Wtime();
       ASSERT(is_inner == 0);
diff --git a/src/interface/common.cxx b/src/interface/common.cxx
index ed7ffc07..228255f2 100644
--- a/src/interface/common.cxx
+++ b/src/interface/common.cxx
@@ -265,21 +265,29 @@ namespace CTF_int {
   }
 
   CommData::CommData(CommData const & other){
-    cm      = other.cm;
-    alive   = other.alive;
-    rank    = other.rank;
-    np      = other.np;
-    color   = other.color;
-    created = 0;
+    cm            = other.cm;
+    alive         = other.alive;
+    rank          = other.rank;
+    np            = other.np;
+    color         = other.color;
+    global_rank   = other.global_rank;
+    node_id       = other.node_id;
+    num_nodes     = other.num_nodes;
+    intra_node_np = other.intra_node_np;
+    created       = 0;
   }
 
   CommData& CommData::operator=(CommData const & other){
-    cm      = other.cm;
-    alive   = other.alive;
-    rank    = other.rank;
-    np      = other.np;
-    color   = other.color;
-    created = 0;
+    cm            = other.cm;
+    alive         = other.alive;
+    rank          = other.rank;
+    np            = other.np;
+    color         = other.color;
+    global_rank   = other.global_rank;
+    node_id       = other.node_id;
+    num_nodes     = other.num_nodes;
+    intra_node_np = other.intra_node_np;
+    created       = 0;
     return *this;
   }
 
@@ -288,16 +296,23 @@ namespace CTF_int {
     cm = cm_;
     MPI_Comm_rank(cm, &rank);
     MPI_Comm_size(cm, &np);
+    MPI_Comm_rank(MPI_COMM_WORLD,&global_rank);
+    intra_node_np = 0;
     alive = 1;
     created = 0;
   }
 
-  CommData::CommData(int rank_, int color_, int np_){
-    rank    = rank_;
-    color   = color_;
-    np      = np_;
-    alive   = 0;
-    created = 0;
+  CommData::CommData(
+    int rank_, int color_, int np_, int num_nodes_, int global_rank_, int intra_node_np_
+  ){
+    rank          = rank_;
+    color         = color_;
+    np            = np_;
+    num_nodes     = num_nodes_;
+    global_rank   = global_rank_;
+    intra_node_np = intra_node_np_;
+    alive         = 0;
+    created       = 0;
   }
 
   CommData::CommData(int rank_, int color_, CommData parent){
@@ -306,6 +321,8 @@ namespace CTF_int {
     ASSERT(parent.alive);
     MPI_Comm_split(parent.cm, color, rank_, &cm);
     MPI_Comm_size(cm, &np);
+    global_rank = parent.global_rank;
+    intra_node_np = 0;
     alive   = 1;
     created = 1;
   }
@@ -387,7 +404,10 @@ namespace CTF_int {
 #ifdef TUNE
     double st_time = MPI_Wtime();
 #endif
+    TAU_FSTART(bcast);
     MPI_Bcast(buf, count, mdtype, root, cm);
+    MPI_Barrier(cm);
+    TAU_FSTOP(bcast);
 #ifdef TUNE
     MPI_Barrier(cm);
     double exe_time = MPI_Wtime()-st_time;
@@ -571,6 +591,110 @@ namespace CTF_int {
     alltoallv_mdl.observe(tps);
   }
 
+
+  CommGrid::CommGrid(ipair _rGrid, int _nNodes){
+    nRanks = _rGrid.first*_rGrid.second;
+    colorKey.resize(nRanks);
+    nGrid = getNodeGrid(_nNodes, _rGrid);
+    rGrid = _rGrid;
+    iGrid.first  = rGrid.first / nGrid.first;
+    iGrid.second = rGrid.second / nGrid.second;
+    assert(colorKey.size() == iGrid.first*iGrid.second*_nNodes);
+  }
+
+  ipair CommGrid::getNodeGrid(int nNodes, ipair rGrid){
+    ipair nGrid({1, 1});
+    std::vector<int> facNodes(CommGrid::factorize(nNodes));
+    std::vector<int> facrgf(CommGrid::factorize(rGrid.first));
+    std::vector<int> facrgs(CommGrid::factorize(rGrid.second));
+    std::vector<int> diff;
+
+    // We are selecting all prim factors of #nodes
+    // which do not occur in the prim factors of a grid edge
+    // we remove these factors and assign them to the opponent grid edge
+
+    std::set_difference( facNodes.begin(), facNodes.end()
+                       , facrgf.begin(), facrgf.end()
+                       , std::back_inserter(diff)
+                       );
+
+   for (auto d: diff)
+      facNodes.erase(std::find(facNodes.begin(), facNodes.end(), d));
+
+    nGrid.second =
+      std::accumulate(diff.begin(), diff.end(), 1, std::multiplies<int>());
+    diff.resize(0);
+
+    std::set_difference( facNodes.begin(), facNodes.end()
+                       , facrgs.begin(), facrgs.end()
+                       , std::back_inserter(diff)
+                       );
+    for (auto d: diff)
+      facNodes.erase(std::find(facNodes.begin(), facNodes.end(), d));
+
+    nGrid.first =
+      std::accumulate(diff.begin(), diff.end(), 1, std::multiplies<int>());
+
+    // if there is no element left, all prim factors are distributed
+    if (!facNodes.size()) return nGrid;
+    //assign the remaining prim factors as such that the grid on every
+    //node is closest possible to a square
+    double minVal(DBL_MAX);
+    ipair bestPair;
+    for (int i(0); i < pow(2, facNodes.size()); i++){
+      ipair edges(CommGrid::getSquare(i, facNodes));
+      // build igrid.first / igrid.second and take the one with
+      // a ratio closest to one
+      //its not true that the node grid candidates are divisor of the rGrid:
+      //we allow only these edges
+      int first(edges.first*nGrid.first);
+      int second(edges.second*nGrid.second);
+      if ( (nRanks/first)*first != nRanks) continue;
+      if ( (nRanks/second)*second != nRanks) continue;
+
+      double val(1.0/(double)first  + 1.0/(double)second);
+      if ( minVal > val ){
+        minVal = val;
+        bestPair = {edges.first, edges.second};
+      }
+    }
+    nGrid.first  *= bestPair.first;
+    nGrid.second *= bestPair.second;
+    return nGrid;
+  }
+
+  std::vector<int> CommGrid::factorize(int number){ 
+    std::vector<int> factors;
+    int n(number);
+    if (n < 4) factors.push_back(n);
+    int d(2);
+    while (d*d <= n)
+    while (n>1){
+      while (!(n%d)){
+        factors.push_back(d);
+        n /= d;
+      }
+      d++;
+    }
+    return factors;
+  }
+
+  ipair CommGrid::getSquare(int id, std::vector<int> factors) {
+    ipair result({1,1});
+    result.second = std::accumulate(
+      factors.begin(), factors.end(), 1, std::multiplies<double>()
+    );
+    for (int pos(0); ; pos++) {
+      int bit(pow(2,pos));
+      if (bit > id) break;
+      if(id & bit) result.first *= factors[pos];
+    }
+    result.second /= result.first;
+    return result;
+  }
+
+
+
   char * get_default_inds(int order, int start_index){
     char * inds = (char*)CTF_int::alloc(order*sizeof(char));
     for (int i=0; i<order; i++){
@@ -611,6 +735,7 @@ namespace CTF_int {
       lda *= lens[i];
     }
   }
+
 /*
 #define USE_CUST_DBL_CMPLX 0
 
@@ -658,4 +783,5 @@ namespace CTF_int {
     return is_new;
   }
 
+
 }
diff --git a/src/interface/common.h b/src/interface/common.h
index e46d5f70..a163b014 100644
--- a/src/interface/common.h
+++ b/src/interface/common.h
@@ -14,6 +14,7 @@
 #include <iostream>
 #include <limits.h>
 #include <random>
+#include <cfloat>
 
 #include "../shared/model.h"
 
@@ -134,21 +135,38 @@ namespace CTF_int {
   // accumulates computed flops (targeted for internal use)
   void add_computed_flops(int64_t n);
 
+  void set_save_glb_comm(MPI_Comm gcm);
+
   // get computed flops
   int64_t get_computed_flops();
 
   // accumulates computed flops (targeted for internal use)
   void add_estimated_flops(int64_t n);
 
+  // wrapper of MPI communicator
   class CommData {
     public:
+      // MPI communicator
       MPI_Comm cm;
+      // number of processors
       int np;
+      // rank of processor
       int rank;
+      // color of subcommunicator cm relative to some parent commmunicator, if provided
       int color;
+      // 1 if this communicator is active (MPI_Comm is created and not finalized)
       int alive;
+      // 1 if this object created a communicator that needs to be finalized (as opposed to being an alias to a different communicator object)
       int created;
-  
+      // intra_node_np, number of processes per node (intra-node grid dimension) corresponding to this communicator, if provided, 1 otherwise
+      int intra_node_np;
+      // global rank
+      int global_rank;
+      // node id
+      int node_id;
+      // number of distinct nodes in the communicator
+      int num_nodes;
+
       CommData();
       ~CommData();
 
@@ -167,8 +185,9 @@ namespace CTF_int {
        * \param[in] rank rank within this comm
        * \param[in] color identifier of comm within parent
        * \param[in] np number of processors within this comm
+       * \param[in] intra_node_np number of processors per physical node
        */
-      CommData(int rank, int color, int np);
+      CommData(int rank, int color, int np, int num_nodes, int glbRank,  int intra_node_np=0);
 
       /**
        * \brief create active subcomm from parent comm which must be active
@@ -241,6 +260,24 @@ namespace CTF_int {
 
   };
 
+  using ipair = std::pair<int,int>;
+  struct CommGrid {
+    CommGrid(){};
+    ~CommGrid(){};
+    CommGrid(ipair _rGrid, int _nNodes);
+
+    int nRanks;
+    std::vector<ipair> colorKey;
+    ipair rGrid; // RankGrid: given by the user
+    ipair nGrid; // NodeGrid: output, grid of nodes
+    ipair iGrid; // intraNodeGrid: the ranks of one node possess this grid
+
+    ipair getNodeGrid(int nNodes, ipair rGrid);
+    std::vector<int> factorize(int number);
+    ipair getSquare(int id, std::vector<int> factors);
+  };
+
+
   int  alloc_ptr(int64_t len, void ** const ptr);
   int  mst_alloc_ptr(int64_t len, void ** const ptr);
   void * alloc(int64_t len);
diff --git a/src/interface/tensor.cxx b/src/interface/tensor.cxx
index 3ae791f5..aba43748 100644
--- a/src/interface/tensor.cxx
+++ b/src/interface/tensor.cxx
@@ -1563,6 +1563,7 @@ NORM_INFTY_INST(double)
       IASSERT(0);
       return;
     }
+    if (T.wrld->dryRanks) return;
     for (int64_t i=0; i<T.size; i++){
       ((dtype*)T.data)[i] = ((dtype)((rtype)CTF_int::get_rand48()*(rmax-rmin)))+rmin;
     }
diff --git a/src/interface/world.cxx b/src/interface/world.cxx
index b81b8736..bc052c63 100644
--- a/src/interface/world.cxx
+++ b/src/interface/world.cxx
@@ -71,9 +71,11 @@ namespace CTF {
 
 
   World::World(MPI_Comm       comm_,
+               int            ppn_,
                int            argc,
                char * const * argv){
     comm = comm_;
+    ppn  = ppn_;
 #ifdef BGQ
     this->init(comm, TOPOLOGY_BGQ, argc, argv);
 #else
@@ -85,6 +87,15 @@ namespace CTF {
 #endif
   }
 
+  World::World(std::string print, int dryRanks_, int ppn_){
+    comm = MPI_COMM_WORLD;
+    dryRanks = dryRanks_;
+    ppn = ppn_;
+    if (print == "high") verbose = 2;
+
+    this->init(comm, TOPOLOGY_GENERIC);
+  }
+
 
   World::World(int             order, 
                int const *     lens, 
@@ -97,6 +108,7 @@ namespace CTF {
 
   World::World(World const & other){
     comm        = other.comm;
+    ppn         = other.ppn;
 #if DEBUG >= 1
     if (other.rank == 0){
       printf("CTF WARNING: Creating copy of World, which is not free or useful, pass original World by reference instead if possible.\n");
@@ -164,6 +176,8 @@ namespace CTF {
                   int             argc,
                   const char * const *  argv){
     cdt = CommData(comm);
+    if (dryRanks) cdt.np = dryRanks;
+
     if (mach == TOPOLOGY_GENERIC)
       phys_topology = NULL;
     else
@@ -187,7 +201,7 @@ namespace CTF {
 
   int World::initialize(int                   argc,
                         const char * const *  argv){
-    char * mem_size, * ppn;
+    char * mem_size, * cppn;
     if (comm == MPI_COMM_WORLD && universe_exists){
       delete phys_topology;
       *this = universe;
@@ -262,16 +276,16 @@ namespace CTF {
                     imem_size);
         CTF_int::set_mem_size(imem_size);
       }
-      ppn = getenv("CTF_PPN");
-      if (ppn != NULL){
+      cppn = getenv("CTF_PPN");
+      if (cppn != NULL){
         if (rank == 0)
           printf("Assuming %d processes per node due to CTF_PPN environment variable\n",
-                    atoi(ppn));
-        ASSERT(atoi(ppn)>=1);
+                    atoi(cppn));
+        ASSERT(atoi(cppn)>=1);
   #ifdef BGQ
         CTF_int::set_memcap(.75);
   #else
-        CTF_int::set_memcap(.75/atof(ppn));
+        CTF_int::set_memcap(.75/atof(cppn));
   #endif
       }
       if (rank == 0)
diff --git a/src/interface/world.h b/src/interface/world.h
index 622b27d4..39fef3db 100644
--- a/src/interface/world.h
+++ b/src/interface/world.h
@@ -24,6 +24,12 @@ namespace CTF {
       int rank;
       /** \brief number of processors */
       int np;
+      /** \brief number of processors per node (optional / can be 1)*/
+      int ppn = 0;
+      /** \brief set dryRun */
+      int dryRanks = 0;
+      /** \brief verbosity of dryRun */
+      int verbose = 1;
       /** \brief derived topologies */
       std::vector< CTF_int::topology* > topovec;
       /** \brief whether the world has been initialized */
@@ -63,6 +69,7 @@ namespace CTF {
        * \param[in] argv main arguments 
        */
       World(MPI_Comm       comm = MPI_COMM_WORLD,
+            int            ppn  = 1,
             int            argc = 0,
             char * const * argv = NULL);
 
@@ -86,6 +93,13 @@ namespace CTF {
        */
       World(char const * emptystring);
 
+      /**
+       * \brief constructor for a dry world
+       * \param[in] print determines how to handle output
+       * \param[in] dryRanks number of dry ranks
+      */
+
+      World(std::string print, int dryRanks, int ppn = 1);
 
       /**
        * \brief frees CTF library
diff --git a/src/mapping/Makefile b/src/mapping/Makefile
index d5c66a28..a609849c 100644
--- a/src/mapping/Makefile
+++ b/src/mapping/Makefile
@@ -1,10 +1,10 @@
-LOBJS = mapping.o distribution.o topology.o
+LOBJS = mapping.o distribution.o topology.o node_aware_dist.o
 OBJS = $(addprefix $(ODIR)/, $(LOBJS))
 
 ctf: $(OBJS) 
 
 #%d | r ! grep -ho "\.\..*\.h" *.cxx *.h | sort | uniq
-HDRS = ../../Makefile $(BDIR)/config.mk  ../interface/common.h ../mapping/mapping.h ../shared/util.h ../summation/sum_tsr.h ../tensor/untyped_tensor.h 
+HDRS = ../../Makefile $(BDIR)/config.mk  ../interface/common.h ../mapping/mapping.h ../mapping/node_aware_dist.h ../shared/util.h ../summation/sum_tsr.h ../tensor/untyped_tensor.h
 
 $(OBJS): $(ODIR)/%.o: %.cxx *.h  $(HDRS)
 	$(FCXX) -c $< -o $@
diff --git a/src/mapping/node_aware_dist.cxx b/src/mapping/node_aware_dist.cxx
new file mode 100644
index 00000000..f67e22b3
--- /dev/null
+++ b/src/mapping/node_aware_dist.cxx
@@ -0,0 +1,210 @@
+/* The code in this file has been written by Andreas Irmler. */
+
+#include "../tensor/untyped_tensor.h"
+#include "../shared/util.h"
+#include "node_aware_dist.h"
+using ivec  = std::vector<int>;
+using vivec = std::vector<ivec>;
+
+
+namespace CTF_int {
+
+
+  struct Tree {
+
+    //Copy
+    Tree(Tree const &other) {
+      order = other.order;
+      sgf = other.sgf;
+      ogf = other.ogf;
+    }
+
+    //Constructor 1
+    Tree(int _order, vivec _sgf, vivec _ogf){
+      order = _order;
+      sgf = _sgf;
+      ogf = _ogf;
+    }
+
+    // Constructor 2
+    Tree(Tree t, int pos, int el){
+      order = t.order + 1;
+      sgf = t.sgf;
+      ogf = t.ogf;
+      assert(sgf.size() > pos);
+      assert(ogf.size() > pos);
+      sgf[pos].push_back(el);
+      std::sort(sgf[pos].begin(), sgf[pos].end());
+      auto it = std::find(ogf[pos].begin(), ogf[pos].end(), el);
+      assert(it != ogf[pos].end());
+      ogf[pos].erase(it);
+    }
+
+    bool find(int pos, int el) {
+      if (ogf.size() <= pos) {
+        printf("Find problem! order %d, size: %ld, pos: %d, el: %d\n"
+              , order, ogf.size(), pos, el);
+        assert(0);
+      }
+      auto it = std::find(ogf[pos].begin(), ogf[pos].end(), el);
+      if (it == ogf[pos].end()) return false;
+      return true;
+    }
+
+    int order;
+    vivec sgf; // settled grid factors. ie factors which are already assigned
+    vivec ogf; // open grid factors. factors which can
+  };
+
+
+  // return a vector of prim factors
+  ivec iv_factorize(int number){
+    ivec factors;
+    int n(number);
+    if (n < 4) factors.push_back(n);
+    int d(2);
+    while (d*d <= n)
+    while (n>1){
+      while (!(n%d)){
+        factors.push_back(d);
+        n /= d;
+      }
+      d++;
+    }
+    return factors;
+  }
+
+  // return vector with input arguments
+  ivec lineToVint(std::string line) {
+    ivec out;
+    size_t pos;
+    while ((pos = line.find(",")) != std::string::npos) {
+      out.push_back(std::stoi(line.substr(0, pos)));
+      line.erase(0, pos + 1);
+    }
+    out.push_back(std::stoi(line));
+
+    return out;
+  }
+
+
+  std::vector< std::vector<int> > get_inter_node_grids(std::vector<int> rGrid, int nodes){
+    int ranks(std::accumulate(rGrid.begin(), rGrid.end(), 1, std::multiplies<int>()));
+    int ranksPerNode(ranks/nodes);
+    IASSERT (ranksPerNode*nodes == ranks );
+
+	  vivec nodeGrid; // final node Grid
+    const ivec nodeFactors(iv_factorize(nodes));
+    const ivec rankFactors(iv_factorize(ranks));
+    vivec gridFactors; // the tensor grid expressed in prim factors
+    ivec assignedFactors; // rank factors which are already assigned
+    ivec openFactors; // unassigned rank factors
+    for (auto r: rGrid) {
+      gridFactors.push_back(iv_factorize(r));
+    }
+    vivec openGridFactors; // grid factors which cannot assigned to a edge
+
+    for (auto gf: gridFactors){
+
+      ivec others, diff;
+      // all prim factors which are not at the given edge
+      std::set_difference( rankFactors.begin()
+                        , rankFactors.end()
+                        , gf.begin()
+                        , gf.end()
+                        , std::back_inserter(others)
+                        );
+      /*
+      for (auto x: others) {
+        std::cout << "others: " << x << " ";
+      }
+      std::cout << std::endl;
+      */
+      // is there a node factor which lives only on a given edge?
+      // if so assign this factor to this edge
+      std::set_difference( nodeFactors.begin()
+                        , nodeFactors.end()
+                        , others.begin()
+                        , others.end()
+                        , std::back_inserter(diff)
+                        );
+      assignedFactors.insert(assignedFactors.end(), diff.begin(), diff.end());
+
+      openGridFactors.resize(openGridFactors.size()+1);
+      std::set_difference( gf.begin()
+                        , gf.end()
+                        , diff.begin()
+                        , diff.end()
+                        , std::back_inserter(openGridFactors.back())
+                        );
+      if (!diff.size()) diff.push_back(1);
+      nodeGrid.push_back(diff);
+
+    }
+
+    std::sort(assignedFactors.begin(), assignedFactors.end());
+    std::set_difference( nodeFactors.begin()
+                      , nodeFactors.end()
+                      , assignedFactors.begin()
+                      , assignedFactors.end()
+                      , std::back_inserter(openFactors)
+                      );
+    // The algorithm goes like that:
+    // 1.) we pick the last element of the list, remove it from the list,
+    //     then open N branches where N is the number of possible possitions
+    //     for that element in the rank Grid
+    // 2.) we remove identical branches
+    // 3.) we go to step 1
+
+    size_t b(0);
+    size_t n(rGrid.size());
+    std::vector<Tree> treeVec;
+    treeVec.emplace_back(0, nodeGrid, openGridFactors);
+    // we loop over all prim Factors of the number of nodes
+    while (openFactors.size()){
+      // take the last element of the list and remove it from the list
+      auto f(openFactors.back());
+      openFactors.pop_back();
+
+      // we work only in the last layer of the tree
+      // we have to find the begin/end in the whole vector
+      auto o(treeVec.back().order);
+      auto b(std::distance( treeVec.begin()
+                          , std::find_if( treeVec.begin()
+                                        , treeVec.end()
+                                        , [o] (const Tree &a)
+                                          { return a.order == o;}
+                                        )
+                          ));
+
+      auto e(treeVec.size());
+      // loop over the last layer of the tree and distribute the
+      // element to all possible positions
+      // however: if a potential element is already in the list,
+      //          do not add it
+      for (size_t t(b); t < e; t++){
+        for (auto i(0); i < n; i++)
+        if ( treeVec[t].find(i, f) ){
+          bool distinct(true);
+          auto cand = Tree(treeVec[t], i, f);
+          for (size_t n(e); n < treeVec.size(); n++){
+            if (cand.sgf == treeVec[n].sgf) distinct = false;
+          }
+          if (distinct) treeVec.push_back(cand);
+        }
+      }
+    }
+
+    std::vector< std::vector<int> > inter_node_grids;
+    for (auto tv: treeVec) {
+      if (treeVec.back().order == tv.order) {
+        std::vector<int> sgf;
+        for (auto s: tv.sgf) {
+          sgf.push_back(std::accumulate(s.begin(), s.end(), 1, std::multiplies<int>()));
+        }
+        inter_node_grids.push_back(sgf);
+      }
+    }
+    return inter_node_grids;
+  }
+}
diff --git a/src/mapping/node_aware_dist.h b/src/mapping/node_aware_dist.h
new file mode 100644
index 00000000..41f09006
--- /dev/null
+++ b/src/mapping/node_aware_dist.h
@@ -0,0 +1,16 @@
+/*Copyright (c) 2022, Edgar Solomonik, all rights reserved.*/
+
+#ifndef __INT_NODE_DISTRIBUTION_H__
+#define __INT_NODE_DISTRIBUTION_H__
+
+namespace CTF_int {
+  /**
+   * \brief returns all possible valid choices inter-node grids, given an overall processor grid and a number of nodes
+   * \param[in] rGrid overall processor grid
+   * \param[in] nodes number of nodes
+   * \return vector of inter node processor grids of total size equal to the number of nodes and of same dimension as rGrid, where each dimension divides into the respective dimension of rGrid
+   */
+  std::vector<std::vector<int> > get_inter_node_grids(std::vector<int> rGrid, int nodes);
+}
+
+#endif
diff --git a/src/mapping/topology.cxx b/src/mapping/topology.cxx
index 02b6eae1..9a87418e 100644
--- a/src/mapping/topology.cxx
+++ b/src/mapping/topology.cxx
@@ -3,11 +3,14 @@
 #include "topology.h"
 #include "../shared/util.h"
 #include "../mapping/mapping.h"
+#include <vector>
 
 #ifdef BGQ
 #include "mpix.h"
 #endif
 
+using ipair = std::pair<int, int>;
+
 namespace CTF_int {
 /*
   topology::topology(){
@@ -17,7 +20,41 @@ namespace CTF_int {
     is_activated = false;
     dim_comm     = NULL;
   }*/
-  
+
+  int get_inv_topo_reorder_rank(int order, int const * lens, int const * intra_node_lens, int new_rank){
+    int irank = new_rank;
+    int intra_node_rank = 0;
+    int node_rank = 0;
+    int lda_node_rank = 1;
+    int lda_intra_node_rank = 1;
+    for (int i=0; i<order; i++){
+      intra_node_rank += (irank%intra_node_lens[i])*lda_intra_node_rank;
+      node_rank += ((irank%lens[i])/intra_node_lens[i])*lda_node_rank;
+      irank = irank / lens[i];
+      lda_node_rank = lda_node_rank*(lens[i]/intra_node_lens[i]);
+      lda_intra_node_rank = lda_intra_node_rank*intra_node_lens[i];
+    }
+    return intra_node_rank + lda_intra_node_rank*node_rank;
+  }
+
+  int get_topo_reorder_rank(int order, int const * lens, int const * lda, int const * intra_node_lens, int rank){
+    int num_intra_node = 1;
+    for (int i=0; i<order; i++){
+      num_intra_node *= intra_node_lens[i];
+    }
+    int intra_node_rank = rank % num_intra_node;
+    int node_rank = rank / num_intra_node;
+    int new_rank = 0;
+    for (int i=0; i<order; i++){
+      int i_node_rank = node_rank % (lens[i]/intra_node_lens[i]);
+      node_rank = node_rank / (lens[i]/intra_node_lens[i]);
+      int i_intra_node_rank = intra_node_rank % intra_node_lens[i];
+      intra_node_rank = intra_node_rank / intra_node_lens[i];
+      new_rank += (i_node_rank*intra_node_lens[i] + i_intra_node_rank)*lda[i];
+    }
+    return new_rank;
+  }
+
   topology::~topology(){
     deactivate();
     CTF_int::cdealloc(lens);
@@ -25,7 +62,7 @@ namespace CTF_int {
     CTF_int::cdealloc(dim_comm);
   }
 
-  topology::topology(topology const & other) : glb_comm(other.glb_comm) {
+  topology::topology(topology const & other) : glb_comm(other.glb_comm), unord_glb_comm(other.unord_glb_comm) {
     order        = other.order;
 
     lens         = (int*)CTF_int::alloc(order*sizeof(int));
@@ -40,32 +77,105 @@ namespace CTF_int {
     }
 
     is_activated = other.is_activated;
+    is_reordered = other.is_reordered;
+  }
+
+  void topology::morph_to(topology const & other){
+    ASSERT(order == other.order);
+    ASSERT(!is_reordered || !other.is_reordered);
+    memcpy(lens, other.lens, order*sizeof(int));
+    memcpy(lda, other.lda, order*sizeof(int));
+
+    // overwrite communicators, swapping out CommData objects pointed to elsewhere
+    for (int i=0; i<order; i++){
+      dim_comm[i] = CommData(other.dim_comm[i]);
+    }
+
+    is_activated = other.is_activated;
+    is_reordered = other.is_reordered;
+    glb_comm = other.glb_comm;
+    unord_glb_comm = other.unord_glb_comm;
   }
 
   topology::topology(int         order_,
                      int const * lens_,
                      CommData    cdt,
-                     bool        activate) : glb_comm(cdt) {
+                     bool        activate,
+                     int const * intra_node_lens) : unord_glb_comm(cdt), glb_comm(cdt) {
     order        = order_;
     lens         = (int*)CTF_int::alloc(order_*sizeof(int));
     lda          = (int*)CTF_int::alloc(order_*sizeof(int));
     dim_comm     = (CommData*)CTF_int::alloc(order_*sizeof(CommData));
     is_activated = false;
-   
+
     memcpy(lens, lens_, order_*sizeof(int));
     //reverse FIXME: this is assumed somewhere...
 //    for (int i=0; i<order; i++){
 //      lens[i] = lens_[order-i-1];
 //    }
- 
+
+    lda[0] = 1;
+    for (int i = 1; i < order; i++) {
+      lda[i] = lda[i-1] * lens[i-1];
+    }
+
+    if (intra_node_lens == NULL){
+      is_reordered = false;
+      //glb_comm = cdt;
+    } else {
+      int new_rank = get_topo_reorder_rank(order, lens, lda, intra_node_lens, cdt.rank);
+      is_reordered = true;
+      glb_comm = CommData(new_rank, 0, cdt.np, cdt.num_nodes, cdt.global_rank);
+    }
     int stride = 1, cut = 0;
     int rank = glb_comm.rank;
+    std::vector<int> num_nodes(order);
+    std::vector< std::vector<ipair> > como(order, std::vector<ipair> (cdt.np));
+    for (int r(0); r < cdt.np; r++){
+      int stride =1, cut = 0;
+      for (size_t i=0; i<order; i++){
+        como[i][r] = {(((r/(stride*lens[i]))*stride)+cut), r/72};
+        stride*=lens[i];
+        cut = (r - (r/stride)*stride);
+      }
+    }
+    // sort for the same color
+    for (auto &c: como) std::sort(c.begin(), c.end());
+    for (int i=0; i< order; i++){
+      std::vector<ipair> sameColor;
+      std::copy_if( como[i].begin()
+                  , como[i].end()
+                  , std::back_inserter(sameColor)
+                  , [](ipair &a){ return a.first == 0;}
+                  );
+      std::sort( sameColor.begin()
+               , sameColor.end()
+               , [](ipair &a, ipair &b){return a.second < b.second;}
+               );
+      num_nodes[i] = std::distance( sameColor.begin()
+                                  , std::unique( sameColor.begin()
+                                               , sameColor.end()
+                                               , [](ipair &a, ipair &b)
+                                                 { return a.second == b.second;}
+                                               )
+                                  );
+    }
+
     for (int i=0; i<order; i++){
       lda[i] = stride;
-      dim_comm[i] = CommData(((rank/stride)%lens[i]),
-                             (((rank/(stride*lens[i]))*stride)+cut),
-                             lens[i]);
-//      SETUP_SUB_COMM_SHELL(cdt, dim_comm[i],
+      if (intra_node_lens == NULL)
+        dim_comm[i] = CommData(((rank/stride)%lens[i]),
+                               (((rank/(stride*lens[i]))*stride)+cut),
+                               lens[i],
+                               num_nodes[i],
+                               rank);
+      else
+        dim_comm[i] = CommData(((rank/stride)%lens[i]),
+                               (((rank/(stride*lens[i]))*stride)+cut),
+                               lens[i],
+                               lens[i] / intra_node_lens[i],
+                               rank,
+                               intra_node_lens[i]);
       stride*=lens[i];
       cut = (rank - (rank/stride)*stride);
     }
@@ -75,10 +185,11 @@ namespace CTF_int {
 
   void topology::activate(){
     if (!is_activated){
+      if (is_reordered) glb_comm.activate(unord_glb_comm.cm);
       for (int i=0; i<order; i++){
         dim_comm[i].activate(glb_comm.cm);
       }
-    } 
+    }
     is_activated = true;
   }
 
@@ -87,7 +198,8 @@ namespace CTF_int {
       for (int i=0; i<order; i++){
         dim_comm[i].deactivate();
       }
-    } 
+      if (is_reordered) glb_comm.deactivate();
+    }
     is_activated = false;
   }
 
@@ -141,7 +253,7 @@ namespace CTF_int {
         topo = new topology(dim, topo_dims, glb_comm, 1);
         CTF_int::cdealloc(topo_dims);
         return topo;
-      } else 
+      } else
       #endif
       {
         int order;
@@ -397,34 +509,36 @@ namespace CTF_int {
     }
   }
 
-  /** 
+  /**
    * \brief computes all unique factorizations into non-primes each yielding a topology, prepending additional factors as specified
    * \param[in] cdt global communicator
    * \param[in] n_uf number of unique prime factors
    * \param[in] uniq_fact list of prime factors
    * \param[in] n_prepend number of factors to prepend
-   * \param[in] mults ? 
+   * \param[in] mults multiplicities of each factor
    * \param[in] prelens factors to prepend
    * \return lens vector of factorizations
    */
-  std::vector< topology* > get_all_topos(CommData cdt, int n_uf, int const * uniq_fact, int const * mults, int n_prepend, int const * prelens){
-    std::vector<topology*> topos;
+  std::vector< std::vector<int>* > get_all_shapes_rec(int n_uf, int const * uniq_fact, int const * mults, int n_prepend, int const * prelens){
+    std::vector< std::vector<int>* > shapes;
 
+    // enumerate the number of different possible numbers (including 1) that divide (with remainder 0) the number of processors
     int num_divisors = 1;
     for (int i=0; i<n_uf; i++){
       num_divisors *= (1+mults[i]);
       ASSERT(num_divisors < 1E6);
     }
-    
+
     if (num_divisors == 1){
-      topos.push_back(new topology(n_prepend, prelens, cdt));
-      return topos;
+      shapes.push_back(new std::vector<int>(prelens,prelens+n_prepend));
+      return shapes;
     }
     int sub_mults[n_uf];
     int new_prelens[n_prepend+1];
     memcpy(new_prelens, prelens, n_prepend*sizeof(int));
     //FIXME: load may be highly imbalanced
     //for (int div=cdt.rank; div<num_divisors; div+=cdt.np)
+    //iterate through all possible divisors
     for (int div=1; div<num_divisors; div++){
       //memcpy(sub_mults, mults, n_uf*sizeof(int));
       int dmults[n_uf];
@@ -437,37 +551,40 @@ namespace CTF_int {
         len0 *= std::pow(uniq_fact[i], dmults[i]);
       }
       new_prelens[n_prepend] = len0;
-      std::vector< topology* > new_topos = get_all_topos(cdt, n_uf, uniq_fact, sub_mults, n_prepend+1, new_prelens);
+      std::vector< std::vector<int>* > new_shapes = get_all_shapes_rec(n_uf, uniq_fact, sub_mults, n_prepend+1, new_prelens);
       //FIXME call some append function?
-      for (unsigned i=0; i<new_topos.size(); i++){
-        topos.push_back(new_topos[i]);
+      for (unsigned i=0; i<new_shapes.size(); i++){
+        shapes.push_back(new_shapes[i]);
       }
     }
-    return topos;
+    return shapes;
   }
 
-  std::vector< topology* > get_generic_topovec(CommData cdt){
-    std::vector<topology*> topovec;
-
+  /**
+   * \brief generate all possible factorizations of size into divisors
+  *  \param[in] total size that numbers should multiply to
+  *  \return all possible collections of natural numbers that multiply to size (excluding 1s)
+   */
+  std::vector< std::vector<int>* > get_all_shapes(int size){
     int nfact, * factors;
-    factorize(cdt.np, &nfact, &factors);
+    factorize(size, &nfact, &factors);
     if (nfact <= 1){
-      topovec.push_back(new topology(nfact, factors, cdt));
-      if (cdt.np >= 7 && cdt.rank == 0) 
-        DPRINTF(1,"CTF WARNING: using a world with a prime number of processors may lead to very bad performance\n");
+      std::vector<std::vector<int>*> shapes;
+      shapes.push_back(new std::vector<int>(factors, factors+nfact));
       if (nfact > 0) cdealloc(factors);
-      return topovec;
+      return shapes;
     }
     std::sort(factors,factors+nfact);
+    //compute number of unique factors
     int n_uf = 1;
     assert(factors[0] != 1);
     for (int i=1; i<nfact; i++){
       if (factors[i] != factors[i-1]) n_uf++;
     }
-    if (n_uf >= 3){
-      if (cdt.rank == 0) 
-        DPRINTF(1,"CTF WARNING: using a world with a number of processors that contains 3 or more unique prime factors may lead to suboptimal performance, when possible use p=2^k3^l processors for some k,l\n");
-    }
+    //if (n_uf >= 3){
+    //  if (cdt.rank == 0)
+    //    DPRINTF(1,"CTF WARNING: using a world with a number of processors that contains 3 or more unique prime factors may lead to suboptimal performance, when possible use p=2^k3^l processors for some k,l\n");
+    //}
     int uniq_fact[n_uf];
     int mults[n_uf];
     int i_uf = 0;
@@ -481,7 +598,30 @@ namespace CTF_int {
       } else mults[i_uf]++;
     }
     cdealloc(factors);
-    return get_all_topos(cdt, n_uf, uniq_fact, mults, 0, NULL);
+    std::vector< std::vector<int> * > shapes = get_all_shapes_rec(n_uf, uniq_fact, mults, 0, NULL);
+    return shapes;
+  }
+
+
+  std::vector< topology* > create_topos_from_shapes(std::vector< std::vector<int>* > shapes, CommData cdt){
+    std::vector< topology* > topos;
+    for (int i=0; i<(int)shapes.size(); i++){
+      topos.push_back(new topology(shapes[i]->size(), &shapes[i]->operator[](0), cdt));
+    }
+    return topos;
+  }
+
+  std::vector< topology* > get_generic_topovec(CommData cdt){
+    std::vector< std::vector<int> * > shapes = get_all_shapes(cdt.np);
+    std::vector< topology* > topos = create_topos_from_shapes(shapes, cdt);
+    for (int i=0; i<(int)shapes.size(); i++){
+      delete shapes[i];
+    }
+
+    if (shapes.size() == 1 && cdt.np >= 7 && cdt.rank == 0)
+      DPRINTF(1,"CTF WARNING: using a world with a prime number of processors may lead to very bad performance\n");
+    return topos;
+
   }
 
 
@@ -493,7 +633,7 @@ namespace CTF_int {
     bool changed;
     /*int i=0;
     do {
-      for (int j=0; j< perm_vec[i]->order; 
+      for (int j=0; j< perm_vec[i]->order;
     } while(i<perm_vec.size();*/
     do {
 //      printf("HERE %d %d %d %d\n",perm_vec[0]->order, perm_vec.size(), perm_vec[0]->lens[0], perm_vec[0]->lens[1]);
@@ -538,9 +678,9 @@ namespace CTF_int {
                                       CommData         glb_comm){
     std::vector< topology* > topos;
     topos.push_back(new topology(*topo));
-    
+
     if (topo->order <= 1) return topos;
-    
+
     int * new_lens = (int*)alloc(sizeof(int)*topo->order-1);
 
     for (int i=0; i<topo->order-1; i++){
@@ -567,12 +707,12 @@ namespace CTF_int {
     }
     return topos;
   }
-    
+
   int find_topology(topology const *           topo,
                     std::vector< topology* > & topovec){
     int i, j, found;
     std::vector< topology* >::iterator iter;
-    
+
     found = -1;
     for (j=0, iter=topovec.begin(); iter!=topovec.end(); iter++, j++){
       if ((*iter)->order == topo->order){
@@ -585,7 +725,7 @@ namespace CTF_int {
       }
       if (found != -1) return found;
     }
-    return -1;  
+    return -1;
   }
 
   int get_best_topo(int64_t  nvirt,
@@ -638,8 +778,8 @@ namespace CTF_int {
     CommData *   sub_phys_comm;
     int * comm_idx;
     mapping const * map;
-    memset(phys_mapped, 0, topo->order*sizeof(int));  
-    
+    memset(phys_mapped, 0, topo->order*sizeof(int));
+
     num_sub_phys_dims = 0;
 
     for (i=0; i<order_A; i++){
@@ -648,7 +788,7 @@ namespace CTF_int {
         phys_mapped[map->cdt] = 1;
         if (map->has_child) map = map->child;
         else break;
-      } 
+      }
     }
     for (i=0; i<order_B; i++){
       map = &edge_map_B[i];
@@ -656,7 +796,7 @@ namespace CTF_int {
         phys_mapped[map->cdt] = 1;
         if (map->has_child) map = map->child;
         else break;
-      } 
+      }
     }
 
     num_sub_phys_dims = 0;
@@ -680,7 +820,7 @@ namespace CTF_int {
 
   }
 
-  int can_morph(topology const * topo_keep, 
+  int can_morph(topology const * topo_keep,
                 topology const * topo_change){
     int i, j, lda;
     lda = 1;
@@ -715,7 +855,7 @@ namespace CTF_int {
           do {
             for (j=0; j<new_topo->order; j++){
               if (new_topo->lda[j] == old_lda) break;
-            } 
+            }
             ASSERT(j!=new_topo->order);
             new_rec_map->type   = PHYSICAL_MAP;
             new_rec_map->cdt    = j;
@@ -749,7 +889,7 @@ namespace CTF_int {
             break;
           }
         }
-        edge_map[i].clear();      
+        edge_map[i].clear();
         edge_map[i] = *new_map;
         CTF_int::cdealloc(new_map);
       }
diff --git a/src/mapping/topology.h b/src/mapping/topology.h
index 6194f690..d6f034ec 100644
--- a/src/mapping/topology.h
+++ b/src/mapping/topology.h
@@ -13,41 +13,78 @@ namespace CTF_int {
   /* \brief mesh/torus topology configuration */
   class topology {
     public:
+      // number of dimensions in torus
       int        order;
+      // lengths of dimensions
       int *      lens;
+      // lda[i] = lens[i-1] * ... * lens[0]
       int *      lda;
+      // global communicator is reordered if intra-node grid is provided
+      int        is_reordered;
+      // whether dim_comm communicators have been activated
       bool       is_activated;
+
+      // list of communicators along fibers of each dimension of torus
       CommData * dim_comm;
+      // global communicator, ordered as in torus given by dim_comm
       CommData   glb_comm;
+      // global communicator, ordered as given, assuming processors are ordered as [processes in node 1], [processes in node 2], etc.
+      CommData   unord_glb_comm;
 
       //topology();
       ~topology();
 
-      /** 
+      /**
        * \brief copy constructor
        * \param[in] other topology to copy
        */
       topology(topology const & other);
 
       /**
-       * \brief constructs torus topology 
+       * \brief overwrite this topology with communicators of another, without reallocating CommData objects, allowing to 'hot-swap' this topology for another, propagating change through creatred ctr objects
+       * \param[in] other topology to copy
+       */
+      void morph_to(topology const & other);
+
+
+      /**
+       * \brief constructs torus topology, if intra_node_lens is NULL, the p processors are folded into a torus, otherwise, the each set of prod(intra_node_lens) processors is mapped to different modes of the processor grid, e.g., if lens_ = [6,4] and intra_node_lens=[3,2] (6 processes per node), the processors are assiged as
+       * [[ 0  1  2  6  7  8 ],
+       *  [ 3  4  5  9  10 11],
+       *  [ 12 13 14 18 19 20],
+       *  [ 15 16 17 21 22 23]]
        * \param[in] order_ number of torus dimensions
        * \param[in] lens_ lengths of torus dimensions
-       * \param[in] cdt communicator for whole torus 
+       * \param[in] cdt communicator for whole torus
        * \param[in] activate whether to create MPI_Comms
+       * \param[in] intra_node_lens lengths of intra-node processor grid
        */
       topology(int         order_,
                int const * lens_,
                CommData    cdt,
-               bool        activate=false);
-     
-      /* \brief create (split off) MPI communicators, re-entrant */ 
+               bool        activate=false,
+               int const * intra_node_lens=NULL);
+
+      /* \brief create (split off) MPI communicators, re-entrant */
       void activate();
 
       /* \breif free MPI communicators, re-entrant */
       void deactivate();
   };
 
+  /**
+   * \brief determine this processors rank in the global communicator given by reordering nodes so that they adhere to the assignment described in the constructor of the topology() object, assuming initial order is node by node
+   *
+   * \param[in] order_ number of torus dimensions
+   * \param[in] lens_ lengths of torus dimensions
+   * \param[in] lda_ prefix product of lengths of torus dimensions
+   * \param[in] intra_node_lens lengths of intra-node processor grid
+   */
+  int get_topo_reorder_rank(int order, int const * lens, int const * lda, int const * intra_node_lens, int rank);
+
+  int get_inv_topo_reorder_rank(int order, int const * lens, int const * intra_node_lens, int new_rank);
+
+
   /**
    * \brief get dimension and torus lengths of specified topology
    *
@@ -57,6 +94,15 @@ namespace CTF_int {
   topology * get_phys_topo(CommData glb_comm,
                            TOPOLOGY mach);
 
+
+  /**
+   * \brief generate all possible factorizations of size into divisors
+  *  \param[in] total size that numbers should multiply to
+  *  \return all possible collections of natural numbers that multiply to size (excluding 1s)
+   */
+  std::vector< std::vector<int>* > get_all_shapes(int size);
+
+
   /**
    * \brief computes all topology configurations given undelying physical topology information
    * \param[in] cdt global communicator
@@ -87,7 +133,7 @@ namespace CTF_int {
   int find_topology(topology const *           topo,
                     std::vector< topology* > & topovec);
 
- 
+
   /**
    * \brief get the best topologoes (least nvirt) over all procs
    * \param[in] nvirt best virtualization achieved by this proc
@@ -102,7 +148,7 @@ namespace CTF_int {
                      CommData global_comm,
                      int64_t  bcomm_vol=0,
                      int64_t  bmemuse=0);
-   
+
 
   /**
    * \brief extracts the set of physical dimensions still available for mapping
diff --git a/src/shared/init_models.cxx b/src/shared/init_models.cxx
index cb4cbbd7..9c1d0153 100644
--- a/src/shared/init_models.cxx
+++ b/src/shared/init_models.cxx
@@ -1,41 +1,42 @@
 namespace CTF_int{
-double csrred_mdl_init[] = {3.0689E-03, 2.2385E-03, 4.4815E-07};
-double csrred_mdl_cst_init[] = {-1.8323E-04, 1.3076E-04, 2.8732E-09};
-double alltoall_mdl_init[] = {1.0000E-06, 1.0000E-06, 5.0000E-10};
-double alltoallv_mdl_init[] = {7.3164E-23, 1.0404E-04, 2.5827E-07};
-double red_mdl_init[] = {1.7255E-12, 1.2558E-11, 3.7127E-10};
-double red_mdl_cst_init[] = {1.2881E-04, 1.4093E-16, 8.3976E-10};
-double allred_mdl_init[] = {4.7939E-14, 7.4715E-13, 2.0949E-06};
-double allred_mdl_cst_init[] = {-3.3754E-04, 2.1343E-04, 3.0801E-09};
-double bcast_mdl_init[] = {1.1722E-82, 3.0112E-05, 8.6197E-09};
-double seq_tsr_ctr_mdl_cst_init[] = {7.8076E-13, 6.9558E-08, 1.3923E-08};
-double seq_tsr_ctr_mdl_ref_init[] = {4.9138E-08, 5.8290E-10, 4.8575E-11};
-double seq_tsr_ctr_mdl_inr_init[] = {1.0689E-05, 9.4660E-10, 2.1921E-10};
-double seq_tsr_ctr_mdl_off_init[] = {6.2925E-05, 1.7449E-11, 1.7211E-12};
-double seq_tsr_ctr_mdl_cst_inr_init[] = {1.3863E-04, 2.0119E-10, 9.8820E-09};
-double seq_tsr_ctr_mdl_cst_off_init[] = {8.4844E-04, 5.9246E-11, 3.5247E-10};
-double long_contig_transp_mdl_init[] = {1.5117E-04, 1.9091E-09};
-double shrt_contig_transp_mdl_init[] = {7.7643E-05, 6.4347E-12};
-double non_contig_transp_mdl_init[] = {2.6680E-05, 4.6247E-06};
-double seq_tsr_spctr_cst_off_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_off_k2_init[] = {2.1996E-04, 3.1883E-09, 3.8743E-11};
-double seq_tsr_spctr_off_k0_init[] = {8.6970E-06, 4.5598E-11, 1.1544E-09};
-double seq_tsr_spctr_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_off_k2_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_cst_k2_init[] = {2.1303E-74, 5.7379E-09, 4.1887E-11};
-double seq_tsr_spctr_cst_k3_init[] = {1.4917E-05, 2.5510E-10, 5.4110E-12};
-double seq_tsr_spctr_cst_k4_init[] = {5.6408E-06, 1.8318E-09, 5.2399E-80};
-double seq_tsr_spctr_cst_k5_init[] = {2.8218E-05, 3.0049E-09, 5.2399E-11};
-double seq_tsr_spctr_k0_init[] = {3.9315E-05, 2.2285E-08, 6.1958E-08};
-double seq_tsr_spctr_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
-double seq_tsr_spctr_k2_init[] = {5.9868E-14, 1.4877E-09, 5.3514E-12};
-double seq_tsr_spctr_k3_init[] = {1.3994E-15, 2.5071E-09, 2.7323E-11};
-double seq_tsr_spctr_k4_init[] = {2.0404E-04, 8.2989E-09, 6.0431E-11};
-double seq_tsr_spctr_k5_init[] = {6.9073E-15, 4.0130E-09, 2.2669E-13};
-double pin_keys_mdl_init[] = {4.0261E-05, 7.2443E-07};
-double spredist_mdl_init[] = {6.8713E-23, 7.8867E-04, 6.9422E-11};
-double dgtog_res_mdl_init[] = {4.3225E-22, 6.3127E-03, 3.6107E-07};
-double blres_mdl_init[] = {1.1782E-05, 6.7690E-10};}
+  double csrred_mdl_init[] = {3.0689E-03, 2.2385E-03, 4.4815E-07};          // not used I guess (at least not in dense)
+  double csrred_mdl_cst_init[] = {-1.8323E-04, 1.3076E-04, 2.8732E-09};     // not used I guess (at least not in dense)
+  double alltoall_mdl_init[] = {1.0000E-06, 1.0000E-06, 5.0000E-10};        // not used I guess
+  double alltoallv_mdl_init[] = {7.3164E-23, 1.0404E-04, 2.5827E-07};       // not used I guess
+  double red_mdl_init[] = {4.5530E-11, 3.0466E-17, 2.5E-9}; // mpi_reduce, used for summa for moving C 
+  double red_mdl_cst_init[] = {1.2881E-04, 1.4093E-16, 8.3976E-10};         // not used I guess
+  double allred_mdl_init[] = {4.7939E-14, 7.4715E-13, 2.0949E-06}; // de-facto not used I guess
+  double allred_mdl_cst_init[] = {-3.3754E-04, 2.1343E-04, 3.0801E-09};     // not used I guess
+  double bcast_mdl_init[] = {1.1115E-16, 1.0754E-16, 1.32E-9}; //mpi_bcast, used for summa for bcasting A,B, 3rd parameter is around 0.7 GB/s
+  double seq_tsr_ctr_mdl_cst_init[] = {7.8076E-13, 6.9558E-08, 1.3923E-08}; // not used I guess
+  double seq_tsr_ctr_mdl_ref_init[] = {4.9138E-08, 5.8290E-10, 4.8575E-11}; // not used I guess
+  double seq_tsr_ctr_mdl_inr_init[] = {6.0166E-21, 2.3443E-13, 1.4286E-11}; // our model, 2nd parameter negligible for large matrices, 3rd paramter fixed to 70GFLOPS/s/core
+  double seq_tsr_ctr_mdl_off_init[] = {6.2925E-05, 1.7449E-11, 1.7211E-12}; // not used I guess
+  double seq_tsr_ctr_mdl_cst_inr_init[] = {0.0, 0.0, 1.6E-11};
+  double seq_tsr_ctr_mdl_cst_off_init[] = {8.4844E-04, 5.9246E-11, 3.5247E-10};
+  double long_contig_transp_mdl_init[] = {0.0, 1.25E-08};
+  double shrt_contig_transp_mdl_init[] = {0.0, 1.25E-08};
+  double non_contig_transp_mdl_init[] = {2.6680E-05, 8.6247E-08};
+  double seq_tsr_spctr_cst_off_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_off_k2_init[] = {2.1996E-04, 3.1883E-09, 3.8743E-11};
+  double seq_tsr_spctr_off_k0_init[] = {8.6970E-06, 4.5598E-11, 1.1544E-09};
+  double seq_tsr_spctr_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_off_k2_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_cst_k2_init[] = {2.1303E-74, 5.7379E-09, 4.1887E-11};
+  double seq_tsr_spctr_cst_k3_init[] = {1.4917E-05, 2.5510E-10, 5.4110E-12};
+  double seq_tsr_spctr_cst_k4_init[] = {5.6408E-06, 1.8318E-09, 5.2399E-80};
+  double seq_tsr_spctr_cst_k5_init[] = {2.8218E-05, 3.0049E-09, 5.2399E-11};
+  double seq_tsr_spctr_k0_init[] = {3.9315E-05, 2.2285E-08, 6.1958E-08};
+  double seq_tsr_spctr_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10};
+  double seq_tsr_spctr_k2_init[] = {5.9868E-14, 1.4877E-09, 5.3514E-12};
+  double seq_tsr_spctr_k3_init[] = {1.3994E-15, 2.5071E-09, 2.7323E-11};
+  double seq_tsr_spctr_k4_init[] = {2.0404E-04, 8.2989E-09, 6.0431E-11};
+  double seq_tsr_spctr_k5_init[] = {6.9073E-15, 4.0130E-09, 2.2669E-13};
+  double pin_keys_mdl_init[] = {4.0261E-05, 7.2443E-07};
+  double spredist_mdl_init[] = {6.8713E-23, 7.8867E-04, 6.9422E-11};
+  double dgtog_res_mdl_init[] = {0.0, 0.0, 7.25E-10};  // elementwise reshuffling of distribution
+  double blres_mdl_init[] = {0.0, 1E-10};  // blockwise reshuffling of distribution 
+}
diff --git a/src/shared/model.cxx b/src/shared/model.cxx
index ef17ebf4..f8a31e0d 100644
--- a/src/shared/model.cxx
+++ b/src/shared/model.cxx
@@ -53,6 +53,14 @@ namespace CTF_int {
 #endif
   }
 
+  void dump_touched_models(std::string path){
+#ifdef TUNE
+    for (int i=0; i<(int)get_all_models().size(); i++){
+      get_all_models()[i]->dump_data(path, true);
+    }
+#endif
+  }
+
 #define SPLINE_CHUNK_SZ = 8
 
   double cddot(int n,       const double *dX,
@@ -245,7 +253,7 @@ namespace CTF_int {
     //if (nobs % tune_interval == 0){
 
     //define the number of cols in the matrix to be the min of the number of observations and
-    //the number we are willing to store (hist_size)
+    //the number we are willing to store ( {}hist_size)
     int nrcol = std::min(nobs,(int64_t)hist_size);
     //max of the number of local observations and nparam (will usually be the former)
     int ncol = std::max(nrcol, nparam);
@@ -697,11 +705,12 @@ namespace CTF_int {
   }
 
   template <int nparam>
-  void LinModel<nparam>::dump_data(std::string path){
+  void LinModel<nparam>::dump_data(std::string path, bool dump_only_touched){
     int rank = 0;
     int np, my_rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
     MPI_Comm_size(MPI_COMM_WORLD, &np);
+/*
     while(rank < np){
         if (rank == my_rank){
         // Open the file
@@ -721,6 +730,7 @@ namespace CTF_int {
         int num_records = std::min(nobs, (int64_t)hist_size);
         for(int i=0; i<num_records; i++){
             std::string instance = "";
+           ofs << i << " ";
            for(int j=0; j<mat_lda; j++){
              ofs<<time_param_mat[i*mat_lda+j]<<" ";
            }
@@ -731,6 +741,48 @@ namespace CTF_int {
       rank++;
       MPI_Barrier(MPI_COMM_WORLD);
     }
+*/
+    int num_records = std::min(nobs, (int64_t)hist_size);
+    bool dump = true;
+    if (dump_only_touched) dump = (bool) num_records;
+    std::vector<double> local_times(num_records), max_times(local_times);
+    int min_records(0), max_records(0);
+    for (int i=0; i < num_records; i++) {
+      local_times[i] = time_param_mat[i*mat_lda];
+    }
+    MPI_Allreduce(&num_records, &max_records, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+    MPI_Allreduce(&num_records, &min_records, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+    assert(max_records == min_records);
+    if (max_records > 0 && max_records == min_records) {
+      MPI_Reduce(local_times.data(), max_times.data(), num_records, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+      if (!my_rank && dump) {
+        // Open the file
+        std::ofstream ofs;
+        std::string model_name = std::string(name);
+        ofs.open(path+"/"+model_name, std::ofstream::out | std::ofstream::app);
+
+        // Dump the model coeffs
+        ofs << "Coeff: ";
+        for(int i=0; i<nparam; i++){
+          ofs << coeff_guess[i] << " ";
+        }
+        ofs << std::endl;
+
+
+        // Dump the training data
+        int num_records = std::min(nobs, (int64_t)hist_size);
+        for(int i=0; i<num_records; i++){
+          std::string instance = "";
+          ofs << max_times[i];
+          for(int j=1; j<mat_lda; j++){
+            ofs << " " << time_param_mat[i*mat_lda+j];
+          }
+          ofs<<"\n";
+        }
+        ofs.close();
+      }
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
   }
 
 
@@ -832,7 +884,7 @@ namespace CTF_int {
   }
 
   template <int nparam>
-  void CubicModel<nparam>::dump_data(std::string path){
+  void CubicModel<nparam>::dump_data(std::string path, bool dump_only_touched){
     lmdl.dump_data(path);
   }
 
diff --git a/src/shared/model.h b/src/shared/model.h
index e9b3a7d8..206f6bda 100644
--- a/src/shared/model.h
+++ b/src/shared/model.h
@@ -20,7 +20,7 @@ namespace CTF_int {
       virtual void print_uo(){};
       virtual void load_coeff(std::string file_name){};
       virtual void write_coeff(std::string file_name){};
-      virtual void dump_data(std::string path){};
+      virtual void dump_data(std::string path, bool dump_only_touched = false){};
   };
 
   void update_all_models(MPI_Comm cm);
@@ -28,6 +28,7 @@ namespace CTF_int {
   void load_all_models(std::string file_name);
   void write_all_models(std::string file_name);
   void dump_all_models(std::string path);
+  void dump_touched_models(std::string path);
 
   /**
    * \brief Linear performance models, which given measurements, provides new model guess
@@ -138,7 +139,8 @@ namespace CTF_int {
       /**
        * \brief dump model data to a file
        */
-      void dump_data(std::string path);
+      void dump_data(std::string path, bool dump_only_touched = false);
+
   };
 
   /**
@@ -216,7 +218,7 @@ namespace CTF_int {
        * \brief write model coefficients to file
        * \param[in] path the path that we wish to dump all files to
        */
-      void dump_data(std::string path);
+      void dump_data(std::string path, bool dump_only_touched = false);
 
   };
 
diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx
index bced3ac2..b4cdbf4d 100644
--- a/src/tensor/untyped_tensor.cxx
+++ b/src/tensor/untyped_tensor.cxx
@@ -19,6 +19,7 @@
 
 using namespace CTF;
 
+
 namespace CTF_int {
 
   LinModel<3> spredist_mdl(spredist_mdl_init,"spredist_mdl");
@@ -41,7 +42,7 @@ namespace CTF_int {
   void tensor::free_self(){
     if (order > -1){
       if (wrld->rank == 0) DPRINTF(3,"Deleted order %d tensor %s\n",order,name);
-      if (is_folded) unfold();
+      if (is_folded && !wrld->dryRanks) unfold();
       //if (is_folded) unfold(0,1);
       cdealloc(sym);
       cdealloc(lens);
@@ -307,7 +308,7 @@ namespace CTF_int {
         }*/
         this->home_size = other->home_size;
         register_size(this->home_size*sr->el_size);
-        this->home_buffer = sr->alloc(other->home_size);
+        if (!wrld->dryRanks) this->home_buffer = sr->alloc(other->home_size);
         if (other->is_home){
           this->is_home = 1;
           this->data = this->home_buffer;
@@ -315,14 +316,16 @@ namespace CTF_int {
           /*if (this->is_home || this->home_size != other->home_size){
           }*/
           this->is_home = 0;
-          sr->copy(this->home_buffer, other->home_buffer, other->home_size);
-          //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
-          this->data = sr->alloc(other->size);
+          if (!wrld->dryRanks){
+            sr->copy(this->home_buffer, other->home_buffer, other->home_size);
+            //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
+            this->data = sr->alloc(other->size);
+          }
         }
         this->has_home = 1;
       } else {
         //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
-        this->data = sr->alloc(other->size);
+        if (!wrld->dryRanks) this->data = sr->alloc(other->size);
 /*          if (this->has_home && !this->is_home){
           CTF_int::cdealloc(this->home_buffer);
         }*/
@@ -331,9 +334,9 @@ namespace CTF_int {
       }
   #else
       //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
-      this->data = sr->alloc(other->size);
+      if (!wrld->dryRanks) this->data = sr->alloc(other->size);
   #endif
-      sr->copy(this->data, other->data, other->size);
+      if (!wrld->dryRanks) sr->copy(this->data, other->data, other->size);
     } else {
       ASSERT(this->is_sparse);
       has_home = other->has_home;
@@ -676,7 +679,6 @@ namespace CTF_int {
     int * restricted;
     int btopo;
     int64_t bmemuse;
-
     if (this->is_mapped){
       if (is_sparse){
         sr->pair_dealloc(this->data);
@@ -686,6 +688,7 @@ namespace CTF_int {
         memset(this->nnz_blk, 0, sizeof(int64_t)*calc_nvirt());
         this->set_new_nnz_glb(this->nnz_blk);
       } else {
+        if (!wrld->dryRanks)
         sr->set(this->data, sr->addid(), this->size);
       }
     } else {
@@ -726,15 +729,19 @@ namespace CTF_int {
           //this->has_home = 0;
     /*      if (wrld->rank == 0)
             DPRINTF(3,"Initial size of tensor %d is " PRId64 ",",tensor_id,this->size);*/
-          this->home_buffer = sr->alloc(this->home_size);
-          if (wrld->rank == 0) DPRINTF(2,"Creating home of %s\n",name);
-          register_size(this->size*sr->el_size);
-          this->data = this->home_buffer;
+          if (!wrld->dryRanks) {
+            this->home_buffer = sr->alloc(this->home_size);
+            if (wrld->rank == 0) DPRINTF(2,"Creating home of %s\n",name);
+            register_size(this->size*sr->el_size);
+            this->data = this->home_buffer;
+          }
         } else {
-          this->data = sr->alloc(this->size);
+          if (!wrld->dryRanks)
+            this->data = sr->alloc(this->size);
         }
         #else
-        this->data = sr->alloc(this->size);
+        if (!wrld->dryRanks)
+          this->data = sr->alloc(this->size);
         //CTF_int::alloc_ptr(this->size*sr->el_size, (void**)&this->data);
         #endif
         #if DEBUG >= 1
@@ -743,7 +750,8 @@ namespace CTF_int {
         this->print_lens();
         this->print_map(stdout);
         #endif
-        sr->init(this->size, this->data);
+        if (!wrld->dryRanks)
+          sr->init(this->size, this->data);
       }
     }
     TAU_FSTOP(set_zero_tsr);
@@ -752,17 +760,24 @@ namespace CTF_int {
 
   void tensor::print_map(FILE * stream, bool allcall) const {
     if (!allcall || wrld->rank == 0){
-      if (is_sparse)
-        printf("printing mapping of sparse tensor %s\n",name);
-      else
-        printf("printing mapping of dense tensor %s\n",name);
+//      if (is_sparse)
+//        printf("printing mapping of sparse tensor %s\n",name);
+//      else
+//        printf("printing mapping of dense tensor %s\n",name);
+//      if (topo != NULL){
+//        printf("CTF: %s mapped to order %d topology with dims:",name,topo->order);
+//        for (int dim=0; dim<topo->order; dim++){
+//          printf(" %d ",topo->lens[dim]);
+//        }
+//      }
+//      printf("\n");
       if (topo != NULL){
-        printf("CTF: %s mapped to order %d topology with dims:",name,topo->order);
+        printf("%s topo (",name);
         for (int dim=0; dim<topo->order; dim++){
-          printf(" %d ",topo->lens[dim]);
+          printf(", %d",topo->lens[dim]);
         }
+        printf("); ");
       }
-      printf("\n");
       char tname[200];
       tname[0] = '\0';
       sprintf(tname, "%s[", name);
@@ -1028,7 +1043,16 @@ namespace CTF_int {
       bool tsr_has_sym = false;
       bool tsr_has_virt = false;
 
+      int topo_dims_A = tsr_A->topo->order;
+      int topo_dims_B = tsr_B->topo->order;
+
       for (int i=0; i<this->order; i++){
+        if (tsr_A->edge_map[i].type == PHYSICAL_MAP){
+          topo_dims_A--;
+        }
+        if (tsr_B->edge_map[i].type == PHYSICAL_MAP){
+          topo_dims_B--;
+        }
         if (A->sym[i] != NS || this->sym[i] != NS)
           tsr_has_sym = true;
         if (A->edge_map[i].type == VIRTUAL_MAP || (A->edge_map[i].has_child && A->edge_map[i].child->type == VIRTUAL_MAP)){
@@ -1040,7 +1064,7 @@ namespace CTF_int {
       }
       int nvirt_A = tsr_A->calc_nvirt();
       int nvirt_B = tsr_B->calc_nvirt();
-      if (tsr_B->wrld->np == tsr_A->wrld->np && !tsr_has_sym && !this->is_sparse && !A->is_sparse && nvirt_A == 1 && nvirt_B == 1 && !tsr_has_virt){
+      if (tsr_B->wrld->np == tsr_A->wrld->np && !tsr_has_sym && !this->is_sparse && !A->is_sparse && nvirt_A == 1 && nvirt_B == 1 && !tsr_has_virt && topo_dims_A ==0 && topo_dims_B == 0){
         push_slice(this, offsets_B, ends_B, beta, A, offsets_A, ends_A, alpha);
         TAU_FSTOP(slice);
         return;
diff --git a/src/tensor/untyped_tensor.h b/src/tensor/untyped_tensor.h
index 6a24b6a5..0ebbd631 100644
--- a/src/tensor/untyped_tensor.h
+++ b/src/tensor/untyped_tensor.h
@@ -372,7 +372,7 @@ namespace CTF_int {
        * \param[out] size number of elements in data
        */
       void get_raw_data(char ** data, int64_t * size) const;
-      
+
       /**
        * \brief query mapping to processor grid and intra-processor blocking, which may be used to define a tensor with the same initial distribution
        * \param[out] idx array of this->order chars describing this processor modes mapping on processor grid dimensions tarting from 'a'
@@ -1063,6 +1063,7 @@ namespace CTF_int {
        * \return tensor with same data point as this one but no edge lengths of size 1
        */
       tensor * get_no_unit_len_alias();
+
   };
 }
 #endif// __UNTYPED_TENSOR_H__