diff --git a/Makefile b/Makefile index 85c8f2f1..875f6f1c 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ uninstall: EXAMPLES = algebraic_multigrid apsp bitonic_sort btwn_central ccsd checkpoint dft_3D fft force_integration force_integration_sparse jacobi matmul neural_network particle_interaction qinformatics recursive_matmul scan sparse_mp3 sparse_permuted_slice spectral_element spmv sssp strassen trace mis mis2 ao_mo_transf block_sparse checkpoint_sparse hosvd mttkrp fft_with_idx_partition TESTS = bivar_function bivar_transform ccsdt_map_test ccsdt_t3_to_t2 dft diag_ctr diag_sym endomorphism_cust endomorphism_cust_sp endomorphism gemm_4D multi_tsr_sym permute_multiworld readall_test readwrite_test repack scalar speye sptensor_sum subworld_gemm sy_times_ns test_suite univar_function weigh_4D reduce_bcast -BENCHMARKS = bench_contraction bench_nosym_transp bench_redistribution model_trainer +BENCHMARKS = model_trainer_cc4s model_trainer bench_contraction bench_nosym_transp SCALAPACK_TESTS = qr svd eigh diff --git a/bench/model_trainer_cc4s.cxx b/bench/model_trainer_cc4s.cxx new file mode 100644 index 00000000..85201a47 --- /dev/null +++ b/bench/model_trainer_cc4s.cxx @@ -0,0 +1,170 @@ +/** Copyright (c) 2011, Edgar Solomonik, all rights reserved. + * \addtogroup benchmarks + * @{ + * \addtogroup model_trainer + * @{ + * \brief Executes a set of different contractions on different processor counts to train model parameters + */ + +#include +#include +#include +#define TEST_SUITE +#include "../examples/ccsd.cxx" +#include "../examples/sparse_mp3.cxx" +#undef TEST_SUITE +using namespace CTF; + +namespace CTF_int{ + void update_all_models(MPI_Comm comm); +} + +struct Ccsd_dimensions { + int64_t No; + int64_t Nv; + int64_t Nx; + int64_t Ng; +}; + +Ccsd_dimensions get_ccsd_dimensions(double mem_per_core, int64_t nvfac, World &dw) { + int np; + MPI_Comm_size(dw.comm, &np); + int64_t No(10); + while ( No*No*No*No*nvfac*nvfac*8./np/1024/1024 < mem_per_core) No++; + return Ccsd_dimensions({No, No*nvfac, No, (int64_t) No*nvfac*2.5}); +} + +void ph1_contraction(int64_t No, int64_t Nv, World &dw) { + int64_t vvoo[] = {Nv, Nv, No, No}; + int syms[] = {NS, NS, NS, NS}; + CTF::Tensor< double > T(4, vvoo, syms, dw, "T"); + CTF::Tensor< double > V(4, vvoo, syms, dw, "V"); + CTF::Tensor< double > R(4, vvoo, syms, dw, "R"); + V.fill_random(0, 1); + T.fill_random(0, 1); + R["abij"] = T["acik"] * V["cbkj"]; +} + +void ph2_contraction(int64_t No, int64_t Nv, World &dw) { + int64_t vvoo[] = {Nv, Nv, No, No}; + int64_t ovvo[] = {No, Nv, Nv, No}; + int syms[] = {NS, NS, NS, NS}; + CTF::Tensor< double > T(4, vvoo, syms, dw, "T"); + CTF::Tensor< double > V(4, ovvo, syms, dw, "V"); + CTF::Tensor< double > R(4, vvoo, syms, dw, "R"); + V.fill_random(0, 1); + T.fill_random(0, 1); + R["abij"] = T["acik"] * V["kbcj"]; +} + +void ggv_contraction(int64_t Nv, int64_t Nx, int64_t Ng, World &dw) { + int64_t gxv[] = {Ng, Nx, Nv}; + int64_t vvxx[] = {Nv, Nv, Nx, Nx}; + int syms[] = {NS, NS, NS, NS}; + CTF::Tensor< double > G(3, gxv, syms, dw, "B"); + CTF::Tensor< double > V(4, vvxx, syms, dw, "C"); + G.fill_random(0, 1); + V["cdxy"] = G["Gxc"] * G["Gyd"]; +} + +void rvt_contraction(int64_t No, int64_t Nv, int64_t Nx, World &dw) { + int64_t vvoo[] = {Nv, Nv, No, No}; + int64_t xxoo[] = {Nx, Nx, No, No}; + int64_t vvxx[] = {Nv, Nv, Nx, Nx}; + int syms[] = {NS, NS, NS, NS}; + CTF::Tensor< double > T(4, vvoo, syms, dw, "T"); + CTF::Tensor< double > V(4, vvxx, syms, dw, "V"); + CTF::Tensor< double > R(4, xxoo, syms, dw, "R"); + V.fill_random(0, 1); + T.fill_random(0, 1); + R["abij"] = V["xyab"] * T["xyij"]; +} + +void train_ccsd(World & dw, double mem_per_core, int64_t nvfac, int c_id){ + auto dim = get_ccsd_dimensions(mem_per_core, nvfac, dw); + if (c_id & 1) ph1_contraction(dim.No, dim.Nv, dw); + if (c_id & 2) ph2_contraction(dim.No, dim.Nv, dw); + if (c_id & 4) ggv_contraction(dim.Nv, dim.Nx, dim.Ng, dw); + if (c_id & 8) rvt_contraction(dim.No, dim.Nv, dim.Nx, dw); +} + + + +void train_all(std::string dump_path, int num_iterations, int rounds, int ppn){ + World dw("hallo", 0, ppn); + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + + for (int i=0; iwrld->cdt.cm == MPI_COMM_WORLD){ // update_all_models(A->wrld->cdt.cm); //} - int stat = home_contract(); if (stat != SUCCESS){ printf("CTF ERROR: Failed to perform contraction\n"); @@ -2582,11 +2582,12 @@ namespace CTF_int { assert(nnz_frac_C>=0.); } - void contraction::detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time){ + void contraction::detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time, double &redist_time, double &contr_time, double &fold_time){ TAU_FSTART(detail_estimate_mem_and_time); ctr * sctr; est_time = 0.; memuse = 0; + fold_time = 0.0; topology * topo_i = A->topo; bool csr_or_coo = B->is_sparse || C->is_sparse || is_custom || !A->sr->has_coo_ker; bool use_ccsr = csr_or_coo && A->is_sparse && C->is_sparse && !B->is_sparse; @@ -2598,13 +2599,14 @@ namespace CTF_int { #if FOLD_TSR if (can_fold()){ est_time = est_time_fold(); + fold_time = est_time; iparam prm = map_fold(false); sctr = construct_ctr(1, &prm); if (this->is_sparse()) - est_time = ((spctr*)sctr)->est_time_rec(sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C); + est_time += ((spctr*)sctr)->est_time_rec(sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C); else - est_time = sctr->est_time_rec(sctr->num_lyr); + est_time += sctr->est_time_rec(sctr->num_lyr); A->remove_fold(); B->remove_fold(); C->remove_fold(); @@ -2672,6 +2674,7 @@ namespace CTF_int { } } + contr_time = est_time - fold_time; #if DEBUG >= 4 printf("mapping passed contr est_time = %E sec %d %ld %ld %ld %E %E %E\n", est_time, sctr->num_lyr, A->calc_nvirt(), B->calc_nvirt(), C->calc_nvirt(), nnz_frac_A, nnz_frac_B, nnz_frac_C); #endif @@ -2724,6 +2727,7 @@ namespace CTF_int { mem_redist_tmp += C->get_redist_mem(*dC, nnz_frac_C); //mem_redist += (int64_t)(nnz_frac_C*C->size*C->sr->pair_size()) +C->get_redist_mem(*dC, nnz_frac_C); } + redist_time = est_time - contr_time - fold_time; assert(mem_fold_tmp >= 0); assert(mem_fold >= 0); assert(mem_redist >= 0); @@ -2755,7 +2759,9 @@ namespace CTF_int { #if DEBUG > 4 for (int t=1; t<(int)wrld->topovec.size()+8; t++){ #else - for (int64_t t=global_comm.rank+1; t<(int)wrld->topovec.size()+8; t+=global_comm.np){ + int64_t incr(global_comm.np); + if (A->wrld->dryRanks) incr = 1; + for (int64_t t=global_comm.rank+1; t<(int)wrld->topovec.size()+8; t+=incr){ #endif A->clear_mapping(); B->clear_mapping(); @@ -2794,7 +2800,7 @@ namespace CTF_int { ret = map_to_topology(topo_i, j); if (ret == NEGATIVE){ - //printf("map_to_topology returned negative\n"); +// printf("map_to_topology returned negative %d %d\n", t, j); continue; } @@ -2806,6 +2812,7 @@ namespace CTF_int { C->topo = topo_i; if (check_mapping() == 0){ +// printf("check mapping is zero %d %d\n", t, j); continue; } A->set_padding(); @@ -2825,11 +2832,18 @@ namespace CTF_int { continue; } int64_t memuse;//, bmemuse; - double est_time; - detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time); + double est_time, redist_time, contr_time, fold_time; + detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time); #ifdef MIN_MEMORY est_time = memuse; #endif + + if (A->wrld->dryRanks && A->wrld->verbose == 2) + printf( "t %ld j %d will use %f GB per rank and take %f s, %f %f %f" + , t, j, memuse/1024.0/1024./1024 + , est_time, redist_time, contr_time, fold_time); + if (A->wrld->dryRanks && A->wrld->verbose == 2) C->print_map(); + ASSERT(est_time >= 0.0); if ((int64_t)memuse >= max_memuse){ if (global_comm.rank == 0) @@ -2889,7 +2903,7 @@ namespace CTF_int { int64_t old_off = choice_offset; choice_offset += tnum_choices; for (int j=0; jwrld->dryRanks && (old_off + j)%global_comm.np != global_comm.rank) continue; A->clear_mapping(); B->clear_mapping(); @@ -2931,12 +2945,15 @@ namespace CTF_int { continue; } int64_t memuse;//, bmemuse; - double est_time; - detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time); + double est_time, redist_time, contr_time, fold_time; + detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time); #ifdef MIN_MEMORY est_time = memuse; #endif ASSERT(est_time >= 0.0); + if (A->wrld->dryRanks) printf( "topo %d order %d will use %f GB per rank and take %f s (%f %f %f, redist/contraction/folding)\n" + , i, j, memuse/1024.0/1024./1024, est_time, redist_time, contr_time, fold_time); + if ((int64_t)memuse >= max_memuse){ DPRINTF(3,"[EXH] Not enough memory available for topo %d with order %d memory %ld/%ld\n", i,j,memuse,max_memuse); @@ -3076,7 +3093,7 @@ namespace CTF_int { A->set_padding(); B->set_padding(); C->set_padding(); - if (gbest_time_sel < 100.){ + if (gbest_time_sel > 1e100){ gbest_time_exh = gbest_time_sel+1.; ttopo_exh = ttopo_sel; } else { @@ -3107,6 +3124,7 @@ namespace CTF_int { ctr_sig_map.insert(std::pair(sig,ti)); TAU_FSTOP(ctr_sig_map_insert); } + if (!do_remap || ttopo == INT64_MAX || ttopo == -1){ CTF_int::cdealloc(old_phase_A); CTF_int::cdealloc(old_phase_B); @@ -3191,9 +3209,9 @@ namespace CTF_int { #if (VERBOSE >= 1 || DEBUG >= 1 || PROFILE_MEMORY >= 1) int64_t memuse; - double est_time; + double est_time, redist_time, contr_time, fold_time; - detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time); + detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time); if (global_comm.rank == 0){ printf("Contraction will use %E bytes per processor out of %E available memory (already used %E) and take an estimated of %E sec\n", (double)memuse,(double)proc_bytes_available(),(double)proc_bytes_used(),est_time); @@ -3205,6 +3223,15 @@ namespace CTF_int { // assert(est_time == std::min(gbest_time_sel,gbest_time_exh)); //#endif #endif + if (A->wrld->dryRanks){ + int64_t memuse; + double est_time, redist_time, contr_time, fold_time; + detail_estimate_mem_and_time(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, nnz_frac_A, nnz_frac_B, nnz_frac_C, memuse, est_time, redist_time, contr_time, fold_time); + printf( "Contraction will use %f GB per rank and take %f s (%f %f %f, redist/contraction/folding)\n" + , memuse/1024.0/1024./1024, est_time, redist_time, contr_time, fold_time); + } + + if (can_fold()){ iparam prm = map_fold(false); @@ -3250,7 +3277,7 @@ namespace CTF_int { } } else need_remap = 1; - if (need_remap) + if (need_remap && !wrld->dryRanks) A->redistribute(*dA); need_remap = 0; if (B->topo == old_topo_B){ @@ -3260,7 +3287,7 @@ namespace CTF_int { } } else need_remap = 1; - if (need_remap) + if (need_remap && !wrld->dryRanks) B->redistribute(*dB); need_remap = 0; if (C->topo == old_topo_C){ @@ -3270,9 +3297,9 @@ namespace CTF_int { } } else need_remap = 1; - if (need_remap) + if (need_remap && !wrld->dryRanks) C->redistribute(*dC); - + TAU_FSTOP(redistribute_for_contraction); CTF_int::cdealloc( old_phase_A ); @@ -4169,6 +4196,8 @@ namespace CTF_int { ctr * ctrf; CommData global_comm = C->wrld->cdt; + + if (A->has_zero_edge_len || B->has_zero_edge_len || C->has_zero_edge_len){ if (!C->sr->isequal(beta,C->sr->mulid()) && !C->has_zero_edge_len){ @@ -4368,7 +4397,6 @@ namespace CTF_int { C->print_map(); } #endif - #ifdef PROFILE TAU_FSTART(pre_fold_barrier); MPI_Barrier(global_comm.cm); @@ -4383,12 +4411,50 @@ namespace CTF_int { if (is_inner){ iparam prm; TAU_FSTART(map_fold); - prm = map_fold(); + prm = map_fold(!A->wrld->dryRanks); TAU_FSTOP(map_fold); delete ctrf; ctrf = construct_ctr(1, &prm); } #endif + + + if (A->wrld->dryRanks){ +// iran: this is the silent version + A->print_map(); + B->print_map(); + C->print_map(); + //ctrf->print(); +#define NODE_AWARE 1 +#ifdef NODE_AWARE + if (C->wrld->ppn){ + topology orig_topo = *(C->topo); + std::vector pe_grid(orig_topo.lens, orig_topo.lens + orig_topo.order); + std::vector > inter_node_grids = + CTF_int::get_inter_node_grids(pe_grid, C->wrld->dryRanks/C->wrld->ppn); + int * intra_node_lens = (int*)CTF_int::alloc(orig_topo.order*sizeof(int)); + double comm_vol_ref = ctrf->est_internode_comm_vol_rec(ctrf->num_lyr); + printf("Ref: %f\n", comm_vol_ref/1024.0/1024.0/1024.0); + for (size_t i=0; itopo->morph_to(na_topo_i); + double comm_vol_i = ctrf->est_internode_comm_vol_rec(ctrf->num_lyr); + for (int j=0; j < orig_topo.order; j++) printf("%d ", inter_node_grids[i][j]); + printf("-> %f\n", comm_vol_i/1024.0/1024.0/1024.0); + + C->topo->morph_to(orig_topo); + } + cdealloc(intra_node_lens); + } +#endif + delete ctrf; + TAU_FSTOP(contract); + return SUCCESS; + } + + #if (VERBOSE >= 1 || DEBUG >= 1) if (global_comm.rank == 0){ ctrf->print(); @@ -4422,8 +4488,72 @@ namespace CTF_int { MPI_Barrier(global_comm.cm); TAU_FSTOP(pre_ctr_func_barrier); #endif + + +#ifdef NODE_AWARE + TAU_FSTART(node_aware_remapping); + /* reorder processor grid to account for node-awareness */ + topology orig_topo = *(C->topo); + int64_t node_aware_send_to_rank(0); + int64_t node_aware_recv_from_rank(0); + // FIXME: support sparsity + if (C->wrld->ppn && !is_sparse()){ + std::vector pe_grid(orig_topo.lens, orig_topo.lens + orig_topo.order); + std::vector > inter_node_grids = CTF_int::get_inter_node_grids(pe_grid, C->wrld->np/C->wrld->ppn); + //std::vector< std::vector > intra_node_grids = CTF_int::get_all_shapes(C->wrld->ppn()){ + int * intra_node_lens = (int*)CTF_int::alloc(orig_topo.order*sizeof(int)); + int64_t best_topo_index(0); + double best_comm_vol = DBL_MAX; + for (size_t i=0; itopo->morph_to(na_topo_i); + + double comm_vol_i = ctrf->est_internode_comm_vol_rec(ctrf->num_lyr); + if (comm_vol_i < best_comm_vol){ + best_topo_index = i; + best_comm_vol = comm_vol_i; + } + C->topo->morph_to(orig_topo); + } + for (int j=0; jtopo->morph_to(node_aware_topo); + node_aware_send_to_rank = get_inv_topo_reorder_rank(node_aware_topo.order, node_aware_topo.lens, intra_node_lens, orig_topo.glb_comm.rank); + node_aware_recv_from_rank = get_topo_reorder_rank(node_aware_topo.order, node_aware_topo.lens, node_aware_topo.lda, intra_node_lens, orig_topo.glb_comm.rank); + if (orig_topo.glb_comm.rank != node_aware_send_to_rank){ + IASSERT(orig_topo.glb_comm.rank != node_aware_recv_from_rank); + TAU_FSTART(redistribute_for_node_aware); + // FIXME: to support sparsity need to also communicate nnz information here + MPI_Status stat; + MPI_Sendrecv_replace(A->data, A->size, A->sr->mdtype(), node_aware_send_to_rank, 1322, node_aware_recv_from_rank, 1322, orig_topo.glb_comm.cm, &stat); + MPI_Sendrecv_replace(B->data, B->size, B->sr->mdtype(), node_aware_send_to_rank, 1323, node_aware_recv_from_rank, 1323, orig_topo.glb_comm.cm, &stat); + MPI_Sendrecv_replace(C->data, C->size, C->sr->mdtype(), node_aware_send_to_rank, 1324, node_aware_recv_from_rank, 1324, orig_topo.glb_comm.cm, &stat); + TAU_FSTOP(redistribute_for_node_aware); + } + cdealloc(intra_node_lens); + } + TAU_FSTOP(node_aware_remapping); +#endif + + + + + TAU_FSTART(ctr_func); /* Invoke the contraction algorithm */ + TAU_FSTART(blockComm); + std::vector swap; + ctrf->blockComm( A->topo->lens, A->data, B->data, C->data + , A->size, B->size, C->size, global_comm, swap); + MPI_Barrier(global_comm.cm); + TAU_FSTOP(blockComm); A->topo->activate(); #ifdef PROFILE_MEMORY @@ -4506,9 +4636,37 @@ namespace CTF_int { printf("Finished contraction computation\n"); } #endif +#ifdef NODE_AWARE + TAU_FSTART(node_aware_backmapping); + /* reorder processor grid to account for node-awareness */ + // FIXME: support sparsity + if (C->wrld->ppn && !is_sparse() && orig_topo.glb_comm.rank != node_aware_send_to_rank){ + TAU_FSTART(redistribute_for_node_aware); + // FIXME: to support sparsity need to also communicate nnz information here + MPI_Status stat; + if (A->is_home) { + MPI_Sendrecv_replace(A->data, A->size, A->sr->mdtype(), node_aware_recv_from_rank, 1325, node_aware_send_to_rank, 1325, orig_topo.glb_comm.cm, &stat); + } + if (B->is_home) { + MPI_Sendrecv_replace(B->data, B->size, B->sr->mdtype(), node_aware_recv_from_rank, 1326, node_aware_send_to_rank, 1326, orig_topo.glb_comm.cm, &stat); + } + MPI_Sendrecv_replace(C->data, C->size, C->sr->mdtype(), node_aware_recv_from_rank, 1327, node_aware_send_to_rank, 1327, orig_topo.glb_comm.cm, &stat); + TAU_FSTOP(redistribute_for_node_aware); + } + if (C->wrld->ppn && !is_sparse()) { + C->topo->morph_to(orig_topo); + } + TAU_FSTOP(node_aware_backmapping); +#endif + - A->topo->deactivate(); +// A->topo->deactivate(); + TAU_FSTART(blockComm); + MPI_Barrier(global_comm.cm); + ctrf->blockComm( A->topo->lens, A->data, B->data, C->data + , A->size, B->size, C->size, global_comm, swap); + TAU_FSTOP(blockComm); #ifdef PROFILE TAU_FSTART(post_ctr_func_barrier); @@ -5176,7 +5334,6 @@ namespace CTF_int { return SUCCESS; } } - contraction new_ctr = contraction(*this); was_home_A = A->is_home; @@ -5245,7 +5402,9 @@ namespace CTF_int { } ret = new_ctr.sym_contract();//&ntype, ftsr, felm, alpha, beta); + if (ret!= SUCCESS) return ret; + if (C->wrld->dryRanks) return SUCCESS; if (was_home_C) new_ctr.C->unfold(); if (was_home_C && !new_ctr.C->is_home){ diff --git a/src/contraction/contraction.h b/src/contraction/contraction.h index 90ca2211..8395983c 100644 --- a/src/contraction/contraction.h +++ b/src/contraction/contraction.h @@ -292,7 +292,7 @@ namespace CTF_int { void calc_nnz_frac(double & nnz_frac_A, double & nnz_frac_B, double & nnz_frac_C); - void detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time); + void detail_estimate_mem_and_time(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & memuse, double & est_time, double &redist_time, double &contr_time, double &fold_time); void get_best_sel_map(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C, int64_t & idx, double & time); diff --git a/src/contraction/ctr_2d_general.cxx b/src/contraction/ctr_2d_general.cxx index 117704be..e5faad92 100755 --- a/src/contraction/ctr_2d_general.cxx +++ b/src/contraction/ctr_2d_general.cxx @@ -199,15 +199,18 @@ namespace CTF_int { void ctr_2d_general::print() { printf("ctr_2d_general: edge_len = %ld\n", edge_len); - printf("move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld\n", + printf("move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld", move_A, ctr_lda_A, ctr_sub_lda_A); - if (move_A) printf("cdt_A length = %d\n",cdt_A->np); - printf("move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld\n", + if (move_A) printf(", cdt_A length = %d",cdt_A->np); + printf("\n"); + printf("move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld", move_B, ctr_lda_B, ctr_sub_lda_B); - if (move_B) printf("cdt_B length = %d\n",cdt_B->np); - printf("move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld\n", + if (move_B) printf(", cdt_B length = %d",cdt_B->np); + printf("\n"); + printf("move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld", move_C, ctr_lda_C, ctr_sub_lda_C); - if (move_C) printf("cdt_C length = %d\n",cdt_C->np); + if (move_C) printf(", cdt_C length = %d",cdt_C->np); + printf("\n"); #ifdef OFFLOAD if (alloc_host_buf) printf("alloc_host_buf is true\n"); @@ -262,6 +265,26 @@ namespace CTF_int { return rec_ctr->est_time_rec(1)*(double)edge_len/MIN(nlyr,edge_len) + est_time_fp(nlyr); } + + double ctr_2d_general::est_internode_collective_comm_vol(int nlyr) { + int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size; + find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size); + double sz = 0.0; + if (move_A) + sz += (sr_A->el_size*s_A) * (cdt_A->num_nodes - 1);// ((cdt_A->np / cdt_A->intra_node_np) - 1); + if (move_B) + sz += (sr_B->el_size*s_B) * (cdt_B->num_nodes - 1);// ((cdt_B->np / cdt_B->intra_node_np) - 1); + if (move_C) + sz += (sr_C->el_size*s_C) * (cdt_C->num_nodes - 1); //((cdt_C->np / cdt_C->intra_node_np) - 1); + return (sz*(double)edge_len)/MIN(nlyr,edge_len); + } + + double ctr_2d_general::est_internode_comm_vol_rec(int nlyr) { + return rec_ctr->est_internode_comm_vol_rec(1)*(double)edge_len/MIN(nlyr,edge_len) + est_internode_collective_comm_vol(nlyr); + } + + + int64_t ctr_2d_general::mem_fp() { int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size; find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size); @@ -431,7 +454,7 @@ namespace CTF_int { if (cdt_C->rank == owner_C) cdt_C->red(MPI_IN_PLACE, op_C, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C); else - cdt_C->red(op_C, NULL, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C); + cdt_C->red(op_C, op_C, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C); if (rank_C == owner_C){ sr_C->copy(ctr_sub_lda_C, ctr_lda_C, op_C, ctr_sub_lda_C, sr_C->mulid(), @@ -469,5 +492,113 @@ namespace CTF_int { } TAU_FSTOP(ctr_2d_general); } + + void ctr_2d_general::blockComm( int const * rgrid, char *A, char *B, char *C + , size_t sizeA, size_t sizeB, size_t sizeC + , CommData glb_comm, std::vector &swap + ){ + int rank = glb_comm.rank; + int np = glb_comm.np; + int src, dst; + // we have to determine the partners + if (! swap.size() ) { + ipair nr(getNumNodes(glb_comm.cm)); + // rGrid is the rankGrid of the given tensor topology + CommGrid grid({rgrid[0], rgrid[1]}, nr.first); + ipair nGrid = grid.nGrid; + ipair iGrid = grid.iGrid; + // rr is the key/color pair for the original rank distribution of dim_comm[0] + std::vector rr(np); + std::vector< std::pair > perm(np); + for (int r(0); r < np; r++) rr[r] = { r % rgrid[0], r / rgrid[0] }; + // the desired distribution are nGrid[0] x nGrid[1] blocks with the some color + for (int r(0); r < np; r++){ + // the color is the jth column and kth row in the nodeGrid + int clr = (rr[r].second/iGrid.second)*nGrid.first + rr[r].first/iGrid.first; + int key = (rr[r].second%iGrid.second)*iGrid.first + rr[r].first%iGrid.first; + // we have to swap color and key that we can use std::sort + perm[r] = { { clr, key }, r}; + } + std::sort(perm.begin(), perm.end()); + for (auto p: perm) swap.push_back(p.second); + + src = swap[rank]; + auto it( std::find(swap.begin(), swap.end(), rank) ); + dst = std::distance(swap.begin(), it); + } + else { + dst = swap[rank]; + auto it( std::find(swap.begin(), swap.end(), rank) ); + src = std::distance(swap.begin(), it); + } + + MPI_Barrier(glb_comm.cm); + MPI_Status s; + MPI_Sendrecv_replace(&cdt_A->color, 1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s); + MPI_Sendrecv_replace(&cdt_B->color, 1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s); + MPI_Sendrecv_replace(&cdt_A->rank, 1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s); + MPI_Sendrecv_replace(&cdt_B->rank, 1, MPI_INT, dst, 0, src, 0, glb_comm.cm, &s); + + MPI_Barrier(glb_comm.cm); + + size_t el(std::max(sizeA, sizeB)); + el = std::max(el, sizeC); + char *buf = new char[el*sr_A->el_size]; + // Do the A job + MPI_Request sreq, rreq; + MPI_Irecv(buf, sizeA, sr_A->mdtype(), src, 0, glb_comm.cm, &rreq); + MPI_Isend(A, sizeA, sr_A->mdtype(), dst, 0, glb_comm.cm, &sreq); + MPI_Wait(&rreq, MPI_STATUS_IGNORE); + MPI_Wait(&sreq, MPI_STATUS_IGNORE); + memcpy(A, buf, sizeA*sr_A->el_size); + + // Do the B job + MPI_Irecv(buf, sizeB, sr_A->mdtype(), src, 0, glb_comm.cm, &rreq); + MPI_Isend(B, sizeB, sr_A->mdtype(), dst, 0, glb_comm.cm, &sreq); + MPI_Wait(&rreq, MPI_STATUS_IGNORE); + MPI_Wait(&sreq, MPI_STATUS_IGNORE); + memcpy(B, buf, sizeB*sr_A->el_size); + + // Do the B job + MPI_Irecv(buf, sizeC, sr_A->mdtype(), src, 0, glb_comm.cm, &rreq); + MPI_Isend(C, sizeC, sr_A->mdtype(), dst, 0, glb_comm.cm, &sreq); + MPI_Wait(&rreq, MPI_STATUS_IGNORE); + MPI_Wait(&sreq, MPI_STATUS_IGNORE); + memcpy(C, buf, sizeC*sr_A->el_size); + MPI_Barrier(glb_comm.cm); + } + + ipair ctr_2d_general::getNumNodes(MPI_Comm comm){ + int rank, np; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &np); + + std::vector nodeList(np); + char nodeName[MPI_MAX_PROCESSOR_NAME]; + char nodeNames[np*MPI_MAX_PROCESSOR_NAME]; + std::vector nameLengths(np); + std::vector off(np); + int nameLength; + MPI_Get_processor_name(nodeName, &nameLength); + MPI_Allgather( + &nameLength, 1, MPI_INT, nameLengths.data(), 1, MPI_INT, comm + ); + for (int i(1); i < np; i++) off[i] = off[i-1] + nameLengths[i-1]; + MPI_Allgatherv( + nodeName, nameLengths[rank], MPI_BYTE, nodeNames, + nameLengths.data(), off.data(), MPI_BYTE, comm + ); + for (int i(0); i < np; i++) { + std::string s(&nodeNames[off[i]], nameLengths[i]); + nodeList[i] = s; + } + std::sort(nodeList.begin(), nodeList.end()); + std::vector::iterator it( + std::unique(nodeList.begin(), nodeList.end()) + ); + size_t nNodes(std::distance(nodeList.begin(), it)); + return {nNodes, np/nNodes}; + } + } diff --git a/src/contraction/ctr_2d_general.h b/src/contraction/ctr_2d_general.h index 3df75d3d..9dbed5d7 100644 --- a/src/contraction/ctr_2d_general.h +++ b/src/contraction/ctr_2d_general.h @@ -68,6 +68,8 @@ namespace CTF_int{ CommData * cdt_A; CommData * cdt_B; CommData * cdt_C; + + /* Class to be called on sub-blocks */ ctr * rec_ctr; @@ -81,6 +83,19 @@ namespace CTF_int{ * where b is the smallest blocking factor among A and B or A and C or B and C. */ void run(char * A, char * B, char * C); + /** + * \brief interchanges processors in the communicator -> permuting + * the data such that each communicator has adjacent global ranks + */ + void blockComm( int const *rgrid, char *A, char *B, char *C + , size_t sizeA, size_t sizeB, size_t sizeC + , CommData globalComm, std::vector &swap); + + /** + * \brief returns the number of nodes & number of ranks per node + * note: only trustworthy if ranks per node is the same for all nodes!! + */ + ipair getNumNodes(MPI_Comm comm); /** * \brief returns the number of bytes of buffer space * we need @@ -102,6 +117,19 @@ namespace CTF_int{ * \return bytes needed for recursive contraction */ double est_time_rec(int nlyr); + + /** + * \brief estimate the inter-node communication volume of this kernel + * \return volume in bytes, represented as floating point + */ + double est_internode_collective_comm_vol(int nlyr); + + /** + * \brief estimate the inter-node communication volume of the algorithm recursively + * \return volume in bytes, represented as floating point + */ + double est_internode_comm_vol_rec(int nlyr); + ctr * clone(); /** diff --git a/src/contraction/ctr_comm.cxx b/src/contraction/ctr_comm.cxx index de7a3e20..63f659f4 100755 --- a/src/contraction/ctr_comm.cxx +++ b/src/contraction/ctr_comm.cxx @@ -193,6 +193,18 @@ namespace CTF_int { return rec_ctr->est_time_rec(nlyr) + est_time_fp(nlyr); } + double ctr_replicate::est_internode_comm_vol_rec(int nlyr) { + int i; + double sz = 0.; + for (i = 0; i < ncdt_A; i++) + sz += (size_A*sr_A->el_size) * (cdt_A[i]->num_nodes - 1); + for (i = 0; i < ncdt_B; i++) + sz += (size_B*sr_B->el_size) * (cdt_B[i]->num_nodes - 1); + for (i = 0; i < ncdt_C; i++) + sz += (size_C*sr_C->el_size) * (cdt_C[i]->num_nodes - 1); + return rec_ctr->est_internode_comm_vol_rec(nlyr) + sz; + } + int64_t ctr_replicate::mem_fp(){ return 0; } diff --git a/src/contraction/ctr_comm.h b/src/contraction/ctr_comm.h index 0f4670df..a748888a 100644 --- a/src/contraction/ctr_comm.h +++ b/src/contraction/ctr_comm.h @@ -199,8 +199,11 @@ namespace CTF_int{ virtual int64_t mem_rec() { return mem_fp(); }; virtual double est_time_fp(int nlyr) { return 0; }; virtual double est_time_rec(int nlyr) { return est_time_fp(nlyr); }; + virtual double est_internode_comm_vol_rec(int nlyr) { return 0; }; virtual ctr * clone() { return NULL; }; - + virtual void blockComm( int const *rgrid, char *A, char *B, char *C + , size_t sizeA, size_t sizeB, size_t sizeC + , CommData globalComm, std::vector &swap) {}; /** * \brief deallocates generic ctr object */ @@ -254,6 +257,11 @@ namespace CTF_int{ * \return time in sec */ double est_time_rec(int nlyr); + /** + * \brief estimate the inter-node communication volume of the algorithm + * \return volume in bytes, represented as floating point + */ + double est_internode_comm_vol_rec(int nlyr); void print(); ctr * clone(); diff --git a/src/contraction/ctr_tsr.cxx b/src/contraction/ctr_tsr.cxx index 45cc994a..bd398ec1 100755 --- a/src/contraction/ctr_tsr.cxx +++ b/src/contraction/ctr_tsr.cxx @@ -338,17 +338,20 @@ namespace CTF_int { int i; printf("seq_tsr_ctr:\n"); for (i=0; i facNodes(CommGrid::factorize(nNodes)); + std::vector facrgf(CommGrid::factorize(rGrid.first)); + std::vector facrgs(CommGrid::factorize(rGrid.second)); + std::vector diff; + + // We are selecting all prim factors of #nodes + // which do not occur in the prim factors of a grid edge + // we remove these factors and assign them to the opponent grid edge + + std::set_difference( facNodes.begin(), facNodes.end() + , facrgf.begin(), facrgf.end() + , std::back_inserter(diff) + ); + + for (auto d: diff) + facNodes.erase(std::find(facNodes.begin(), facNodes.end(), d)); + + nGrid.second = + std::accumulate(diff.begin(), diff.end(), 1, std::multiplies()); + diff.resize(0); + + std::set_difference( facNodes.begin(), facNodes.end() + , facrgs.begin(), facrgs.end() + , std::back_inserter(diff) + ); + for (auto d: diff) + facNodes.erase(std::find(facNodes.begin(), facNodes.end(), d)); + + nGrid.first = + std::accumulate(diff.begin(), diff.end(), 1, std::multiplies()); + + // if there is no element left, all prim factors are distributed + if (!facNodes.size()) return nGrid; + //assign the remaining prim factors as such that the grid on every + //node is closest possible to a square + double minVal(DBL_MAX); + ipair bestPair; + for (int i(0); i < pow(2, facNodes.size()); i++){ + ipair edges(CommGrid::getSquare(i, facNodes)); + // build igrid.first / igrid.second and take the one with + // a ratio closest to one + //its not true that the node grid candidates are divisor of the rGrid: + //we allow only these edges + int first(edges.first*nGrid.first); + int second(edges.second*nGrid.second); + if ( (nRanks/first)*first != nRanks) continue; + if ( (nRanks/second)*second != nRanks) continue; + + double val(1.0/(double)first + 1.0/(double)second); + if ( minVal > val ){ + minVal = val; + bestPair = {edges.first, edges.second}; + } + } + nGrid.first *= bestPair.first; + nGrid.second *= bestPair.second; + return nGrid; + } + + std::vector CommGrid::factorize(int number){ + std::vector factors; + int n(number); + if (n < 4) factors.push_back(n); + int d(2); + while (d*d <= n) + while (n>1){ + while (!(n%d)){ + factors.push_back(d); + n /= d; + } + d++; + } + return factors; + } + + ipair CommGrid::getSquare(int id, std::vector factors) { + ipair result({1,1}); + result.second = std::accumulate( + factors.begin(), factors.end(), 1, std::multiplies() + ); + for (int pos(0); ; pos++) { + int bit(pow(2,pos)); + if (bit > id) break; + if(id & bit) result.first *= factors[pos]; + } + result.second /= result.first; + return result; + } + + + char * get_default_inds(int order, int start_index){ char * inds = (char*)CTF_int::alloc(order*sizeof(char)); for (int i=0; i #include #include +#include #include "../shared/model.h" @@ -134,21 +135,38 @@ namespace CTF_int { // accumulates computed flops (targeted for internal use) void add_computed_flops(int64_t n); + void set_save_glb_comm(MPI_Comm gcm); + // get computed flops int64_t get_computed_flops(); // accumulates computed flops (targeted for internal use) void add_estimated_flops(int64_t n); + // wrapper of MPI communicator class CommData { public: + // MPI communicator MPI_Comm cm; + // number of processors int np; + // rank of processor int rank; + // color of subcommunicator cm relative to some parent commmunicator, if provided int color; + // 1 if this communicator is active (MPI_Comm is created and not finalized) int alive; + // 1 if this object created a communicator that needs to be finalized (as opposed to being an alias to a different communicator object) int created; - + // intra_node_np, number of processes per node (intra-node grid dimension) corresponding to this communicator, if provided, 1 otherwise + int intra_node_np; + // global rank + int global_rank; + // node id + int node_id; + // number of distinct nodes in the communicator + int num_nodes; + CommData(); ~CommData(); @@ -167,8 +185,9 @@ namespace CTF_int { * \param[in] rank rank within this comm * \param[in] color identifier of comm within parent * \param[in] np number of processors within this comm + * \param[in] intra_node_np number of processors per physical node */ - CommData(int rank, int color, int np); + CommData(int rank, int color, int np, int num_nodes, int glbRank, int intra_node_np=0); /** * \brief create active subcomm from parent comm which must be active @@ -241,6 +260,24 @@ namespace CTF_int { }; + using ipair = std::pair; + struct CommGrid { + CommGrid(){}; + ~CommGrid(){}; + CommGrid(ipair _rGrid, int _nNodes); + + int nRanks; + std::vector colorKey; + ipair rGrid; // RankGrid: given by the user + ipair nGrid; // NodeGrid: output, grid of nodes + ipair iGrid; // intraNodeGrid: the ranks of one node possess this grid + + ipair getNodeGrid(int nNodes, ipair rGrid); + std::vector factorize(int number); + ipair getSquare(int id, std::vector factors); + }; + + int alloc_ptr(int64_t len, void ** const ptr); int mst_alloc_ptr(int64_t len, void ** const ptr); void * alloc(int64_t len); diff --git a/src/interface/tensor.cxx b/src/interface/tensor.cxx index 3ae791f5..aba43748 100644 --- a/src/interface/tensor.cxx +++ b/src/interface/tensor.cxx @@ -1563,6 +1563,7 @@ NORM_INFTY_INST(double) IASSERT(0); return; } + if (T.wrld->dryRanks) return; for (int64_t i=0; iinit(comm, TOPOLOGY_BGQ, argc, argv); #else @@ -85,6 +87,15 @@ namespace CTF { #endif } + World::World(std::string print, int dryRanks_, int ppn_){ + comm = MPI_COMM_WORLD; + dryRanks = dryRanks_; + ppn = ppn_; + if (print == "high") verbose = 2; + + this->init(comm, TOPOLOGY_GENERIC); + } + World::World(int order, int const * lens, @@ -97,6 +108,7 @@ namespace CTF { World::World(World const & other){ comm = other.comm; + ppn = other.ppn; #if DEBUG >= 1 if (other.rank == 0){ printf("CTF WARNING: Creating copy of World, which is not free or useful, pass original World by reference instead if possible.\n"); @@ -164,6 +176,8 @@ namespace CTF { int argc, const char * const * argv){ cdt = CommData(comm); + if (dryRanks) cdt.np = dryRanks; + if (mach == TOPOLOGY_GENERIC) phys_topology = NULL; else @@ -187,7 +201,7 @@ namespace CTF { int World::initialize(int argc, const char * const * argv){ - char * mem_size, * ppn; + char * mem_size, * cppn; if (comm == MPI_COMM_WORLD && universe_exists){ delete phys_topology; *this = universe; @@ -262,16 +276,16 @@ namespace CTF { imem_size); CTF_int::set_mem_size(imem_size); } - ppn = getenv("CTF_PPN"); - if (ppn != NULL){ + cppn = getenv("CTF_PPN"); + if (cppn != NULL){ if (rank == 0) printf("Assuming %d processes per node due to CTF_PPN environment variable\n", - atoi(ppn)); - ASSERT(atoi(ppn)>=1); + atoi(cppn)); + ASSERT(atoi(cppn)>=1); #ifdef BGQ CTF_int::set_memcap(.75); #else - CTF_int::set_memcap(.75/atof(ppn)); + CTF_int::set_memcap(.75/atof(cppn)); #endif } if (rank == 0) diff --git a/src/interface/world.h b/src/interface/world.h index 622b27d4..39fef3db 100644 --- a/src/interface/world.h +++ b/src/interface/world.h @@ -24,6 +24,12 @@ namespace CTF { int rank; /** \brief number of processors */ int np; + /** \brief number of processors per node (optional / can be 1)*/ + int ppn = 0; + /** \brief set dryRun */ + int dryRanks = 0; + /** \brief verbosity of dryRun */ + int verbose = 1; /** \brief derived topologies */ std::vector< CTF_int::topology* > topovec; /** \brief whether the world has been initialized */ @@ -63,6 +69,7 @@ namespace CTF { * \param[in] argv main arguments */ World(MPI_Comm comm = MPI_COMM_WORLD, + int ppn = 1, int argc = 0, char * const * argv = NULL); @@ -86,6 +93,13 @@ namespace CTF { */ World(char const * emptystring); + /** + * \brief constructor for a dry world + * \param[in] print determines how to handle output + * \param[in] dryRanks number of dry ranks + */ + + World(std::string print, int dryRanks, int ppn = 1); /** * \brief frees CTF library diff --git a/src/mapping/Makefile b/src/mapping/Makefile index d5c66a28..a609849c 100644 --- a/src/mapping/Makefile +++ b/src/mapping/Makefile @@ -1,10 +1,10 @@ -LOBJS = mapping.o distribution.o topology.o +LOBJS = mapping.o distribution.o topology.o node_aware_dist.o OBJS = $(addprefix $(ODIR)/, $(LOBJS)) ctf: $(OBJS) #%d | r ! grep -ho "\.\..*\.h" *.cxx *.h | sort | uniq -HDRS = ../../Makefile $(BDIR)/config.mk ../interface/common.h ../mapping/mapping.h ../shared/util.h ../summation/sum_tsr.h ../tensor/untyped_tensor.h +HDRS = ../../Makefile $(BDIR)/config.mk ../interface/common.h ../mapping/mapping.h ../mapping/node_aware_dist.h ../shared/util.h ../summation/sum_tsr.h ../tensor/untyped_tensor.h $(OBJS): $(ODIR)/%.o: %.cxx *.h $(HDRS) $(FCXX) -c $< -o $@ diff --git a/src/mapping/node_aware_dist.cxx b/src/mapping/node_aware_dist.cxx new file mode 100644 index 00000000..f67e22b3 --- /dev/null +++ b/src/mapping/node_aware_dist.cxx @@ -0,0 +1,210 @@ +/* The code in this file has been written by Andreas Irmler. */ + +#include "../tensor/untyped_tensor.h" +#include "../shared/util.h" +#include "node_aware_dist.h" +using ivec = std::vector; +using vivec = std::vector; + + +namespace CTF_int { + + + struct Tree { + + //Copy + Tree(Tree const &other) { + order = other.order; + sgf = other.sgf; + ogf = other.ogf; + } + + //Constructor 1 + Tree(int _order, vivec _sgf, vivec _ogf){ + order = _order; + sgf = _sgf; + ogf = _ogf; + } + + // Constructor 2 + Tree(Tree t, int pos, int el){ + order = t.order + 1; + sgf = t.sgf; + ogf = t.ogf; + assert(sgf.size() > pos); + assert(ogf.size() > pos); + sgf[pos].push_back(el); + std::sort(sgf[pos].begin(), sgf[pos].end()); + auto it = std::find(ogf[pos].begin(), ogf[pos].end(), el); + assert(it != ogf[pos].end()); + ogf[pos].erase(it); + } + + bool find(int pos, int el) { + if (ogf.size() <= pos) { + printf("Find problem! order %d, size: %ld, pos: %d, el: %d\n" + , order, ogf.size(), pos, el); + assert(0); + } + auto it = std::find(ogf[pos].begin(), ogf[pos].end(), el); + if (it == ogf[pos].end()) return false; + return true; + } + + int order; + vivec sgf; // settled grid factors. ie factors which are already assigned + vivec ogf; // open grid factors. factors which can + }; + + + // return a vector of prim factors + ivec iv_factorize(int number){ + ivec factors; + int n(number); + if (n < 4) factors.push_back(n); + int d(2); + while (d*d <= n) + while (n>1){ + while (!(n%d)){ + factors.push_back(d); + n /= d; + } + d++; + } + return factors; + } + + // return vector with input arguments + ivec lineToVint(std::string line) { + ivec out; + size_t pos; + while ((pos = line.find(",")) != std::string::npos) { + out.push_back(std::stoi(line.substr(0, pos))); + line.erase(0, pos + 1); + } + out.push_back(std::stoi(line)); + + return out; + } + + + std::vector< std::vector > get_inter_node_grids(std::vector rGrid, int nodes){ + int ranks(std::accumulate(rGrid.begin(), rGrid.end(), 1, std::multiplies())); + int ranksPerNode(ranks/nodes); + IASSERT (ranksPerNode*nodes == ranks ); + + vivec nodeGrid; // final node Grid + const ivec nodeFactors(iv_factorize(nodes)); + const ivec rankFactors(iv_factorize(ranks)); + vivec gridFactors; // the tensor grid expressed in prim factors + ivec assignedFactors; // rank factors which are already assigned + ivec openFactors; // unassigned rank factors + for (auto r: rGrid) { + gridFactors.push_back(iv_factorize(r)); + } + vivec openGridFactors; // grid factors which cannot assigned to a edge + + for (auto gf: gridFactors){ + + ivec others, diff; + // all prim factors which are not at the given edge + std::set_difference( rankFactors.begin() + , rankFactors.end() + , gf.begin() + , gf.end() + , std::back_inserter(others) + ); + /* + for (auto x: others) { + std::cout << "others: " << x << " "; + } + std::cout << std::endl; + */ + // is there a node factor which lives only on a given edge? + // if so assign this factor to this edge + std::set_difference( nodeFactors.begin() + , nodeFactors.end() + , others.begin() + , others.end() + , std::back_inserter(diff) + ); + assignedFactors.insert(assignedFactors.end(), diff.begin(), diff.end()); + + openGridFactors.resize(openGridFactors.size()+1); + std::set_difference( gf.begin() + , gf.end() + , diff.begin() + , diff.end() + , std::back_inserter(openGridFactors.back()) + ); + if (!diff.size()) diff.push_back(1); + nodeGrid.push_back(diff); + + } + + std::sort(assignedFactors.begin(), assignedFactors.end()); + std::set_difference( nodeFactors.begin() + , nodeFactors.end() + , assignedFactors.begin() + , assignedFactors.end() + , std::back_inserter(openFactors) + ); + // The algorithm goes like that: + // 1.) we pick the last element of the list, remove it from the list, + // then open N branches where N is the number of possible possitions + // for that element in the rank Grid + // 2.) we remove identical branches + // 3.) we go to step 1 + + size_t b(0); + size_t n(rGrid.size()); + std::vector treeVec; + treeVec.emplace_back(0, nodeGrid, openGridFactors); + // we loop over all prim Factors of the number of nodes + while (openFactors.size()){ + // take the last element of the list and remove it from the list + auto f(openFactors.back()); + openFactors.pop_back(); + + // we work only in the last layer of the tree + // we have to find the begin/end in the whole vector + auto o(treeVec.back().order); + auto b(std::distance( treeVec.begin() + , std::find_if( treeVec.begin() + , treeVec.end() + , [o] (const Tree &a) + { return a.order == o;} + ) + )); + + auto e(treeVec.size()); + // loop over the last layer of the tree and distribute the + // element to all possible positions + // however: if a potential element is already in the list, + // do not add it + for (size_t t(b); t < e; t++){ + for (auto i(0); i < n; i++) + if ( treeVec[t].find(i, f) ){ + bool distinct(true); + auto cand = Tree(treeVec[t], i, f); + for (size_t n(e); n < treeVec.size(); n++){ + if (cand.sgf == treeVec[n].sgf) distinct = false; + } + if (distinct) treeVec.push_back(cand); + } + } + } + + std::vector< std::vector > inter_node_grids; + for (auto tv: treeVec) { + if (treeVec.back().order == tv.order) { + std::vector sgf; + for (auto s: tv.sgf) { + sgf.push_back(std::accumulate(s.begin(), s.end(), 1, std::multiplies())); + } + inter_node_grids.push_back(sgf); + } + } + return inter_node_grids; + } +} diff --git a/src/mapping/node_aware_dist.h b/src/mapping/node_aware_dist.h new file mode 100644 index 00000000..41f09006 --- /dev/null +++ b/src/mapping/node_aware_dist.h @@ -0,0 +1,16 @@ +/*Copyright (c) 2022, Edgar Solomonik, all rights reserved.*/ + +#ifndef __INT_NODE_DISTRIBUTION_H__ +#define __INT_NODE_DISTRIBUTION_H__ + +namespace CTF_int { + /** + * \brief returns all possible valid choices inter-node grids, given an overall processor grid and a number of nodes + * \param[in] rGrid overall processor grid + * \param[in] nodes number of nodes + * \return vector of inter node processor grids of total size equal to the number of nodes and of same dimension as rGrid, where each dimension divides into the respective dimension of rGrid + */ + std::vector > get_inter_node_grids(std::vector rGrid, int nodes); +} + +#endif diff --git a/src/mapping/topology.cxx b/src/mapping/topology.cxx index 02b6eae1..9a87418e 100644 --- a/src/mapping/topology.cxx +++ b/src/mapping/topology.cxx @@ -3,11 +3,14 @@ #include "topology.h" #include "../shared/util.h" #include "../mapping/mapping.h" +#include #ifdef BGQ #include "mpix.h" #endif +using ipair = std::pair; + namespace CTF_int { /* topology::topology(){ @@ -17,7 +20,41 @@ namespace CTF_int { is_activated = false; dim_comm = NULL; }*/ - + + int get_inv_topo_reorder_rank(int order, int const * lens, int const * intra_node_lens, int new_rank){ + int irank = new_rank; + int intra_node_rank = 0; + int node_rank = 0; + int lda_node_rank = 1; + int lda_intra_node_rank = 1; + for (int i=0; i num_nodes(order); + std::vector< std::vector > como(order, std::vector (cdt.np)); + for (int r(0); r < cdt.np; r++){ + int stride =1, cut = 0; + for (size_t i=0; i sameColor; + std::copy_if( como[i].begin() + , como[i].end() + , std::back_inserter(sameColor) + , [](ipair &a){ return a.first == 0;} + ); + std::sort( sameColor.begin() + , sameColor.end() + , [](ipair &a, ipair &b){return a.second < b.second;} + ); + num_nodes[i] = std::distance( sameColor.begin() + , std::unique( sameColor.begin() + , sameColor.end() + , [](ipair &a, ipair &b) + { return a.second == b.second;} + ) + ); + } + for (int i=0; i get_all_topos(CommData cdt, int n_uf, int const * uniq_fact, int const * mults, int n_prepend, int const * prelens){ - std::vector topos; + std::vector< std::vector* > get_all_shapes_rec(int n_uf, int const * uniq_fact, int const * mults, int n_prepend, int const * prelens){ + std::vector< std::vector* > shapes; + // enumerate the number of different possible numbers (including 1) that divide (with remainder 0) the number of processors int num_divisors = 1; for (int i=0; i(prelens,prelens+n_prepend)); + return shapes; } int sub_mults[n_uf]; int new_prelens[n_prepend+1]; memcpy(new_prelens, prelens, n_prepend*sizeof(int)); //FIXME: load may be highly imbalanced //for (int div=cdt.rank; div new_topos = get_all_topos(cdt, n_uf, uniq_fact, sub_mults, n_prepend+1, new_prelens); + std::vector< std::vector* > new_shapes = get_all_shapes_rec(n_uf, uniq_fact, sub_mults, n_prepend+1, new_prelens); //FIXME call some append function? - for (unsigned i=0; i get_generic_topovec(CommData cdt){ - std::vector topovec; - + /** + * \brief generate all possible factorizations of size into divisors + * \param[in] total size that numbers should multiply to + * \return all possible collections of natural numbers that multiply to size (excluding 1s) + */ + std::vector< std::vector* > get_all_shapes(int size){ int nfact, * factors; - factorize(cdt.np, &nfact, &factors); + factorize(size, &nfact, &factors); if (nfact <= 1){ - topovec.push_back(new topology(nfact, factors, cdt)); - if (cdt.np >= 7 && cdt.rank == 0) - DPRINTF(1,"CTF WARNING: using a world with a prime number of processors may lead to very bad performance\n"); + std::vector*> shapes; + shapes.push_back(new std::vector(factors, factors+nfact)); if (nfact > 0) cdealloc(factors); - return topovec; + return shapes; } std::sort(factors,factors+nfact); + //compute number of unique factors int n_uf = 1; assert(factors[0] != 1); for (int i=1; i= 3){ - if (cdt.rank == 0) - DPRINTF(1,"CTF WARNING: using a world with a number of processors that contains 3 or more unique prime factors may lead to suboptimal performance, when possible use p=2^k3^l processors for some k,l\n"); - } + //if (n_uf >= 3){ + // if (cdt.rank == 0) + // DPRINTF(1,"CTF WARNING: using a world with a number of processors that contains 3 or more unique prime factors may lead to suboptimal performance, when possible use p=2^k3^l processors for some k,l\n"); + //} int uniq_fact[n_uf]; int mults[n_uf]; int i_uf = 0; @@ -481,7 +598,30 @@ namespace CTF_int { } else mults[i_uf]++; } cdealloc(factors); - return get_all_topos(cdt, n_uf, uniq_fact, mults, 0, NULL); + std::vector< std::vector * > shapes = get_all_shapes_rec(n_uf, uniq_fact, mults, 0, NULL); + return shapes; + } + + + std::vector< topology* > create_topos_from_shapes(std::vector< std::vector* > shapes, CommData cdt){ + std::vector< topology* > topos; + for (int i=0; i<(int)shapes.size(); i++){ + topos.push_back(new topology(shapes[i]->size(), &shapes[i]->operator[](0), cdt)); + } + return topos; + } + + std::vector< topology* > get_generic_topovec(CommData cdt){ + std::vector< std::vector * > shapes = get_all_shapes(cdt.np); + std::vector< topology* > topos = create_topos_from_shapes(shapes, cdt); + for (int i=0; i<(int)shapes.size(); i++){ + delete shapes[i]; + } + + if (shapes.size() == 1 && cdt.np >= 7 && cdt.rank == 0) + DPRINTF(1,"CTF WARNING: using a world with a prime number of processors may lead to very bad performance\n"); + return topos; + } @@ -493,7 +633,7 @@ namespace CTF_int { bool changed; /*int i=0; do { - for (int j=0; j< perm_vec[i]->order; + for (int j=0; j< perm_vec[i]->order; } while(iorder, perm_vec.size(), perm_vec[0]->lens[0], perm_vec[0]->lens[1]); @@ -538,9 +678,9 @@ namespace CTF_int { CommData glb_comm){ std::vector< topology* > topos; topos.push_back(new topology(*topo)); - + if (topo->order <= 1) return topos; - + int * new_lens = (int*)alloc(sizeof(int)*topo->order-1); for (int i=0; iorder-1; i++){ @@ -567,12 +707,12 @@ namespace CTF_int { } return topos; } - + int find_topology(topology const * topo, std::vector< topology* > & topovec){ int i, j, found; std::vector< topology* >::iterator iter; - + found = -1; for (j=0, iter=topovec.begin(); iter!=topovec.end(); iter++, j++){ if ((*iter)->order == topo->order){ @@ -585,7 +725,7 @@ namespace CTF_int { } if (found != -1) return found; } - return -1; + return -1; } int get_best_topo(int64_t nvirt, @@ -638,8 +778,8 @@ namespace CTF_int { CommData * sub_phys_comm; int * comm_idx; mapping const * map; - memset(phys_mapped, 0, topo->order*sizeof(int)); - + memset(phys_mapped, 0, topo->order*sizeof(int)); + num_sub_phys_dims = 0; for (i=0; icdt] = 1; if (map->has_child) map = map->child; else break; - } + } } for (i=0; icdt] = 1; if (map->has_child) map = map->child; else break; - } + } } num_sub_phys_dims = 0; @@ -680,7 +820,7 @@ namespace CTF_int { } - int can_morph(topology const * topo_keep, + int can_morph(topology const * topo_keep, topology const * topo_change){ int i, j, lda; lda = 1; @@ -715,7 +855,7 @@ namespace CTF_int { do { for (j=0; jorder; j++){ if (new_topo->lda[j] == old_lda) break; - } + } ASSERT(j!=new_topo->order); new_rec_map->type = PHYSICAL_MAP; new_rec_map->cdt = j; @@ -749,7 +889,7 @@ namespace CTF_int { break; } } - edge_map[i].clear(); + edge_map[i].clear(); edge_map[i] = *new_map; CTF_int::cdealloc(new_map); } diff --git a/src/mapping/topology.h b/src/mapping/topology.h index 6194f690..d6f034ec 100644 --- a/src/mapping/topology.h +++ b/src/mapping/topology.h @@ -13,41 +13,78 @@ namespace CTF_int { /* \brief mesh/torus topology configuration */ class topology { public: + // number of dimensions in torus int order; + // lengths of dimensions int * lens; + // lda[i] = lens[i-1] * ... * lens[0] int * lda; + // global communicator is reordered if intra-node grid is provided + int is_reordered; + // whether dim_comm communicators have been activated bool is_activated; + + // list of communicators along fibers of each dimension of torus CommData * dim_comm; + // global communicator, ordered as in torus given by dim_comm CommData glb_comm; + // global communicator, ordered as given, assuming processors are ordered as [processes in node 1], [processes in node 2], etc. + CommData unord_glb_comm; //topology(); ~topology(); - /** + /** * \brief copy constructor * \param[in] other topology to copy */ topology(topology const & other); /** - * \brief constructs torus topology + * \brief overwrite this topology with communicators of another, without reallocating CommData objects, allowing to 'hot-swap' this topology for another, propagating change through creatred ctr objects + * \param[in] other topology to copy + */ + void morph_to(topology const & other); + + + /** + * \brief constructs torus topology, if intra_node_lens is NULL, the p processors are folded into a torus, otherwise, the each set of prod(intra_node_lens) processors is mapped to different modes of the processor grid, e.g., if lens_ = [6,4] and intra_node_lens=[3,2] (6 processes per node), the processors are assiged as + * [[ 0 1 2 6 7 8 ], + * [ 3 4 5 9 10 11], + * [ 12 13 14 18 19 20], + * [ 15 16 17 21 22 23]] * \param[in] order_ number of torus dimensions * \param[in] lens_ lengths of torus dimensions - * \param[in] cdt communicator for whole torus + * \param[in] cdt communicator for whole torus * \param[in] activate whether to create MPI_Comms + * \param[in] intra_node_lens lengths of intra-node processor grid */ topology(int order_, int const * lens_, CommData cdt, - bool activate=false); - - /* \brief create (split off) MPI communicators, re-entrant */ + bool activate=false, + int const * intra_node_lens=NULL); + + /* \brief create (split off) MPI communicators, re-entrant */ void activate(); /* \breif free MPI communicators, re-entrant */ void deactivate(); }; + /** + * \brief determine this processors rank in the global communicator given by reordering nodes so that they adhere to the assignment described in the constructor of the topology() object, assuming initial order is node by node + * + * \param[in] order_ number of torus dimensions + * \param[in] lens_ lengths of torus dimensions + * \param[in] lda_ prefix product of lengths of torus dimensions + * \param[in] intra_node_lens lengths of intra-node processor grid + */ + int get_topo_reorder_rank(int order, int const * lens, int const * lda, int const * intra_node_lens, int rank); + + int get_inv_topo_reorder_rank(int order, int const * lens, int const * intra_node_lens, int new_rank); + + /** * \brief get dimension and torus lengths of specified topology * @@ -57,6 +94,15 @@ namespace CTF_int { topology * get_phys_topo(CommData glb_comm, TOPOLOGY mach); + + /** + * \brief generate all possible factorizations of size into divisors + * \param[in] total size that numbers should multiply to + * \return all possible collections of natural numbers that multiply to size (excluding 1s) + */ + std::vector< std::vector* > get_all_shapes(int size); + + /** * \brief computes all topology configurations given undelying physical topology information * \param[in] cdt global communicator @@ -87,7 +133,7 @@ namespace CTF_int { int find_topology(topology const * topo, std::vector< topology* > & topovec); - + /** * \brief get the best topologoes (least nvirt) over all procs * \param[in] nvirt best virtualization achieved by this proc @@ -102,7 +148,7 @@ namespace CTF_int { CommData global_comm, int64_t bcomm_vol=0, int64_t bmemuse=0); - + /** * \brief extracts the set of physical dimensions still available for mapping diff --git a/src/shared/init_models.cxx b/src/shared/init_models.cxx index cb4cbbd7..9c1d0153 100644 --- a/src/shared/init_models.cxx +++ b/src/shared/init_models.cxx @@ -1,41 +1,42 @@ namespace CTF_int{ -double csrred_mdl_init[] = {3.0689E-03, 2.2385E-03, 4.4815E-07}; -double csrred_mdl_cst_init[] = {-1.8323E-04, 1.3076E-04, 2.8732E-09}; -double alltoall_mdl_init[] = {1.0000E-06, 1.0000E-06, 5.0000E-10}; -double alltoallv_mdl_init[] = {7.3164E-23, 1.0404E-04, 2.5827E-07}; -double red_mdl_init[] = {1.7255E-12, 1.2558E-11, 3.7127E-10}; -double red_mdl_cst_init[] = {1.2881E-04, 1.4093E-16, 8.3976E-10}; -double allred_mdl_init[] = {4.7939E-14, 7.4715E-13, 2.0949E-06}; -double allred_mdl_cst_init[] = {-3.3754E-04, 2.1343E-04, 3.0801E-09}; -double bcast_mdl_init[] = {1.1722E-82, 3.0112E-05, 8.6197E-09}; -double seq_tsr_ctr_mdl_cst_init[] = {7.8076E-13, 6.9558E-08, 1.3923E-08}; -double seq_tsr_ctr_mdl_ref_init[] = {4.9138E-08, 5.8290E-10, 4.8575E-11}; -double seq_tsr_ctr_mdl_inr_init[] = {1.0689E-05, 9.4660E-10, 2.1921E-10}; -double seq_tsr_ctr_mdl_off_init[] = {6.2925E-05, 1.7449E-11, 1.7211E-12}; -double seq_tsr_ctr_mdl_cst_inr_init[] = {1.3863E-04, 2.0119E-10, 9.8820E-09}; -double seq_tsr_ctr_mdl_cst_off_init[] = {8.4844E-04, 5.9246E-11, 3.5247E-10}; -double long_contig_transp_mdl_init[] = {1.5117E-04, 1.9091E-09}; -double shrt_contig_transp_mdl_init[] = {7.7643E-05, 6.4347E-12}; -double non_contig_transp_mdl_init[] = {2.6680E-05, 4.6247E-06}; -double seq_tsr_spctr_cst_off_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; -double seq_tsr_spctr_cst_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; -double seq_tsr_spctr_cst_off_k2_init[] = {2.1996E-04, 3.1883E-09, 3.8743E-11}; -double seq_tsr_spctr_off_k0_init[] = {8.6970E-06, 4.5598E-11, 1.1544E-09}; -double seq_tsr_spctr_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; -double seq_tsr_spctr_off_k2_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; -double seq_tsr_spctr_cst_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; -double seq_tsr_spctr_cst_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; -double seq_tsr_spctr_cst_k2_init[] = {2.1303E-74, 5.7379E-09, 4.1887E-11}; -double seq_tsr_spctr_cst_k3_init[] = {1.4917E-05, 2.5510E-10, 5.4110E-12}; -double seq_tsr_spctr_cst_k4_init[] = {5.6408E-06, 1.8318E-09, 5.2399E-80}; -double seq_tsr_spctr_cst_k5_init[] = {2.8218E-05, 3.0049E-09, 5.2399E-11}; -double seq_tsr_spctr_k0_init[] = {3.9315E-05, 2.2285E-08, 6.1958E-08}; -double seq_tsr_spctr_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; -double seq_tsr_spctr_k2_init[] = {5.9868E-14, 1.4877E-09, 5.3514E-12}; -double seq_tsr_spctr_k3_init[] = {1.3994E-15, 2.5071E-09, 2.7323E-11}; -double seq_tsr_spctr_k4_init[] = {2.0404E-04, 8.2989E-09, 6.0431E-11}; -double seq_tsr_spctr_k5_init[] = {6.9073E-15, 4.0130E-09, 2.2669E-13}; -double pin_keys_mdl_init[] = {4.0261E-05, 7.2443E-07}; -double spredist_mdl_init[] = {6.8713E-23, 7.8867E-04, 6.9422E-11}; -double dgtog_res_mdl_init[] = {4.3225E-22, 6.3127E-03, 3.6107E-07}; -double blres_mdl_init[] = {1.1782E-05, 6.7690E-10};} + double csrred_mdl_init[] = {3.0689E-03, 2.2385E-03, 4.4815E-07}; // not used I guess (at least not in dense) + double csrred_mdl_cst_init[] = {-1.8323E-04, 1.3076E-04, 2.8732E-09}; // not used I guess (at least not in dense) + double alltoall_mdl_init[] = {1.0000E-06, 1.0000E-06, 5.0000E-10}; // not used I guess + double alltoallv_mdl_init[] = {7.3164E-23, 1.0404E-04, 2.5827E-07}; // not used I guess + double red_mdl_init[] = {4.5530E-11, 3.0466E-17, 2.5E-9}; // mpi_reduce, used for summa for moving C + double red_mdl_cst_init[] = {1.2881E-04, 1.4093E-16, 8.3976E-10}; // not used I guess + double allred_mdl_init[] = {4.7939E-14, 7.4715E-13, 2.0949E-06}; // de-facto not used I guess + double allred_mdl_cst_init[] = {-3.3754E-04, 2.1343E-04, 3.0801E-09}; // not used I guess + double bcast_mdl_init[] = {1.1115E-16, 1.0754E-16, 1.32E-9}; //mpi_bcast, used for summa for bcasting A,B, 3rd parameter is around 0.7 GB/s + double seq_tsr_ctr_mdl_cst_init[] = {7.8076E-13, 6.9558E-08, 1.3923E-08}; // not used I guess + double seq_tsr_ctr_mdl_ref_init[] = {4.9138E-08, 5.8290E-10, 4.8575E-11}; // not used I guess + double seq_tsr_ctr_mdl_inr_init[] = {6.0166E-21, 2.3443E-13, 1.4286E-11}; // our model, 2nd parameter negligible for large matrices, 3rd paramter fixed to 70GFLOPS/s/core + double seq_tsr_ctr_mdl_off_init[] = {6.2925E-05, 1.7449E-11, 1.7211E-12}; // not used I guess + double seq_tsr_ctr_mdl_cst_inr_init[] = {0.0, 0.0, 1.6E-11}; + double seq_tsr_ctr_mdl_cst_off_init[] = {8.4844E-04, 5.9246E-11, 3.5247E-10}; + double long_contig_transp_mdl_init[] = {0.0, 1.25E-08}; + double shrt_contig_transp_mdl_init[] = {0.0, 1.25E-08}; + double non_contig_transp_mdl_init[] = {2.6680E-05, 8.6247E-08}; + double seq_tsr_spctr_cst_off_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; + double seq_tsr_spctr_cst_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; + double seq_tsr_spctr_cst_off_k2_init[] = {2.1996E-04, 3.1883E-09, 3.8743E-11}; + double seq_tsr_spctr_off_k0_init[] = {8.6970E-06, 4.5598E-11, 1.1544E-09}; + double seq_tsr_spctr_off_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; + double seq_tsr_spctr_off_k2_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; + double seq_tsr_spctr_cst_k0_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; + double seq_tsr_spctr_cst_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; + double seq_tsr_spctr_cst_k2_init[] = {2.1303E-74, 5.7379E-09, 4.1887E-11}; + double seq_tsr_spctr_cst_k3_init[] = {1.4917E-05, 2.5510E-10, 5.4110E-12}; + double seq_tsr_spctr_cst_k4_init[] = {5.6408E-06, 1.8318E-09, 5.2399E-80}; + double seq_tsr_spctr_cst_k5_init[] = {2.8218E-05, 3.0049E-09, 5.2399E-11}; + double seq_tsr_spctr_k0_init[] = {3.9315E-05, 2.2285E-08, 6.1958E-08}; + double seq_tsr_spctr_k1_init[] = {5.3745E-06, 3.6464E-08, 2.2334E-10}; + double seq_tsr_spctr_k2_init[] = {5.9868E-14, 1.4877E-09, 5.3514E-12}; + double seq_tsr_spctr_k3_init[] = {1.3994E-15, 2.5071E-09, 2.7323E-11}; + double seq_tsr_spctr_k4_init[] = {2.0404E-04, 8.2989E-09, 6.0431E-11}; + double seq_tsr_spctr_k5_init[] = {6.9073E-15, 4.0130E-09, 2.2669E-13}; + double pin_keys_mdl_init[] = {4.0261E-05, 7.2443E-07}; + double spredist_mdl_init[] = {6.8713E-23, 7.8867E-04, 6.9422E-11}; + double dgtog_res_mdl_init[] = {0.0, 0.0, 7.25E-10}; // elementwise reshuffling of distribution + double blres_mdl_init[] = {0.0, 1E-10}; // blockwise reshuffling of distribution +} diff --git a/src/shared/model.cxx b/src/shared/model.cxx index ef17ebf4..f8a31e0d 100644 --- a/src/shared/model.cxx +++ b/src/shared/model.cxx @@ -53,6 +53,14 @@ namespace CTF_int { #endif } + void dump_touched_models(std::string path){ +#ifdef TUNE + for (int i=0; i<(int)get_all_models().size(); i++){ + get_all_models()[i]->dump_data(path, true); + } +#endif + } + #define SPLINE_CHUNK_SZ = 8 double cddot(int n, const double *dX, @@ -245,7 +253,7 @@ namespace CTF_int { //if (nobs % tune_interval == 0){ //define the number of cols in the matrix to be the min of the number of observations and - //the number we are willing to store (hist_size) + //the number we are willing to store ( {}hist_size) int nrcol = std::min(nobs,(int64_t)hist_size); //max of the number of local observations and nparam (will usually be the former) int ncol = std::max(nrcol, nparam); @@ -697,11 +705,12 @@ namespace CTF_int { } template - void LinModel::dump_data(std::string path){ + void LinModel::dump_data(std::string path, bool dump_only_touched){ int rank = 0; int np, my_rank; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &np); +/* while(rank < np){ if (rank == my_rank){ // Open the file @@ -721,6 +730,7 @@ namespace CTF_int { int num_records = std::min(nobs, (int64_t)hist_size); for(int i=0; i local_times(num_records), max_times(local_times); + int min_records(0), max_records(0); + for (int i=0; i < num_records; i++) { + local_times[i] = time_param_mat[i*mat_lda]; + } + MPI_Allreduce(&num_records, &max_records, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(&num_records, &min_records, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + assert(max_records == min_records); + if (max_records > 0 && max_records == min_records) { + MPI_Reduce(local_times.data(), max_times.data(), num_records, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if (!my_rank && dump) { + // Open the file + std::ofstream ofs; + std::string model_name = std::string(name); + ofs.open(path+"/"+model_name, std::ofstream::out | std::ofstream::app); + + // Dump the model coeffs + ofs << "Coeff: "; + for(int i=0; i - void CubicModel::dump_data(std::string path){ + void CubicModel::dump_data(std::string path, bool dump_only_touched){ lmdl.dump_data(path); } diff --git a/src/shared/model.h b/src/shared/model.h index e9b3a7d8..206f6bda 100644 --- a/src/shared/model.h +++ b/src/shared/model.h @@ -20,7 +20,7 @@ namespace CTF_int { virtual void print_uo(){}; virtual void load_coeff(std::string file_name){}; virtual void write_coeff(std::string file_name){}; - virtual void dump_data(std::string path){}; + virtual void dump_data(std::string path, bool dump_only_touched = false){}; }; void update_all_models(MPI_Comm cm); @@ -28,6 +28,7 @@ namespace CTF_int { void load_all_models(std::string file_name); void write_all_models(std::string file_name); void dump_all_models(std::string path); + void dump_touched_models(std::string path); /** * \brief Linear performance models, which given measurements, provides new model guess @@ -138,7 +139,8 @@ namespace CTF_int { /** * \brief dump model data to a file */ - void dump_data(std::string path); + void dump_data(std::string path, bool dump_only_touched = false); + }; /** @@ -216,7 +218,7 @@ namespace CTF_int { * \brief write model coefficients to file * \param[in] path the path that we wish to dump all files to */ - void dump_data(std::string path); + void dump_data(std::string path, bool dump_only_touched = false); }; diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx index bced3ac2..b4cdbf4d 100644 --- a/src/tensor/untyped_tensor.cxx +++ b/src/tensor/untyped_tensor.cxx @@ -19,6 +19,7 @@ using namespace CTF; + namespace CTF_int { LinModel<3> spredist_mdl(spredist_mdl_init,"spredist_mdl"); @@ -41,7 +42,7 @@ namespace CTF_int { void tensor::free_self(){ if (order > -1){ if (wrld->rank == 0) DPRINTF(3,"Deleted order %d tensor %s\n",order,name); - if (is_folded) unfold(); + if (is_folded && !wrld->dryRanks) unfold(); //if (is_folded) unfold(0,1); cdealloc(sym); cdealloc(lens); @@ -307,7 +308,7 @@ namespace CTF_int { }*/ this->home_size = other->home_size; register_size(this->home_size*sr->el_size); - this->home_buffer = sr->alloc(other->home_size); + if (!wrld->dryRanks) this->home_buffer = sr->alloc(other->home_size); if (other->is_home){ this->is_home = 1; this->data = this->home_buffer; @@ -315,14 +316,16 @@ namespace CTF_int { /*if (this->is_home || this->home_size != other->home_size){ }*/ this->is_home = 0; - sr->copy(this->home_buffer, other->home_buffer, other->home_size); - //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data); - this->data = sr->alloc(other->size); + if (!wrld->dryRanks){ + sr->copy(this->home_buffer, other->home_buffer, other->home_size); + //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data); + this->data = sr->alloc(other->size); + } } this->has_home = 1; } else { //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data); - this->data = sr->alloc(other->size); + if (!wrld->dryRanks) this->data = sr->alloc(other->size); /* if (this->has_home && !this->is_home){ CTF_int::cdealloc(this->home_buffer); }*/ @@ -331,9 +334,9 @@ namespace CTF_int { } #else //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data); - this->data = sr->alloc(other->size); + if (!wrld->dryRanks) this->data = sr->alloc(other->size); #endif - sr->copy(this->data, other->data, other->size); + if (!wrld->dryRanks) sr->copy(this->data, other->data, other->size); } else { ASSERT(this->is_sparse); has_home = other->has_home; @@ -676,7 +679,6 @@ namespace CTF_int { int * restricted; int btopo; int64_t bmemuse; - if (this->is_mapped){ if (is_sparse){ sr->pair_dealloc(this->data); @@ -686,6 +688,7 @@ namespace CTF_int { memset(this->nnz_blk, 0, sizeof(int64_t)*calc_nvirt()); this->set_new_nnz_glb(this->nnz_blk); } else { + if (!wrld->dryRanks) sr->set(this->data, sr->addid(), this->size); } } else { @@ -726,15 +729,19 @@ namespace CTF_int { //this->has_home = 0; /* if (wrld->rank == 0) DPRINTF(3,"Initial size of tensor %d is " PRId64 ",",tensor_id,this->size);*/ - this->home_buffer = sr->alloc(this->home_size); - if (wrld->rank == 0) DPRINTF(2,"Creating home of %s\n",name); - register_size(this->size*sr->el_size); - this->data = this->home_buffer; + if (!wrld->dryRanks) { + this->home_buffer = sr->alloc(this->home_size); + if (wrld->rank == 0) DPRINTF(2,"Creating home of %s\n",name); + register_size(this->size*sr->el_size); + this->data = this->home_buffer; + } } else { - this->data = sr->alloc(this->size); + if (!wrld->dryRanks) + this->data = sr->alloc(this->size); } #else - this->data = sr->alloc(this->size); + if (!wrld->dryRanks) + this->data = sr->alloc(this->size); //CTF_int::alloc_ptr(this->size*sr->el_size, (void**)&this->data); #endif #if DEBUG >= 1 @@ -743,7 +750,8 @@ namespace CTF_int { this->print_lens(); this->print_map(stdout); #endif - sr->init(this->size, this->data); + if (!wrld->dryRanks) + sr->init(this->size, this->data); } } TAU_FSTOP(set_zero_tsr); @@ -752,17 +760,24 @@ namespace CTF_int { void tensor::print_map(FILE * stream, bool allcall) const { if (!allcall || wrld->rank == 0){ - if (is_sparse) - printf("printing mapping of sparse tensor %s\n",name); - else - printf("printing mapping of dense tensor %s\n",name); +// if (is_sparse) +// printf("printing mapping of sparse tensor %s\n",name); +// else +// printf("printing mapping of dense tensor %s\n",name); +// if (topo != NULL){ +// printf("CTF: %s mapped to order %d topology with dims:",name,topo->order); +// for (int dim=0; dimorder; dim++){ +// printf(" %d ",topo->lens[dim]); +// } +// } +// printf("\n"); if (topo != NULL){ - printf("CTF: %s mapped to order %d topology with dims:",name,topo->order); + printf("%s topo (",name); for (int dim=0; dimorder; dim++){ - printf(" %d ",topo->lens[dim]); + printf(", %d",topo->lens[dim]); } + printf("); "); } - printf("\n"); char tname[200]; tname[0] = '\0'; sprintf(tname, "%s[", name); @@ -1028,7 +1043,16 @@ namespace CTF_int { bool tsr_has_sym = false; bool tsr_has_virt = false; + int topo_dims_A = tsr_A->topo->order; + int topo_dims_B = tsr_B->topo->order; + for (int i=0; iorder; i++){ + if (tsr_A->edge_map[i].type == PHYSICAL_MAP){ + topo_dims_A--; + } + if (tsr_B->edge_map[i].type == PHYSICAL_MAP){ + topo_dims_B--; + } if (A->sym[i] != NS || this->sym[i] != NS) tsr_has_sym = true; if (A->edge_map[i].type == VIRTUAL_MAP || (A->edge_map[i].has_child && A->edge_map[i].child->type == VIRTUAL_MAP)){ @@ -1040,7 +1064,7 @@ namespace CTF_int { } int nvirt_A = tsr_A->calc_nvirt(); int nvirt_B = tsr_B->calc_nvirt(); - if (tsr_B->wrld->np == tsr_A->wrld->np && !tsr_has_sym && !this->is_sparse && !A->is_sparse && nvirt_A == 1 && nvirt_B == 1 && !tsr_has_virt){ + if (tsr_B->wrld->np == tsr_A->wrld->np && !tsr_has_sym && !this->is_sparse && !A->is_sparse && nvirt_A == 1 && nvirt_B == 1 && !tsr_has_virt && topo_dims_A ==0 && topo_dims_B == 0){ push_slice(this, offsets_B, ends_B, beta, A, offsets_A, ends_A, alpha); TAU_FSTOP(slice); return; diff --git a/src/tensor/untyped_tensor.h b/src/tensor/untyped_tensor.h index 6a24b6a5..0ebbd631 100644 --- a/src/tensor/untyped_tensor.h +++ b/src/tensor/untyped_tensor.h @@ -372,7 +372,7 @@ namespace CTF_int { * \param[out] size number of elements in data */ void get_raw_data(char ** data, int64_t * size) const; - + /** * \brief query mapping to processor grid and intra-processor blocking, which may be used to define a tensor with the same initial distribution * \param[out] idx array of this->order chars describing this processor modes mapping on processor grid dimensions tarting from 'a' @@ -1063,6 +1063,7 @@ namespace CTF_int { * \return tensor with same data point as this one but no edge lengths of size 1 */ tensor * get_no_unit_len_alias(); + }; } #endif// __UNTYPED_TENSOR_H__