From 2a78cf774b44432cbb858f50a41eb641fe2942eb Mon Sep 17 00:00:00 2001 From: sbacchio Date: Thu, 7 Jun 2018 12:16:25 +0300 Subject: [PATCH 1/3] Defining aligned variables --- lib/qhg_alloc.c | 3 +-- lib/qhg_defs.h | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/qhg_alloc.c b/lib/qhg_alloc.c index db4939e..8d1bab0 100644 --- a/lib/qhg_alloc.c +++ b/lib/qhg_alloc.c @@ -1,7 +1,6 @@ #include #include - -#define QHG_MEMALIGN 64 +#include /* * Malloc, with minimal error handling diff --git a/lib/qhg_defs.h b/lib/qhg_defs.h index ddf16bc..1e3541f 100644 --- a/lib/qhg_defs.h +++ b/lib/qhg_defs.h @@ -8,5 +8,11 @@ #define NC 3 #define NS 4 #define ND 4 +#define QHG_MEMALIGN 64 +#define QHG_FAST_TYPE float +typedef QHG_FAST_TYPE afloat __attribute__ ((aligned (QHG_MEMALIGN))); + +#define NEPS 6 +extern int QHG_EPS[NEPS][NC+1]; #endif /* _QHG_DEFS_H */ From 9b1023d4a25f3fa0d962e4c2fa9eb9f28cfce3c4 Mon Sep 17 00:00:00 2001 From: sbacchio Date: Thu, 7 Jun 2018 12:17:16 +0300 Subject: [PATCH 2/3] Defining types for fast contractions --- lib/qhg_types.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/lib/qhg_types.h b/lib/qhg_types.h index 717b67a..d720076 100644 --- a/lib/qhg_types.h +++ b/lib/qhg_types.h @@ -54,6 +54,27 @@ typedef struct { qhg_lattice *lat; } qhg_spinor_field; +typedef struct { + afloat *((*field)[NS][NS][NC][NC][2]); + afloat *alloc; + enum qhg_fermion_bc_time bc; + qhg_lattice *lat; +} qhg_fast_spinor_field; + +typedef struct { + double (*C)[NS][NS][NS][NS][2]; + qhg_lattice *lat; + int tsrc; + int write; +} qhg_mesons_open_correlator; + +typedef struct { + double (*C)[NS][NS][NS][NS][NS][NS][2]; + qhg_lattice *lat; + int tsrc; + int write; +} qhg_baryons_open_correlator; + typedef struct { int (*mom_vecs)[3]; int n_mom_vecs; @@ -69,6 +90,7 @@ typedef struct { qhg_mom_list *mom_list; qhg_lattice *lat; } qhg_correlator; + enum projector { P0, From adec0dbd12923f3833a16de14d3c042826176770 Mon Sep 17 00:00:00 2001 From: sbacchio Date: Thu, 7 Jun 2018 12:22:49 +0300 Subject: [PATCH 3/3] Adding auto-vectorized contractions with open indeces for mesons and nucleons --- lib/.gitignore | 1 + lib/Makefile | 8 + lib/qhg.h | 3 + lib/qhg_fast_baryons.c | 521 ++++++++++++++++++++++++++++++++++++ lib/qhg_fast_baryons.h | 12 + lib/qhg_fast_mesons.c | 149 +++++++++++ lib/qhg_fast_mesons.h | 12 + lib/qhg_fast_spinor_field.c | 94 +++++++ lib/qhg_fast_spinor_field.h | 12 + lib/qhg_idx.h | 2 + 10 files changed, 814 insertions(+) create mode 100644 lib/.gitignore create mode 100644 lib/qhg_fast_baryons.c create mode 100644 lib/qhg_fast_baryons.h create mode 100644 lib/qhg_fast_mesons.c create mode 100644 lib/qhg_fast_mesons.h create mode 100644 lib/qhg_fast_spinor_field.c create mode 100644 lib/qhg_fast_spinor_field.h diff --git a/lib/.gitignore b/lib/.gitignore new file mode 100644 index 0000000..d9ac3ed --- /dev/null +++ b/lib/.gitignore @@ -0,0 +1 @@ +*.optrpt \ No newline at end of file diff --git a/lib/Makefile b/lib/Makefile index 9868e52..edb7d59 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -14,6 +14,7 @@ endif all: libqhg.a +#CFLAGS += -qopt-report=1 -I./ CFLAGS += -I./ SOURCES=\ qhg_lattice \ @@ -22,6 +23,9 @@ SOURCES=\ qhg_stop_watch \ qhg_gauge_field \ qhg_spinor_field \ + qhg_fast_spinor_field \ + qhg_fast_mesons \ + qhg_fast_baryons \ qhg_correlator \ qhg_xchange_gauge \ qhg_xchange_spinor \ @@ -63,6 +67,10 @@ libqhg.a: ${addsuffix .o, $(SOURCES)} $(C)$(CC) $(CFLAGS) -c $< -o $@ $(C)$(CC) -M $(CFLAGS) -c $< > $*.d +%.optrpt: %.c + $(E) CC-report $< + $(C)$(CC) $(CFLAGS) -qopt-report=3 -c $< + clean: $(E) CLEAN in $(PWD) $(C)$(RM) *.o *.d diff --git a/lib/qhg.h b/lib/qhg.h index a3df1a9..2a1c7e2 100644 --- a/lib/qhg.h +++ b/lib/qhg.h @@ -26,6 +26,9 @@ #include #include #include +#include +#include +#include #include #include #include diff --git a/lib/qhg_fast_baryons.c b/lib/qhg_fast_baryons.c new file mode 100644 index 0000000..5c53e43 --- /dev/null +++ b/lib/qhg_fast_baryons.c @@ -0,0 +1,521 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +int QHG_EPS[NEPS][NC+1] = {{0,1,2,+1}, + {0,2,1,-1}, + {1,0,2,-1}, + {1,2,0,+1}, + {2,0,1,+1}, + {2,1,0,-1}}; + +inline void +qhg_fast_contract_f111(qhg_baryons_open_correlator corr, qhg_fast_spinor_field sp_1) +{ + + qhg_lattice *lat = sp_1.lat; + int lt = lat->ldims[0]; + unsigned long int lv3 = lat->lv3; + +#ifdef QHG_OMP +#pragma omp parallel for +#endif + for(int t=0; tldims[0]; + unsigned long int lv3 = lat->lv3; + +#ifdef QHG_OMP +#pragma omp parallel for +#endif + for(int t=0; tldims[0]; + unsigned long int lv3 = lat->lv3; + +#ifdef QHG_OMP +#pragma omp parallel for +#endif + for(int t=0; tldims[0]; + unsigned long int lv3 = lat->lv3; + +#ifdef QHG_OMP +#pragma omp parallel for +#endif + for(int t=0; tldims[0]; + unsigned long int lv3 = lat->lv3; + +#ifdef QHG_OMP +#pragma omp parallel for +#endif + for(int t=0; tldims[0]; + qhg_baryons_open_correlator corr; + corr.C = qhg_alloc(lt*NS*NS*NS*NS*NS*NS*2*sizeof(double)); + memset(corr.C, '\0', lt*NS*NS*NS*NS*NS*NS*2*sizeof(double)); + corr.lat = lat; + + // better to check "flavour" but not given at the moment + // f111 + if(sp_1.field == sp_2.field && sp_1.field == sp_3.field) + qhg_fast_contract_f111(corr, sp_1); + // f113 + else if(sp_1.field == sp_2.field && sp_1.field != sp_3.field) + qhg_fast_contract_f113(corr,sp_1,sp_3); + // f121 + else if(sp_1.field != sp_2.field && sp_1.field == sp_3.field) + qhg_fast_contract_f121(corr,sp_1,sp_2); + // f122 + else if(sp_1.field != sp_2.field && sp_2.field == sp_3.field) + qhg_fast_contract_f122(corr,sp_1,sp_2); + // f123 + else if(sp_1.field != sp_2.field && sp_1.field != sp_3.field) + qhg_fast_contract_f123(corr,sp_1,sp_2,sp_3); + + + + /* Split the communicator so that all processes of the same + time-slices have a separate communicator. This way we can do a + reduction over the new communicator. */ + MPI_Comm tcomm; + int *pcoords = lat->comms->proc_coords; + int proc_id = lat->comms->proc_id; + MPI_Comm_split(lat->comms->comm, pcoords[0], proc_id, &tcomm); + + /* The rank within the tcomm communicator */ + int s_proc_id; + MPI_Comm_rank(tcomm, &s_proc_id); + + if(s_proc_id == 0) { + MPI_Reduce( MPI_IN_PLACE, corr.C, lt*NS*NS*NS*NS*NS*NS*2, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + corr.write = 1; + } else { + MPI_Reduce( corr.C, NULL, lt*NS*NS*NS*NS*NS*NS*2, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + corr.write = 0; + } + + //MPI_Comm_free(&tcomm); + return corr; +} + +void +qhg_baryons_open_correlator_finalize(qhg_baryons_open_correlator corr) +{ + free(corr.C); + corr.lat = NULL; + corr.write = 0; +} + +void +qhg_write_baryons_open_correlator(char fname[], qhg_baryons_open_correlator corr, char group[]) +{ + qhg_lattice *lat = corr.lat; + int proc_id = lat->comms->proc_id; + unsigned long int lvol = lat->lvol; + int *pc = lat->comms->proc_coords; + int *ld = lat->ldims; + int *d = lat->dims; + + /* Split the communicator so that all processes that need to write are grouped together */ + MPI_Comm wcomm; + MPI_Comm_split(lat->comms->comm, corr.write, proc_id, &wcomm); + + if ( corr.write == 0 ) + return; + + int ndims = 8; + hsize_t starts[8] = {pc[0]*ld[0], 0, 0, 0, 0, 0, 0, 0}; + hsize_t dims[8] = {d[0], 4, 4, 4, 4, 4, 4, 2}; + hsize_t ldims[8] = {ld[0], 4, 4, 4, 4, 4, 4, 2}; + + hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(fapl_id, wcomm, MPI_INFO_NULL); + hid_t file_id; + if( access( fname, F_OK ) != -1 ) + file_id = H5Fopen(fname, H5F_ACC_RDWR, fapl_id); + else + file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + H5Pclose(fapl_id); + + hid_t lcpl_id = H5Pcreate(H5P_LINK_CREATE); + H5Pset_create_intermediate_group(lcpl_id, 1); + hid_t top_id = H5Gcreate(file_id, group, lcpl_id, H5P_DEFAULT, H5P_DEFAULT); + + /* + Attributes (metadata) are: + 1) the origin (source position) and + 2) the index order in the file + */ + hsize_t n = 1; + hid_t attrdat_id = H5Screate_simple(1, &n, NULL); + hid_t attr_id = H5Acreate2(top_id, "Tsrc", H5T_NATIVE_INT, attrdat_id, H5P_DEFAULT, H5P_DEFAULT); + H5Awrite(attr_id, H5T_NATIVE_INT, &(corr.tsrc)); + H5Aclose(attr_id); + H5Sclose(attrdat_id); + + char order[] = "C-order: [t,s0,s1,s3,s4,s5,s6,re/im]\0"; + attrdat_id = H5Screate(H5S_SCALAR); + hid_t type_id = H5Tcopy(H5T_C_S1); + H5Tset_size(type_id, strlen(order)); + attr_id = H5Acreate1(top_id, "IndexOrder", type_id, attrdat_id, H5P_DEFAULT); + H5Awrite(attr_id, type_id, &order); + + H5Aclose(attr_id); + H5Tclose(type_id); + H5Sclose(attrdat_id); + /* */ + + hid_t filespace = H5Screate_simple(ndims, dims, NULL); + hid_t dataset_id = H5Dcreate(top_id, "corr", H5T_NATIVE_DOUBLE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + hid_t subspace = H5Screate_simple(ndims, ldims, NULL); + filespace = H5Dget_space(dataset_id); + H5Sselect_hyperslab(filespace, H5S_SELECT_SET, starts, NULL, ldims, NULL); + hid_t plist_id = H5Pcreate(H5P_DATASET_XFER); + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + herr_t status = H5Dwrite(dataset_id, H5T_NATIVE_DOUBLE, subspace, filespace, plist_id, corr.C); + H5Dclose(dataset_id); + H5Sclose(filespace); + H5Sclose(subspace); + H5Pclose(plist_id); + H5Pclose(lcpl_id); + H5Gclose(top_id); + H5Fclose(file_id); + MPI_Comm_free(&wcomm); + return; +} diff --git a/lib/qhg_fast_baryons.h b/lib/qhg_fast_baryons.h new file mode 100644 index 0000000..f3242b7 --- /dev/null +++ b/lib/qhg_fast_baryons.h @@ -0,0 +1,12 @@ +#ifndef _QHG_FAST_BARYONS_H +#define _QHG_FAST_BARYONS_H 1 + +#include + +qhg_baryons_open_correlator qhg_fast_baryons(qhg_fast_spinor_field sp_1, qhg_fast_spinor_field sp_2, qhg_fast_spinor_field sp_3); +void qhg_write_baryons_open_correlator(char fname[], qhg_baryons_open_correlator corr, char group[]); +void qhg_baryons_open_correlator_finalize(qhg_baryons_open_correlator corr); + + + +#endif /* _QHG_BARYONS_H */ diff --git a/lib/qhg_fast_mesons.c b/lib/qhg_fast_mesons.c new file mode 100644 index 0000000..ecbf614 --- /dev/null +++ b/lib/qhg_fast_mesons.c @@ -0,0 +1,149 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +qhg_mesons_open_correlator +qhg_fast_mesons(qhg_fast_spinor_field sp_1, qhg_fast_spinor_field sp_2) +{ + qhg_lattice *lat = sp_1.lat; + int lt = lat->ldims[0]; + qhg_mesons_open_correlator corr; + corr.C = qhg_alloc(lt*NS*NS*NS*NS*2*sizeof(double)); + memset(corr.C, '\0', lt*NS*NS*NS*NS*2*sizeof(double)); + corr.lat = lat; + unsigned long int lv3 = lat->lv3; + +#ifdef QHG_OMP +#pragma omp parallel for +#endif + for(int t=0; tcomms->proc_coords; + int proc_id = lat->comms->proc_id; + MPI_Comm_split(lat->comms->comm, pcoords[0], proc_id, &tcomm); + + /* The rank within the tcomm communicator */ + int s_proc_id; + MPI_Comm_rank(tcomm, &s_proc_id); + + if(s_proc_id == 0) { + MPI_Reduce( MPI_IN_PLACE, corr.C, lt*NS*NS*NS*NS*2, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + corr.write = 1; + } else { + MPI_Reduce( corr.C, NULL, lt*NS*NS*NS*NS*2, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + corr.write = 0; + } + + //MPI_Comm_free(&tcomm); + return corr; +} + +void +qhg_mesons_open_correlator_finalize(qhg_mesons_open_correlator corr) +{ + free(corr.C); + corr.lat = NULL; + corr.write = 0; +} + +void +qhg_write_mesons_open_correlator(char fname[], qhg_mesons_open_correlator corr, char group[]) +{ + qhg_lattice *lat = corr.lat; + int proc_id = lat->comms->proc_id; + unsigned long int lvol = lat->lvol; + int *pc = lat->comms->proc_coords; + int *ld = lat->ldims; + int *d = lat->dims; + + /* Split the communicator so that all processes that need to write are grouped together */ + MPI_Comm wcomm; + MPI_Comm_split(lat->comms->comm, corr.write, proc_id, &wcomm); + + if ( corr.write == 0 ) + return; + + int ndims = 6; + hsize_t starts[6] = {pc[0]*ld[0], 0, 0, 0, 0, 0}; + hsize_t dims[6] = {d[0], 4, 4, 4, 4, 2}; + hsize_t ldims[6] = {ld[0], 4, 4, 4, 4, 2}; + + hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(fapl_id, wcomm, MPI_INFO_NULL); + + hid_t file_id; + if( access( fname, F_OK ) != -1 ) + file_id = H5Fopen(fname, H5F_ACC_RDWR, fapl_id); + else + file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + H5Pclose(fapl_id); + + hid_t lcpl_id = H5Pcreate(H5P_LINK_CREATE); + H5Pset_create_intermediate_group(lcpl_id, 1); + hid_t top_id = H5Gcreate(file_id, group, lcpl_id, H5P_DEFAULT, H5P_DEFAULT); + + /* + Attributes (metadata) are: + 1) the origin (source position) and + 2) the index order in the file + */ + hsize_t n = 1; + hid_t attrdat_id = H5Screate_simple(1, &n, NULL); + hid_t attr_id = H5Acreate2(top_id, "Tsrc", H5T_NATIVE_INT, attrdat_id, H5P_DEFAULT, H5P_DEFAULT); + H5Awrite(attr_id, H5T_NATIVE_INT, &(corr.tsrc)); + H5Aclose(attr_id); + H5Sclose(attrdat_id); + + char order[] = "C-order: [t,s0,s1,s3,s4,re-re/re-im/im-re/im-im]\0"; + attrdat_id = H5Screate(H5S_SCALAR); + hid_t type_id = H5Tcopy(H5T_C_S1); + H5Tset_size(type_id, strlen(order)); + attr_id = H5Acreate1(top_id, "IndexOrder", type_id, attrdat_id, H5P_DEFAULT); + H5Awrite(attr_id, type_id, &order); + + H5Aclose(attr_id); + H5Tclose(type_id); + H5Sclose(attrdat_id); + /* */ + + hid_t filespace = H5Screate_simple(ndims, dims, NULL); + hid_t dataset_id = H5Dcreate(top_id, "corr", H5T_NATIVE_DOUBLE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + hid_t subspace = H5Screate_simple(ndims, ldims, NULL); + filespace = H5Dget_space(dataset_id); + H5Sselect_hyperslab(filespace, H5S_SELECT_SET, starts, NULL, ldims, NULL); + hid_t plist_id = H5Pcreate(H5P_DATASET_XFER); + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + herr_t status = H5Dwrite(dataset_id, H5T_NATIVE_DOUBLE, subspace, filespace, plist_id, corr.C); + H5Dclose(dataset_id); + H5Sclose(filespace); + H5Sclose(subspace); + H5Pclose(plist_id); + H5Pclose(lcpl_id); + H5Gclose(top_id); + H5Fclose(file_id); + MPI_Comm_free(&wcomm); + return; +} diff --git a/lib/qhg_fast_mesons.h b/lib/qhg_fast_mesons.h new file mode 100644 index 0000000..80139d6 --- /dev/null +++ b/lib/qhg_fast_mesons.h @@ -0,0 +1,12 @@ +#ifndef _QHG_FAST_MESONS_H +#define _QHG_FAST_MESONS_H 1 + +#include + +qhg_mesons_open_correlator qhg_fast_mesons(qhg_fast_spinor_field sp_1, qhg_fast_spinor_field sp_2); +void qhg_write_mesons_open_correlator(char fname[], qhg_mesons_open_correlator corr, char group[]); +void qhg_mesons_open_correlator_finalize(qhg_mesons_open_correlator corr); + + + +#endif /* _QHG_MESONS_H */ diff --git a/lib/qhg_fast_spinor_field.c b/lib/qhg_fast_spinor_field.c new file mode 100644 index 0000000..8098f8f --- /dev/null +++ b/lib/qhg_fast_spinor_field.c @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include +#include + +qhg_fast_spinor_field +qhg_fast_spinor_field_init(qhg_lattice *lat, enum qhg_fermion_bc_time bc) +{ + unsigned long int lvol = lat->lvol; + unsigned long int lv3 = lat->lv3; + int lt = lat->ldims[0]; + + qhg_fast_spinor_field sp; + sp.alloc = qhg_alloc(lvol*NS*NS*NC*NC*2*sizeof(afloat)); + sp.field = qhg_alloc(lt*NS*NS*NC*NC*2*sizeof(afloat *)); + sp.lat = lat; + sp.bc = bc; + + afloat *tmp = sp.alloc; + + for(int t=0; tlvol*NS*NS*NC*NC*2*sizeof(afloat)); + return; +} + +void +qhg_fast_spinor_field_import(qhg_fast_spinor_field y, qhg_spinor_field x[NS*NC]) +{ + y.lat = x[0].lat; + y.bc = x[0].bc; + unsigned long int lvol = y.lat->lvol; + unsigned long int lv3 = y.lat->lv3; + + for(int cs0=0; cs0ldims[0]; + unsigned long int lv3 = y.lat->lv3; + + double norm = 0; + + for(int t=0; tcomms->comm); + return norm; +} + diff --git a/lib/qhg_fast_spinor_field.h b/lib/qhg_fast_spinor_field.h new file mode 100644 index 0000000..07335f1 --- /dev/null +++ b/lib/qhg_fast_spinor_field.h @@ -0,0 +1,12 @@ +#ifndef _QHG_FAST_SPINOR_FIELD_H +#define _QHG_FAST_SPINOR_FIELD_H 1 + +#include + +qhg_fast_spinor_field qhg_fast_spinor_field_init(qhg_lattice *, enum qhg_fermion_bc_time); +void qhg_fast_spinor_field_finalize(qhg_fast_spinor_field ); +void qhg_fast_spinor_field_copy(qhg_fast_spinor_field, qhg_fast_spinor_field); +void qhg_fast_spinor_field_import(qhg_fast_spinor_field, qhg_spinor_field[NS*NC]); +double qhg_fast_spinor_field_normsq(qhg_fast_spinor_field); + +#endif /* _QHG_FAST_SPINOR_FIELD_H */ diff --git a/lib/qhg_idx.h b/lib/qhg_idx.h index 1a75445..18f8ebf 100644 --- a/lib/qhg_idx.h +++ b/lib/qhg_idx.h @@ -9,6 +9,8 @@ * Single color-spin index from [spin,color] index */ #define CS(s,c) ((c)+(s)*NC) +#define CS2S(cs) ((int) cs/NC) +#define CS2C(cs) ((int) cs%NC) /* * Single index from [color0,color1] index