Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions Grid/allocator/MemoryManagerCache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,32 @@ void MemoryManager::Audit(std::string s)

}

// void MemoryManager::PrintStateNew(void* _CpuPtr)
// {
// uint64_t CpuPtr = (uint64_t)_CpuPtr;

// if ( EntryPresent(CpuPtr) ){
// auto AccCacheIterator = EntryLookup(CpuPtr);
// auto & AccCache = AccCacheIterator->second;
// std::string str;
// if ( AccCache.state==Empty ) str = std::string("Empty");
// if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
// if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
// if ( AccCache.state==Consistent)str = std::string("Consistent");
// if ( AccCache.state==EvictNext) str = std::string("EvictNext");

// std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
// std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
// << "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
// << "\t" << AccCache.cpuLock
// << "\t" << AccCache.accLock
// << "\t" << AccCache.LRU_valid<<std::endl;

// } else {
// std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl;
// }
// }

void MemoryManager::PrintState(void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
Expand Down
1 change: 1 addition & 0 deletions Grid/cshift/Cshift.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{
GRID_TRACE("Cshift");
return Cshift(closure(expr),dim,shift);
}
NAMESPACE_END(Grid);
Expand Down
230 changes: 116 additions & 114 deletions Grid/cshift/Cshift_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,73 @@ NAMESPACE_BEGIN(Grid);

extern std::vector<std::pair<int,int> > Cshift_table;
extern deviceVector<std::pair<int,int> > Cshift_table_device;
extern std::vector<int> Cshift_vector;
extern deviceVector<int> Cshift_vector_device;

inline std::pair<int,int> *MapCshiftTable(void)
// Copy Cshift map object (table or vector) to device
template<class vobj>
inline void MapCshiftCopy(std::vector<vobj> &Cshift_obj, deviceVector<vobj> &Cshift_obj_device)
{
// GPU version
uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz);
// GPU version only
uint64_t sz=Cshift_obj.size();
if (Cshift_obj_device.size()!=sz ) {
Cshift_obj_device.resize(sz);
}
acceleratorCopyToDevice((void *)&Cshift_table[0],
(void *)&Cshift_table_device[0],
sizeof(Cshift_table[0])*sz);
acceleratorCopyToDevice((void *)&Cshift_obj[0],
(void *)&Cshift_obj_device[0],
sizeof(Cshift_obj[0])*sz);

return &Cshift_table_device[0];
// CPU version use identify map
}

// Copy Cshift map object (table or vector) to device and return pointer to device copy
template<class vobj>
inline vobj *MapCshift(std::vector<vobj> &Cshift_obj, deviceVector<vobj> &Cshift_obj_device)
{
MapCshiftCopy<vobj>(Cshift_obj, Cshift_obj_device);

return &Cshift_obj_device[0];
}

// Calculate Cshift_vector
template<class vobj>
void CalculateCshiftVector(Lattice<vobj> &ret, const Lattice<vobj> &rhs, int dimension, int cbmask)
{
GridBase *grid = rhs.Grid();

if ( !grid->CheckerBoarded(dimension) ) {
cbmask=0x3;
}

int e1=grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=grid->_slice_block[dimension];
int stride = grid->_slice_stride[dimension];

if (Cshift_vector.size() < e1*e2) Cshift_vector.resize(e1*e2); // Let it grow to biggest

int ent = 0;
if(cbmask == 0x3 ){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
Cshift_vector[ent++] = o;
}
}
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
int ocb=1<<ret.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
Cshift_vector[ent++] = o;
}
}
}
}

if (ent < Cshift_vector.size()) Cshift_vector.resize(ent); // trim vector to actual size (relevant for checkerboarded dimensions)
}


///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -89,7 +141,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dim
}
{
auto buffer_p = & buffer[0];
auto table = MapCshiftTable();
auto table = MapCshift<std::pair<int,int> >(Cshift_table, Cshift_table_device);
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
Expand Down Expand Up @@ -201,7 +253,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<

{
auto buffer_p = & buffer[0];
auto table = MapCshiftTable();
auto table = MapCshift<std::pair<int,int> >(Cshift_table, Cshift_table_device);
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
Expand Down Expand Up @@ -263,94 +315,36 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA

template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{
int rd = rhs.Grid()->_rdimensions[dimension];

if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
GRID_TRACE("Copy_plane");

int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane

int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension];

if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest

int ent=0;

if(cbmask == 0x3 ){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
}
}

{
auto table = MapCshiftTable();
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
}
auto table = &Cshift_vector_device[0];
tracePush("copy_plane-av");
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
tracePush("copy_plane-acc_for");
accelerator_for(i,Cshift_vector.size(),vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i]+lo],coalescedRead(rhs_v[table[i]+ro]));
});
tracePop("copy_plane-acc_for");
tracePop("copy_plane-av");
}

template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{
int rd = rhs.Grid()->_rdimensions[dimension];

if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}

int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane

int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension];

if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest

int ent=0;

if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
}

{
auto table = MapCshiftTable();
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
auto table = &Cshift_vector_device[0];
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,Cshift_vector.size(),1,{
permute(lhs_v[table[i]+lo],rhs_v[table[i]+ro],permute_type);
});
}
}

//////////////////////////////////////////////////////
Expand All @@ -373,6 +367,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r

template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GRID_TRACE("Cshift_local");
GridBase *grid = rhs.Grid();
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
Expand All @@ -383,51 +378,58 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;

int cb= (cbmask==0x2)? Odd : Even;
int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);

// the permute type
ret.Checkerboard() = grid->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
int permute_dim =grid->PermuteDim(dimension);
int permute_type=grid->PermuteType(dimension);
int permute_type_dist;

for(int x=0;x<rd;x++){
// wrap is whether sshift > rd.
// num is sshift mod rd.
//
// shift 7
//
// XoXo YcYc
// oXoX cYcY
// XoXo YcYc
// oXoX cYcY
//
// sshift --
//
// XX YY ; 3
// XX YY ; 0
// XX YY ; 3
// XX YY ; 0
//
int wrap = sshift/rd; wrap=wrap % ly;
int num = sshift%rd;

// Calculate Cshift_vector - it's the same for all slices
tracePush("CalcCshiftTable");
CalculateCshiftVector<vobj>(ret, rhs, dimension, cbmask);
tracePop("CalcCshiftTable");
// Copy it to the device
tracePush("MapCshiftTable");
MapCshiftCopy<int>(Cshift_vector, Cshift_vector_device);
tracePop("MapCshiftTable");

// int o = 0;
int bo = x * grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
for(int x=0;x<rd;x++){

int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
int sx = (x+sshift)%rd;

// wrap is whether sshift > rd.
// num is sshift mod rd.
//
// shift 7
//
// XoXo YcYc
// oXoX cYcY
// XoXo YcYc
// oXoX cYcY
//
// sshift --
//
// XX YY ; 3
// XX YY ; 0
// XX YY ; 3
// XX YY ; 0
//
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd; wrap=wrap % ly;
int num = sshift%rd;

if ( x< rd-num ) permute_slice=wrap;
else permute_slice = (wrap+1)%ly;

if ( (ly>2) && (permute_slice) ) {
assert(permute_type & RotateBit);
permute_type_dist = permute_type|permute_slice;
assert(permute_type & RotateBit);
permute_type_dist = permute_type|permute_slice;
} else {
permute_type_dist = permute_type;
permute_type_dist = permute_type;
}
}

Expand Down
15 changes: 15 additions & 0 deletions Grid/cshift/Cshift_mpi.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ NAMESPACE_BEGIN(Grid);
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
GRID_TRACE("Cshift_MPI");
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;

Expand All @@ -56,7 +57,9 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
t0=usecond();
if ( !comm_dim ) {
// std::cout << "CSHIFT: Cshift_local" <<std::endl;
tracePush("Cshift_local_MPI");
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
tracePop("Cshift_local_MPI");
} else if ( splice_dim ) {
// std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift);
Expand Down Expand Up @@ -106,6 +109,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
}
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GRID_TRACE("Cshift_comms_MPI");
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;

Expand All @@ -132,6 +136,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r

int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);

// Calculate Cshift_vector - it's the same for all slices
tracePush("CalcCshiftTableMPI");
CalculateCshiftVector<vobj>(ret, rhs, dimension, cbmask);
tracePop("CalcCshiftTableMPI");
// Copy it to the device
tracePush("MapCshiftTableMPI");
MapCshiftCopy<int>(Cshift_vector, Cshift_vector_device);
tracePop("MapCshiftTableMPI");

RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
Expand Down Expand Up @@ -203,6 +217,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r

template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GRID_TRACE("Cshift_comms_simd_MPI");
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
Expand Down
Loading