-
Notifications
You must be signed in to change notification settings - Fork 55
Description
Hi,
I made a comparison for the contraction A[a,b] B[b,c,d,e] = C[a,c,d,e], where B[:,:,d,e] = B[:,:,e,d]. I used Fortran by loop over external indices d,e to utilize the symmetry then dgemm. In CTF, I tried to set ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS]). Both Fortran (gfortran) and CTF used openblas library. The loop over indices strategy appears (does not mean first) in tensor contraction engine J. Phys. Chem. A 2003, 107, 9887-9897 and in nwchem http://gitlab.hpcrl.cse.ohio-state.edu/jinsung/nwchem-cogent-master/-/blob/eac3c06962a89c597fab04aa19bcf9c3989b3ae4/nwchem/src/tce/ccsd/cc2_t2.F
Here is the Fortran code
Program test_contraction
!A[a,b] * B[b,c,d,e] = C[a,c,d,e]
integer, parameter :: dp = selected_real_kind(15, 307)
Real( dp ), Dimension( :, : ), Allocatable :: a
Real( dp ), Dimension( :, :, :, : ), Allocatable :: b0, b
Real( dp ), Dimension( :, :, :, : ), Allocatable :: c0, c1, c2
Integer :: na, nb, nc, nd, ne, m, n, m_iter
Integer :: ia, ib, ic, id, ie
Integer :: start, finish, rate
real(dp) :: sum_time
Write( *, * ) 'na, nb, nc, nd, ne ?'
Read( *, * ) na, nb, nc, nd, ne
Allocate( a ( 1:na, 1:nb ) )
Allocate( b0 ( 1:nb, 1:nc, 1:nd, 1:ne ) )
Allocate( b ( 1:nb, 1:nc, 1:nd, 1:ne ) )
Allocate( c0( 1:na, 1:nc, 1:nd, 1:ne ) )
Allocate( c1( 1:na, 1:nc, 1:nd, 1:ne ) )
Allocate( c2( 1:na, 1:nc, 1:nd, 1:ne ) )
Call Random_number( a )
Call Random_number( b0 )
c1 = 0.0_dp
c2 = 0.0_dp
m_iter = 20
write (*,*) 'm_iter average', m_iter
!symmetrize
do id = 1, nd
do ie = 1, ne
b(:, :, id, ie) = ( b0(:, :, id, ie) + b0(:, :, ie, id) )*0.5
end do
end do
! refrence value by nested loops
sum_time = 0.0
do m = 1, m_iter
c0 = 0.0_dp
Call System_clock( start, rate )
do ie = 1, ne
do id = 1, nd
do ic = 1, nc
do ib = 1, nb
do ia = 1, na
c0(ia, ic, id, ie) = c0(ia, ic, id, ie) + a(ia, ib) * b(ib, ic, id, ie)
end do
end do
end do
end do
end do
Call System_clock( finish, rate )
sum_time = sum_time + Real( finish - start, dp ) / rate
end do
Write( *, * ) 'Time for naive loop edcba', sum_time / m_iter
sum_time = 0.0
do m = 1, m_iter
do ie = 1, ne
do id = 1, nd
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, nb, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
b(:,:,id, ie) , Size( b , Dim = 1 ), &
0.0_dp, c1(:,:,id, ie), Size( c1, Dim = 1 ) )
!c5(:,id)
Call System_clock( finish, rate )
sum_time = sum_time + Real( finish - start, dp ) / rate
end do
end do
end do
Write( *, * ) 'Time for loop dgemm-2', sum_time / m_iter
sum_time = 0.0
do m = 1, m_iter
do ie = 1, ne
do id = 1, ie
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, nb, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
b(:,:,id, ie) , Size( b , Dim = 1 ), &
0.0_dp, c2(:,:,id, ie), Size( c2, Dim = 1 ) )
!c5(:,id)
Call System_clock( finish, rate )
sum_time = sum_time + Real( finish - start, dp ) / rate
c2(:,:,ie, id) = c2(:,:,id, ie)
end do
end do
end do
Write( *, * ) 'Time for loop dgemm-symmetry', sum_time / m_iter
do ia = 1, na
do ic = 1, nc
do id = 1, nd
do ie = 1, ne
if ( dabs(c1(ia,ic,id,ie) - c0(ia,ic,id,ie)) > 1.e-6 ) then
write (*,*) '!!! c1', ia,ic,id,ie, c0(ia,ic,id,ie), c1(ia,ic,id,ie)
endif
if ( dabs(c2(ia,ic,id,ie) - c0(ia,ic,id,ie)) > 1.e-6 ) then
write (*,*) '!!! c2', ia,ic,id,ie, c2(ia,ic,id,ie), c1(ia,ic,id,ie)
endif
enddo
enddo
enddo
enddo
end
Here is the CTF called from python
import numpy as np
import time
import scipy.stats
import warnings
#import timeit
import ctf
warnings.filterwarnings("ignore", category=DeprecationWarning)
# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
def mean_confidence_interval(data, var_name, unit, confidence=0.95):
a = 1.0 * np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
print(var_name, round(m, 5), "\u00B1", round(h, 5), unit )
print('ab,bcde->acd')
# A[a,b] * B[b,c,d,e] = C[a,c,d,e]
na = nb = nc = nd = ne = dim_comm = 32
#nt = 10
n_times = 64
print('number of common dimension', dim_comm)
print('number of averaged time', n_times)
A = np.random.random((na,nb))
B = np.random.random((nb,nc,nd,ne))
B_symm= np.zeros((na,nc,nd,ne))
# symmetrize B
for id in range(nd):
for ie in range(ne):
B_symm[:,:,id,ie] = (B[:,:,id,ie] + B[:,:,ie,id]) * 0.5
N = dim_comm
C_ref = np.einsum('ab,bcde->acde', A, B_symm)
tA = ctf.tensor([N,N],sym=[ctf.SYM.NS,ctf.SYM.NS])
tB2 = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tB2_symm = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])
tC = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tC_symm = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])
tA = ctf.from_nparray(A)
tB2 = ctf.from_nparray(B_symm)
tB2_symm = ctf.from_nparray(B_symm)
list_time = []
for i in range(n_times):
start_time = time.time()
tC = ctf.einsum('ab,bcde->acde', tA, tB2)
finish_time = time.time()
list_time.append(finish_time - start_time)
mean_confidence_interval(list_time, 'ctf', 's' )
list_time = []
for i in range(n_times):
start_time = time.time()
tC_symm = ctf.einsum('ab,bcde->acde', tA, tB2_symm)
finish_time = time.time()
list_time.append(finish_time - start_time)
mean_confidence_interval(list_time, 'ctf-symm', 's' )
tC_temp = ctf.to_nparray(tC)
print('check ctf',np.allclose(C_ref,tC_temp))
tC_symm_temp = ctf.to_nparray(tC_symm)
print('check ctf symm',np.allclose(C_ref,tC_symm_temp))
The results are, Fortran
na, nb, nc, nd, ne ?
32 32 32 32 32
m_iter average 20
Time for naive loop edcba 0.31620000000000004
Time for loop dgemm-2 6.9000000000000051E-003
Time for loop dgemm-symmetry 3.5500000000000024E-003
Python
ab,bcde->acd
number of common dimension 32
number of averaged time 64
ctf 0.00809 ± 0.00031 s
ctf-symm 0.00863 ± 0.00054 s
check ctf True
check ctf symm True
It seems I can get ~ 90% speed up by symmetry in Fortran, but ~ 0 in CTF. If I use python to do similar loops over external indices, with einsum as the Fortran code did, the results will be much slower (>a factor of 10). Is my setting on CTF correct?
From intel people, the efficiency of calling MKL from Fotran and C++ is similar https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/performance-of-MKL-by-c-or-fortran/td-p/936921
I suppose it holds for openblas. So I think if I compare with C++ with dgemm and python loaded CTF, the results will be similar.