Comparing Fortran and CTF performance on symmetries in tensor contractions

Hi,

  I made a comparison for the contraction `A[a,b] B[b,c,d,e] = C[a,c,d,e]`, where `B[:,:,d,e] = B[:,:,e,d]`. I used Fortran by loop over external indices `d,e` to utilize the symmetry then `dgemm`. In `CTF`, I tried to set ` ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])`. Both Fortran (gfortran) and `CTF` used `openblas` library. The loop over indices strategy appears (does not mean first) in tensor contraction engine J. Phys. Chem. A 2003, 107, 9887-9897 and in `nwchem` http://gitlab.hpcrl.cse.ohio-state.edu/jinsung/nwchem-cogent-master/-/blob/eac3c06962a89c597fab04aa19bcf9c3989b3ae4/nwchem/src/tce/ccsd/cc2_t2.F

Here is the Fortran code
```
Program test_contraction
  !A[a,b] * B[b,c,d,e] = C[a,c,d,e]
  
    integer, parameter :: dp = selected_real_kind(15, 307)
  
    Real( dp ), Dimension( :, :    ), Allocatable :: a
    Real( dp ), Dimension( :, :, :, : ), Allocatable :: b0, b
    Real( dp ), Dimension( :, :, :, : ), Allocatable :: c0, c1, c2
  
    Integer :: na, nb, nc, nd, ne, m, n, m_iter
    Integer :: ia, ib, ic, id, ie  
    Integer :: start, finish, rate
    real(dp) :: sum_time


    Write( *, * ) 'na, nb, nc, nd, ne ?'
    Read( *, * ) na, nb, nc, nd, ne


    Allocate( a ( 1:na, 1:nb ) ) 
    Allocate( b0 ( 1:nb, 1:nc, 1:nd, 1:ne ) )
    Allocate( b ( 1:nb, 1:nc, 1:nd, 1:ne ) )
  
    Allocate( c0( 1:na, 1:nc, 1:nd, 1:ne ) )   
    Allocate( c1( 1:na, 1:nc, 1:nd, 1:ne ) )   
    Allocate( c2( 1:na, 1:nc, 1:nd, 1:ne ) )  
  

  
    Call Random_number( a )
    Call Random_number( b0 )
    c1 = 0.0_dp
    c2 = 0.0_dp

    m_iter = 20

    write (*,*) 'm_iter average', m_iter 
  
    !symmetrize
    do id = 1, nd
      do ie = 1, ne
        b(:, :, id, ie) = ( b0(:, :, id, ie) + b0(:, :, ie, id) )*0.5
      end do  
    end do  


! refrence value by nested loops
  sum_time = 0.0
  do m = 1, m_iter
    c0 = 0.0_dp
    Call System_clock( start, rate )
    do ie = 1, ne 
      do id = 1, nd
        do ic = 1, nc
          do ib = 1, nb
            do ia = 1, na
              c0(ia, ic, id, ie) = c0(ia, ic, id, ie)  + a(ia, ib) * b(ib, ic, id, ie)
            end do  
          end do  
        end do
      end do
    end do  
    Call System_clock( finish, rate )
    sum_time = sum_time + Real( finish - start, dp ) / rate  
  end do
  Write( *, * ) 'Time for naive loop edcba', sum_time / m_iter 
  

  sum_time = 0.0  
  do m = 1, m_iter     
    do ie = 1, ne 
      do id = 1, nd
        Call System_clock( start, rate )
        Call dgemm( 'N', 'N', na, nb, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
                                            b(:,:,id, ie) , Size( b , Dim = 1 ), &
                                            0.0_dp, c1(:,:,id, ie), Size( c1, Dim = 1 ) )
      !c5(:,id)                                      
        Call System_clock( finish, rate )
        sum_time = sum_time + Real( finish - start, dp ) / rate
      end do    
    end do
  end do  
  Write( *, * ) 'Time for loop dgemm-2', sum_time / m_iter
    


   


  sum_time = 0.0  
  do m = 1, m_iter     
    do ie = 1, ne 
      do id = 1, ie
        Call System_clock( start, rate )
        Call dgemm( 'N', 'N', na, nb, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
                                            b(:,:,id, ie) , Size( b , Dim = 1 ), &
                                            0.0_dp, c2(:,:,id, ie), Size( c2, Dim = 1 ) )
      !c5(:,id)                                      
        Call System_clock( finish, rate )
        sum_time = sum_time + Real( finish - start, dp ) / rate

        c2(:,:,ie, id) = c2(:,:,id, ie) 

      end do    
    end do
  end do  
  Write( *, * ) 'Time for loop dgemm-symmetry', sum_time / m_iter
    

  do ia = 1, na
    do ic = 1, nc
      do id = 1, nd
        do ie = 1, ne
 
          if ( dabs(c1(ia,ic,id,ie) - c0(ia,ic,id,ie))  > 1.e-6 ) then 
            write (*,*) '!!! c1', ia,ic,id,ie, c0(ia,ic,id,ie), c1(ia,ic,id,ie)
          endif   

          if ( dabs(c2(ia,ic,id,ie) - c0(ia,ic,id,ie))  > 1.e-6 ) then 
            write (*,*) '!!! c2', ia,ic,id,ie, c2(ia,ic,id,ie), c1(ia,ic,id,ie)
          endif      
              
        enddo 
      enddo
    enddo
  enddo  
  


end     

```

Here is the `CTF` called from python
```
import numpy as np
import time
import scipy.stats
import warnings
#import timeit
import ctf


warnings.filterwarnings("ignore", category=DeprecationWarning)
# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
def mean_confidence_interval(data, var_name, unit, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    
    print(var_name, round(m, 5), "\u00B1",  round(h, 5), unit )

print('ab,bcde->acd')
# A[a,b] * B[b,c,d,e] = C[a,c,d,e]
na = nb = nc = nd = ne = dim_comm = 32
#nt = 10
n_times = 64
    
print('number of common dimension', dim_comm)
print('number of averaged time', n_times)

A = np.random.random((na,nb))
B = np.random.random((nb,nc,nd,ne))
B_symm= np.zeros((na,nc,nd,ne))
# symmetrize B
for id in range(nd):
    for ie in range(ne):            
        B_symm[:,:,id,ie] = (B[:,:,id,ie] + B[:,:,ie,id]) * 0.5

N = dim_comm

C_ref = np.einsum('ab,bcde->acde', A, B_symm)

tA = ctf.tensor([N,N],sym=[ctf.SYM.NS,ctf.SYM.NS])
tB2 = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tB2_symm = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])

tC = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tC_symm = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])

tA = ctf.from_nparray(A)
tB2 = ctf.from_nparray(B_symm)
tB2_symm = ctf.from_nparray(B_symm)



list_time = []

for i in range(n_times):
    start_time = time.time()
    tC = ctf.einsum('ab,bcde->acde', tA, tB2) 
    finish_time = time.time()

    list_time.append(finish_time - start_time)    
  
mean_confidence_interval(list_time, 'ctf', 's' ) 



list_time = []

for i in range(n_times):
    start_time = time.time()
    tC_symm = ctf.einsum('ab,bcde->acde', tA, tB2_symm) 
    finish_time = time.time()

    list_time.append(finish_time - start_time)    
  
mean_confidence_interval(list_time, 'ctf-symm', 's' ) 

tC_temp = ctf.to_nparray(tC)
print('check ctf',np.allclose(C_ref,tC_temp))
tC_symm_temp = ctf.to_nparray(tC_symm)
print('check ctf symm',np.allclose(C_ref,tC_symm_temp))

```

The results are, Fortran
```
 na, nb, nc, nd, ne ?
32 32 32 32 32
 m_iter average          20
 Time for naive loop edcba  0.31620000000000004
 Time for loop dgemm-2   6.9000000000000051E-003
 Time for loop dgemm-symmetry   3.5500000000000024E-003
```

Python
```
ab,bcde->acd
number of common dimension 32
number of averaged time 64
ctf 0.00809 ± 0.00031 s
ctf-symm 0.00863 ± 0.00054 s
check ctf True
check ctf symm True
```

It seems I can get ~ 90% speed up by symmetry in Fortran, but ~ 0 in `CTF`. If I use python to do similar loops over external indices, with `einsum` as the Fortran code did, the results will be much slower (>a factor of 10). Is my setting on `CTF` correct? 

From intel people, the efficiency of calling `MKL` from Fotran and C++ is similar https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/performance-of-MKL-by-c-or-fortran/td-p/936921
I suppose it holds for `openblas`. So I think if I compare with C++ with dgemm and python loaded `CTF`, the results will be similar. 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Comparing Fortran and CTF performance on symmetries in tensor contractions #136

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Comparing Fortran and CTF performance on symmetries in tensor contractions #136

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions