From 6b555754421bc1db64562f065d39b308b142967d Mon Sep 17 00:00:00 2001
From: Marcus Petschlies <marcus.petschlies@web.de>
Date: Tue, 6 Dec 2016 17:17:03 +0100
Subject: [PATCH 1/8] my changes

---
 wrapper/lib_wrapper.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wrapper/lib_wrapper.c b/wrapper/lib_wrapper.c
index 5353ebda4..eb6847261 100644
--- a/wrapper/lib_wrapper.c
+++ b/wrapper/lib_wrapper.c
@@ -22,6 +22,7 @@
  * Author: Carsten Urbach
  *         curbach@gmx.de
  *
+ *
  *******************************************************************************/
 
 #include <stdlib.h>

From 68a08eecb08bc78095ba27753da6beb5fb9b516c Mon Sep 17 00:00:00 2001
From: Marcus Petschlies <marcus.petschlies@web.de>
Date: Tue, 20 Dec 2016 13:31:07 +0100
Subject: [PATCH 2/8] added communicator to tmLQCD_mpi_params struct; added
 tmLQCD_deflator_params struct and function prototypes for exact deflation

---
 include/tmLQCD.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/tmLQCD.h b/include/tmLQCD.h
index b0c7ea58e..7c4dcd464 100755
--- a/include/tmLQCD.h
+++ b/include/tmLQCD.h
@@ -41,6 +41,9 @@ extern "C"
   typedef struct {
     unsigned int nproc, nproc_t, nproc_x, nproc_y, nproc_z, cart_id, proc_id, time_rank, omp_num_threads;
     unsigned int proc_coords[4];
+#ifdef TM_USE_MPI
+    MPI_Comm cart_grid;
+#endif
   } tmLQCD_mpi_params;
 
   int tmLQCD_invert_init(int argc, char *argv[], const int verbose, const int external_id);
@@ -58,6 +61,12 @@ extern "C"
                     const int op_id, const int gauge_persist);
 #endif
 
+  int tmLQCD_invert_eo(double * const propagator, double * const source, const int op_id);
+    
+  int tmLQCD_get_deflator_params(tmLQCD_deflator_params*params, const int op_id);
+  int tmLQCD_init_deflator(const int op_id);
+  int tmLQCD_set_deflator_fields(const int op_id1, const int op_id2);
+
 #ifdef __cplusplus
 }
 #endif

From 88e9d8defa75db1ff2b7c9403c98455823623656 Mon Sep 17 00:00:00 2001
From: Marcus Petschlies <marcus.petschlies@web.de>
Date: Thu, 22 Dec 2016 09:25:52 +0100
Subject: [PATCH 3/8] added interface functions for exact deflation in
 wrapper/lib_wrapper.c

---
 wrapper/lib_wrapper.c | 172 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)

diff --git a/wrapper/lib_wrapper.c b/wrapper/lib_wrapper.c
index eb6847261..73dd86138 100644
--- a/wrapper/lib_wrapper.c
+++ b/wrapper/lib_wrapper.c
@@ -369,3 +369,175 @@ int tmLQCD_get_gauge_field_pointer(double ** gf) {
 
   return(0);
 }
+
+
+/***************************************************************************************
+ * invert even-odd preconditioned operator directly
+ *   propagator and source must be in even-odd ordering already
+ ***************************************************************************************/
+int tmLQCD_invert_eo(double * const propagator, double * const source, const int op_id) {
+  int iter;
+  spinor * const p_ptr = (spinor * const)propagator;
+  spinor * const s_ptr = (spinor * const)source;
+
+
+  if(!tmLQCD_invert_initialised) {
+    fprintf(stderr, "[tmLQCD_invert_eo] tmLQCD_inver_init must be called first. Aborting...\n");
+    fflush(stderr);
+    return(-1);
+  }
+
+  if(op_id < 0 || op_id >= no_operators) {
+    fprintf(stderr, "[tmLQCD_invert_eo] op_id=%d not in valid range. Aborting...\n", op_id);
+    fflush(stderr);
+    return(-1);
+  }
+
+  if (operator_list[op_id].solver != EXACTDEFLATEDCG ) {
+    fprintf(stderr, "[tmLQCD_invert_eo] Error, wrong solver type in operator\n");
+    fflush(stderr);
+    return(-2);
+  }
+
+  if(g_proc_id == 0) {
+    printf("# [tmLQCD_invert_eo] Using eo-prec., symmetrized, deflated cg!\n");
+    fflush(stdout);
+  }
+
+  /****************************************
+   * - initialize parameters; this is otherwise
+   *   done in operator.c or invert.c
+   * -
+   ****************************************/
+  boundary(operator_list[op_id].kappa);
+  g_mu    = operator_list[op_id].mu;
+  g_c_sw  = operator_list[op_id].c_sw;
+  g_kappa = operator_list[op_id].kappa;
+
+  if(operator_list[op_id].type == CLOVER) {
+    if (g_cart_id == 0 && g_debug_level > 1) {
+      printf("#\n# csw = %e, computing clover leafs\n", g_c_sw);
+    }
+    init_sw_fields(VOLUME);
+
+    sw_term( (const su3**) g_gauge_field, operator_list[op_id].kappa, operator_list[op_id].c_sw);
+    /* this must be EE here!   */
+    /* to match clover_inv in Qsw_psi */
+    sw_invert(EE, operator_list[op_id].mu);
+    /* now copy double sw and sw_inv fields to 32bit versions */
+    copy_32_sw_fields();
+  }
+
+  /* call invert */
+  iter = exactdeflated_cg( operator_list[op_id].solver_params, operator_list[op_id].deflator_params, p_ptr, s_ptr);
+
+  if(iter<0) {
+    fprintf(stderr, "[tmLQCD_invert_eo] Error from deflated_cg, status was %d\n", iter);
+    fflush(stderr);
+    return(1);
+  }
+
+  return(0);
+}  /* end of tmLQCD_invert_eo */
+
+/***************************************************************************************
+ * copy several deflator parameters
+ ***************************************************************************************/
+int tmLQCD_get_deflator_params(tmLQCD_deflator_params*params, const int op_id) {
+  if(!tmLQCD_invert_initialised) {
+    fprintf(stderr, "[tmLQCD_get_deflator_params] tmLQCD_invert_init must be called first. Aborting...\n");
+    return(-1);
+  }
+
+  if(operator_list[op_id].deflator_params.nconv == -1) {
+    fprintf(stderr, "[tmLQCD_get_deflator_params] deflator no initialized. Aborting...\n");
+    return(-2);
+  }
+
+  strcpy(params->type_name,  operator_list[op_id].deflator_params.type_name);
+  params->eoprec = operator_list[op_id].deflator_params.eoprec;
+  params->evecs  = operator_list[op_id].deflator_params.evecs;
+  params->evals  = operator_list[op_id].deflator_params.evals;
+  params->prec   = operator_list[op_id].deflator_params.prec;
+  params->nev    = operator_list[op_id].deflator_params.nconv;
+
+  return(0);
+}  /* end of tmLQCD_get_deflator_params */
+
+/***************************************************************************************
+ * call the deflator function
+ ***************************************************************************************/
+int tmLQCD_init_deflator(const int op_id) {
+  int status;
+
+  if(op_id < 0 || op_id >= no_operators) {
+    fprintf(stderr, "[tmLQCD_init_deflator] op_id=%d not in valid range. Aborting...\n", op_id);
+    return(1);
+  }
+
+
+  if(operator_list[op_id].deflator_params.nconv > -1) {
+    fprintf(stderr, "[tmLQCD_init_deflator] deflator already initialized. Aborting...\n");
+    return(2);
+  }
+
+  /****************************************
+   * initialize parameters; this is otherwise
+   * done in operator.c or invert.c
+   ****************************************/
+  boundary(operator_list[op_id].kappa);
+  g_mu    = operator_list[op_id].mu;
+  g_c_sw  = operator_list[op_id].c_sw;
+  g_kappa = operator_list[op_id].kappa;
+             
+  if (g_cart_id == 0) {
+    fprintf(stdout, "# [tmLQCD_init_deflator] 2 kappa mu = %e, kappa = %e, c_sw = %e\n", g_mu, g_kappa, g_c_sw);
+  }
+
+  if(operator_list[op_id].type == CLOVER) {
+    if (g_cart_id == 0) {
+      printf("# [tmLQCD_init_deflator]\n# [tmLQCD_init_deflator] csw = %e, computing clover leafs\n", g_c_sw);
+    }
+    init_sw_fields(VOLUME);
+
+    sw_term( (const su3**) g_gauge_field, operator_list[op_id].kappa, operator_list[op_id].c_sw);
+    /* this must be EE here!   */
+    /* to match clover_inv in Qsw_psi */
+    sw_invert(EE, operator_list[op_id].mu);
+    /* now copy double sw and sw_inv fields to 32bit versions */
+    copy_32_sw_fields();
+  }
+
+
+  status = operator_list[op_id].deflator_params.init( &(operator_list[op_id].deflator_params) );
+  if(status != 0) {
+    fprintf(stderr, "[tmLQCD_init_deflator] Error from deflator_init, status was %d\n", status);
+    return(3);
+  }
+
+  return(0);
+}  /* end of tmLQCD_init_deflator */
+
+int tmLQCD_set_deflator_fields(const int op_id1, const int op_id2) {
+  int status;
+
+  if( op_id1 < 0 || op_id1 >= no_operators || op_id2 < 0 || op_id2 >= no_operators) {
+    fprintf(stderr, "[tmLQCD_init_deflator] op_id out of range; aborting\n");
+    return(1);
+  }
+
+  if( operator_list[op_id2].deflator_params.nconv <= 0 || operator_list[op_id2].deflator_params.evecs == NULL || operator_list[op_id2].deflator_params.evals == NULL ) {
+    fprintf(stderr, "[tmLQCD_copy_deflator_fields] source deflator not initialized; aborting\n");
+    return(2);
+  }
+
+  /****************************************
+   * set deflator fields
+   ****************************************/
+  operator_list[op_id1].deflator_params.nconv = operator_list[op_id2].deflator_params.nconv;
+  operator_list[op_id1].deflator_params.evecs = operator_list[op_id2].deflator_params.evecs;
+  operator_list[op_id1].deflator_params.evals = operator_list[op_id2].deflator_params.evals;
+
+  return(0);
+}  /* end of tmLQCD_set_deflator_fields */
+

From ff9c75aca12223bb5400e1d04443f78f98a96893 Mon Sep 17 00:00:00 2001
From: Marcus Petschlies <marcus.petschlies@web.de>
Date: Thu, 22 Dec 2016 09:28:36 +0100
Subject: [PATCH 4/8] solver-functions for exact deflation

---
 solver/deflated_cg.c | 436 ++++++++++++++++++++++++++++++++
 solver/deflator.c    | 582 +++++++++++++++++++++++++++++++++++++++++++
 solver/deflator.h    | 103 ++++++++
 3 files changed, 1121 insertions(+)
 create mode 100644 solver/deflated_cg.c
 create mode 100644 solver/deflator.c
 create mode 100644 solver/deflator.h

diff --git a/solver/deflated_cg.c b/solver/deflated_cg.c
new file mode 100644
index 000000000..6c1e1b5cc
--- /dev/null
+++ b/solver/deflated_cg.c
@@ -0,0 +1,436 @@
+/*****************************************************************************
+ * Copyright (C) 2014 Abdou M. Abdel-Rehim
+ *
+ * Deflating CG using eigenvectors computed using ARPACK
+ * eigenvectors used correspond to those with smallest magnitude
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+
+#include "global.h"
+#include "gettime.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "linalg/blas.h"
+#include "linalg/lapack.h"
+#include <io/eospinor.h>
+#include <io/params.h>
+#include <io/spinor.h>
+#include <io/utils.h>
+#include "solver_field.h"
+#include "solver/deflated_cg.h"
+
+int exactdeflated_cg(
+  solver_params_t solver_params, /* (IN) parameters for solver */
+  deflator_params_t deflator_params, /* (IN) parameters for deflator */
+  spinor * const x,              /* (IN/OUT) initial guess on input, solution on output for this RHS*/
+  spinor * const b               /* (IN) right-hand side*/
+) {
+
+  /* Static variables and arrays. */
+  static int ncurRHS=0;                  /* current number of the system being solved */                   
+  static void *_ax, *_r, *_tmps1, *_tmps2;
+  static spinor *ax, *r, *tmps1, *tmps2;
+  static _Complex double *H,*Hinv,*initwork,*tmpv1;
+  static double *hevals;
+  static int *IPIV; 
+  int i,j,tmpsize;
+  int ONE=1;
+  _Complex double c1, c2, c3;
+  double d1,d2,d3;
+  double et1,et2;  /* timing variables */
+
+  int parallel;        /* for parallel processing of the scalar products */
+#ifdef TM_USE_MPI
+    parallel=1;
+#else
+    parallel=0;
+#endif
+
+  const int N = deflator_params.eoprec == 0 ? VOLUME : VOLUME/2;  /* (IN) Number of lattice sites for this process*/
+
+  /* leading dimension for spinor vectors */
+  int LDN;
+  if(N==VOLUME)
+     LDN = VOLUMEPLUSRAND;
+  else
+     LDN = VOLUMEPLUSRAND/2; 
+
+  /* (IN) Number of right-hand sides to be solved*/ 
+  const int nrhs = solver_params.nrhs; 
+
+  /* (IN) First number of right-hand sides to be solved using tolerance eps_sq1*/ 
+  const int nrhs1 = solver_params.nrhs1;
+
+  /* (IN) squared tolerance of convergence of the linear system for systems 1 till nrhs1*/
+  const double eps_sq1 = solver_params.eps_sq1;
+
+  /* (IN) suqared tolerance for restarting cg */
+  const double res_eps_sq =   solver_params.res_eps_sq;
+
+  /* (IN) how to project with approximate eigenvectors */
+  int projection_type = deflator_params.projection_type;
+
+  /*(IN) field of eigenvectors */
+  _Complex double *evecs = (_Complex double *) deflator_params.evecs;
+
+  /*(IN) list of eigenvalues */
+  _Complex double *evals = (_Complex double *) deflator_params.evals;
+
+  /* (IN) operator in double precision */
+  matrix_mult f = deflator_params.f;
+
+  /* (IN) operator in single precision */
+  matrix_mult32 f32 = deflator_params.f32;
+
+  /* (IN) final operator application during projection of type 1 */
+  matrix_mult f_final = deflator_params.f_final;
+
+  /* (IN) initial operator application during projection of type 1 */
+  matrix_mult f_initial = deflator_params.f_initial;
+
+  /* (IN) number of converged eigenvectors as returned by deflator */
+  int nconv = deflator_params.nconv;
+
+  const double eps_sq   = solver_params.eps_sq;   /* (IN) squared tolerance of convergence of the linear system for systems nrhs1+1 till nrhs*/
+  const int    rel_prec = solver_params.rel_prec; /* (IN) 0 for using absoute error for convergence
+                                                          1 for using relative error for convergence*/
+  const int    maxit    = solver_params.maxiter;  /* (IN) Maximum allowed number of iterations to solution for the linear system*/
+
+  /**************************************************************
+   * if this is the first right hand side, allocate memory
+   **************************************************************/
+  if(ncurRHS==0){ 
+#if (defined SSE || defined SSE2 || defined SSE3)
+    _ax = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+    if(_ax==NULL)
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[exactdefled_cg] insufficient memory for _ax inside deflated_cg.\n");
+       exit(1);
+    }
+    else
+       {ax  = (spinor *) ( ((unsigned long int)(_ax)+ALIGN_BASE)&~ALIGN_BASE);}
+
+    _r = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+    if(_r==NULL)
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[exactdefled_cg] insufficient memory for _r inside deflated_cg.\n");
+
+       exit(1);
+    }
+    else
+       {r  = (spinor *) ( ((unsigned long int)(_r)+ALIGN_BASE)&~ALIGN_BASE);}
+
+    _tmps1 = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+    if(_tmps1==NULL)
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[exactdefled_cg] insufficient memory for _tmps1 inside deflated_cg.\n");
+       exit(1);
+    }
+    else
+       {tmps1  = (spinor *) ( ((unsigned long int)(_tmps1)+ALIGN_BASE)&~ALIGN_BASE);}
+
+    _tmps2 = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+    if(_tmps2==NULL) {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[exactdefled_cg] insufficient memory for _tmps2 inside deflated_cg.\n");
+       exit(1);
+    } else {
+      tmps2  = (spinor *) ( ((unsigned long int)(_tmps2)+ALIGN_BASE)&~ALIGN_BASE);
+    }
+
+#else
+    ax = (spinor *) malloc(LDN*sizeof(spinor));
+    r  = (spinor *) malloc(LDN*sizeof(spinor));
+    tmps1 = (spinor *) malloc(LDN*sizeof(spinor));
+    tmps2 = (spinor *) malloc(LDN*sizeof(spinor));
+    
+    if( (ax == NULL)  || (r==NULL) || (tmps1==NULL) || (tmps2==NULL) ) {
+       if(g_proc_id == g_stdio_proc) fprintf(stderr,"[exactdefled_cg] insufficient memory for ax,r,tmps1,tmps2 inside exactdeflated_cg.\n");
+       exit(1);
+    }
+#endif
+
+    tmpv1 = (_Complex double *) malloc(12*N*sizeof(_Complex double));
+
+    if((evecs == NULL)  || (evals==NULL) || (tmpv1==NULL))
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[exactdefled_cg] insufficient memory for evecs and evals inside exactdeflated_cg.\n");
+       exit(1);
+    }
+
+    H        = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); 
+    Hinv     = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); 
+    initwork = (_Complex double *) malloc(nconv*sizeof(_Complex double)); 
+    IPIV     = (int *) malloc(nconv*sizeof(int));
+    hevals      = (double *) malloc(nconv*sizeof(double));
+
+    if((H==NULL) || (Hinv==NULL) || (initwork==NULL) || (IPIV==NULL) || (hevals==NULL)) {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[exactdefled_cg] insufficient memory for H, Hinv, initwork, IPIV, hevals inside exactdeflated_cg.\n");
+       exit(1);
+    }
+
+    et1=gettime();
+    /* compute the elements of the hermitian matrix H 
+       leading dimension is nconv and active dimension is nconv 
+       H = V^+ Op V; V = (v_1 ... v_nconv)
+     */
+    
+    if( projection_type == 0) {
+    
+      for(i=0; i<nconv; i++)
+      {
+        assign_complex_to_spinor(r,&evecs[i*12*N],12*N);
+        f(ax,r);
+        c1 = scalar_prod(r,ax,N,parallel);
+        H[i+nconv*i] = creal(c1);  /* diagonal should be real */
+        for(j=i+1; j<nconv; j++) {
+          assign_complex_to_spinor(r,&evecs[j*12*N],12*N);
+          c1 = scalar_prod(r,ax,N,parallel);
+          H[j+nconv*i] = c1;
+          H[i+nconv*j] = conj(c1); /* enforce hermiticity */
+        }
+      }
+
+    } else if ( projection_type == 1 )  {
+
+      for(i=0; i<nconv; i++)
+      {
+        assign_complex_to_spinor(tmps1, &evecs[i*12*N], 12*N);
+        f_final(r, tmps1);
+        f(ax,r);
+        c1 = scalar_prod(r,ax,N,parallel);
+        c2 = scalar_prod(r,r,N,parallel);
+        H[i+nconv*i] = creal(c1) / creal(c2);   /* diagonal should be real */
+        for(j=i+1; j<nconv; j++)
+        {
+          assign_complex_to_spinor(tmps1, &evecs[j*12*N], 12*N);
+          f_final(r, tmps1);
+          c1 = scalar_prod(r,ax,N,parallel);
+          c3 = scalar_prod(r, r, N, parallel);
+
+          H[j+nconv*i] = c1 / sqrt( creal(c2) * creal(c3) );
+          H[i+nconv*j] = conj(c1) / sqrt( creal(c2) * creal(c3) ); /* enforce hermiticity */
+        }
+      }
+    }
+
+
+    et2=gettime();
+    if(g_proc_id == g_stdio_proc) {
+      fprintf(stdout,"[exactdefled_cg] time to compute H: %+e\n",et2-et1);
+    }
+
+  }  /* if(ncurRHS==0) */
+    
+  double eps_sq_used,restart_eps_sq_used;  /* tolerance squared for the linear system */
+
+  /*increment the RHS counter*/
+  ncurRHS = ncurRHS +1; 
+
+  /* set the tolerance to be used for this right-hand side  */
+  if(ncurRHS > nrhs1) {
+    eps_sq_used = eps_sq;
+  } else {
+    eps_sq_used = eps_sq1;
+  }
+  
+  if(g_proc_id == g_stdio_proc && g_debug_level > 0) {
+    fprintf(stdout, "# [exactdefled_cg] System %d, eps_sq %e, projection type %d\n",ncurRHS,eps_sq_used, projection_type); 
+    fflush(stdout);
+  } 
+  
+  /*---------------------------------------------------------------*/
+  /* Call init-CG until this right-hand side converges             */
+  /*---------------------------------------------------------------*/
+  double wt1,wt2,wE,wI;
+  double normsq,tol_sq;
+  int flag,maxit_remain,numIts,its;
+  int info_lapack;
+
+  wE = 0.0; wI = 0.0;     /* Start accumulator timers */
+  flag = -1;    	  /* System has not converged yet */
+  maxit_remain = maxit;   /* Initialize Max and current # of iters   */
+  numIts = 0;  
+  restart_eps_sq_used=res_eps_sq;
+
+  while( flag == -1 ) {
+    
+    if(nconv > 0) {
+
+      /* ---------------------------------------------------------
+       * Perform init-CG with eigenvectors
+       * 
+       * xinit = xinit + evecs*Hinv*evec'*(b-Ax0)
+       * --------------------------------------------------------- */
+      wt1 = gettime();
+
+      /*r0=b-Ax0*/
+      f(ax,x); /*ax = A*x */
+      diff(r,b,ax,N);  /* r=b-A*x */
+
+      if( projection_type == 0) {
+
+        /* x = x + evecs*inv(H)*evecs'*r */
+        for(int i=0; i < nconv; i++) {
+           assign_complex_to_spinor(tmps1,&evecs[i*12*N],12*N);
+           initwork[i]= scalar_prod(tmps1,r,N,parallel);
+        }
+
+        /* solve the linear system H y = c */
+        tmpsize=nconv*nconv;
+        _FT(zcopy) (&tmpsize,H,&ONE,Hinv,&ONE); /* copy H into Hinv */
+        /* SUBROUTINE ZGESV( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) */
+        _FT(zgesv) (&nconv,&ONE,Hinv,&nconv,IPIV,initwork,&nconv,&info_lapack);
+
+        if(info_lapack != 0) {
+           if(g_proc_id == g_stdio_proc) {
+              fprintf(stderr, "[exactdefled_cg] Error in ZGESV:, info =  %d\n",info_lapack); 
+              fflush(stderr);
+           }
+           exit(1);
+        }
+
+        /* x = x + evecs*inv(H)*evecs'*r */
+        for(i=0; i<nconv; i++) {
+          assign_complex_to_spinor(tmps1,&evecs[i*12*N],12*N);
+          assign_add_mul(x,tmps1,initwork[i],N);
+        }
+
+      } else if ( projection_type == 1 ) {
+        /* x = x + evecs*inv(H)*evecs'*r */
+
+        /* tmps2 = Q^+ r */
+        f_initial(tmps2, r);
+
+        for(int i=0; i < nconv; i++) {
+          /* tmps1 = v_i */
+          assign_complex_to_spinor(tmps1,&evecs[i*12*N],12*N);
+
+          /* initwork_i = v_i^+ Q^+ r / lambda_i^2 */
+          initwork[i]= scalar_prod(tmps1, tmps2, N, parallel) / ( H[i*nconv+i] * H[i*nconv+i] );
+        }
+
+        memset(tmps2, 0, N*sizeof(spinor) );
+        for(i=0; i<nconv; i++) {
+          assign_complex_to_spinor(tmps1, &evecs[i*12*N], 12*N);
+          assign_add_mul(tmps2, tmps1, initwork[i], N);
+        }
+
+        /* apply final operator */
+        f_final(tmps1, tmps2);
+        assign_add_mul(x, tmps1, 1., N);
+
+      }  /* end of if projection type */
+
+      /* compute elapsed time and add to accumulator */
+
+      wt2 = gettime();
+      wI = wI + wt2-wt1;
+      
+    }  /* of if nconv > 0 */
+
+
+    /* which tolerance to use */
+    if(eps_sq_used > restart_eps_sq_used) {
+       tol_sq = eps_sq_used;
+       flag   = 1; /* shouldn't restart again */
+    } else {
+       tol_sq = restart_eps_sq_used;
+    }
+
+    wt1 = gettime();
+    its = cg_her(x,b,maxit_remain,tol_sq,rel_prec,N,f); 
+          
+    wt2 = gettime();
+
+    wE = wE + wt2-wt1;
+
+    /* check convergence */
+    if(its == -1)
+    {
+       /* cg didn't converge */
+       if(g_proc_id == g_stdio_proc) {
+         fprintf(stderr, "[exactdefled_cg] CG didn't converge within the maximum number of iterations in deflated_cg. Exiting...\n"); 
+         fflush(stderr);
+         exit(1);
+         
+       }
+    } 
+    else
+    {
+       numIts += its;   
+       maxit_remain = maxit - numIts; /* remaining number of iterations */
+       restart_eps_sq_used = restart_eps_sq_used*res_eps_sq; /* prepare for the next restart */
+    }
+    
+  }
+  /* end while (flag ==-1)               */
+  
+  /* ---------- */
+  /* Reporting  */
+  /* ---------- */
+  /* compute the exact residual */
+  f(ax,x); /* ax= A*x */
+  diff(r,b,ax,N);  /* r=b-A*x */	
+  normsq=square_norm(r,N,parallel);
+  if(g_debug_level > 0 && g_proc_id == g_stdio_proc)
+  {
+    fprintf(stdout, "# [exactdefled_cg] For this rhs:\n");
+    fprintf(stdout, "# [exactdefled_cg] Total initCG Wallclock : %+e\n", wI);
+    fprintf(stdout, "# [exactdefled_cg] Total cg Wallclock : %+e\n", wE);
+    fprintf(stdout, "# [exactdefled_cg] Iterations: %-d\n", numIts); 
+    fprintf(stdout, "# [exactdefled_cg] Actual Resid of LinSys  : %+e\n",normsq);
+  }
+
+  /* apply the adjoint operator again */
+  f_initial(ax,x);
+  /* copy back to x */
+  memcpy(x,ax,N*sizeof(spinor));
+
+  /* free memory if this was your last system to solve */
+  if(ncurRHS == nrhs) {
+#if ( (defined SSE) || (defined SSE2) || (defined SSE3)) 
+    free(_ax);  free(_r);  free(_tmps1); free(_tmps2);
+#else
+    free(ax); free(r); free(tmps1); free(tmps2);
+#endif
+    free(H); free(Hinv);
+    free(initwork); free(tmpv1);
+    free(hevals); free(IPIV);
+  }  /* end of if ncurRHS == nrhs */
+
+  return(numIts);
+}  /* end of exactdeflated_cg */
diff --git a/solver/deflator.c b/solver/deflator.c
new file mode 100644
index 000000000..47b9a2c49
--- /dev/null
+++ b/solver/deflator.c
@@ -0,0 +1,582 @@
+/*****************************************************************************
+ * Copyright (C) 2014 Abdou M. Abdel-Rehim
+ *
+ * Deflating CG using eigenvectors computed using ARPACK
+ * eigenvectors used correspond to those with smallest magnitude
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+
+#include "global.h"
+#include "gettime.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "linalg/blas.h"
+#include "linalg/lapack.h"
+#include <io/eospinor.h>
+#include <io/params.h>
+#include <io/spinor.h>
+#include <io/utils.h>
+#include "solver_field.h"
+#include "solver/deflator.h"
+#include "operator/tm_operators_32.h"
+#include "operator/tm_operators.h"
+
+
+
+int make_exactdeflator( deflator_params_t *deflator_params ) {
+
+  /* Static variables and arrays. */
+  void *_ax,*_r,*_tmps1,*_tmps2;
+  spinor *ax,*r,*tmps1,*tmps2;
+  _Complex double *evecs,*evals,*H,*HU,*Hinv,*initwork,*tmpv1;
+  _Complex double *zheev_work;
+  double *hevals,*zheev_rwork;
+  int *IPIV; 
+  int info_arpack=0;
+  int nconv=0; /* number of converged eigenvectors as returned by arpack */
+  int exit_status = 0;
+  int i,j,tmpsize;
+  char cV='V',cN='N', cU='U';   
+  int ONE=1;
+  int zheev_lwork,zheev_info;
+  _Complex double c1, tpone=1.0,tzero=0.0;
+  double d1,d2,d3;
+  double et1,et2;  /* timing variables */
+  char evecs_filename[500];
+  FILE *evecs_fs=NULL;
+  size_t evecs_count;
+  WRITER *evecs_writer=NULL;
+  spinor *evecs_ptr0 = NULL, *evecs_ptr1 = NULL;
+  paramsPropagatorFormat *evecs_propagatorFormat = NULL;
+  void *evecs_io_buffer = NULL;
+
+  const int N = deflator_params->eoprec==0 ? VOLUME : VOLUME/2;
+
+  int parallel;        /* for parallel processing of the scalar products */
+#ifdef TM_USE_MPI
+    parallel=1;
+#else
+    parallel=0;
+#endif
+
+  /* leading dimension for spinor vectors */
+  int LDN;
+  if(N==VOLUME)
+     LDN = VOLUMEPLUSRAND;
+  else
+     LDN = VOLUMEPLUSRAND/2; 
+
+  /* (IN) number of eigenvectors to be computed by arpack*/
+  const int nev = deflator_params->nev;
+
+  /* (IN) size of the subspace used by arpack with the condition (nev+1) =< ncv*/
+  const int ncv = deflator_params->ncv;
+
+  /* (IN) tolerance for computing eigenvalues with arpack */
+  double deflator_eig_tol =   deflator_params->eig_tol;
+
+  /* (IN) maximum number of iterations to be used by arpack*/
+  int deflator_eig_maxiter =   deflator_params->eig_maxiter;
+
+  /* (IN) 0 for eigenvalues with smallest real part "SR"
+         1 for eigenvalues with largest real part "LR"
+         2 for eigenvalues with smallest absolute value "SM"
+         3 for eigenvalues with largest absolute value "LM"
+         4 for eigenvalues with smallest imaginary part "SI"
+         5 for eigenvalues with largest imaginary part  "LI"*/
+  int kind =   deflator_params->evals_kind;
+
+  /* (IN) 0 don't compute the eiegnvalues and their residuals of the original system 
+         1 compute the eigenvalues and the residuals for the original system (the orthonormal basis
+           still be used in deflation and they are not overwritten).*/
+  int comp_evecs =   deflator_params->comp_evecs;
+
+  /* (IN) 0 no polynomial acceleration; 1 use polynomial acceleration*/
+  int acc =   deflator_params->use_acc;
+
+  /* (IN) degree of the Chebyshev polynomial (irrelevant if acc=0)*/
+  int cheb_k = deflator_params->cheb_k;
+
+  /* (IN) lower end of the interval where the acceleration will be used (irrelevant if acc=0)*/
+  double emin = deflator_params->op_evmin;
+
+  /* (IN) upper end of the interval where the acceleration will be used (irrelevant if acc=0)*/
+  double emax = deflator_params->op_evmax;
+
+  /* (IN) file name to be used for printing out debugging information from arpack*/
+  char *deflator_logfile = deflator_params->logfile;
+
+  /* (IN) read eigenvectors in Schur basis from file */
+  int  deflator_read_ev = deflator_params->read_ev;
+
+  /* (IN) write eigenvectors in Schur basis to file */
+  int  deflator_write_ev = deflator_params->write_ev;
+
+  /* (IN) file name to be used for reading and writing evecs from and to disc */
+  char *deflator_evecs_filename = deflator_params->evecs_filename;
+
+   /* (IN) precision used for writing eigenvectors */
+  int deflator_evecs_writeprec = deflator_params->evecs_writeprec;
+
+  /* (IN) file format for evecs used by arpack */
+  char *deflator_evecs_fileformat = deflator_params->evecs_fileformat; 
+
+  /* (IN) f(s,r) computes s=A*r, i.e. matrix-vector multiply in double precision */
+  matrix_mult f = deflator_params->f;
+
+  /* (IN) f32(s,r) computes s=A*r, i.e. matrix-vector multiply in single precision */
+  matrix_mult32 f32 = deflator_params->f32;
+
+  /* set nconv to 0 to signify, that init has been called */
+  deflator_params->nconv = 0;
+
+  if(g_cart_id == 1) {
+    fprintf(stdout, "# [make_exactdeflator] eo prec = %d\n", deflator_params->eoprec);
+    fprintf(stdout, "# [make_exactdeflator] N =  %d\n", N);
+    fprintf(stdout, "# [make_exactdeflator] nev = %d\n", nev);
+    fprintf(stdout, "# [make_exactdeflator] ncv = %d\n", ncv);
+    fprintf(stdout, "# [make_exactdeflator] evals kind = %d\n", kind);
+    fprintf(stdout, "# [make_exactdeflator] use acc  = %d\n", acc);
+    fprintf(stdout, "# [make_exactdeflator] Chebyshev k  = %d\n", cheb_k);
+    fprintf(stdout, "# [make_exactdeflator] emin / emax = %e / %e\n", emin, emax);
+    fprintf(stdout, "# [make_exactdeflator] comp evecs = %d\n", comp_evecs);
+    fprintf(stdout, "# [make_exactdeflator] deflator_eig_tol = %e\n", deflator_eig_tol);
+    fprintf(stdout, "# [make_exactdeflator] deflator_eig_maxiter = %d\n", deflator_eig_maxiter);
+    fprintf(stdout, "# [make_exactdeflator] deflator_write_ev = %d\n", deflator_write_ev);
+    fprintf(stdout, "# [make_exactdeflator] deflator_read_ev = %d\n", deflator_read_ev);
+    fprintf(stdout, "# [make_exactdeflator] f == Qtm_pm_psi = %d\n", (f==&Qtm_pm_psi));
+    fprintf(stdout, "# [make_exactdeflator] f32 == Qtm_pm_psi = %d\n", (f32==&Qtm_pm_psi_32));
+  }
+
+  /*-------------------------------------------------------------
+    if this is the first right hand side, allocate memory, 
+    call arpack, and compute resiudals of eigenvectors if needed
+    -------------------------------------------------------------*/ 
+#if (defined SSE || defined SSE2 || defined SSE3)
+  _ax = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+  if(_ax==NULL) {
+    if(g_proc_id == g_stdio_proc) fprintf(stderr,"[make_exactdeflator] insufficient memory for _ax inside deflator.\n");
+    exit(1);
+  } else {
+    ax  = (spinor *) ( ((unsigned long int)(_ax)+ALIGN_BASE)&~ALIGN_BASE);
+  }
+
+  _r = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+  if(_r==NULL) {
+    if(g_proc_id == g_stdio_proc) fprintf(stderr,"[make_exactdeflator] insufficient memory for _r inside deflator.\n");
+    exit(1);
+  } else {
+    r  = (spinor *) ( ((unsigned long int)(_r)+ALIGN_BASE)&~ALIGN_BASE);
+  }
+
+  _tmps1 = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+  if(_tmps1==NULL) {
+    if(g_proc_id == g_stdio_proc) fprintf(stderr,"[make_exactdeflator] insufficient memory for _tmps1 inside deflator.\n");
+    exit(1);
+  } else {
+    tmps1  = (spinor *) ( ((unsigned long int)(_tmps1)+ALIGN_BASE)&~ALIGN_BASE);
+  }
+
+  _tmps2 = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+  if(_tmps2==NULL) {
+    if(g_proc_id == g_stdio_proc) fprintf(stderr,"[make_exactdeflator] insufficient memory for _tmps2 inside deflator.\n");
+    exit(1);
+  } else {
+    tmps2  = (spinor *) ( ((unsigned long int)(_tmps2)+ALIGN_BASE)&~ALIGN_BASE);
+  }
+
+#else
+  ax = (spinor *) malloc(LDN*sizeof(spinor));
+  r  = (spinor *) malloc(LDN*sizeof(spinor));
+  tmps1 = (spinor *) malloc(LDN*sizeof(spinor));
+  tmps2 = (spinor *) malloc(LDN*sizeof(spinor));
+    
+  if( (ax == NULL)  || (r==NULL) || (tmps1==NULL) || (tmps2==NULL) ) {
+    if(g_proc_id == g_stdio_proc) fprintf(stderr,"[make_exactdeflator] insufficient memory for ax,r,tmps1,tmps2 inside deflator_cg.\n");
+    exit(1);
+  }
+#endif
+
+  deflator_params->evecs = malloc(ncv*12*N*sizeof(_Complex double)); /* note: allocation without RAND */
+  deflator_params->evals = malloc(ncv*sizeof(_Complex double)); 
+  deflator_params->prec  = 64;
+  tmpv1 = (_Complex double *) malloc(12*N*sizeof(_Complex double));
+
+  evecs = (_Complex double *)deflator_params->evecs;
+  evals = (_Complex double *)deflator_params->evals;
+
+  if((evecs == NULL)  || (evals==NULL) || (tmpv1==NULL)) {
+    if(g_proc_id == g_stdio_proc) fprintf(stderr,"[make_exactdeflator] insufficient memory for evecs and evals inside deflator_cg.\n");
+    exit(1);
+  }
+
+  if ( deflator_read_ev == 1) {
+
+    if (strcmp(deflator_evecs_fileformat, "partfile") == 0) {
+      /* set evec filenmae */
+      sprintf(evecs_filename, "%s.%.5d.pt%.2dpx%.2dpy%.2dpz%.2d", deflator_evecs_filename, nev, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3]);
+      evecs_fs = fopen(evecs_filename, "r");
+      if (evecs_fs == NULL) {
+        fprintf(stderr, "[make_exactdeflator] (%.4d) Error, could not open file %s for reading\n", g_cart_id, evecs_filename);
+        return(-2);
+      } 
+      if(g_proc_id == g_stdio_proc) fprintf(stdout, "# [make_exactdeflator] reading eigenvectors from file %s\n", evecs_filename);
+
+      if(deflator_evecs_writeprec == 64) {
+
+        evecs_io_buffer = (void*)evecs;
+  
+        et1=gettime();
+        evecs_count = fread( evecs_io_buffer, sizeof(_Complex double), (size_t)nev*12*N, evecs_fs);
+        et2=gettime();
+       
+      } else {
+        evecs_io_buffer = malloc(sizeof(_Complex double) * (size_t)nev*12*N );
+        if( evecs_io_buffer == NULL) {
+          fprintf(stderr, "[make_exactdeflator] (%.4d) Error, could not allocate memory for evecs_io_buffer\n", g_cart_id);
+          return(-42);
+        }
+  
+        et1=gettime();
+        evecs_count = fread( evecs_io_buffer, sizeof(_Complex double)/2, (size_t)nev*12*N, evecs_fs);
+        et2=gettime();
+
+        single2double(evecs, evecs_io_buffer, nev*24*N);
+
+        free( evecs_io_buffer );
+        evecs_io_buffer = NULL;
+      }
+       
+      if( evecs_count != ((size_t)nev*12*N) ) {
+        fprintf(stderr, "[make_exactdeflator] (%.4d) Error, could not proper amount of data from file %s\n", g_cart_id, evecs_filename);
+        return(-3);
+      }
+      fclose(evecs_fs);
+      evecs_fs = NULL;
+      if(g_proc_id == g_stdio_proc) {
+        fprintf(stdout,"# [make_exactdeflator] ARPACK time for reading %d eigenvectors: %+e seconds\n", nev, et2-et1);
+      }
+    } else if(strcmp(deflator_evecs_fileformat, "single") == 0) {
+
+      if(N==VOLUME) {
+        for(i=0; i<nev; i++) {
+          sprintf(evecs_filename, "%s.ev%.5d", deflator_evecs_filename, i);
+          if(g_proc_id == g_stdio_proc) fprintf(stdout, "# [make_exactdeflator] reading eigenvector from file %s\n", evecs_filename);
+          evecs_ptr0 = (spinor*)&(evecs[i*12*N]);
+          evecs_ptr1 = NULL;
+          read_spinor(evecs_ptr0,  evecs_ptr1, evecs_filename, 0);
+        } /* end of loop on eigenvectors */
+      } else if(N==VOLUME/2) {
+        for(i=0; i<nev/2; i++) {
+          sprintf(evecs_filename, "%s.ev%.5d", deflator_evecs_filename, 2*i);
+          if(g_proc_id == g_stdio_proc) fprintf(stdout, "# [make_exactdeflator] reading eigenvector pair from file %s\n", evecs_filename);
+          evecs_ptr0 = (spinor*)&(evecs[(2*i  )*12*N]);
+          evecs_ptr1 = (spinor*)&(evecs[(2*i+1)*12*N]);
+          read_spinor(evecs_ptr0,  evecs_ptr1, evecs_filename, 0);
+        } /* end of loop on eigenvectors */
+      }
+    }   /* of if deflator_evecs_fileformat */
+
+    /* set info_arpack pro forma to SUCCESS */
+    nconv = nev;
+    deflator_params->nconv = nev;
+    info_arpack = 0;
+  } else {
+    et1=gettime();
+    evals_arpack(N,nev,ncv,kind,acc,cheb_k,emin,emax,evals,evecs,deflator_eig_tol,deflator_eig_maxiter,f,&info_arpack,&nconv,deflator_logfile);
+    et2=gettime();
+
+    if(info_arpack != 0) { /* arpack didn't converge */
+      if(g_proc_id == g_stdio_proc) fprintf(stderr,"[make_exactdeflator] Error: ARPACK didn't converge; exiting\n");
+      return(-1);
+    }
+   
+    if(g_proc_id == g_stdio_proc) {
+       fprintf(stdout,"# [make_exactdeflator] ARPACK has computed %d eigenvectors\n",nconv);
+       fprintf(stdout,"# [make_exactdeflator] ARPACK time: %+e\n",et2-et1);
+    }
+
+    if ( deflator_write_ev == 1) {
+
+      if(strcmp(deflator_evecs_fileformat, "partfile") == 0 ) {
+
+        /* set evec filenmae */
+        sprintf(evecs_filename, "%s.%.5d.pt%.2dpx%.2dpy%.2dpz%.2d", deflator_evecs_filename, nconv, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3]);
+
+        evecs_fs = fopen(evecs_filename, "w");
+        if (evecs_fs == NULL) {
+          fprintf(stderr, "[make_exactdeflator] (%.4d) Error, could not open file %s for writing\n", g_cart_id, evecs_filename);
+          return(-4);
+        }
+      
+        if(deflator_evecs_writeprec == 64) {
+
+          evecs_io_buffer = (void*)evecs;
+ 
+          et1=gettime();
+          evecs_count = fwrite( evecs_io_buffer, sizeof(_Complex double), (size_t)nconv*12*N, evecs_fs);
+          et2=gettime();
+
+        } else {
+          evecs_io_buffer = malloc(sizeof(_Complex double) * (size_t)nconv*12*N );
+          if( evecs_io_buffer == NULL) {
+            fprintf(stderr, "[make_exactdeflator] (%.4d) Error, could not allocate memory for evecs_io_buffer\n", g_cart_id);
+            return(-41);
+          }
+          double2single(evecs_io_buffer, evecs, nconv*24*N);
+
+          et1=gettime();
+          evecs_count = fwrite( evecs_io_buffer, sizeof(_Complex double)/2, (size_t)nconv*12*N, evecs_fs);
+          et2=gettime();
+          free(evecs_io_buffer);
+          evecs_io_buffer = NULL;
+        }
+
+        if( evecs_count != ((size_t)nconv*12*N) ) {
+          fprintf(stderr, "[make_exactdeflator] (%.4d) Error, could not write proper amount of data to file %s\n", g_cart_id, evecs_filename);
+          return(-5);
+        }
+        fclose(evecs_fs);
+        evecs_fs = NULL;
+
+        if(g_proc_id == g_stdio_proc) {
+          fprintf(stdout,"[make_exactdeflator] (%.4d) ARPACK time for writing %d eigenvectors: %+e seconds\n", g_cart_id, nconv, et2-et1);
+        }
+
+      } else if (strcmp(deflator_evecs_fileformat, "single") == 0) {
+
+        if(N==VOLUME) {
+          for(i=0; i<nconv; i++) {
+            sprintf(evecs_filename, "%s.ev%.5d", deflator_evecs_filename, i);
+            construct_writer(&evecs_writer, evecs_filename, 0);
+            evecs_propagatorFormat = construct_paramsPropagatorFormat(deflator_evecs_writeprec, 1);
+            write_propagator_format(evecs_writer, evecs_propagatorFormat);
+            free(evecs_propagatorFormat);
+            evecs_ptr0 = (spinor*)&(evecs[i*12*N]);
+            evecs_ptr1 = NULL;
+            write_spinor(evecs_writer, &evecs_ptr0, &evecs_ptr1, 1, deflator_evecs_writeprec);
+            destruct_writer(evecs_writer);
+            evecs_writer=NULL;
+          } /* end of loop on converged eigenvectors */
+        } else if(N==VOLUME/2) {
+          for(i=0; i<nconv/2; i++) {
+            sprintf(evecs_filename, "%s.ev%.5d", deflator_evecs_filename, 2*i);
+            construct_writer(&evecs_writer, evecs_filename, 0);
+            evecs_propagatorFormat = construct_paramsPropagatorFormat(deflator_evecs_writeprec, 1);
+            write_propagator_format(evecs_writer, evecs_propagatorFormat);
+            free(evecs_propagatorFormat);
+            evecs_ptr0 = (spinor*)&(evecs[(2*i  )*12*N]);
+            evecs_ptr1 = (spinor*)&(evecs[(2*i+1)*12*N]);
+            write_spinor(evecs_writer, &evecs_ptr0, &evecs_ptr1,1, deflator_evecs_writeprec);
+            destruct_writer(evecs_writer);
+            evecs_writer=NULL;
+          }  /* end of loop on converged eigenvectors */
+        }    /* end of if N == VOLUME */
+
+      }      /* of if deflator_evecs_fileformat */
+
+    }        /* end of if deflator_write_ev == 1 */
+
+  }          /* end of if deflator_read_ev == 1 */
+
+  deflator_params->nconv = nconv;
+
+  /* compute Ritz values and Ritz vectors if needed */
+  if( (nconv>0) && (comp_evecs !=0)) {
+
+    H           = (_Complex double *)malloc(nconv*nconv*sizeof(_Complex double)); 
+    HU          = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); 
+    Hinv        = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); 
+    initwork    = (_Complex double *) malloc(nconv*sizeof(_Complex double)); 
+    IPIV        = (int *) malloc(nconv*sizeof(int));
+    zheev_lwork = 3*nconv;
+    zheev_work  = (_Complex double *) malloc(zheev_lwork*sizeof(_Complex double));
+    zheev_rwork = (double *) malloc(3*nconv*sizeof(double));
+    hevals      = (double *) malloc(nconv*sizeof(double));
+
+    if((H==NULL) || (HU==NULL) || (Hinv==NULL) || (initwork==NULL) || (IPIV==NULL) || (zheev_lwork==NULL) || (zheev_rwork==NULL) || (hevals==NULL)) {
+       if(g_proc_id == g_stdio_proc) {
+          fprintf(stderr,"[make_exactdeflator] insufficient memory for H, HU, Hinv, initwork, IPIV, zheev_lwork, zheev_rwork, hevals inside deflator_cg.\n");
+       }
+       exit(1);
+    }
+    et1=gettime();
+    /* compute the elements of the hermitian matrix H 
+       leading dimension is nconv and active dimension is nconv */
+  
+    for(i=0; i<nconv; i++) {
+      assign_complex_to_spinor(r,&evecs[i*12*N],12*N);
+      f(ax,r);
+      c1 = scalar_prod(r,ax,N,parallel);
+      H[i+nconv*i] = creal(c1);  /* diagonal should be real */
+      evals[i] = creal(c1)+ 0.*I;
+      for(j=i+1; j<nconv; j++) {
+        assign_complex_to_spinor(r,&evecs[j*12*N],12*N);
+        c1 = scalar_prod(r,ax,N,parallel);
+        H[j+nconv*i] = c1;
+        H[i+nconv*j] = conj(c1); /* enforce hermiticity */
+      }
+    }  /* end of loop on nconv */
+
+    et2=gettime();
+    if(g_proc_id == g_stdio_proc) {
+      fprintf(stdout,"# [make_exactdeflator] time to compute H: %+e\n",et2-et1);
+    }
+
+    et1=gettime();
+    /* copy H into HU */
+    tmpsize=nconv*nconv;
+    _FT(zcopy)(&tmpsize,H,&ONE,HU,&ONE);
+
+    /* compute eigenvalues and eigenvectors of HU*/
+    /* SUBROUTINE ZHEEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK,INFO ) */
+    _FT(zheev)(&cV,&cU,&nconv,HU,&nconv,hevals,zheev_work,&zheev_lwork,zheev_rwork,&zheev_info,1,1);
+
+    if(zheev_info != 0) {
+      if(g_proc_id == g_stdio_proc) {
+        fprintf(stderr,"[make_exactdeflator] Error in ZHEEV:, info =  %d\n",zheev_info); 
+        fflush(stderr);
+      }
+      exit(1);
+    }
+
+    /* If you want to replace the Schur (orthonormal) basis by eigen basis
+       use something like this. It is better to use the schur basis because
+       they are better conditioned. Use this part only to get the eigenvalues
+       and their resduals for the operator (D^\daggerD)
+       esize=(ncv-nconv)*12*N;
+       Zrestart_X(evecs,12*N,HU,12*N,nconv,nconv,&evecs[nconv*N],esize); */
+
+    /* compute residuals and print out results */
+
+    if(g_proc_id == g_stdio_proc) {
+      fprintf(stdout,"# [make_exactdeflator] Ritz values of A and their residulas (||A*x-lambda*x||/||x||\n"); 
+      fprintf(stdout,"# [make_exactdeflator] =============================================================\n");
+      fflush(stdout);
+    }
+
+    for(i=0; i<nconv; i++) {
+      tmpsize=12*N;
+      _FT(zgemv)(&cN,&tmpsize,&nconv,&tpone,evecs,&tmpsize, &HU[i*nconv],&ONE,&tzero,tmpv1,&ONE,1);
+      assign_complex_to_spinor(r,tmpv1,12*N);
+      d1=square_norm(r,N,parallel);
+      f(ax,r);
+      mul_r(tmps1,hevals[i],r,N);
+      diff(tmps2,ax,tmps1,N);
+      d2= square_norm(tmps2,N,parallel);
+      d3= sqrt(d2/d1);
+ 
+      if(g_proc_id == g_stdio_proc) {
+        fprintf(stdout,"# [make_exactdeflator] Eval %6d %25.16e %25.16e %25.16e\n", i, hevals[i], d3, creal(evals[i]) );
+        fflush(stdout);
+      }
+    } 
+
+    free(H);
+    free(HU);
+    free(Hinv);
+    free(initwork);
+    free(IPIV);
+    free(zheev_work);
+    free(hevals);
+    free(zheev_rwork);
+  } else {
+    memset(evals, 0, nconv*sizeof(double));
+  }  /* if( (nconv_arpack>0) && (comp_evecs !=0)) */
+
+  et2=gettime();
+  if(g_proc_id == g_stdio_proc) {
+    fprintf(stdout,"# [make_exactdeflator] time to compute eigenvectors: %+e\n",et2-et1);
+  }
+
+#if ( (defined SSE) || (defined SSE2) || (defined SSE3)) 
+  free(_ax);
+  free(_r);
+  free(_tmps1);
+  free(_tmps2);
+#else
+  free(ax);
+  free(r);
+  free(tmps1);
+  free(tmps2);
+#endif
+  free(tmpv1);
+
+  return(0);
+}  /* end of make_exactdeflator */
+
+int fini_exactdeflator( deflator_params_t *deflator_params ) {
+
+  strcpy(deflator_params->type_name, "NA");
+  deflator_params->type = 0;
+
+  deflator_params->eoprec = -1;
+
+  deflator_params->f = (matrix_mult)NULL;
+  deflator_params->f32 = (matrix_mult32)NULL;
+
+  deflator_params->f_final = (matrix_mult)NULL;
+  deflator_params->f_initial = (matrix_mult)NULL;
+
+  deflator_params->projection_type = -1;
+
+  if( deflator_params->evecs != NULL ) free( deflator_params->evecs);
+  deflator_params->evecs = NULL;
+ 
+  if( deflator_params->evals != NULL ) free( deflator_params->evals );
+  deflator_params->evals = NULL;
+  deflator_params->prec = 0;
+
+  deflator_params->nconv = -1;
+  deflator_params->nev   =  0;
+  deflator_params->ncv   =  0;
+  deflator_params->evals_kind = 0;
+  deflator_params->comp_evecs = 0;
+  deflator_params->eig_tol = 0.;
+  deflator_params->eig_maxiter = 0;
+  strcpy(deflator_params->logfile, "NA");
+
+  deflator_params->use_acc = 0;
+  deflator_params->cheb_k = 0;
+  deflator_params->op_evmin = 0.;
+  deflator_params->op_evmax = 0.;
+   
+  deflator_params->write_ev = 0;
+  deflator_params->read_ev  = 0;
+  strcpy(deflator_params->evecs_filename, "NA");
+  strcpy(deflator_params->evecs_fileformat,"NA");
+
+  deflator_params->evecs_writeprec = 0;
+
+  deflator_params->init = NULL;
+  deflator_params->fini = NULL;
+  return(0);
+}  /* end of fini_exactdeflator */
diff --git a/solver/deflator.h b/solver/deflator.h
new file mode 100644
index 000000000..52c07cf6d
--- /dev/null
+++ b/solver/deflator.h
@@ -0,0 +1,103 @@
+/*****************************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * Deflating CG using eigenvectors computed using ARPACK
+ *
+ * Author: A.M. Abdel-Rehim (amabdelrehim@gmail.com)
+ *         November 2014
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ****************************************************************************/
+/* A sample input is given in the sample-input folder */
+
+#ifndef _DEFLATOR_H
+#define _DEFLATOR_H
+
+#include "su3.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/eigenvalues_arpack.h"
+
+
+typedef struct deflator_params_t {
+
+  char type_name[100];
+  int type;
+
+  int eoprec;
+
+  matrix_mult f;    /* f(s,r)   computes s=A*r, i.e. matrix-vector multiply in double precision */
+  matrix_mult32 f32;  /* f32(s,r) computes s=A*r, i.e. matrix-vector multiply in single precision */
+
+  matrix_mult f_final;    /* (IN) final operator application during projection of type 1 */
+  matrix_mult f_initial;  /* (IN) initial operator application during projection of type 1 */
+
+  int projection_type;
+
+  void *evecs;
+  void *evals;
+  int prec;
+
+  /**********************************
+   * arpack parameters
+   **********************************/
+
+   int    nconv;         /* number of converged eigenvectors */
+   int    nev;           /* number of eigenvectors to be computed by arpack*/
+   int    ncv;           /* Size of the subspace used by arpack with the condition (nev+1) =< ncv */
+   int    evals_kind;    /* type of eigenvalues to be computed 
+                                     0 eigenvalues of smallest real part
+                                     1 eigenvalues of largest real part 
+                                     2 eigenvalues of smallest absolute value
+                                     3 eigenvalues of largest absolute value*/
+   int    comp_evecs;    /* 0 don't compute the resiudals of the eigenvalues
+                                     1 compute the residulas of the eigenvalues*/
+   double eig_tol;       /* tolerance for computing eigenvalues with arpack */
+   int    eig_maxiter;   /* maximum number of iterations to be used by arpack*/
+   char   logfile[500];  /* file name for the logfile used by arpack*/
+
+  /*************************************
+   * Chebychev polynomial acceleraton
+   * paprameters
+   *************************************/
+   int use_acc;      /* type of acceleration to be used:  */
+                     /*   0 no acceleration, 1 compute eiegnvectors of the acceleration polynomial,   */
+   int cheb_k;       /* order of the polynomial used is k+1 and the lowest value is k=-1 which correspond to T_0 */
+   double op_evmin;  /* lowest boundary of the interval for the polynomial acceleration */
+   double op_evmax;  /* highest boundary for the interval for the polynomial acceleration */
+   
+  /*************************************
+   * parameters to control of I/O for
+   * eigenvectors
+   *************************************/
+   int write_ev;
+   int read_ev;
+   char evecs_filename[500];    /* file name for the evecs used by arpack*/
+   char evecs_fileformat[200];  /* file format for evecs used by arpack */
+
+   int evecs_writeprec;
+
+  /*************************************
+   * init function
+   *************************************/
+  int (*init)(struct deflator_params_t*);
+  int (*fini)(struct deflator_params_t*);
+
+} deflator_params_t;
+
+int make_exactdeflator( deflator_params_t *deflator_params);
+
+#endif

From c6c0ea96ba5d4fa8556cbcccd7bc19f599132d3e Mon Sep 17 00:00:00 2001
From: Marcus Petschlies <marcus.petschlies@web.de>
Date: Mon, 26 Dec 2016 11:04:32 +0100
Subject: [PATCH 5/8] added deflator_params to operator type; added parsing of
 exdefl parameters to read_input.l; additional solver type EXACTDEFLATION in
 solver/solver_types.h

---
 operator.h            |   3 ++
 read_input.l          | 118 +++++++++++++++++++++++++++++++++++++++++-
 solver/solver_types.h |   3 +-
 3 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/operator.h b/operator.h
index 411476528..f5a3c3eb3 100644
--- a/operator.h
+++ b/operator.h
@@ -89,6 +89,9 @@ typedef struct {
   /*solver parameters struct*/
   solver_params_t solver_params;
 
+  /* deflator parameters struct */
+  deflator_params_t deflator_params;
+
   /* multiple masses for CGMMS */
   double extra_masses[MAX_EXTRA_MASSES];
   int no_extra_masses;
diff --git a/read_input.l b/read_input.l
index e312c6e68..febe30445 100644
--- a/read_input.l
+++ b/read_input.l
@@ -285,6 +285,9 @@ static inline void rmQuotes(char *str){
 %x MULTIGRID
 %x INITMULTIGRID
 
+%x EXACTDEFLATION
+%x INITEXACTDEFLATION
+
 %x INITOPERATOR
 %x TMOP
 %x DBTMOP
@@ -630,6 +633,104 @@ static inline void rmQuotes(char *str){
   }
 }
 
+
+<EXACTDEFLATION>{
+  {SPC}*Nev{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).nev = a;
+    if(myverbose) printf("  nev set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*Ncv{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).ncv = a;
+    if(myverbose) printf("  ncv set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*EvalsKind{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).evals_kind  = a;
+    if(myverbose) printf("  evals_kind set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*EvecsMaxiter{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).eig_maxiter = a;
+    if(myverbose) printf("  eig_maxiter set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*EvecsTolerance{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c);
+    (optr->deflator_params).eig_tol = c;
+    if(myverbose) printf("  eig_tol set to %lf line %d operator %d\n", c, line_of_file, current_operator);
+  }
+  {SPC}*ChebPolyDegree{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).cheb_k = a;
+    if(myverbose) printf("  cheb_k set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*AccMode{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).use_acc = a;
+    if(myverbose) printf("   polynomial acceleration mode set to  %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*CompEvecs{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).comp_evecs = a;
+    if(myverbose) printf("   compute evecs option set to   %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*LogFile{EQL}{FILENAME}+ {
+    sscanf(yytext, " %[a-zA-Z] = %s", name, fname);
+    strncpy( (optr->deflator_params).logfile, fname, 200);
+    if(myverbose) printf("   logfile name is  set to %s line %d operator %d\n", fname, line_of_file, current_operator);
+  }
+  {SPC}*EVminHPD{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c);
+    (optr->deflator_params).op_evmin = c;
+    if(myverbose) printf("  op_evmin set to %lf line %d operator %d\n", c, line_of_file, current_operator);
+  }
+  {SPC}*EVmaxHPD{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c);
+    (optr->deflator_params).op_evmax = c;
+    if(myverbose) printf("  op_evmax set to %lf line %d operator %d\n", c, line_of_file, current_operator);
+  }
+  {SPC}*EvecsRead{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).read_ev        = a;
+    if(myverbose) printf("  read_ev set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*EvecsWrite{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).write_ev = a;
+    if(myverbose) printf("  write_ev set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*ProjectionType{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).projection_type = a;
+    if(myverbose) printf("  projection_type set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*EvecsFile{EQL}{FILENAME}+ {
+    sscanf(yytext, " %[a-zA-Z] = %s", name, fname);
+    strncpy( (optr->deflator_params).evecs_filename, fname, 200);
+    if(myverbose) printf("   evecs file name is set to %s line %d operator %d\n", fname, line_of_file, current_operator);
+  }
+  {SPC}*EvecsWriteprec{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).evecs_writeprec = a;
+    if(myverbose) printf("   evecs precision for writing set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*EvecsFileformat{EQL}{FILENAME}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, fname);
+    strncpy( (optr->deflator_params).evecs_fileformat, fname, 100);
+    if(myverbose) printf("   evecs precision for writing set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*EvecsPrecision{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    (optr->deflator_params).prec = 64;
+    if(myverbose) printf("  precision for eigenvectors set to %d line %d operator %d\n", a, line_of_file, current_operator);
+  }
+  {SPC}*EndExactDeflation{SPC}* {
+  if(myverbose) printf("EXACTDEFLATION parsed in line %d\n\n", line_of_file);
+  BEGIN(name_caller);
+  }
+}
+
 <INITMULTIGRID>AMG{SPC}* {
  if(myverbose) printf("Initialising DDalphaAMG line %d\n", line_of_file); 
  BEGIN(MULTIGRID);
@@ -805,6 +906,10 @@ static inline void rmQuotes(char *str){
     optr->rel_prec = 0;
     if(myverbose) printf("  SolverRelativePrecision set to NO line %d operator %d\n",  line_of_file, current_operator);
   }
+  {SPC}*BeginExactDeflation {
+    printf("[read_input] starting exact deflator init\n");
+    BEGIN(EXACTDEFLATION);
+  }
   {SPC}*mcgdelta{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c);
     (optr->solver_params).mcg_delta = c;
@@ -1175,6 +1280,12 @@ static inline void rmQuotes(char *str){
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
+  exactdeflatedcg {
+    optr->solver = EXACTDEFLATEDCG;
+    if(myverbose) printf("  Solver set to EXACTDEFLATEDCG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
+
 }
 
 
@@ -1206,6 +1317,11 @@ static inline void rmQuotes(char *str){
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
+  exactdeflatedcg {
+    optr->solver = EXACTDEFLATEDCG;
+    if(myverbose) printf("  Solver set to EXACTDEFLATEDCG line %d operator %d\n", line_of_file, current_operator);
+    BEGIN(name_caller);
+  }
 }
 
 <OVSOLVER>{
@@ -2474,7 +2590,7 @@ static inline void rmQuotes(char *str){
   BEGIN(comment_caller);
 }
 
-<INITMONOMIAL,DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLPOLYMONOMIAL,GAUGEMONOMIAL,INTEGRATOR,INITINTEGRATOR,INITMEASUREMENT,PIONNORMMEAS,ONLINEMEAS,ORIENTEDPLAQUETTESMEAS,GRADIENTFLOWMEAS,INITOPERATOR,TMOP,DBTMOP,OVERLAPOP,WILSONOP,CLOVEROP,DBCLOVEROP,POLYMONOMIAL,PLOOP,INITGPU,GPU,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,INITDEFLATION,DEFLATION,INITMULTIGRID,MULTIGRID>{SPC}*\n   {
+<INITMONOMIAL,DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLPOLYMONOMIAL,GAUGEMONOMIAL,INTEGRATOR,INITINTEGRATOR,INITMEASUREMENT,PIONNORMMEAS,ONLINEMEAS,ORIENTEDPLAQUETTESMEAS,GRADIENTFLOWMEAS,INITOPERATOR,TMOP,DBTMOP,OVERLAPOP,WILSONOP,CLOVEROP,DBCLOVEROP,POLYMONOMIAL,PLOOP,INITGPU,GPU,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,INITDEFLATION,DEFLATION,INITMULTIGRID,MULTIGRID,INITEXACTDEFLATION,EXACTDEFLATION>{SPC}*\n   {
   line_of_file++;
 }
 <*>{SPC}*\n                       {
diff --git a/solver/solver_types.h b/solver/solver_types.h
index d16491580..2205d71d5 100644
--- a/solver/solver_types.h
+++ b/solver/solver_types.h
@@ -24,7 +24,8 @@ typedef enum SOLVER_TYPE {
  MCR,
  CR,
  BICG,
- MG
+ MG,
+ EXACTDEFLATEDCG
 } SOLVER_TYPE;
 
 #endif

From b50813c8859e56b35dd3fae9b94169743e38b254 Mon Sep 17 00:00:00 2001
From: Marcus Petschlies <marcus.petschlies@web.de>
Date: Mon, 26 Dec 2016 11:40:14 +0100
Subject: [PATCH 6/8] added deflator_params initialization in operator.c; TODO:
 add deflator_params and solver_params to argument list of invert_eo and
 invert_clover_eo

---
 operator.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/operator.c b/operator.c
index d9a4e14fe..d3f7683c6 100644
--- a/operator.c
+++ b/operator.c
@@ -130,6 +130,37 @@ int add_operator(const int type) {
   optr->inverter = &op_invert;
   optr->write_prop = &op_write_prop;
 
+  /* init deflator parameters */
+  (optr->deflator_params).type            = -1;
+  (optr->deflator_params).eoprec          = _default_even_odd_flag;
+  (optr->deflator_params).f               = NULL;
+  (optr->deflator_params).f32             = NULL;
+  (optr->deflator_params).f_final         = NULL;
+  (optr->deflator_params).f_initial       = NULL;
+  (optr->deflator_params).evecs           = NULL;
+  (optr->deflator_params).evals           = NULL;
+  (optr->deflator_params).prec            = 64;
+  (optr->deflator_params).nconv           = -1;
+  (optr->deflator_params).nev             = 0;
+  (optr->deflator_params).ncv             = 0;
+  (optr->deflator_params).evals_kind      = 0;
+  (optr->deflator_params).comp_evecs      = 0;
+  (optr->deflator_params).eig_tol         = 0.;
+  (optr->deflator_params).eig_maxiter     = 0;
+  (optr->deflator_params).use_acc         = 0;
+  (optr->deflator_params).cheb_k          = 0;
+  (optr->deflator_params).op_evmin        = 0.0;
+  (optr->deflator_params).op_evmax        = 0.0;
+  (optr->deflator_params).write_ev        = 0;
+  (optr->deflator_params).read_ev         = 0;
+  (optr->deflator_params).evecs_writeprec = 64;
+  (optr->deflator_params).init            = NULL;
+  strcpy( (optr->deflator_params).type_name,       "NA");
+  strcpy((optr->deflator_params).logfile,          "arpack_log" );
+  strcpy((optr->deflator_params).evecs_filename,   "eigenvectors");
+  strcpy((optr->deflator_params).evecs_fileformat, "single");
+
+
   /* Overlap needs special treatment */
   if(optr->type == OVERLAP) {
     optr->even_odd_flag = 0;
@@ -165,12 +196,28 @@ int init_operators() {
       optr = operator_list + i;
       /* This is a hack, it should be set on an operator basis. */
       optr->rel_prec = g_relative_precision_flag;
+
+      if(optr->solver == EXACTDEFLATEDCG) {
+        (optr->deflator_params).eoprec = optr->even_odd_flag;
+        (optr->deflator_params).type   = optr->type;
+        (optr->deflator_params).init   = make_exactdeflator;
+      }
+
       if(optr->type == TMWILSON || optr->type == WILSON) {
         if(optr->c_sw > 0) {
           init_sw_fields();
         }
         optr->applyM = &M_full;
         optr->applyQ = &Q_full;
+
+        if(optr->solver == EXACTDEFLATEDCG) {
+          if(optr->type == TMWILSON ) {
+            strcpy( (optr->deflator_params).type_name, "TMWILSON" );
+          } else if( optr->type == WILSON) {
+            strcpy( (optr->deflator_params).type_name, "WILSON" );
+          }
+        }
+
         if(optr->even_odd_flag) {
           optr->applyMee    = &Mee_psi;
           optr->applyMeeInv = &Mee_inv_psi;
@@ -179,6 +226,15 @@ int init_operators() {
           optr->applyQsq = &Qtm_pm_psi;
           optr->applyMp = &Mtm_plus_psi;
           optr->applyMm = &Mtm_minus_psi;
+
+          if(optr->solver == EXACTDEFLATEDCG) {
+            /* the the operators in the deflator */
+            (optr->deflator_params).f         = &Qtm_pm_psi;
+            (optr->deflator_params).f32       = &Qtm_pm_psi_32;
+            (optr->deflator_params).f_final   = &Qtm_plus_psi;
+            (optr->deflator_params).f_initial = &Qtm_minus_psi;
+          }
+
         }
         else {
           optr->applyQp = &Q_plus_psi;
@@ -186,6 +242,14 @@ int init_operators() {
           optr->applyQsq = &Q_pm_psi;
           optr->applyMp = &D_psi;
           optr->applyMm = &M_minus_psi;
+
+          if(optr->solver == EXACTDEFLATEDCG) {
+            /* the the operators in the deflator */
+            (optr->deflator_params).f         = &Q_pm_psi;
+            (optr->deflator_params).f32       = &Q_pm_psi_32;
+            (optr->deflator_params).f_final   = &Q_plus_psi;
+            (optr->deflator_params).f_initial = &Q_minus_psi;
+          }
         }
         if(optr->solver == CGMMS) {
           if (g_cart_id == 0 && optr->even_odd_flag == 1)
@@ -211,6 +275,11 @@ int init_operators() {
         }
         optr->applyM = &Msw_full;
         optr->applyQ = &Qsw_full;
+
+        if(optr->solver == EXACTDEFLATEDCG) {
+          strcpy( (optr->deflator_params).type_name, "CLOVER" );
+        }
+
         if(optr->even_odd_flag) {
           optr->applyMee    = &Mee_sw_psi;
           optr->applyMeeInv = &Mee_sw_inv_psi;
@@ -219,6 +288,14 @@ int init_operators() {
           optr->applyQsq = &Qsw_pm_psi;
           optr->applyMp = &Msw_plus_psi;
           optr->applyMm = &Msw_minus_psi;
+
+          if(optr->solver == EXACTDEFLATEDCG) {
+            /* the the operators in the deflator */
+            (optr->deflator_params).f         = &Qsw_pm_psi;
+            (optr->deflator_params).f32       = &Qsw_pm_psi_32;
+            (optr->deflator_params).f_final   = &Qsw_plus_psi;
+            (optr->deflator_params).f_initial = &Qsw_minus_psi;
+          }
         }
         else {
           optr->applyQp = &Qsw_full_plus_psi;
@@ -226,6 +303,14 @@ int init_operators() {
           optr->applyQsq = &Qsw_full_pm_psi;
           optr->applyMp = &D_psi;
           optr->applyMm = &Msw_full_minus_psi;
+
+          if(optr->solver == EXACTDEFLATEDCG) {
+            /* the the operators in the deflator */
+            (optr->deflator_params).f         = Qsw_full_pm_psi;
+            (optr->deflator_params).f32       = NULL;
+            (optr->deflator_params).f_final   = &Qsw_full_plus_psi;
+            (optr->deflator_params).f_initial = &Qsw_full_minus_psi;
+          }
         }
         if(optr->solver == CGMMS) {
           if (g_cart_id == 0 && optr->even_odd_flag == 1)

From 443ee35c72aa306ff745244168530bd04c984e48 Mon Sep 17 00:00:00 2001
From: hch02q <hch02q@jrl02.jureca>
Date: Thu, 29 Dec 2016 18:17:29 +0100
Subject: [PATCH 7/8] added support for arpackcg, ARPACK-based exact deflation;
 calls to ARPACKCG and EXACTDEFLATEDCG must be added to invert_eo and
 invert_clover_eo

---
 include/tmLQCD.h            |   9 +
 invert_eo.h                 |   1 +
 linalg/arpack.h             |  74 ++++
 linalg/assign.c             | 162 ++++++++
 linalg/assign.h             |   6 +
 linalg/fortran.h            |   6 +
 operator.c                  |   1 +
 operator.h                  |   1 +
 read_input.l                |   2 +-
 solver/Makefile.in          |   4 +-
 solver/arpack_cg.c          | 733 ++++++++++++++++++++++++++++++++++++
 solver/arpack_cg.h          |  52 +++
 solver/deflated_cg.h        |  43 +++
 solver/eigenvalues_arpack.c | 515 +++++++++++++++++++++++++
 solver/eigenvalues_arpack.h |  74 ++++
 solver/precon.c             | 125 ++++++
 solver/precon.h             |  39 ++
 solver/solver_params.h      |  51 +++
 wrapper/lib_wrapper.c       |   5 +
 19 files changed, 1901 insertions(+), 2 deletions(-)
 create mode 100644 linalg/arpack.h
 create mode 100644 solver/arpack_cg.c
 create mode 100644 solver/arpack_cg.h
 create mode 100644 solver/deflated_cg.h
 create mode 100644 solver/eigenvalues_arpack.c
 create mode 100644 solver/eigenvalues_arpack.h
 create mode 100644 solver/precon.c
 create mode 100644 solver/precon.h

diff --git a/include/tmLQCD.h b/include/tmLQCD.h
index 7c4dcd464..038fd9299 100755
--- a/include/tmLQCD.h
+++ b/include/tmLQCD.h
@@ -46,6 +46,15 @@ extern "C"
 #endif
   } tmLQCD_mpi_params;
 
+  typedef struct {
+    char type_name[100];
+    int  eoprec;
+    void *evecs;
+    void *evals;
+    int  prec;
+    int  nev;
+  } tmLQCD_deflator_params;
+
   int tmLQCD_invert_init(int argc, char *argv[], const int verbose, const int external_id);
   int tmLQCD_read_gauge(const int nconfig);
   int tmLQCD_invert(double * const propagator, double * const source,
diff --git a/invert_eo.h b/invert_eo.h
index d64bb3c0e..bb3b87c34 100644
--- a/invert_eo.h
+++ b/invert_eo.h
@@ -28,6 +28,7 @@
 #define _INVERT_EO_H
 #include "global.h"
 #include "solver/solver_params.h"
+#include "solver/deflator.h"
 
 int invert_eo(spinor * const Even_new, spinor * const Odd_new, 
               spinor * const Even, spinor * const Odd,
diff --git a/linalg/arpack.h b/linalg/arpack.h
new file mode 100644
index 000000000..a4d5c7197
--- /dev/null
+++ b/linalg/arpack.h
@@ -0,0 +1,74 @@
+/*************************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Header file to interface with ARPACK package for solving large 
+ * eigenvalue problems
+ *
+ * Author: Abdou M. Abdel-Rehim (amabdelrehim@gmail.com) 2014
+ *************************************************************************/
+
+
+#ifndef _ARPACK_H
+#define _ARPACK_H
+
+#include <complex.h>
+#include "linalg/fortran.h"
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+
+
+//ARPACK initlog and finilog routines for printing the ARPACK log (same for serial and parallel version)
+extern int _AFT(initlog) (int*, char*, int);
+extern int _AFT(finilog) (int*);
+
+
+//ARPACK driver routines for computing eigenvectors (serial version) 
+extern int _AFT(znaupd) (int *ido, char *bmat, int *n, char *which, int *nev, double *tol,
+                         _Complex double *resid, int *ncv, _Complex double *v, int *ldv, 
+                         int *iparam, int *ipntr, _Complex double *workd, _Complex double *workl, 
+                         int *lworkl, double *rwork, int *info, int bmat_size, int which_size );
+			
+extern int _AFT(zneupd) (int *comp_evecs, char *howmany, int *select, _Complex double *evals, 
+			 _Complex double *v, int *ldv, _Complex double *sigma, _Complex double *workev, 
+			 char *bmat, int *n, char *which, int *nev, double *tol, _Complex double *resid, 
+                         int *ncv, _Complex double *v1, int *ldv1, int *iparam, int *ipntr, 
+                         _Complex double *workd, _Complex double *workl, int *lworkl, double *rwork, int *info,
+                         int howmany_size, int bmat_size, int which_size);
+
+extern int _AFT(mcinitdebug)(int*,int*,int*,int*,int*,int*,int*,int*);
+
+//PARPACK routines (parallel version)
+#ifdef TM_USE_MPI
+extern int _AFT(pznaupd) (int *comm, int *ido, char *bmat, int *n, char *which, int *nev, double *tol,
+                         _Complex double *resid, int *ncv, _Complex double *v, int *ldv, 
+                         int *iparam, int *ipntr, _Complex double *workd, _Complex double *workl, 
+                         int *lworkl, double *rwork, int *info, int bmat_size, int which_size );
+
+extern int _AFT(pzneupd) (int *comm, int *comp_evecs, char *howmany, int *select, _Complex double *evals, 
+                         _Complex double *v, int *ldv, _Complex double *sigma, _Complex double *workev, 
+                         char *bmat, int *n, char *which, int *nev, double *tol, _Complex double *resid, 
+                         int *ncv, _Complex double *v1, int *ldv1, int *iparam, int *ipntr, 
+                         _Complex double *workd, _Complex double *workl, int *lworkl, double *rwork, int *info,
+                         int howmany_size, int bmat_size, int which_size);
+
+extern int _AFT(pmcinitdebug)(int*,int*,int*,int*,int*,int*,int*,int*);
+#endif
+
+#endif
diff --git a/linalg/assign.c b/linalg/assign.c
index 7f5a4cdef..01a799238 100644
--- a/linalg/assign.c
+++ b/linalg/assign.c
@@ -34,6 +34,7 @@
 #include <math.h>
 #include <string.h>
 #include "su3.h"
+#include "global.h"
 #include "assign.h"
 
 
@@ -67,3 +68,164 @@ void assign_su3vect(su3_vector * const R, su3_vector * const S, const int N)
   }
 }
 #endif
+
+
+/* copy a complex double S of size N into a spinor R of size N/12  */
+/* S input, R output        */
+/* S and R must not overlap */
+void assign_complex_to_spinor(spinor * const R, _Complex double * const S, const int N)
+{
+  //check the dimension
+  if( (N%12) != 0){
+    if(g_proc_id==g_stdio_proc)
+      fprintf(stderr,"Error in assign_double_to_spinor. N is not a multiple of 12");
+  }
+
+  int k; //spinor index
+  spinor *r;
+  _Complex double *s;
+
+  k=0;
+  for(int ix=0; ix<N ; ix +=12)
+  {
+     s=S+ix;
+     r=R+k;
+
+     (r->s0).c0 = *s;
+     (r->s0).c1 = *(s+1);
+     (r->s0).c2 = *(s+2);
+             
+     (r->s1).c0 = *(s+3);
+     (r->s1).c1 = *(s+4);
+     (r->s1).c2 = *(s+5);
+
+     (r->s2).c0 = *(s+6);
+     (r->s2).c1 = *(s+7);
+     (r->s2).c2 = *(s+8);
+
+
+     (r->s3).c0 = *(s+9);
+     (r->s3).c1 = *(s+10);
+     (r->s3).c2 = *(s+11);
+
+     k++;
+  } 
+
+  return;
+}
+
+
+void assign_complex_to_spinor_32(spinor32 * const R, _Complex float* const S, const int N)
+{
+  //check the dimension
+  if( (N%12) != 0){
+    if(g_proc_id==g_stdio_proc)
+      fprintf(stderr,"Error in assign_double_to_spinor. N is not a multiple of 12");
+  }
+
+  int k; //spinor index
+  spinor32 *r;
+  _Complex float *s;
+
+  k=0;
+  for(int ix=0; ix<N ; ix +=12)
+  {
+     s=S+ix;
+     r=R+k;
+
+     (r->s0).c0 = *s;
+     (r->s0).c1 = *(s+1);
+     (r->s0).c2 = *(s+2);
+             
+     (r->s1).c0 = *(s+3);
+     (r->s1).c1 = *(s+4);
+     (r->s1).c2 = *(s+5);
+
+     (r->s2).c0 = *(s+6);
+     (r->s2).c1 = *(s+7);
+     (r->s2).c2 = *(s+8);
+
+
+     (r->s3).c0 = *(s+9);
+     (r->s3).c1 = *(s+10);
+     (r->s3).c2 = *(s+11);
+
+     k++;
+  } 
+
+  return;
+}
+
+/* copy a spinor S of size N into a complex double R of size 12*N  */
+/* S input, R output        */
+/* S and R must not overlap */
+void assign_spinor_to_complex(_Complex double * const R, spinor * const S, const int N)
+{
+
+  int k; //complex double index
+  _Complex double *r;
+  spinor *s;
+
+  k=0;
+  for(int ix=0; ix<N ; ix++)
+  {
+     s=S+ix;
+     r=R+k;
+
+     *r     =  (s->s0).c0;
+     *(r+1) =  (s->s0).c1;
+     *(r+2) =  (s->s0).c2;
+
+     *(r+3) =  (s->s1).c0;
+     *(r+4) =  (s->s1).c1;
+     *(r+5) =  (s->s1).c2;
+
+     *(r+6) =  (s->s2).c0;
+     *(r+7) =  (s->s2).c1;
+     *(r+8) =  (s->s2).c2;
+
+     *(r+9)  =  (s->s3).c0;
+     *(r+10) =  (s->s3).c1;
+     *(r+11) =  (s->s3).c2;
+
+     k +=12;
+  } 
+
+  return;
+}
+
+
+void assign_spinor_to_complex_32(_Complex float * const R, spinor32 * const S, const int N)
+{
+
+  int k; //complex double index
+  _Complex float *r;
+  spinor32 *s;
+
+  k=0;
+  for(int ix=0; ix<N ; ix++)
+  {
+     s=S+ix;
+     r=R+k;
+
+     *r     =  (s->s0).c0;
+     *(r+1) =  (s->s0).c1;
+     *(r+2) =  (s->s0).c2;
+
+     *(r+3) =  (s->s1).c0;
+     *(r+4) =  (s->s1).c1;
+     *(r+5) =  (s->s1).c2;
+
+     *(r+6) =  (s->s2).c0;
+     *(r+7) =  (s->s2).c1;
+     *(r+8) =  (s->s2).c2;
+
+     *(r+9)  =  (s->s3).c0;
+     *(r+10) =  (s->s3).c1;
+     *(r+11) =  (s->s3).c2;
+
+     k +=12;
+  } 
+
+  return;
+}
diff --git a/linalg/assign.h b/linalg/assign.h
index 302829aa6..00a3fceaf 100644
--- a/linalg/assign.h
+++ b/linalg/assign.h
@@ -27,4 +27,10 @@ void assign(spinor * const R, spinor * const S, const int N);
 void assign_32(spinor32 * const R, spinor32 * const S, const int N);
 void assign_su3vect(su3_vector * const R, su3_vector * const S, const int N);
 
+void assign_complex_to_spinor(spinor * const R, _Complex double * const S, const int N); /* N is the size of S */
+void assign_spinor_to_complex(_Complex double * const R, spinor * const S, const int N); /* N is the size of S */
+
+void assign_complex_to_spinor_32(spinor32 * const R, _Complex float* const S, const int N);
+void assign_spinor_to_complex_32(_Complex float* const R, spinor32 * const S, const int N); /* N is the size of S */
+
 #endif
diff --git a/linalg/fortran.h b/linalg/fortran.h
index 95b8ccaf8..1ed1379cd 100644
--- a/linalg/fortran.h
+++ b/linalg/fortran.h
@@ -25,4 +25,10 @@
 #define _FT(s) s ## _
 #endif
 
+#if (defined NOARPACKUNDERSCORE)
+#define _AFT(s) s
+#else
+#define _AFT(s) s ## _
+#endif
+
 #endif
diff --git a/operator.c b/operator.c
index d3f7683c6..f21d8fff3 100644
--- a/operator.c
+++ b/operator.c
@@ -48,6 +48,7 @@
 #include "start.h"
 #include "solver/eigenvalues.h"
 #include "solver/solver.h"
+#include "solver/deflator.h"
 #include <io/params.h>
 #include <io/gauge.h>
 #include <io/spinor.h>
diff --git a/operator.h b/operator.h
index f5a3c3eb3..0074e95f6 100644
--- a/operator.h
+++ b/operator.h
@@ -25,6 +25,7 @@
 #include "solver/dirac_operator_eigenvectors.h"
 #include "su3.h"
 #include "solver/solver_params.h"
+#include "solver/deflator.h"
                                     
 
 #define TMWILSON 0
diff --git a/read_input.l b/read_input.l
index febe30445..1f9d402a2 100644
--- a/read_input.l
+++ b/read_input.l
@@ -102,7 +102,7 @@ static inline void rmQuotes(char *str){
   double c;
   float cs;
   int reread = 0;
-  char name[100];
+  char name[100], fname[200];
   char * type;
 
   int verbose = 0;
diff --git a/solver/Makefile.in b/solver/Makefile.in
index a2fd11132..debe8b4b5 100644
--- a/solver/Makefile.in
+++ b/solver/Makefile.in
@@ -36,8 +36,10 @@ libsolver_TARGETS = bicgstab_complex gmres incr_eigcg eigcg restart_X ortho \
 	            cgne4complex mr4complex fgmres4complex \
 	            quicksort gmres_dr lu_solve jdher Msap \
                     jdher_bi gram-schmidt eigenvalues_bi \
+		    eigenvalues_arpack arpack_cg \
+		    deflator deflated_cg \
                     bicgstab_complex_bi cg_her_bi pcg_her \
-                    sub_low_ev cg_her_nd poly_precon \
+                    sub_low_ev cg_her_nd poly_precon precon \
                     generate_dfl_subspace dfl_projector \
                     cg_mms_tm cg_mms_tm_nd mixed_cg_mms_tm_nd \
                     solver_field sumr mixed_cg_her index_jd \
diff --git a/solver/arpack_cg.c b/solver/arpack_cg.c
new file mode 100644
index 000000000..10cca6e13
--- /dev/null
+++ b/solver/arpack_cg.c
@@ -0,0 +1,733 @@
+/*****************************************************************************
+ * Copyright (C) 2014 Abdou M. Abdel-Rehim
+ *
+ * Deflating CG using eigenvectors computed using ARPACK
+ * eigenvectors used correspond to those with smallest magnitude
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+
+#include "global.h"
+#include "gettime.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "linalg/blas.h"
+#include "linalg/lapack.h"
+#include <io/eospinor.h>
+#include <io/params.h>
+#include <io/spinor.h>
+#include <io/utils.h>
+#include "solver_field.h"
+#include "solver/arpack_cg.h"
+
+int arpack_cg(
+  /* solver params */
+  const int N,                   /* (IN) Number of lattice sites for this process*/
+  solver_params_t solver_params, /* (IN) parameters for solver */
+  spinor * const x,              /* (IN/OUT) initial guess on input, solution on output for this RHS*/
+  spinor * const b,              /* (IN) right-hand side*/
+  matrix_mult f,                 /* (IN) f(s,r) computes s=A*r, i.e. matrix-vector multiply in double precision */
+  matrix_mult f32,               /* (IN) f(s,r) computes s=A*r, i.e. matrix-vector multiply in single precision */
+  const double eps_sq,           /* (IN) squared tolerance of convergence of the linear system for systems nrhs1+1 till nrhs*/
+  const int rel_prec,            /* (IN) 0 for using absoute error for convergence
+                                         1 for using relative error for convergence*/
+  const int maxit,               /* (IN) Maximum allowed number of iterations to solution for the linear system*/
+  matrix_mult f_final,           /* (IN) final operator application during projection of type 1 */
+  matrix_mult f_initial          /* (IN) initial operator application during projection of type 1 */
+) {
+
+  /* Static variables and arrays. */
+  static int ncurRHS=0;                  /* current number of the system being solved */                   
+  static void *_ax,*_r,*_tmps1,*_tmps2;                  
+  static spinor *ax,*r,*tmps1,*tmps2;                  
+  static _Complex double *evecs,*evals,*H,*HU,*Hinv,*initwork,*tmpv1;
+  static _Complex double *zheev_work;
+  static double *hevals,*zheev_rwork;
+  static int *IPIV; 
+  static int info_arpack=0;
+  static int nconv=0; /* number of converged eigenvectors as returned by arpack */
+  int i,j,tmpsize;
+  char cV='V',cN='N', cU='U';   
+  int ONE=1;
+  int zheev_lwork,zheev_info;
+  _Complex double c1, c2, c3, tpone=1.0,tzero=0.0;
+  double d1,d2,d3;
+  double et1,et2;  /* timing variables */
+  char evecs_filename[500];
+  FILE *evecs_fs=NULL;
+  size_t evecs_count;
+  WRITER *evecs_writer=NULL;
+  spinor *evecs_ptr0 = NULL, *evecs_ptr1 = NULL;
+  paramsPropagatorFormat *evecs_propagatorFormat = NULL;
+  void *evecs_io_buffer = NULL;
+
+  int parallel;        /* for parallel processing of the scalar products */
+#ifdef TM_USE_MPI
+    parallel=1;
+#else
+    parallel=0;
+#endif
+
+  /* leading dimension for spinor vectors */
+  int LDN;
+  if(N==VOLUME)
+     LDN = VOLUMEPLUSRAND;
+  else
+     LDN = VOLUMEPLUSRAND/2; 
+
+  /*(IN) Number of right-hand sides to be solved*/ 
+  const int nrhs =   solver_params.arpackcg_nrhs; 
+  /*(IN) First number of right-hand sides to be solved using tolerance eps_sq1*/ 
+  const int nrhs1 =   solver_params.arpackcg_nrhs1;
+  /*(IN) squared tolerance of convergence of the linear system for systems 1 till nrhs1*/
+  const double eps_sq1 = solver_params.arpackcg_eps_sq1;
+  /*(IN) suqared tolerance for restarting cg */
+  const double res_eps_sq =   solver_params.arpackcg_res_eps_sq;
+
+  /* parameters for arpack */
+
+  /*(IN) number of eigenvectors to be computed by arpack*/
+  const int nev = solver_params.arpackcg_nev;
+   /*(IN) size of the subspace used by arpack with the condition (nev+1) =< ncv*/
+  const int ncv = solver_params.arpackcg_ncv;
+  /*(IN) tolerance for computing eigenvalues with arpack */
+  double arpack_eig_tol =   solver_params.arpackcg_eig_tol;
+  /*(IN) maximum number of iterations to be used by arpack*/
+  int arpack_eig_maxiter =   solver_params.arpackcg_eig_maxiter;
+  /*(IN) 0 for eigenvalues with smallest real part "SR"
+         1 for eigenvalues with largest real part "LR"
+         2 for eigenvalues with smallest absolute value "SM"
+         3 for eigenvalues with largest absolute value "LM"
+         4 for eigenvalues with smallest imaginary part "SI"
+         5 for eigenvalues with largest imaginary part  "LI"*/
+  int kind =   solver_params.arpackcg_evals_kind;
+  /*(IN) 0 don't compute the eiegnvalues and their residuals of the original system 
+         1 compute the eigenvalues and the residuals for the original system (the orthonormal basis
+           still be used in deflation and they are not overwritten).*/
+  int comp_evecs =   solver_params.arpackcg_comp_evecs;
+  /*(IN) 0 no polynomial acceleration; 1 use polynomial acceleration*/
+  int acc =   solver_params.use_acc;
+  /*(IN) degree of the Chebyshev polynomial (irrelevant if acc=0)*/
+  int cheb_k = solver_params.cheb_k;
+  /*(IN) lower end of the interval where the acceleration will be used (irrelevant if acc=0)*/
+  double emin = solver_params.op_evmin;
+  /*(IN) upper end of the interval where the acceleration will be used (irrelevant if acc=0)*/
+  double emax = solver_params.op_evmax;
+  /*(IN) file name to be used for printing out debugging information from arpack*/
+  char *arpack_logfile = solver_params.arpack_logfile;
+  /*(IN) read eigenvectors in Schur basis from file */
+  int  arpack_read_ev = solver_params.arpackcg_read_ev;
+  /*(IN) write eigenvectors in Schur basis to file */
+  int  arpack_write_ev = solver_params.arpackcg_write_ev;
+  /*(IN) file name to be used for reading and writing evecs from and to disc */
+  char *arpack_evecs_filename = solver_params.arpack_evecs_filename;
+   /*(IN) precision used for writing eigenvectors */
+  int arpack_evecs_writeprec = solver_params.arpack_evecs_writeprec;
+  /* how to project with approximate eigenvectors */
+  int projection_type = solver_params.projection_type;
+  /* file format for evecs used by arpack */
+  char *arpack_evecs_fileformat = solver_params.arpack_evecs_fileformat; 
+
+  /*-------------------------------------------------------------
+    if this is the first right hand side, allocate memory, 
+    call arpack, and compute resiudals of eigenvectors if needed
+    -------------------------------------------------------------*/ 
+  if(ncurRHS==0){ 
+#if (defined SSE || defined SSE2 || defined SSE3)
+    _ax = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+    if(_ax==NULL)
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[arpack_cg] insufficient memory for _ax inside arpack_cg.\n");
+       exit(1);
+    }
+    else
+       {ax  = (spinor *) ( ((unsigned long int)(_ax)+ALIGN_BASE)&~ALIGN_BASE);}
+
+    _r = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+    if(_r==NULL)
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[arpack_cg] insufficient memory for _r inside arpack_cg.\n");
+       exit(1);
+    }
+    else
+       {r  = (spinor *) ( ((unsigned long int)(_r)+ALIGN_BASE)&~ALIGN_BASE);}
+
+    _tmps1 = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+    if(_tmps1==NULL)
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[arpack_cg] insufficient memory for _tmps1 inside arpack_cg.\n");
+       exit(1);
+    }
+    else
+       {tmps1  = (spinor *) ( ((unsigned long int)(_tmps1)+ALIGN_BASE)&~ALIGN_BASE);}
+
+    _tmps2 = malloc((LDN+ALIGN_BASE)*sizeof(spinor));
+    if(_tmps2==NULL)
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[arpack_cg] insufficient memory for _tmps2 inside arpack_cg.\n");
+       exit(1);
+    }
+    else
+       {tmps2  = (spinor *) ( ((unsigned long int)(_tmps2)+ALIGN_BASE)&~ALIGN_BASE);}
+
+#else
+    ax = (spinor *) malloc(LDN*sizeof(spinor));
+    r  = (spinor *) malloc(LDN*sizeof(spinor));
+    tmps1 = (spinor *) malloc(LDN*sizeof(spinor));
+    tmps2 = (spinor *) malloc(LDN*sizeof(spinor));
+    
+    if( (ax == NULL)  || (r==NULL) || (tmps1==NULL) || (tmps2==NULL) )
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[arpack_cg] insufficient memory for ax,r,tmps1,tmps2 inside arpack_cg.\n");
+       exit(1);
+    }
+#endif
+
+
+    evecs = (_Complex double *) malloc(ncv*12*N*sizeof(_Complex double)); /* note: no extra buffer  */
+    evals = (_Complex double *) malloc(ncv*sizeof(_Complex double)); 
+    tmpv1 = (_Complex double *) malloc(12*N*sizeof(_Complex double));
+
+    if((evecs == NULL)  || (evals==NULL) || (tmpv1==NULL))
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[arpack_cg] insufficient memory for evecs and evals inside arpack_cg.\n");
+       exit(1);
+    }
+
+    if ( arpack_read_ev == 1) {
+
+      if (strcmp(arpack_evecs_fileformat, "partfile") == 0) {
+        /* set evec filenmae */
+        sprintf(evecs_filename, "%s.%.5d.pt%.2dpx%.2dpy%.2dpz%.2d", arpack_evecs_filename, nev, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3]);
+        evecs_fs = fopen(evecs_filename, "r");
+        if (evecs_fs == NULL) {
+          fprintf(stderr, "[arpack_cg] (%.4d) Error, could not open file %s for reading\n", g_cart_id, evecs_filename);
+          return(-2);
+        }
+        fprintf(stdout, "# [arpack_cg] reading eigenvectors from file %s\n", evecs_filename);
+
+        if(arpack_evecs_writeprec == 64) {
+ 
+          evecs_io_buffer = (void*)evecs;
+   
+          et1=gettime();
+          evecs_count = fread( evecs_io_buffer, sizeof(_Complex double), (size_t)nev*12*N, evecs_fs);
+          et2=gettime();
+        
+        } else {
+          evecs_io_buffer = malloc(sizeof(_Complex double) * (size_t)nev*12*N );
+          if( evecs_io_buffer == NULL) {
+            fprintf(stderr, "[arpack_cg] (%.4d) Error, could not allocate memory for evecs_io_buffer\n", g_cart_id);
+            return(-42);
+          }
+  
+          et1=gettime();
+          evecs_count = fread( evecs_io_buffer, sizeof(_Complex double)/2, (size_t)nev*12*N, evecs_fs);
+          et2=gettime();
+
+          single2double(evecs, evecs_io_buffer, nev*24*N);
+
+          free( evecs_io_buffer );
+          evecs_io_buffer = NULL;
+        }
+       
+        if( evecs_count != ((size_t)nev*12*N) ) {
+          fprintf(stderr, "[arpack_cg] (%.4d) Error, could not proper amount of data from file %s\n", g_cart_id, evecs_filename);
+          return(-3);
+        }
+        fclose(evecs_fs);
+        evecs_fs = NULL;
+        if(g_proc_id == g_stdio_proc) {
+          fprintf(stdout,"# [arpack_cg] ARPACK time for reading %d eigenvectors: %+e seconds\n", nev, et2-et1);
+        }
+      } else if(strcmp(arpack_evecs_fileformat, "single") == 0) {
+
+        if(N==VOLUME) {
+          for(i=0; i<nev; i++) {
+            sprintf(evecs_filename, "%s.ev%.5d", arpack_evecs_filename, i);
+            evecs_ptr0 = (spinor*)&(evecs[i*12*N]);
+            evecs_ptr1 = NULL;
+            read_spinor(evecs_ptr0,  evecs_ptr1, evecs_filename, 0);
+          } /* end of loop on eigenvectors */
+        } else if(N==VOLUME/2) {
+          for(i=0; i<nev/2; i++) {
+            sprintf(evecs_filename, "%s.ev%.5d", arpack_evecs_filename, 2*i);
+            evecs_ptr0 = (spinor*)&(evecs[(2*i  )*12*N]);
+            evecs_ptr1 = (spinor*)&(evecs[(2*i+1)*12*N]);
+            read_spinor(evecs_ptr0,  evecs_ptr1, evecs_filename, 0);
+          } /* end of loop on eigenvectors */
+        }
+      }   /* of if arpack_evecs_fileformat */
+
+      /* set info_arpack pro forma to SUCCESS */
+      nconv = nev;
+      info_arpack = 0;
+    } else {
+      et1=gettime();
+      evals_arpack(N,nev,ncv,kind,acc,cheb_k,emin,emax,evals,evecs,arpack_eig_tol,arpack_eig_maxiter,f,&info_arpack,&nconv,arpack_logfile);
+      et2=gettime();
+
+      if(info_arpack != 0){ /* arpack didn't converge */
+      if(g_proc_id == g_stdio_proc)
+        fprintf(stderr,"[arpack_cg] WARNING: ARPACK didn't converge. exiting..\n");
+        return -1;
+      }
+    
+      if(g_proc_id == g_stdio_proc)
+      {
+         fprintf(stdout,"# [arpack_cg] ARPACK has computed %d eigenvectors\n",nconv);
+         fprintf(stdout,"# [arpack_cg] ARPACK time: %+e\n",et2-et1);
+      }
+
+      if ( arpack_write_ev == 1) {
+
+        if(strcmp(arpack_evecs_fileformat, "partfile") == 0 ) {
+
+          /* set evec filenmae */
+          sprintf(evecs_filename, "%s.%.5d.pt%.2dpx%.2dpy%.2dpz%.2d", arpack_evecs_filename, nconv, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3]);
+
+          evecs_fs = fopen(evecs_filename, "w");
+          if (evecs_fs == NULL) {
+            fprintf(stderr, "[arpack_cg] (%.4d) Error, could not open file %s for writing\n", g_cart_id, evecs_filename);
+            return(-4);
+          }
+        
+          if(arpack_evecs_writeprec == 64) {
+
+            evecs_io_buffer = (void*)evecs;
+ 
+            et1=gettime();
+            evecs_count = fwrite( evecs_io_buffer, sizeof(_Complex double), (size_t)nconv*12*N, evecs_fs);
+            et2=gettime();
+
+          } else {
+            evecs_io_buffer = malloc(sizeof(_Complex double) * (size_t)nconv*12*N );
+            if( evecs_io_buffer == NULL) {
+              fprintf(stderr, "[arpack_cg] (%.4d) Error, could not allocate memory for evecs_io_buffer\n", g_cart_id);
+              return(-41);
+            }
+            double2single(evecs_io_buffer, evecs, nconv*24*N);
+ 
+            et1=gettime();
+            evecs_count = fwrite( evecs_io_buffer, sizeof(_Complex double)/2, (size_t)nconv*12*N, evecs_fs);
+            et2=gettime();
+            free(evecs_io_buffer);
+            evecs_io_buffer = NULL;
+          }
+ 
+          if( evecs_count != ((size_t)nconv*12*N) ) {
+            fprintf(stderr, "[arpack_cg] (%.4d) Error, could not write proper amount of data to file %s\n", g_cart_id, evecs_filename);
+            return(-5);
+          }
+          fclose(evecs_fs);
+          evecs_fs = NULL;
+
+          if(g_proc_id == g_stdio_proc) {
+            fprintf(stdout,"[arpack_cg] (%.4d) ARPACK time for writing %d eigenvectors: %+e seconds\n", g_cart_id, nconv, et2-et1);
+          }
+
+        } else if (strcmp(arpack_evecs_fileformat, "single") == 0) {
+
+          if(N==VOLUME) {
+            for(i=0; i<nconv; i++) {
+              sprintf(evecs_filename, "%s.ev%.5d", arpack_evecs_filename, i);
+              construct_writer(&evecs_writer, evecs_filename, 0);
+              evecs_propagatorFormat = construct_paramsPropagatorFormat(arpack_evecs_writeprec, 1);
+              write_propagator_format(evecs_writer, evecs_propagatorFormat);
+              free(evecs_propagatorFormat);
+              evecs_ptr0 = (spinor*)&(evecs[i*12*N]);
+              evecs_ptr1 = NULL;
+              write_spinor(evecs_writer, &evecs_ptr0, &evecs_ptr1, 1, arpack_evecs_writeprec);
+              destruct_writer(evecs_writer);
+              evecs_writer=NULL;
+            } /* end of loop on converged eigenvectors */
+          } else if(N==VOLUME/2) {
+            for(i=0; i<nconv/2; i++) {
+              sprintf(evecs_filename, "%s.ev%.5d", arpack_evecs_filename, 2*i);
+              construct_writer(&evecs_writer, evecs_filename, 0);
+              evecs_propagatorFormat = construct_paramsPropagatorFormat(arpack_evecs_writeprec, 1);
+              write_propagator_format(evecs_writer, evecs_propagatorFormat);
+              free(evecs_propagatorFormat);
+              evecs_ptr0 = (spinor*)&(evecs[(2*i  )*12*N]);
+              evecs_ptr1 = (spinor*)&(evecs[(2*i+1)*12*N]);
+              write_spinor(evecs_writer, &evecs_ptr0, &evecs_ptr1,1, arpack_evecs_writeprec);
+              destruct_writer(evecs_writer);
+              evecs_writer=NULL;
+            }  /* end of loop on converged eigenvectors */
+          }    /* end of if N == VOLUME */
+
+        }      /* of if arpack_evecs_fileformat */
+
+      }        /* end of if arpack_write_ev == 1 */
+
+    }          /* end of if arpack_read_ev == 1 */
+
+    H        = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); 
+    HU       = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); 
+    Hinv     = (_Complex double *) malloc(nconv*nconv*sizeof(_Complex double)); 
+    initwork = (_Complex double *) malloc(nconv*sizeof(_Complex double)); 
+    IPIV     = (int *) malloc(nconv*sizeof(int));
+    zheev_lwork = 3*nconv;
+    zheev_work  = (_Complex double *) malloc(zheev_lwork*sizeof(_Complex double));
+    zheev_rwork = (double *) malloc(3*nconv*sizeof(double));
+    hevals      = (double *) malloc(nconv*sizeof(double));
+
+    if((H==NULL) || (HU==NULL) || (Hinv==NULL) || (initwork==NULL) || (IPIV==NULL) || (zheev_lwork==NULL) || (zheev_rwork==NULL) || (hevals==NULL))
+    {
+       if(g_proc_id == g_stdio_proc)
+          fprintf(stderr,"[arpack_cg] insufficient memory for H, HU, Hinv, initwork, IPIV, zheev_lwork, zheev_rwork, hevals inside arpack_cg.\n");
+       exit(1);
+    }
+
+    et1=gettime();
+    /* compute the elements of the hermitian matrix H 
+       leading dimension is nconv and active dimension is nconv */
+    
+    if( projection_type == 0) {
+    
+      for(i=0; i<nconv; i++)
+      {
+        assign_complex_to_spinor(r,&evecs[i*12*N],12*N);
+        f(ax,r);
+        c1 = scalar_prod(r,ax,N,parallel);
+        H[i+nconv*i] = creal(c1);  /* diagonal should be real */
+        for(j=i+1; j<nconv; j++)
+        {
+          assign_complex_to_spinor(r,&evecs[j*12*N],12*N);
+          c1 = scalar_prod(r,ax,N,parallel);
+          H[j+nconv*i] = c1;
+          H[i+nconv*j] = conj(c1); /* enforce hermiticity */
+        }
+      }
+
+    } else if ( projection_type == 1 )  {
+
+      for(i=0; i<nconv; i++)
+      {
+        assign_complex_to_spinor(tmps1, &evecs[i*12*N], 12*N);
+        f_final(r, tmps1);
+        f(ax,r);
+        c1 = scalar_prod(r,ax,N,parallel);
+        c2 = scalar_prod(r,r,N,parallel);
+        H[i+nconv*i] = creal(c1) / creal(c2);   /* diagonal should be real */
+        for(j=i+1; j<nconv; j++)
+        {
+          assign_complex_to_spinor(tmps1, &evecs[j*12*N], 12*N);
+          f_final(r, tmps1);
+          c1 = scalar_prod(r,ax,N,parallel);
+          c3 = scalar_prod(r, r, N, parallel);
+
+          H[j+nconv*i] = c1 / sqrt( creal(c2) * creal(c3) );
+          H[i+nconv*j] = conj(c1) / sqrt( creal(c2) * creal(c3) ); /* enforce hermiticity */
+        }
+      }
+    }
+
+
+    et2=gettime();
+    if(g_proc_id == g_stdio_proc) {
+      fprintf(stdout,"[arpack_cg] time to compute H: %+e\n",et2-et1);
+    }
+
+/*
+    if(g_cart_id == 0) {
+      for(i=0; i<nconv; i++) {
+      for(j=0; j<nconv; j++) {
+        fprintf(stdout, "H[%d, %d] = %25.16e %25.16e\n", i, j, creal(H[i*nconv+j]), cimag(H[i*nconv+j]));
+      }}
+    }
+*/
+
+
+
+     et1=gettime();
+     /* compute Ritz values and Ritz vectors if needed */
+     if( (nconv>0) && (comp_evecs !=0))
+     {
+         /* copy H into HU */
+         tmpsize=nconv*nconv;
+         _FT(zcopy)(&tmpsize,H,&ONE,HU,&ONE);
+
+         /* compute eigenvalues and eigenvectors of HU*/
+         /* SUBROUTINE ZHEEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK,INFO ) */
+         _FT(zheev)(&cV,&cU,&nconv,HU,&nconv,hevals,zheev_work,&zheev_lwork,zheev_rwork,&zheev_info,1,1);
+
+         if(zheev_info != 0)
+         {
+	    if(g_proc_id == g_stdio_proc) 
+	    {
+	        fprintf(stderr,"[arpack_cg] Error in ZHEEV:, info =  %d\n",zheev_info); 
+                fflush(stderr);
+	    }
+	    exit(1);
+         }
+
+         /* If you want to replace the schur (orthonormal) basis by eigen basis
+            use something like this. It is better to use the schur basis because
+            they are better conditioned. Use this part only to get the eigenvalues
+            and their resduals for the operator (D^\daggerD)
+            esize=(ncv-nconv)*12*N;
+            Zrestart_X(evecs,12*N,HU,12*N,nconv,nconv,&evecs[nconv*N],esize); */
+
+         /* compute residuals and print out results */
+
+	 if(g_proc_id == g_stdio_proc)
+	 {fprintf(stdout,"# [arpack_cg] Ritz values of A and their residulas (||A*x-lambda*x||/||x||\n"); 
+          fprintf(stdout,"# [arpack_cg] =============================================================\n");
+          fflush(stdout);}
+
+         for(i=0; i<nconv; i++)
+         {
+	    tmpsize=12*N;
+            _FT(zgemv)(&cN,&tmpsize,&nconv,&tpone,evecs,&tmpsize,
+		       &HU[i*nconv],&ONE,&tzero,tmpv1,&ONE,1);
+
+            assign_complex_to_spinor(r,tmpv1,12*N);
+
+            d1=square_norm(r,N,parallel);
+            
+            f(ax,r);
+
+            mul_r(tmps1,hevals[i],r,N);
+
+            diff(tmps2,ax,tmps1,N);
+	    
+	    d2= square_norm(tmps2,N,parallel);
+
+            d3= sqrt(d2/d1);
+	    
+	    if(g_proc_id == g_stdio_proc)
+	    {fprintf(stdout,"Eval[%06d]: %22.15E rnorm: %22.15E\n", i, hevals[i], d3); fflush(stdout);}
+        } 
+     }  /* if( (nconv_arpack>0) && (comp_evecs !=0)) */
+     et2=gettime();
+     if(g_proc_id == g_stdio_proc) {
+       fprintf(stdout,"[arpack_cg] time to compute eigenvectors: %+e\n",et2-et1);
+     }
+
+  }  /* if(ncurRHS==0) */
+    
+  double eps_sq_used,restart_eps_sq_used;  /* tolerance squared for the linear system */
+
+  double cur_res; /* current residual squared */
+
+  /*increment the RHS counter*/
+  ncurRHS = ncurRHS +1; 
+
+  /* set the tolerance to be used for this right-hand side  */
+  if(ncurRHS > nrhs1){
+    eps_sq_used = eps_sq;
+  }
+  else{
+    eps_sq_used = eps_sq1;
+  }
+  
+  if(g_proc_id == g_stdio_proc && g_debug_level > 0) {
+    fprintf(stdout, "# [arpack_cg] System %d, eps_sq %e, projection type %d\n",ncurRHS,eps_sq_used, projection_type); 
+    fflush(stdout);
+  } 
+  
+  /*---------------------------------------------------------------*/
+  /* Call init-CG until this right-hand side converges             */
+  /*---------------------------------------------------------------*/
+  double wt1,wt2,wE,wI;
+  double normsq,tol_sq;
+  int flag,maxit_remain,numIts,its;
+  int info_lapack;
+
+  wE = 0.0; wI = 0.0;     /* Start accumulator timers */
+  flag = -1;    	  /* System has not converged yet */
+  maxit_remain = maxit;   /* Initialize Max and current # of iters   */
+  numIts = 0;  
+  restart_eps_sq_used=res_eps_sq;
+
+  while( flag == -1 )
+  {
+    
+    if(nconv > 0)
+    {
+
+
+      /* --------------------------------------------------------- */
+      /* Perform init-CG with evecs vectors                        */
+      /* xinit = xinit + evecs*Hinv*evec'*(b-Ax0) 		   */
+      /* --------------------------------------------------------- */
+      wt1 = gettime();
+
+      /*r0=b-Ax0*/
+      f(ax,x); /*ax = A*x */
+      diff(r,b,ax,N);  /* r=b-A*x */
+
+      if( projection_type == 0) {
+
+        /* x = x + evecs*inv(H)*evecs'*r */
+        for(int i=0; i < nconv; i++)
+        {
+           assign_complex_to_spinor(tmps1,&evecs[i*12*N],12*N);
+           initwork[i]= scalar_prod(tmps1,r,N,parallel);
+        }
+
+        /* solve the linear system H y = c */
+        tmpsize=nconv*nconv;
+        _FT(zcopy) (&tmpsize,H,&ONE,Hinv,&ONE); /* copy H into Hinv */
+        /* SUBROUTINE ZGESV( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) */
+        _FT(zgesv) (&nconv,&ONE,Hinv,&nconv,IPIV,initwork,&nconv,&info_lapack);
+
+        if(info_lapack != 0)
+        {
+           if(g_proc_id == g_stdio_proc) {
+              fprintf(stderr, "[arpack_cg] Error in ZGESV:, info =  %d\n",info_lapack); 
+              fflush(stderr);
+           }
+           exit(1);
+        }
+
+        /* x = x + evecs*inv(H)*evecs'*r */
+        for(i=0; i<nconv; i++)
+        {
+          assign_complex_to_spinor(tmps1,&evecs[i*12*N],12*N);
+          assign_add_mul(x,tmps1,initwork[i],N);
+        }
+
+      } else if ( projection_type == 1 ) {
+        /* x = x + evecs*inv(H)*evecs'*r */
+
+        /* tmps2 = Q^+ r */
+        f_initial(tmps2, r);
+
+        for(int i=0; i < nconv; i++) {
+          /* tmps1 = v_i */
+          assign_complex_to_spinor(tmps1,&evecs[i*12*N],12*N);
+
+          /* initwork_i = v_i^+ Q^+ r / lambda_i^2 */
+          initwork[i]= scalar_prod(tmps1, tmps2, N, parallel) / ( H[i*nconv+i] * H[i*nconv+i] );
+        }
+
+        memset(tmps2, 0, N*sizeof(spinor) );
+        for(i=0; i<nconv; i++) {
+          assign_complex_to_spinor(tmps1, &evecs[i*12*N], 12*N);
+          assign_add_mul(tmps2, tmps1, initwork[i], N);
+        }
+
+        /* apply final operator */
+        f_final(tmps1, tmps2);
+        assign_add_mul(x, tmps1, 1., N);
+
+      }  /* end of if projection type */
+
+      /* compute elapsed time and add to accumulator */
+
+      wt2 = gettime();
+      wI = wI + wt2-wt1;
+      
+    }/* if(nconv > 0) */
+
+
+    /* which tolerance to use */
+    if(eps_sq_used > restart_eps_sq_used)
+    {
+       tol_sq = eps_sq_used;
+       flag   = 1; /* shouldn't restart again */
+    }
+    else
+    {
+       tol_sq = restart_eps_sq_used;
+    }
+
+    wt1 = gettime();
+    its = cg_her(x,b,maxit_remain,tol_sq,rel_prec,N,f); 
+          
+    wt2 = gettime();
+
+    wE = wE + wt2-wt1;
+
+    /* check convergence */
+    if(its == -1)
+    {
+       /* cg didn't converge */
+       if(g_proc_id == g_stdio_proc) {
+         fprintf(stderr, "[arpack_cg] CG didn't converge within the maximum number of iterations in arpack_cg. Exiting...\n"); 
+         fflush(stderr);
+         exit(1);
+         
+       }
+    } 
+    else
+    {
+       numIts += its;   
+       maxit_remain = maxit - numIts; /* remaining number of iterations */
+       restart_eps_sq_used = restart_eps_sq_used*res_eps_sq; /* prepare for the next restart */
+    }
+    
+  }
+  /* end while (flag ==-1)               */
+  
+  /* ---------- */
+  /* Reporting  */
+  /* ---------- */
+  /* compute the exact residual */
+  f(ax,x); /* ax= A*x */
+  diff(r,b,ax,N);  /* r=b-A*x */	
+  normsq=square_norm(r,N,parallel);
+  if(g_debug_level > 0 && g_proc_id == g_stdio_proc)
+  {
+    fprintf(stdout, "# [arpack_cg] For this rhs:\n");
+    fprintf(stdout, "# [arpack_cg] Total initCG Wallclock : %+e\n", wI);
+    fprintf(stdout, "# [arpack_cg] Total cg Wallclock : %+e\n", wE);
+    fprintf(stdout, "# [arpack_cg] Iterations: %-d\n", numIts); 
+    fprintf(stdout, "# [arpack_cg] Actual Resid of LinSys  : %+e\n",normsq);
+  }
+
+
+  /* free memory if this was your last system to solve */
+  if(ncurRHS == nrhs){
+#if ( (defined SSE) || (defined SSE2) || (defined SSE3)) 
+    free(_ax);  free(_r);  free(_tmps1); free(_tmps2);
+#else
+    free(ax); free(r); free(tmps1); free(tmps2);
+#endif
+    free(evecs); free(evals); free(H); free(HU); free(Hinv);
+    free(initwork); free(tmpv1); free(zheev_work);
+    free(hevals); free(zheev_rwork); free(IPIV);
+  }
+
+
+  return numIts;
+}
+ 
+
+
+      
diff --git a/solver/arpack_cg.h b/solver/arpack_cg.h
new file mode 100644
index 000000000..b683b268a
--- /dev/null
+++ b/solver/arpack_cg.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * Deflating CG using eigenvectors computed using ARPACK
+ *
+ * Author: A.M. Abdel-Rehim (amabdelrehim@gmail.com)
+ *         Novemebr 2014
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ****************************************************************************/
+/* A sample input is given in the sample-input folder */
+
+#ifndef _ARPACK_CG_H
+#define _ARPACK_CG_H
+
+#include "su3.h"
+#include "solver/matrix_mult_typedef.h"
+#include "solver/solver_params.h"
+#include "solver/eigenvalues_arpack.h"
+
+
+int arpack_cg(
+  /* solver params */
+  const int N,                   /* (IN) Number of lattice sites for this process*/
+  solver_params_t solver_params, /* (IN) parameters for solver */
+  spinor * const x,              /* (IN/OUT) initial guess on input, solution on output for this RHS*/
+  spinor * const b,              /* (IN) right-hand side*/
+  matrix_mult f,                 /* (IN) f(s,r) computes s=A*r, i.e. matrix-vector multiply in double precision */
+  matrix_mult f32,               /* (IN) f(s,r) computes s=A*r, i.e. matrix-vector multiply in single precision */
+  const double eps_sq,           /* (IN) squared tolerance of convergence of the linear system for systems nrhs1+1 till nrhs*/
+  const int rel_prec,            /* (IN) 0 for using absoute error for convergence
+                                         1 for using relative error for convergence*/
+  const int maxit,               /* (IN) Maximum allowed number of iterations to solution for the linear system*/
+  matrix_mult f_final,           /* (IN) final operator application during projection of type 1 */
+  matrix_mult f_initial          /* (IN) initial operator application during projection of type 1 */
+);
+
+#endif 
diff --git a/solver/deflated_cg.h b/solver/deflated_cg.h
new file mode 100644
index 000000000..41d240137
--- /dev/null
+++ b/solver/deflated_cg.h
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * Deflating CG using eigenvectors computed using ARPACK
+ *
+ * Author: A.M. Abdel-Rehim (amabdelrehim@gmail.com)
+ *         Novemebr 2014
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ****************************************************************************/
+/* A sample input is given in the sample-input folder */
+
+#ifndef _DEFLATED_CG_H
+#define _DEFLATED_CG_H
+
+#include "su3.h"
+/* #include "solver/matrix_mult_typedef.h" */
+#include "solver/solver_params.h"
+#include "solver/deflator.h"
+
+
+int exactdeflated_cg(
+  solver_params_t solver_params, /* (IN) parameters for solver */
+  deflator_params_t deflator_params, /* (IN) parameters for deflator */
+  spinor * const x,              /* (IN/OUT) initial guess on input, solution on output for this RHS*/
+  spinor * const b               /* (IN) right-hand side*/
+);
+
+#endif 
diff --git a/solver/eigenvalues_arpack.c b/solver/eigenvalues_arpack.c
new file mode 100644
index 000000000..0ff771573
--- /dev/null
+++ b/solver/eigenvalues_arpack.c
@@ -0,0 +1,515 @@
+/*********************************************************************** 
+ * Interface for solving the eigenvalue problem A*x=lambda*x 
+ * for a complex matrix A using ARPACK and PARPACK. The matrix
+ * is accessed through a matrix-vector multiplication function.
+ *
+ * Author: A.M. Abdel-Rehim, 2014
+ *
+ * For reference see the driver programs zndrv1 and in the EXAMPLES 
+ * subdriectories of ARPACK and PARPACK.
+ *
+ * This file is part of tmLQCD software suite
+ ***********************************************************************/
+#ifdef HAVE_CONFIG_H
+# include<config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <string.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#include "global.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "quicksort.h"
+#include "linalg/blas.h"
+#include "linalg/lapack.h"
+#include "solver/eigenvalues_arpack.h"
+void evals_arpack(
+  int n, 
+  int nev, 
+  int ncv, 
+  int which,
+  int use_acc,
+  int cheb_k,
+  double amin,
+  double amax,
+  _Complex double *evals, 
+  _Complex double *v,
+  double tol, 
+  int maxiter, 
+  matrix_mult av, 
+  int *info, 
+  int *nconv,
+  char *arpack_logfile)
+/*
+  compute nev eigenvectors using ARPACK and PARPACK
+  n     : (IN) size of the local lattice
+  nev   : (IN) number of eigenvectors requested.
+  ncv   : (IN) size of the subspace used to compute eigenvectors (nev+1) =< ncv < 12*n
+          where 12n is the size of the matrix under consideration
+  which : (IN) which eigenvectors to compute. Choices are:
+          0: smallest real part "SR"
+          1: largest real part "LR"
+          2: smallest absolute value "SM"
+          3: largest absolute value "LM"
+          4: smallest imaginary part "SI"
+          5: largest imaginary part "LI"
+  use_acc: (IN) specify the polynomial acceleration mode
+                0 no acceleration
+                1 use acceleration by computing the eigenvectors of a shifted-normalized chebyshev polynomial
+  cheb_k   : (IN) degree of the chebyshev polynomial to be used for acceleration (irrelevant when use_acc=0 and init_resid_arpack=0)
+  amin,amax: (IN) bounds of the interval [amin,amax] for the acceleration polynomial (irrelevant when use_acc=0 and init_resid_arpack=0)
+  evals : (OUT) array of size nev+1 which has the computed nev Ritz values
+  v     : computed eigenvectors. Size is n*ncv spinors.
+  tol    : Requested tolerance for the accuracy of the computed eigenvectors.
+           A value of 0 means machine precision.
+  maxiter: maximum number of restarts (iterations) allowed to be used by ARPACK
+  av     : operator for computing the action of the matrix on the vector
+           av(vout,vin) where vout is output spinor and vin is input spinors.
+  info   : output from arpack. 0 means that it converged to the desired tolerance. 
+           otherwise, an error message is printed to stderr 
+  nconv  : actual number of converged eigenvectors.
+  arpack_logfile: name of the logfile to be used by arpack
+*/ 
+{
+
+  //print the input:
+  //================
+   if(g_proc_id == g_stdio_proc)
+   {       
+      fprintf(stdout,"# [eigenvalues_arpack] Input to eigenvalues_arpack\n");
+      fprintf(stdout,"# [eigenvalues_arpack] ===========================\n");
+      fprintf(stdout,"# [eigenvalues_arpack] n= %d\n", n);
+      fprintf(stdout,"# [eigenvalues_arpack] The number of Ritz values requested is %d\n", nev);
+      fprintf(stdout,"# [eigenvalues_arpack] The number of Arnoldi vectors generated is %d\n", ncv);
+      fprintf(stdout,"# [eigenvalues_arpack] What portion of the spectrum which: %d\n", which);
+      fprintf(stdout,"# [eigenvalues_arpack] polynomial acceleartion option %d\n", use_acc );
+      fprintf(stdout,"# [eigenvalues_arpack] chebyshev polynomial paramaters: degree %d amin %+e amx %+e\n",cheb_k,amin,amax); 
+      fprintf(stdout,"# [eigenvalues_arpack] The convergence criterion is %+e\n", tol);
+      fprintf(stdout,"# [eigenvalues_arpack] maximum number of iterations for arpack %d\n",maxiter);
+   }
+
+   //create the MPI communicator
+#ifdef TM_USE_MPI
+   MPI_Fint mpi_comm_f = MPI_Comm_c2f(g_cart_grid);
+   //MPI_Comm comm; //communicator used when we call PARPACK
+   //int comm_err = MPI_Comm_dup(MPI_COMM_WORLD,&comm); //duplicate the MPI_COMM_WORLD to create a communicator to be used with arpack
+   //if(comm_err != MPI_SUCCESS) { //error when trying to duplicate the communicator
+   //  if(g_proc_id == g_stdio_proc){
+   //    fprintf(stderr,"[eigenvalues_arpack] MPI_Comm_dup return with an error. Exciting...\n");
+   //    exit(-1);
+   //  }
+   //}
+#endif
+
+   int parallel;
+#ifdef TM_USE_MPI
+     parallel=1;
+#else
+     parallel=0;
+#endif
+
+   int ido=0;           //control of the action taken by reverse communications
+                        //set initially to zero
+
+   char *bmat=strdup("I");     /* Specifies that the right hand side matrix
+                                  should be the identity matrix; this makes
+                                  the problem a standard eigenvalue problem.
+                               */
+
+   //matrix dimensions 
+   int ldv,N,LDV;
+  
+   if(n==VOLUME) //full 
+     ldv= VOLUMEPLUSRAND;
+   else          //even-odd
+     ldv= VOLUMEPLUSRAND/2;
+
+   //dimesnions as complex variables
+   N  =12*n;       //dimension
+   LDV=12*ldv;   //leading dimension (including communication buffers)
+   
+
+   char *which_evals;
+
+   if(which==0)
+     which_evals=strdup("SR");
+   if(which==1)
+     which_evals=strdup("LR");
+   if(which==2)
+     which_evals=strdup("SM");
+   if(which==3)
+     which_evals=strdup("LM");
+   if(which==4)
+     which_evals=strdup("SI");
+   if(which==5)
+     which_evals=strdup("LI");
+
+
+    //check
+    if (which_evals == NULL ||
+            (  strcmp("SR", which_evals)
+            && strcmp("LR", which_evals)
+            && strcmp("SI", which_evals)
+            && strcmp("LI", which_evals)
+            && strcmp("SM", which_evals)
+            && strcmp("LM", which_evals)))
+    {
+        if(g_proc_id == g_stdio_proc)
+          {fprintf(stderr,"[eigenvalues_arpack] Error: invalid value for which_evals\n"); fflush(stderr); exit(1);}
+    }
+
+   //check input
+   if(nev>=N){
+       if(g_proc_id == g_stdio_proc)
+          {fprintf(stderr,"[eigenvalues_arpack] number of eigenvalues requested should be less than the size of the matrix.\n"); fflush(stderr); exit(1);}
+   }
+
+   if(ncv < (nev+1)){
+       if(g_proc_id == g_stdio_proc)
+          {fprintf(stderr,"[eigenvalues_arpack] search subspace must be larger than the number of requested eiegnvalues.\n"); fflush(stderr); exit(1);}
+   }
+
+   _Complex double *resid  = (_Complex double *) malloc(N*sizeof(_Complex double));
+   if(resid == NULL){
+    if(g_proc_id == g_stdio_proc)
+       { fprintf(stderr,"[eigenvalues_arpack] Error: not enough memory for resid in eigenvalues_arpack.\n"); fflush(stderr); exit(1);}
+   }
+
+
+   int *iparam = (int *) malloc(11*sizeof(int));
+   if(iparam == NULL){
+       if(g_proc_id == g_stdio_proc)
+       { fprintf(stderr,"[eigenvalues_arpack] Error: not enough memory for iparam in eigenvalues_arpack.\n"); fflush(stderr); exit(1);}
+   }
+
+
+   iparam[0]=1;  //use exact shifts
+
+   iparam[2]=maxiter;
+
+   iparam[3]=1;
+
+   iparam[6]=1;
+
+
+   int *ipntr  = (int *) malloc(14*sizeof(int));
+
+   _Complex double *workd  = (_Complex double *) malloc(3*N*sizeof(_Complex double)); 
+
+   int lworkl=(3*ncv*ncv+5*ncv)*2; //just allocate more space
+
+   _Complex double *workl=(_Complex double *) malloc(lworkl*sizeof(_Complex double));
+
+   double *rwork  = (double *) malloc(ncv*sizeof(double));
+
+   int rvec=1;       //always call the subroutine that computes orthonormal bais for the eigenvectors
+
+   char *howmany=strdup("P");   //always compute orthonormal basis
+
+   int *select = (int *) malloc(ncv*sizeof(int)); //since all Ritz vectors or Schur vectors are computed no need to initialize this array
+
+   _Complex double sigma;
+    
+   _Complex double *workev = (_Complex double *) malloc(2*ncv*sizeof(_Complex double));
+
+   double *sorted_evals = (double *) malloc(ncv*sizeof(double)); //will be used to sort the eigenvalues
+   int *sorted_evals_index = (int *) malloc(ncv*sizeof(int)); 
+
+   if((ipntr == NULL) || (workd==NULL) || (workl==NULL) || (rwork==NULL) || (select==NULL) || (workev==NULL) || (sorted_evals==NULL) || (sorted_evals_index==NULL)){
+       if(g_proc_id == g_stdio_proc)
+       { fprintf(stderr,"[eigenvalues_arpack] Error: not enough memory for ipntr,workd,workl,rwork,select,workev,sorted_evals,sorted_evals_index in eigenvalues_arpack.\n"); fflush(stderr); exit(1);}
+   }
+
+   double d1,d2,d3;
+
+   
+   (*info) = 0;                 //means use a random starting vector with Arnoldi
+
+   void *_x,*_ax,*_r,*_tmps1,*_tmps2;   //spinors that might be needed
+   spinor *x,*ax,*r,*tmps1,*tmps2;      //spinors that might be needed
+
+   #if (defined SSE || defined SSE2 || defined SSE3)
+   _x = malloc((ldv+ALIGN_BASE)*sizeof(spinor));
+   if(_x==NULL){
+       if(g_proc_id == g_stdio_proc)
+       { fprintf(stderr,"[eigenvalues_arpack] Error: not enough memory for _x in eigenvalues_arpack.\n"); fflush(stderr); exit(1);}
+   }
+   else
+      x  = (spinor *) ( ((unsigned long int)(_x)+ALIGN_BASE)&~ALIGN_BASE);
+
+
+   _ax = malloc((ldv+ALIGN_BASE)*sizeof(spinor));
+   if(_ax==NULL){
+       if(g_proc_id == g_stdio_proc)
+       { fprintf(stderr,"[eigenvalues_arpack] Error: not enough memory for _ax in eigenvalues_arpack.\n"); fflush(stderr); exit(1);}
+   }
+   else
+     ax  = (spinor *) ( ((unsigned long int)(_ax)+ALIGN_BASE)&~ALIGN_BASE);
+
+
+   _tmps1 = malloc((ldv+ALIGN_BASE)*sizeof(spinor));
+   if(_tmps1==NULL){
+       if(g_proc_id == g_stdio_proc)
+       { fprintf(stderr,"[eigenvalues_arpack] Error: not enough memory for _tmps1 in eigenvalues_arpack.\n"); fflush(stderr); exit(1);}
+   }
+   else
+     tmps1  = (spinor *) ( ((unsigned long int)(_tmps1)+ALIGN_BASE)&~ALIGN_BASE);
+
+
+   _tmps2 = malloc((ldv+ALIGN_BASE)*sizeof(spinor));
+   if(_tmps2==NULL){
+       if(g_proc_id == g_stdio_proc)
+       { fprintf(stderr,"[eigenvalues_arpack] Error: not enough memory for _tmps2 in eigenvalues_arpack.\n"); fflush(stderr); exit(1);}
+   }
+   else
+     tmps2  = (spinor *) ( ((unsigned long int)(_tmps2)+ALIGN_BASE)&~ALIGN_BASE);
+
+
+   #else
+
+   x  = (spinor *) calloc(ldv,sizeof(spinor));
+   ax = (spinor *) calloc(ldv,sizeof(spinor));
+   tmps1  = (spinor *) calloc(ldv,sizeof(spinor));
+   tmps2 = (spinor *) calloc(ldv,sizeof(spinor));
+   if((x==NULL) || (ax==NULL) || (tmps1==NULL) || (tmps2==NULL)){
+       if(g_proc_id == g_stdio_proc)
+       { fprintf(stderr,"[eigenvalues_arpack] Error: not enough memory in x, ax, tmps1, tmps2 in eigenvalues_arpack.\n"); fflush(stderr); exit(1);}
+   }
+
+   #endif
+
+   int i,j;
+
+   /* Code added to print the log of ARPACK */
+   //const char *arpack_logfile;
+   int arpack_log_u = 9999;
+
+#ifndef TM_USE_MPI
+   //sprintf(arpack_logfile,"ARPACK_output.log");
+   if ( NULL != arpack_logfile ) {
+     /* correctness of this code depends on alignment in Fortran and C 
+	being the same ; if you observe crashes, disable this part */
+     _AFT(initlog)(&arpack_log_u, arpack_logfile, strlen(arpack_logfile));
+     int msglvl0 = 0,
+       msglvl1 = 1,
+       msglvl2 = 2,
+       msglvl3 = 3;
+     _AFT(mcinitdebug)(
+		  &arpack_log_u,      /*logfil*/
+		  &msglvl3,           /*mcaupd*/
+		  &msglvl3,           /*mcaup2*/
+		  &msglvl0,           /*mcaitr*/
+		  &msglvl3,           /*mceigh*/
+		  &msglvl0,           /*mcapps*/
+		  &msglvl0,           /*mcgets*/
+		  &msglvl3            /*mceupd*/);
+     
+     fprintf(stdout,"# [arpack] *** ARPACK verbosity set to mcaup2=3 mcaupd=3 mceupd=3; \n"
+	     "# [arpack] *** output is directed to '%s';\n"
+	     "# [arpack] *** if you don't see output, your memory may be corrupted\n",
+	     arpack_logfile);
+  }
+#else
+   //if( g_proc_id == g_stdio_proc ){
+   //  sprintf(arpack_logfile,"ARPACK_output.log");
+   //}
+   if ( NULL != arpack_logfile 
+	&& (g_proc_id == g_stdio_proc) ) {
+     /* correctness of this code depends on alignment in Fortran and C 
+	being the same ; if you observe crashes, disable this part */
+     _AFT(initlog)(&arpack_log_u, arpack_logfile, strlen(arpack_logfile));
+     int msglvl0 = 0,
+       msglvl1 = 1,
+       msglvl2 = 2,
+       msglvl3 = 3;
+     _AFT(pmcinitdebug)(
+		   &arpack_log_u,      /*logfil*/
+		   &msglvl3,           /*mcaupd*/
+		   &msglvl3,           /*mcaup2*/
+		   &msglvl0,           /*mcaitr*/
+		   &msglvl3,           /*mceigh*/
+		   &msglvl0,           /*mcapps*/
+		   &msglvl0,           /*mcgets*/
+		   &msglvl3            /*mceupd*/);
+    
+     fprintf(stdout,"# [arpack] *** ARPACK verbosity set to mcaup2=3 mcaupd=3 mceupd=3; \n"
+	    "# [arpack] *** output is directed to '%s';\n"
+	    "# [arpack] *** if you don't see output, your memory may be corrupted\n",
+	    arpack_logfile);
+   }
+#endif   
+
+
+
+
+   /*
+     M A I N   L O O P (Reverse communication)  
+   */
+
+   do
+   {
+
+#ifndef TM_USE_MPI 
+      _AFT(znaupd)(&ido,"I", &N, which_evals, &nev, &tol,resid, &ncv,
+                   v, &N, iparam, ipntr, workd, 
+                  workl, &lworkl,rwork,info,1,2);
+#else
+      _AFT(pznaupd)(&mpi_comm_f, &ido,"I", &N, which_evals, &nev, &tol, resid, &ncv,
+                   v, &N, iparam, ipntr, workd, 
+                   workl, &lworkl,rwork,info,1,2);
+#endif
+
+      if (ido == 99 || (*info) == 1)
+            break;
+
+      if ((ido==-1)||(ido==1)){
+
+         assign_complex_to_spinor(x,workd+ipntr[0]-1,N);
+         if((use_acc==0))
+           av(ax,x);
+         else 
+           cheb_poly_op(ax,x,av,n,amin,amax,cheb_k,tmps1,tmps2);
+
+         assign_spinor_to_complex(workd+ipntr[1]-1, ax, n);
+      }
+   } while (ido != 99);
+   
+/*
+ Check for convergence 
+*/
+     if ( (*info) < 0 ) 
+     {
+         if(g_proc_id == g_stdio_proc){
+            fprintf(stderr,"[eigenvalues_arpack] Error with _naupd, info = %d\n", *info);
+            fprintf(stderr,"[eigenvalues_arpack] Check the documentation of _naupd\n");}
+     }
+     else 
+     {
+        (*nconv) = iparam[4];
+        if(g_proc_id == g_stdio_proc){
+          fprintf(stderr,"[eigenvalues_arpack] number of converged eigenvectors = %d\n", *nconv);}
+
+        //compute eigenvectors 
+#ifndef TM_USE_MPI
+        _AFT(zneupd) (&rvec,"P", select,evals,v,&N,&sigma, 
+                     workev,"I",&N,which_evals,&nev,&tol,resid,&ncv, 
+                     v,&N,iparam,ipntr,workd,workl,&lworkl, 
+                     rwork,info,1,1,2);
+#else
+        _AFT(pzneupd) (&mpi_comm_f,&rvec,"P", select,evals, v,&N,&sigma, 
+                       workev,"I",&N,which_evals,&nev,&tol, resid,&ncv, 
+                       v,&N,iparam,ipntr,workd,workl,&lworkl, 
+                       rwork,info,1,1,2);
+#endif
+
+
+        if( (*info)!=0) 
+        {
+           if(g_proc_id == g_stdio_proc){
+             fprintf(stderr,"[eigenvalues_arpack] Error with _neupd, info = %d \n",(*info));
+             fprintf(stderr,"[eigenvalues_arpack] Check the documentation of _neupd. \n");}
+        }
+        else //report eiegnvalues and their residuals
+        {
+             if(g_proc_id == g_stdio_proc){
+               fprintf(stdout,"[eigenvalues_arpack] Ritz Values and their errors\n");
+               fprintf(stdout,"[eigenvalues_arpack] ============================\n");
+             }
+
+             (*nconv) = iparam[4];
+             for(j=0; j< (*nconv); j++)
+             {
+               /* print out the computed ritz values and their error estimates */
+               if(g_proc_id == g_stdio_proc)
+                  fprintf(stdout,"# [eigenvalues_arpack] RitzValue[%06d]  %+e  %+e  error= %+e \n",j,creal(evals[j]),cimag(evals[j]),cabs(*(workl+ipntr[10]-1+j)));
+               sorted_evals_index[j] = j;
+               sorted_evals[j] = cabs(evals[j]);
+             }
+
+             //SORT THE EIGENVALUES in ascending order based on their absolute value
+             quicksort((*nconv),sorted_evals,sorted_evals_index);
+             //Print sorted evals
+             if(g_proc_id == g_stdio_proc)
+                fprintf(stdout,"# [eigenvalues_arpack] Sorted eigenvalues based on their absolute values\n");
+
+             for(j=0; j< (*nconv); j++)
+             {
+               /* print out the computed ritz values and their error estimates */
+               if(g_proc_id == g_stdio_proc)
+                  fprintf(stdout,"# [eigenvalues_arpack] RitzValue[%06d]  %+e  %+e  error= %+e \n",j,creal(evals[sorted_evals_index[j]]),cimag(evals[sorted_evals_index[j]]),cabs(*(workl+ipntr[10]-1+sorted_evals_index[j])));
+
+             }
+
+        }
+
+        /*Print additional convergence information.*/
+        if( (*info)==1)
+        {
+           if(g_proc_id == g_stdio_proc)
+             fprintf(stderr,"[eigenvalues_arpack] Maximum number of iterations reached.\n");
+        }
+        else
+        { 
+          
+           if(g_proc_id == g_stdio_proc)
+           {
+              if((*info)==3)
+              {  
+                 fprintf(stderr,"[eigenvalues_arpack] No shifts could be applied during implicit\n");
+                 fprintf(stderr,"[eigenvalues_arpack] Arnoldi update, try increasing NCV.\n");
+              }
+         
+              fprintf(stdout,"# [eigenvalues_arpack] _NDRV1\n");
+              fprintf(stdout,"# [eigenvalues_arpack] =======\n");
+              fprintf(stdout,"# [eigenvalues_arpack] Size of the matrix is %d\n", N);
+              fprintf(stdout,"# [eigenvalues_arpack] The number of Ritz values requested is %d\n", nev);
+              fprintf(stdout,"# [eigenvalues_arpack] The number of Arnoldi vectors generated is %d\n", ncv);
+              fprintf(stdout,"# [eigenvalues_arpack] What portion of the spectrum: %s\n", which_evals);
+              fprintf(stdout,"# [eigenvalues_arpack] The number of converged Ritz values is %d\n", (*nconv) ); 
+              fprintf(stdout,"# [eigenvalues_arpack] The number of Implicit Arnoldi update iterations taken is %d\n", iparam[2]);
+              fprintf(stdout,"# [eigenvalues_arpack] The number of OP*x is %d\n", iparam[8]);
+              fprintf(stdout,"# [eigenvalues_arpack] The convergence criterion is %+e\n", tol);
+           }
+          
+        }
+     }  //if(info < 0) else part
+
+
+#ifndef TM_USE_MPI
+     if (NULL != arpack_logfile)
+       _AFT(finilog)(&arpack_log_u);
+#else
+     if(g_proc_id == g_stdio_proc){
+       if (NULL != arpack_logfile){
+	 _AFT(finilog)(&arpack_log_u);
+       }
+     }
+#endif     
+
+
+     //free memory
+     free(resid);
+     free(iparam);
+     free(ipntr);
+     free(workd);
+     //free(zv);
+     free(select);
+     free(workev);
+     #if ( (defined SSE) || (defined SSE2) || (defined SSE3) )
+     free(_x); free(_ax); free(_tmps1); free(_tmps2);
+     #else
+     free(x); free(ax); free(tmps1); free(tmps2);
+     #endif
+
+     return;
+}
+
+
+
+
+
diff --git a/solver/eigenvalues_arpack.h b/solver/eigenvalues_arpack.h
new file mode 100644
index 000000000..10c82b9dc
--- /dev/null
+++ b/solver/eigenvalues_arpack.h
@@ -0,0 +1,74 @@
+/************************************************************************* 
+ * Interface for solving the eigenvalue problem A*x=lambda*x for a complex 
+ * matrix A using ARPACK. The matrix is accessed through a matrix-vector 
+ * multiplication function.
+ *
+ * Author: A.M. Abdel-Rehim, 2014
+ *
+ * For reference see the driver programs zndrv1 in the EXAMPLES 
+ * subdriectories of ARPACK and PARPACK and the documentation.
+ *
+ * This file is part of tmLQCD software suite
+ ***********************************************************************/
+
+#ifndef _EIGENVALUES_ARPACK
+#define _EIGENVALUES_ARPACK
+
+#include "su3.h"
+#include "solver/matrix_mult_typedef.h"
+#include "linalg/arpack.h"
+/* #include "memalloc.h" */
+#include "solver/precon.h"
+
+//evals_arpack(N,nev,ncv,kind,acc,cheb_k,emin,emax,evals,evecs,arpack_eig_tol,arpack_eig_maxiter,f,&info_arpack,&nconv,arpack_logfile);
+
+/*compute nev eigenvectors using ARPACK and PARPACK*/
+void evals_arpack(
+  int n, 
+  int nev, 
+  int ncv, 
+  int which,
+  int use_acc,
+  int cheb_k,
+  double amin,
+  double amax,
+  _Complex double *evals, 
+  _Complex double *v,
+  double tol, 
+  int maxiter, 
+  matrix_mult av, 
+  int *info, 
+  int *nconv,
+  char *arpack_logfile);
+/*
+  compute nev eigenvectors using ARPACK and PARPACK
+  n     : (IN) size of the local lattice
+  nev   : (IN) number of eigenvectors requested.
+  ncv   : (IN) size of the subspace used to compute eigenvectors (nev+1) =< ncv < 12*n
+          where 12n is the size of the matrix under consideration
+  which : (IN) which eigenvectors to compute. Choices are:
+          0: smallest real part "SR"
+          1: largest real part "LR"
+          2: smallest absolute value "SM"
+          3: largest absolute value "LM"
+          4: smallest imaginary part "SI"
+          5: largest imaginary part "LI"
+  use_acc: (IN) specify the polynomial acceleration mode
+                0 no acceleration
+                1 use acceleration by computing the eigenvectors of a shifted-normalized chebyshev polynomial
+  cheb_k   : (IN) degree of the chebyshev polynomial to be used for acceleration (irrelevant when use_acc=0 and init_resid_arpack=0)
+  amin,amax: (IN) bounds of the interval [amin,amax] for the acceleration polynomial (irrelevant when use_acc=0 and init_resid_arpack=0)
+  evals : (OUT) array of size nev+1 which has the computed nev Ritz values
+  v     : computed eigenvectors. Size is n*ncv spinors.
+  tol    : Requested tolerance for the accuracy of the computed eigenvectors.
+           A value of 0 means machine precision.
+  maxiter: maximum number of restarts (iterations) allowed to be used by ARPACK
+  av     : operator for computing the action of the matrix on the vector
+           av(vout,vin) where vout is output spinor and vin is input spinors.
+  info   : output from arpack. 0 means that it converged to the desired tolerance. 
+           otherwise, an error message is printed to stderr 
+  nconv  : actual number of converged eigenvectors.
+  arpack_logfile: name for the logfile to be used by arpack
+*/ 
+
+#endif
diff --git a/solver/precon.c b/solver/precon.c
new file mode 100644
index 000000000..2acc2e996
--- /dev/null
+++ b/solver/precon.c
@@ -0,0 +1,125 @@
+/*********************************************************************** 
+ * Chebyshev polynomial preconditioning related functions.
+ * Note: there also other related functions in poly_precon.h files
+ *       that was developed earlier by Carsten
+ *
+ * Author: A.M. Abdel-Rehim, 2014
+ *
+ * This file is part of tmLQCD software suite
+ ***********************************************************************/
+#include <stdio.h>
+#include "global.h"
+#include "solver/precon.h"
+
+
+void cheb_poly_op(spinor * const R, spinor * const S, matrix_mult f, const int N, const double a, const double b, const int k, spinor *v1, spinor *v2)
+{
+
+   double delta,theta;
+   double sigma,sigma1,sigma_old;
+   double d1,d2,d3;
+
+
+   if(k < 0){ //check the order of the requested polynomial
+      if(g_proc_id == g_stdio_proc)
+        fprintf(stderr,"Error: lowest allowed order of the Chebyshev polynomial is 0.\n");
+        exit(1);
+   }
+
+   delta = (b-a)/2.0;
+   theta = (b+a)/2.0;
+
+   sigma1 = -delta/theta;
+
+   //T_0(Q)=1 
+   assign(R,S,N);
+   if(k== 0){
+      return;
+   }
+
+
+   //T_1(Q) = [2/(b-a)*Q - (b+a)/(b-a)]*sigma_1 
+   d1 =  sigma1/delta;
+   d2 =  1.0;
+   f(R,S); //R=Q(S)
+   assign_mul_add_mul_r(R,S,d1,d2,N);
+   if(k==1){
+     return;
+   }
+   
+   //degree >=2
+   //==========
+
+   //T_0 = S
+   //T_1 = R
+
+
+   int LDN;
+   if(N==VOLUME)
+      LDN = VOLUMEPLUSRAND;
+   else
+      LDN = VOLUMEPLUSRAND/2;
+
+   assign(v1,S,N);
+   assign(v2,R,N);
+
+   sigma_old = sigma1;
+
+   for(int i=2; i <= k; i++)
+   {
+      sigma = 1.0/(2.0/sigma1-sigma_old);
+      
+      d1 = 2.0*sigma/delta;
+      d2 = -d1*theta;
+      d3 = -sigma*sigma_old;
+
+      f(R,v2);
+      assign_mul_add_mul_add_mul_r(R,v2,v1,d1,d2,d3,N);
+
+      assign(v1,v2,N);
+      assign(v2,R,N);
+
+      sigma_old  = sigma;
+   }
+
+   return;
+
+}
+
+void cheb_poly_roots(_Complex double *roots, const int k, const double a, const double b)
+/*
+roots of the shifted Chebyshev polynomial in the interval [a,b]
+The roots of C_k(d(x)) and are given by
+x_l = (b-a)/2*[cos(pi/2 (2*l-1)/k)+(b+a)/(b-a)]
+*/
+{
+   double PI=3.141592653589793;
+
+   double d1,d2,d3;
+
+   d1=0.5*(b+a);
+   d2=0.5*(b-a);
+   d3=PI/(double)k;
+
+   if(k < 1){ //check the order of the requested polynomial
+      if(g_proc_id == g_stdio_proc)
+        fprintf(stderr,"Error: lowest allowed order for roots of Chebyshev polynomial is 1.\n");
+        exit(1);
+   }
+
+
+
+
+   int i;
+
+   for(i=1; i<=k; i++)
+      roots[i-1] = d1+d2*cos(d3*(i-0.5));
+
+   
+   return ;
+}
+
+
+
+
+
diff --git a/solver/precon.h b/solver/precon.h
new file mode 100644
index 000000000..5167d23a6
--- /dev/null
+++ b/solver/precon.h
@@ -0,0 +1,39 @@
+/*********************************************************************** 
+ * preconditioning related functions.
+ * Note: there exist already polynomial preconditioning operators
+ *       related to the chebychev polynomials in poly_precon.h files
+ *       that was developed earlier by Carsten
+ *
+ * Author: A.M. Abdel-Rehim, 2014
+ *
+ * This file is part of tmLQCD software suite
+ ***********************************************************************/
+
+#ifndef _PRECON_H
+#define _PRECON_H
+
+#include "su3.h"
+#include "solver/matrix_mult_typedef.h"
+#include "linalg_eo.h"
+/* #include "memalloc.h" */
+#include "start.h"
+
+void cheb_poly_op(spinor * const R, spinor * const S, matrix_mult f, const int N, const double a, const double b, const int k, spinor *v1, spinor *v2);
+/*
+ Chebyshev polynomial of the oprator f in the interval [a,b] normalized to 1 at 0.
+ R: output spinor
+ S: input spinor
+ f: matrix-vector multiplication for the operator
+ a,b: limits of the interval with b>a
+ k: degree of the polynomial
+ v1 and v2 are spinors needed to be used in the recurrence relation
+*/
+
+
+void cheb_poly_roots(_Complex double *roots, const int k, const double a, const double b);
+/*
+roots of the shifted Chebyshev polynomial of degree k in the interval [a,b]
+*/
+
+
+#endif
diff --git a/solver/solver_params.h b/solver/solver_params.h
index c51207445..e94a7bac9 100644
--- a/solver/solver_params.h
+++ b/solver/solver_params.h
@@ -32,6 +32,14 @@
 
 typedef struct {
 
+  int maxiter;
+  int rel_prec;
+  int nrhs;
+  int nrhs1;
+  double eps_sq;
+  double eps_sq1;
+  double res_eps_sq;
+
   /********************************
    * Incremental EigCG parameters
    ********************************/
@@ -56,6 +64,49 @@ typedef struct {
      where the maximum is over the iterated residuals since the last update */  
   float mcg_delta; 
 
+  /**********************************
+   * arpack_cg parameters
+   **********************************/
+
+  int    arpackcg_nrhs;          /*Number of right-hand sides to be solved*/
+  int    arpackcg_nrhs1;         /*First number of right-hand sides to be solved using tolerance eps_sq1*/
+  double arpackcg_eps_sq1;       /*Squared tolerance of convergence of the linear system for systems 1 till nrhs1*/
+  double arpackcg_eps_sq;        /*Squared tolerance of convergence of the linear system for systems nrhs1+1 till nrhs*/
+  double arpackcg_res_eps_sq;    /*Suqared tolerance for restarting cg */
+  int    arpackcg_nev;           /*Number of eigenvectors to be computed by arpack*/
+  int    arpackcg_ncv;           /*Size of the subspace used by arpack with the condition (nev+1) =< ncv */
+  int    arpackcg_evals_kind;    /* type of eigenvalues to be computed 
+                                    0 eigenvalues of smallest real part
+                                    1 eigenvalues of largest real part 
+                                    2 eigenvalues of smallest absolute value
+                                    3 eigenvalues of largest absolute value*/
+  int    arpackcg_comp_evecs;    /* 0 don't compute the resiudals of the eigenvalues
+                                    1 compute the residulas of the eigenvalues*/
+  double arpackcg_eig_tol;       /*Tolerance for computing eigenvalues with arpack */
+  int    arpackcg_eig_maxiter;   /*Maximum number of iterations to be used by arpack*/
+  char   arpack_logfile[500];    /*file name for the logfile used by arpack*/
+
+  /*************************************
+   *chebychev polynomila preconditioner
+   *for CG related paprameters
+   *************************************/
+  int use_acc; /* type of acceleration to be used:  */
+               /* 0 no acceleration, 1 compute eiegnvectors of the acceleration polynomial,   */
+  int cheb_k; /* order of the polynomial used is k+1 and the lowest value is k=-1 which correspond to T_0 */
+  double op_evmin;  /* lowest boundary of the interval for the polynomial acceleration */
+  double op_evmax;  /* highest boundary for the interval for the polynomial acceleration */
+
+  /*************************************
+   * parameters to control of I/O for
+   * eigenvectors
+   *************************************/
+  int arpackcg_write_ev;
+  int arpackcg_read_ev;
+  char arpack_evecs_filename[500];    /* file name for the evecs used by arpack*/
+  char arpack_evecs_fileformat[200];  /* file format for evecs used by arpack */
+  int projection_type;
+
+  int arpack_evecs_writeprec;
   
 } solver_params_t;
 
diff --git a/wrapper/lib_wrapper.c b/wrapper/lib_wrapper.c
index 73dd86138..caa6713ee 100644
--- a/wrapper/lib_wrapper.c
+++ b/wrapper/lib_wrapper.c
@@ -55,8 +55,13 @@
 #include "invert_eo.h"
 #include "start.h"
 #include "operator.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clovertm_operators_32.h"
+#include "operator/clover_leaf.h"
+#include "solver/solver_types.h"
 #include "measure_gauge_action.h"
 #include "linalg/convert_eo_to_lexic.h"
+#include "operator/Hopping_Matrix.h"
 #include "include/tmLQCD.h"
 #include "fatal_error.h"
 

From 6886593688d3907337d15a9037800b3b13192da8 Mon Sep 17 00:00:00 2001
From: Marcus Petschlies <marcus.petschlies@web.de>
Date: Thu, 12 Jan 2017 14:15:49 +0100
Subject: [PATCH 8/8] added duplication of g_cart_grid to
 tmLQCD_mpi_params.cart_grid

---
 wrapper/lib_wrapper.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/wrapper/lib_wrapper.c b/wrapper/lib_wrapper.c
index caa6713ee..28caaba75 100644
--- a/wrapper/lib_wrapper.c
+++ b/wrapper/lib_wrapper.c
@@ -354,6 +354,9 @@ int tmLQCD_get_mpi_params(tmLQCD_mpi_params * params) {
   params->proc_coords[1] = g_proc_coords[1];
   params->proc_coords[2] = g_proc_coords[2];
   params->proc_coords[3] = g_proc_coords[3];
+#ifdef TM_USE_MPI
+  MPI_Comm_dup(g_cart_grid, &(params->cart_grid) );
+#endif
 
   return(0);
 }