diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1 @@
+
diff --git a/Makefile b/Makefile
index 3ac4930..37fc0eb 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,9 @@
 
 lib.name = earplug~
 
-class.sources = earplug~.c
+cflags = -I./conv
+common.sources = conv/convolution.c conv/fft-dif.c conv/ift-dif.c
+class.sources = earplug~.c 
 
 datafiles = earplug~-help.pd earplug~-meta.pd earplug_data_src.txt \
             parse-to-h.pl README.txt LICENSE.txt
diff --git a/conv/Makefile b/conv/Makefile
new file mode 100644
index 0000000..fda2928
--- /dev/null
+++ b/conv/Makefile
@@ -0,0 +1,14 @@
+PROGRAM = convolution_test
+OBJS = convolution.o test.o fft-dif.o ift-dif.o
+
+.SUFFIXES: .c .o
+
+$(PROGRAM): $(OBJS)
+	$(CC) -o $(PROGRAM) $^
+
+.c.o:
+	$(CC) $(CFLAGS) -c $<
+
+.PHONY: clean
+clean:
+	$(RM) $(PROGRAM) $(OBJS)
diff --git a/conv/convolution.c b/conv/convolution.c
new file mode 100644
index 0000000..e1e5247
--- /dev/null
+++ b/conv/convolution.c
@@ -0,0 +1,45 @@
+#include "convolution.h"
+#include "fft.h"
+#include "ift.h"
+
+void copyToComplex(float* buffer, int size, float complex* complexBuffer)
+{
+	for(int i = 0; i < size; ++i)
+		complexBuffer[i] = buffer[i];
+}
+
+void copyFromComplex(float complex* complexBuffer, int size, float* buffer)
+{
+	for(int i = 0; i < size; ++i)
+		buffer[i] = creal(complexBuffer[i]);
+}
+
+void convolve(float* sample, int size, float* ir, float* result)
+{
+	// fft ir filter
+	float complex complexIr[128] = {0.f};
+	copyToComplex(ir, 128, complexIr);
+	fft(complexIr, 128);
+
+	float complex complexResult[128] = {0.f};
+	int remainingSamples = size;
+
+	do
+	{
+		float complex complexSample[128] = {0.f};
+		int samplesToBeCopied = remainingSamples > 128 ? 128 : remainingSamples; 
+		copyToComplex(sample, samplesToBeCopied, complexSample);
+		fft(complexSample, 128);
+
+		for(int i = 0; i < 128; ++i)
+    	    complexResult[i] = complexSample[i] * complexIr[i];
+
+	    ift(complexResult, 128);
+    	copyFromComplex(complexResult, samplesToBeCopied, result);
+
+		remainingSamples -= 128;
+		sample += 128;
+		result += 128;
+	}while(remainingSamples > 0);
+
+}
diff --git a/conv/convolution.h b/conv/convolution.h
new file mode 100644
index 0000000..22c8c54
--- /dev/null
+++ b/conv/convolution.h
@@ -0,0 +1,3 @@
+#include <complex.h>
+
+void convolve(float* sample, int size, float* ir, float* result);
diff --git a/conv/convolution_test b/conv/convolution_test
new file mode 100755
index 0000000..3c7547f
Binary files /dev/null and b/conv/convolution_test differ
diff --git a/conv/fft-dif.c b/conv/fft-dif.c
new file mode 100644
index 0000000..fc94866
--- /dev/null
+++ b/conv/fft-dif.c
@@ -0,0 +1,97 @@
+#include "fft.h"
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int ctz(size_t N)
+{
+	int ctz1 = 0;
+
+	while( N ) {
+		ctz1++;
+		N >>= 1;
+	}
+
+	return ctz1-1;
+}
+
+static void nop_split(const float complex *x, float complex *X, size_t N)
+{
+	for(size_t n = 0; n < N/2; n++) {
+		X[2*n+0] = x[0/2+n];
+		X[2*n+1] = x[N/2+n];
+	}
+}
+
+static void fft_split(const float complex *x, float complex *X, size_t N, float complex phi)
+{
+	for(size_t n = 0; n < N/2; n++) {
+		X[2*n+0] = (x[0/2+n] + x[N/2+n]);
+		X[2*n+1] = (x[0/2+n] - x[N/2+n]) * cexpf(-2*(float)M_PI*I*phi);
+	}
+}
+
+static size_t revbits(size_t v, int J)
+{
+	size_t r = 0;
+
+	for(int j = 0; j < J; j++) {
+		r |= ( (v>>j)&1 ) << (J-1-j);
+	}
+
+	return r;
+}
+
+static int nop_reverse(int b, float complex *buffers[2], size_t N)
+{
+	int J = ctz(N);
+
+	for(int j = J-2; j >= 0; j--, b++) {
+		size_t delta = N>>j;
+
+		for(size_t n = 0; n < N; n += delta) {
+			nop_split(buffers[b&1]+n, buffers[~b&1]+n, delta);
+		}
+	}
+
+	return b;
+}
+
+static int fft_reverse(int b, float complex *buffers[2], size_t N)
+{
+	int J = ctz(N);
+
+	for(int j = J-1; j >= 0; j--, b++) {
+		size_t delta = N>>j;
+
+		for(size_t n = 0; n < N; n += delta) {
+			float complex phi = (float)revbits(n/delta, j) / (float)(2<<j);
+			fft_split(buffers[b&1]+n, buffers[~b&1]+n, delta, phi);
+		}
+	}
+
+	return b;
+}
+
+int fft(float complex *vector, size_t N)
+{
+	if( !N ) return 0;
+
+	if( N & (N-1) ) return 1;
+
+	float complex *buffers[2] = { vector, malloc(N*sizeof(float complex)) };
+
+	if( !buffers[1] ) return -1;
+
+	int b = 0;
+
+	b = nop_reverse(b, buffers, N);
+	b = fft_reverse(b, buffers, N);
+	b = nop_reverse(b, buffers, N);
+
+	memmove(vector, buffers[b&1], N*sizeof(float complex));
+
+	free( buffers[1] );
+
+	return 0;
+}
diff --git a/conv/fft.h b/conv/fft.h
new file mode 100644
index 0000000..02d92bd
--- /dev/null
+++ b/conv/fft.h
@@ -0,0 +1,20 @@
+#ifndef FFT_H
+#define FFT_H
+
+#include <complex.h>
+#include <stddef.h>
+
+/**
+ * @brief FFT algorithm (forward transform)
+ *
+ * This function computes forward radix-2 fast Fourier transform (FFT).
+ * The output is written in-place over the input.
+ *
+ * @param vector An array of @p N complex values in single-precision floating-point format.
+ * @param N The size of the transform must be a power of two.
+ *
+ * @return Zero for success.
+ */
+int fft(float complex *vector, size_t N);
+
+#endif
diff --git a/conv/ift-dif.c b/conv/ift-dif.c
new file mode 100644
index 0000000..39d06db
--- /dev/null
+++ b/conv/ift-dif.c
@@ -0,0 +1,97 @@
+#include "ift.h"
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int ctz(size_t N)
+{
+	int ctz1 = 0;
+
+	while( N ) {
+		ctz1++;
+		N >>= 1;
+	}
+
+	return ctz1-1;
+}
+
+static void nop_split(const float complex *x, float complex *X, size_t N)
+{
+	for(size_t n = 0; n < N/2; n++) {
+		X[2*n+0] = x[0/2+n];
+		X[2*n+1] = x[N/2+n];
+	}
+}
+
+static void fft_split(const float complex *x, float complex *X, size_t N, float complex phi)
+{
+	for(size_t n = 0; n < N/2; n++) {
+		X[2*n+0] = (x[0/2+n] + x[N/2+n])/2;
+		X[2*n+1] = (x[0/2+n] - x[N/2+n])/2 * cexpf(+2*(float)M_PI*I*phi);
+	}
+}
+
+static size_t revbits(size_t v, int J)
+{
+	size_t r = 0;
+
+	for(int j = 0; j < J; j++) {
+		r |= ( (v>>j)&1 ) << (J-1-j);
+	}
+
+	return r;
+}
+
+static int nop_reverse(int b, float complex *buffers[2], size_t N)
+{
+	int J = ctz(N);
+
+	for(int j = J-2; j >= 0; j--, b++) {
+		size_t delta = N>>j;
+
+		for(size_t n = 0; n < N; n += delta) {
+			nop_split(buffers[b&1]+n, buffers[~b&1]+n, delta);
+		}
+	}
+
+	return b;
+}
+
+static int fft_reverse(int b, float complex *buffers[2], size_t N)
+{
+	int J = ctz(N);
+
+	for(int j = J-1; j >= 0; j--, b++) {
+		size_t delta = N>>j;
+
+		for(size_t n = 0; n < N; n += delta) {
+			float complex phi = (float)revbits(n/delta, j) / (float)(2<<j);
+			fft_split(buffers[b&1]+n, buffers[~b&1]+n, delta, phi);
+		}
+	}
+
+	return b;
+}
+
+int ift(float complex *vector, size_t N)
+{
+	if( !N ) return 0;
+
+	if( N & (N-1) ) return 1;
+
+	float complex *buffers[2] = { vector, malloc(N*sizeof(float complex)) };
+
+	if( !buffers[1] ) return -1;
+
+	int b = 0;
+
+	b = nop_reverse(b, buffers, N);
+	b = fft_reverse(b, buffers, N);
+	b = nop_reverse(b, buffers, N);
+
+	memmove(vector, buffers[b&1], N*sizeof(float complex));
+
+	free( buffers[1] );
+
+	return 0;
+}
diff --git a/conv/ift.h b/conv/ift.h
new file mode 100644
index 0000000..d00415d
--- /dev/null
+++ b/conv/ift.h
@@ -0,0 +1,20 @@
+#ifndef IFT_H
+#define IFT_H
+
+#include <complex.h>
+#include <stddef.h>
+
+/**
+ * @brief FFT algorithm (inverse transform)
+ *
+ * This function computes inverse radix-2 fast Fourier transform (FFT).
+ * The output is written in-place over the input.
+ *
+ * @param vector An array of @p N complex values in single-precision floating-point format.
+ * @param N The size of the transform must be a power of two.
+ *
+ * @return Zero for success.
+ */
+int ift(float complex *vector, size_t N);
+
+#endif
diff --git a/conv/test.c b/conv/test.c
new file mode 100644
index 0000000..6465b4e
--- /dev/null
+++ b/conv/test.c
@@ -0,0 +1,61 @@
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include "convolution.h"
+
+void assert_eq(float expected, float actual)
+{
+	float dif = fabsf(expected -actual);
+	assert(dif < 0.001f);
+}
+
+int main()
+{
+	float sample64[64] = {0.0f};
+	float sample128[128] = {0.0f};
+	float sample512[512] = {0.0f};
+	float sample2048[2048] = {0.0f};
+
+	float result64[64] = {0.0f};
+	float result128[128] = {0.0f};
+	float result512[512] = {0.0f};
+	float result2048[2048] = {0.0f};
+
+	float ir[128] = {0.0f};
+
+	// the second last sample have dirac
+	sample64[62] = 1.f;
+	sample128[126] = 1.f;
+	sample512[510] = 1.f;
+	sample2048[2046] = 1.f;
+
+	ir[1] = 0.3f; // generate one sample delay + attenuation
+
+	//expect convolution result in buffers with one inpulse (amp = 0.3) at the end of the buffer
+
+	convolve(sample64, 64, ir, result64);
+	for(int i = 0; i < 63; ++i)
+		assert_eq(result64[i], 0.0f);
+	assert_eq(result64[63], 0.3f);
+	puts("block size 64: OK");
+
+	convolve(sample128, 128, ir, result128);
+	for(int i = 0; i < 127; ++i)
+		assert_eq(result128[i], 0.0f);
+	assert_eq(result128[127], 0.3f);
+	puts("block size 128: OK");
+
+	convolve(sample512, 512, ir, result512);
+	for(int i = 0; i < 511; ++i)
+		assert_eq(result512[i], 0.0f);
+	assert_eq(result512[511], 0.3f);
+	puts("block size 512: OK");
+
+	convolve(sample2048, 2048, ir, result2048);
+	for(int i = 0; i < 2047; ++i)
+		assert_eq(result2048[i], 0.0f);
+	assert_eq(result2048[2047], 0.3f);
+	puts("block size 2048: OK");
+
+	return 0;
+}
\ No newline at end of file
diff --git a/earplug~.c b/earplug~.c
index 0714d67..9ebdd15 100644
--- a/earplug~.c
+++ b/earplug~.c
@@ -5,10 +5,13 @@
 /* Revised in spring 2009 by Hans-Christoph Steiner to compile in the data file */
 /* Updated in fall 2020 by Dan Wilcox */
 
-#include "earplug~.h"
 #include <math.h>
 #include <string.h>
 #include <errno.h>
+#include <complex.h>
+
+#include "earplug~.h"
+#include "convolution.h"
 
 #define VERSION "0.2.2"
 
@@ -36,6 +39,9 @@ typedef struct _earplug
     t_float azi;
     t_float ele;
      
+    t_float inputBuffer[8192];
+    t_float outputBuffer[2][8192];
+
     t_float crossCoef[8192];
     t_float azimScale[13];
     unsigned int azimOffset[13];
@@ -48,6 +54,12 @@ typedef struct _earplug
     int bufferPin;
 } t_earplug;
 
+void copy(float* source, float* dist, int size)
+{
+    for(int i = 0; i < size; ++i)
+        *dist++ = *source++;
+}
+
 static t_int *earplug_perform(t_int *w)
 {
     t_earplug *x = (t_earplug *)(w[1]);
@@ -143,39 +155,38 @@ static t_int *earplug_perform(t_int *w)
         }
     }
 
-    float inSample;
-    float convSum[2]; /* to accumulate the sum during convolution */
-    int blockScale = 8192 / blocksize;
+    convolve(in, blocksize, x->currentImpulse[0], left_out);
+    convolve(in, blocksize, x->currentImpulse[1], right_out);
 
     /* convolve the interpolated HRIRs (left and right) with the input signal */
-    while (blocksize--)
-    {
-        convSum[0] = 0; 
-        convSum[1] = 0; 
-
-        inSample = *(in++);
-
-        x->convBuffer[x->bufferPin] = inSample;
-        unsigned scaledBlocksize = blocksize * blockScale;
-        unsigned blocksizeDelta = 8191 - scaledBlocksize;
-        for (i = 0; i < 128; i++)
-        { 
-            convSum[0] += (x->previousImpulse[0][i] * x->crossCoef[blocksizeDelta] + 
-                            x->currentImpulse[0][i] * x->crossCoef[scaledBlocksize]) *
-                            x->convBuffer[(x->bufferPin - i) &127];
-                            
-            convSum[1] += (x->previousImpulse[1][i] * x->crossCoef[blocksizeDelta] +
-                            x->currentImpulse[1][i] * x->crossCoef[scaledBlocksize]) *
-                            x->convBuffer[(x->bufferPin - i) &127];
-
-            x->previousImpulse[0][i] = x->currentImpulse[0][i];
-            x->previousImpulse[1][i] = x->currentImpulse[1][i];
-        }   
-        x->bufferPin = (x->bufferPin + 1) & 127;
-
-        *left_out++ = convSum[0];
-        *right_out++ = convSum[1];
-    }
+//    while (blocksize--)
+//    {
+//        convSum[0] = 0;
+//        convSum[1] = 0;
+//
+//        inSample = *(in++);
+//
+//        x->convBuffer[x->bufferPin] = inSample;
+//        unsigned scaledBlocksize = blocksize * blockScale;
+//        unsigned blocksizeDelta = 8191 - scaledBlocksize;
+//        for (i = 0; i < 128; i++)
+//        {
+//            convSum[0] += (x->previousImpulse[0][i] * x->crossCoef[blocksizeDelta] +
+//                            x->currentImpulse[0][i] * x->crossCoef[scaledBlocksize]) *
+//                            x->convBuffer[(x->bufferPin - i) &127];
+//
+//            convSum[1] += (x->previousImpulse[1][i] * x->crossCoef[blocksizeDelta] +
+//                            x->currentImpulse[1][i] * x->crossCoef[scaledBlocksize]) *
+//                            x->convBuffer[(x->bufferPin - i) &127];
+//
+//            x->previousImpulse[0][i] = x->currentImpulse[0][i];
+//            x->previousImpulse[1][i] = x->currentImpulse[1][i];
+//        }
+//        x->bufferPin = (x->bufferPin + 1) & 127;
+//
+//        *left_out++ = convSum[0];
+//        *right_out++ = convSum[1];
+//    }
     return w + 6;
 }