diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 6b99a77e73b..7ee981f6544 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -110,6 +110,54 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src,
                             MatrixDim src_dim, double *dst, MatrixDim dst_dim);
 void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src,
                             MatrixDim src_dim, float *dst, MatrixDim dst_dim);
+void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const double *src, double *dst, double *index_max_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
+                          int A_tran);
+void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const float *src, float *dst, float *index_max_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
+                          int A_tran);
+void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
+                               const double *src, double *dst, double *index_max_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_);
+void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl,
+                               const float *src, float *dst, float *index_max_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_);
 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                             MatrixDim mat_dim, const double *mat2,
                             int mat2_row_stride, int mat2_col_stride,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 934a860a055..34f5c7652ba 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -773,6 +773,149 @@ static void _add_mat_blocks_trans(Real alpha, const Real* src,
     }
 }
 
+template<typename Real>
+__global__
+static void _max_mat_blocks(const Real *src, Real *dst, Real *index_max_, 
+                            const int32_cuda stride,
+                            const int32_cuda input_t_dim_,
+                            const int32_cuda pool_t_size_,
+                            const int32_cuda pool_t_step_,
+                            const int32_cuda input_h_dim_,
+                            const int32_cuda pool_h_size_,
+                            const int32_cuda pool_h_step_,
+                            const int32_cuda input_f_dim_,
+                            const int32_cuda pool_f_size_,
+                            const int32_cuda pool_f_step_) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z;
+  int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+
+  // initialize the temporary maximum value and its index in each pool
+  int32_cuda max_row = i * pool_t_step_; 
+  int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_;
+  int32_cuda max_value = src[max_row * input_h_dim_ * input_f_dim_ + max_col];
+
+  // loop over all the elements in each pool to find the maximum one,
+  // and record its index.
+
+  for (int32_cuda t = 0; t < pool_t_size_; t += stride) {
+    // the index of row in *src
+    int32_cuda idx_row = i * pool_t_step_ + t;
+
+    for (int32_cuda h = 0; h < pool_h_size_; h++) {
+      for (int32_cuda f = 0; f < pool_f_size_; f++) {
+        // the index of column in *src
+        int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
+
+        if (src[idx_row * input_h_dim_ * input_f_dim_ + idx_col] > max_value) {
+          max_row = idx_row;
+          max_col = idx_col;
+          max_value = src[max_row * input_h_dim_ * input_f_dim_ + max_col];
+        }
+      } 
+    }
+  }
+
+  dst[i * num_pools_h * num_pools_f + j * num_pools_f + k] = max_value;
+
+  // the index of indexes stored in vector 'index_max_'.
+  int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k;
+  index_max_[idx_in_idxmax] = max_row;
+  index_max_[idx_in_idxmax + 1] = max_col;
+}
+
+// this function is basicall the same as _max_mat_blocks, except it
+// deal with the transpose matrix of *src. So the column and row index
+// are exchanged.
+template<typename Real>
+__global__
+static void _max_mat_blocks_trans(const Real *src, Real *dst, Real *index_max_,
+                                  const int32_cuda stride,
+                                  const int32_cuda input_t_dim_,
+                                  const int32_cuda pool_t_size_,
+                                  const int32_cuda pool_t_step_,
+                                  const int32_cuda input_h_dim_,
+                                  const int32_cuda pool_h_size_,
+                                  const int32_cuda pool_h_step_,
+                                  const int32_cuda input_f_dim_,
+                                  const int32_cuda pool_f_size_,
+                                  const int32_cuda pool_f_step_) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z;
+  int32_cuda num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
+  int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+
+  int32_cuda max_row = i * pool_t_step_; 
+  int32_cuda max_col = j * pool_h_step_ * input_f_dim_ + k * pool_f_step_;
+  int32_cuda max_value = src[max_col * input_t_dim_ + max_row];
+
+  for (int32_cuda t = 0; t < pool_t_size_; t += stride) {
+  	int32_cuda idx_row = i * pool_t_step_ + t;
+
+    for (int32_cuda h = 0; h < pool_h_size_; h++) {
+      for (int32_cuda f = 0; f < pool_f_size_; f++) {
+        int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
+
+        if (src[idx_col * input_t_dim_ + idx_row] > max_value) {
+          max_row = idx_row;
+          max_col = idx_col;
+          max_value = src[max_col * input_t_dim_ + max_row];
+        }
+      } 
+    }
+  }
+
+  dst[(j * num_pools_f + k) * num_pools_t + i] = max_value;
+
+  int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k;
+  index_max_[idx_in_idxmax] = max_row;
+  index_max_[idx_in_idxmax + 1] = max_col;
+
+}
+
+template<typename Real>
+__global__
+static void _max_mat_blocks_back(const Real *src, Real *dst, Real *index_max_,
+                                 const int32_cuda input_t_dim_,
+                                 const int32_cuda pool_t_size_,
+                                 const int32_cuda pool_t_step_,
+                                 const int32_cuda input_h_dim_,
+                                 const int32_cuda pool_h_size_,
+                                 const int32_cuda pool_h_step_,
+                                 const int32_cuda input_f_dim_,
+                                 const int32_cuda pool_f_size_,
+                                 const int32_cuda pool_f_step_) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda k = blockIdx.z * blockDim.z + threadIdx.z;
+  int32_cuda num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32_cuda num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+
+  for (int32_cuda t = 0; t < pool_t_size_ ; t++) {
+    int32_cuda idx_row = i * pool_t_step_ + t;
+
+    for (int32_cuda h = 0; h < pool_h_size_ ; h++) {
+      for (int32_cuda f = 0; f < pool_f_size_; f++) {
+        int32_cuda idx_col = (j * pool_h_step_ + h) * input_f_dim_ + k * pool_f_step_ + f;
+        int32_cuda idx_in_idxmax = (i * num_pools_h + j) * num_pools_f + k;
+
+        if (idx_row == index_max_[idx_in_idxmax] && 
+            idx_col == index_max_[idx_in_idxmax + 1] ||
+            dst[idx_row * input_h_dim_ * input_f_dim_ + idx_col] != 0) {
+          dst[idx_row * input_h_dim_ * input_f_dim_ + idx_col] = 
+          src[i * num_pools_h * num_pools_f + j * num_pools_f + k];
+        } else {
+          dst[idx_row * input_h_dim_ * input_f_dim_ + idx_col] = 0;
+        }
+      }
+    }
+  }
+}
+
 template<typename Real>
 __global__
 static void _set_mat_mat_div_mat(const Real* A, const Real* B, const Real* C,
@@ -3957,6 +4100,48 @@ void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
+void cudaF_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const float *src, float *dst, float *index_max_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
+                          int A_trans) {
+  if (A_trans) {
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_);
+  } else {
+    _max_mat_blocks<<<Gr,Bl>>>(src, dst, index_max_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_);
+  }
+}
+
+void cudaF_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
+                               const float *src, float *dst, float *index_max_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_) {
+  _max_mat_blocks_back<<<Gr,Bl>>>(src, dst, index_max_,
+	                            input_t_dim_, pool_t_size_, pool_t_step_,
+	                            input_h_dim_, pool_h_size_, pool_h_step_,
+	                            input_f_dim_, pool_f_size_, pool_f_step_);
+}
 
 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
                                const float *C, float *dst, MatrixDim d,
@@ -4661,6 +4846,49 @@ void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
   _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
 }
 
+void cudaD_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const double *src, double *dst, double *index_max_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
+                          int A_trans) {
+  if (A_trans) {
+    _max_mat_blocks_trans<<<Gr,Bl>>>(src, dst, index_max_, stride,
+			                         input_t_dim_, pool_t_size_, pool_t_step_,
+			                         input_h_dim_, pool_h_size_, pool_h_step_,
+			                         input_f_dim_, pool_f_size_, pool_f_step_);
+  } else {
+    _max_mat_blocks<<<Gr,Bl>>>(src, dst, index_max_, stride,
+	                           input_t_dim_, pool_t_size_, pool_t_step_,
+	                           input_h_dim_, pool_h_size_, pool_h_step_,
+	                           input_f_dim_, pool_f_size_, pool_f_step_);
+  }
+}
+
+void cudaD_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
+                               const double *src, double *dst, double *index_max_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_) {
+  _max_mat_blocks_back<<<Gr,Bl>>>(src, dst, index_max_,
+		                          input_t_dim_, pool_t_size_, pool_t_step_,
+		                          input_h_dim_, pool_h_size_, pool_h_step_,
+		                          input_f_dim_, pool_f_size_, pool_f_step_);
+}
+
 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
                                const double *B, const double *C, double *dst,
                                MatrixDim d, int stride_a, int stride_b,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 8f719a8c4a1..0712179bca8 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -168,6 +168,74 @@ inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
                                   float *dst, MatrixDim dst_dim) {
   cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
 }
+inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const double *src, double *dst, double *index_max_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
+                          int A_trans) {
+  cudaD_max_mat_blocks(Gr, Bl, src, dst, index_max_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_, A_trans);
+}
+inline void cuda_max_mat_blocks(dim3 Gr, dim3 Bl, 
+                          const float *src, float *dst, float *index_max_,
+                          const int32_cuda stride,
+                          const int32_cuda input_t_dim_,
+                          const int32_cuda pool_t_size_,
+                          const int32_cuda pool_t_step_,
+                          const int32_cuda input_h_dim_,
+                          const int32_cuda pool_h_size_,
+                          const int32_cuda pool_h_step_,
+                          const int32_cuda input_f_dim_,
+                          const int32_cuda pool_f_size_,
+                          const int32_cuda pool_f_step_,
+                          int A_trans) {
+  cudaF_max_mat_blocks(Gr, Bl, src, dst, index_max_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_, A_trans);
+}
+inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl, 
+                               const double *src, double *dst, double *index_max_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_) {
+  cudaD_max_mat_blocks_back(Gr, Bl, src, dst, index_max_,
+                            input_t_dim_, pool_t_size_, pool_t_step_,
+                            input_h_dim_, pool_h_size_, pool_h_step_,
+                            input_f_dim_, pool_f_size_, pool_f_step_);
+}
+inline void cuda_max_mat_blocks_back(dim3 Gr, dim3 Bl,
+                               const float *src, float *dst, float *index_max_,
+                               const int32_cuda input_t_dim_,
+                               const int32_cuda pool_t_size_,
+                               const int32_cuda pool_t_step_,
+                               const int32_cuda input_h_dim_,
+                               const int32_cuda pool_h_size_,
+                               const int32_cuda pool_h_step_,
+                               const int32_cuda input_f_dim_,
+                               const int32_cuda pool_f_size_,
+                               const int32_cuda pool_f_step_) {
+  cudaF_max_mat_blocks_back(Gr, Bl, src, dst, index_max_,
+                            input_t_dim_, pool_t_size_, pool_t_step_,
+                            input_h_dim_, pool_h_size_, pool_h_step_,
+                            input_f_dim_, pool_f_size_, pool_f_step_);
+}
 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                                   MatrixDim mat_dim, const double *mat2,
                                   int mat2_row_stride, int mat2_col_stride,
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 01030bb8353..60d87ca1ca7 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2917,6 +2917,97 @@ static void UnitTestCuMatrixEqualElementMask() {
 
 }
 
+template<typename Real>
+static void UnitTestCuMatrixMaxMatBlocks() {
+  for (int32 l = 0; l < 5; l++) {
+    int32 stride = RandInt(1, 5);
+    int32 input_t_dim_ = RandInt(1, 100);
+    int32 pool_t_size_ = RandInt(1, 10);
+    int32 pool_t_step_ = RandInt(1, 10);
+    int32 input_h_dim_ = RandInt(1, 100);
+    int32 pool_h_size_ = RandInt(1, 10);
+    int32 pool_h_step_ = RandInt(1, 10);
+    int32 input_f_dim_ = RandInt(1, 100);
+    int32 pool_f_size_ = RandInt(1, 10);
+    int32 pool_f_step_ = RandInt(1, 10);
+
+
+    // this part is for testing of forward propagation
+    CuMatrix<Real> in_value(input_t_dim_, input_h_dim_ * input_f_dim_);
+    in_value.SetRandn();
+
+    int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_;
+    int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+    int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+
+    CuMatrix<Real> out_value(num_pools_t, num_pools_h * num_pools_f);
+    out_value.SetRandn();
+
+    CuVector<Real> index_max_(2 * num_pools_t * num_pools_h * num_pools_f);
+    index_max_.SetRandn();
+
+    CuMatrix<Real> out_value_copy(out_value);
+
+    out_value.MaxMatBlocks(in_value, index_max_,
+                    input_t_dim_, pool_t_size_, pool_t_step_,
+                    input_h_dim_, pool_h_size_, pool_h_step_,
+                    input_f_dim_, pool_f_size_, pool_f_step_,
+                    kNoTrans);
+    int32 tmp = 0;
+    for (int32 t = 0; t < num_pools_t; t++) {
+      for (int32 h = 0; h < num_pools_t; h++) {
+        for (int32 f = 0; f < num_pools_f; f++) {
+          // initialize the maximum value as the first element in the pool
+          int32 max_x = 0; int32 max_y = 0;
+          int32 max_value = in_value(t * pool_t_step_, h * pool_h_step_ * input_f_dim_ + f * pool_f_step_);
+
+          // find the maximm value in the pool
+          for (int32 x = 0; x < pool_t_size_; x += stride) {
+            int32 cur_x = t * pool_t_step_ + x;
+
+            for (int32 y = 0; y < pool_h_size_; y++) {
+              for (int32 z = 0; z < pool_f_size_; z++) {
+                int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z;
+                if (in_value(cur_x, cur_y) > max_value) {
+                  max_x = cur_x;
+                  max_y = cur_y;
+                  max_value = in_value(cur_x, cur_y);
+                  index_max_(tmp) = max_x;
+                  index_max_(tmp+1) = max_y;
+                }
+              }
+            }
+          }
+          out_value_copy(t, h * num_pools_f + f) = max_value;
+          tmp += 2;
+        }
+      }
+    }
+
+    AssertEqual(out_value, out_value_copy);
+
+    // this part is for testing backward propagation
+    CuMatrix<Real> in_deriv(input_t_dim_, input_h_dim_ * input_f_dim_);
+    in_deriv.SetZero();
+    CuMatrix<Real> out_deriv(num_pools_t, num_pools_h * num_pools_f);
+    out_deriv.SetRandn();
+    CuMatrix<Real> in_deriv_copy(in_deriv);
+
+    in_deriv.MaxMatBlocks(out_deriv, index_max_,
+                   input_t_dim_, pool_t_size_, pool_t_step_,
+                   input_h_dim_, pool_h_size_, pool_h_step_,
+                   input_f_dim_, pool_f_size_, pool_f_step_,
+                   kNoTrans);
+
+    for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) {
+      int32 row_tmp = (x / 2) / (num_pools_h * num_pools_f);
+      int32 col_tmp = (x / 2) % (num_pools_h * num_pools_f);
+      in_deriv_copy(index_max_(x),index_max_(x+1)) = out_deriv(row_tmp, col_tmp);
+    }
+    AssertEqual(in_deriv, in_deriv_copy);
+  }
+}
+
 template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixApplyExpSpecial<Real>();
   UnitTestCuMatrixApplyExpLimited<Real>();
@@ -2987,6 +3078,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixAddToElements<Real>();
   UnitTestCuMatrixLookup<Real>();
   UnitTestCuMatrixEqualElementMask<Real>();
+  UnitTestCuMatrixMaxMatBlocks<Real>();
   // test CuVector<Real> methods
   UnitTestCuVectorAddVec<Real>();
   UnitTestCuVectorAddRowSumMat<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index beccd9dc4a5..f0c10086433 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1187,6 +1187,140 @@ void CuMatrixBase<Real>::AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
   }
 }
 
+
+template<typename Real>
+void CuMatrixBase<Real>::MaxMatBlocks(const CuMatrixBase<Real> &A,
+                                      CuVectorBase<Real> &index_max_,
+                                      const int32 stride,
+                                      const int32 input_t_dim_,
+                                      const int32 pool_t_size_,
+                                      const int32 pool_t_step_,
+                                      const int32 input_h_dim_,
+                                      const int32 pool_h_size_,
+                                      const int32 pool_h_step_,
+                                      const int32 input_f_dim_,
+                                      const int32 pool_f_size_,
+                                      const int32 pool_f_step_,
+                                      MatrixTransposeType transA) {
+  if (num_rows_ == 0 || num_cols_ == 0) return;
+
+  int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_;
+  int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+
+  KALDI_ASSERT((input_t_dim_ - pool_t_size_) % pool_t_step_ == 0 && 
+               (input_h_dim_ - pool_h_size_) % pool_h_step_ == 0 &&
+               (input_f_dim_ - pool_f_size_) % pool_f_step_ == 0);
+
+  if (A.NumRows() >= (transA == kNoTrans ? num_rows_ : num_cols_) &&
+      A.NumCols() >= (transA == kNoTrans ? num_cols_ : num_rows_)) {
+    // This is the "forward-propagation" version of MaxMatBlocks.
+    // It supports both regular and transposed operation.
+    if (transA == kNoTrans) {
+      KALDI_ASSERT(A.NumRows() == input_t_dim_ &&
+                   A.NumCols() == input_h_dim_ * input_f_dim_ &&
+                   num_rows_ == num_pools_t  &&
+                   num_cols_ == num_pools_h * num_pools_f);
+
+    } else {
+      KALDI_ASSERT(A.NumCols() == input_t_dim_ &&
+                   A.NumRows() == input_h_dim_ * input_f_dim_ &&
+                   num_cols_ == num_pools_t  &&
+                   num_rows_ == num_pools_h * num_pools_f);
+
+    }
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+      CuTimer tim;
+      dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f);
+      dim3 dimGrid(1);
+
+      cuda_max_mat_blocks(dimGrid, dimBlock, A.data_, data_, index_max_.data_, stride,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_,
+                          (transA == kTrans ? 1 : 0));
+      CU_SAFE_CALL(cudaGetLastError());
+
+      CuDevice::Instantiate().AccuProfile(__func__, tim);
+    } else
+#endif
+    {
+    // maxpooling without cuda
+      int32 tmp = 0;
+      for (int32 t = 0; t < num_pools_t; t++) {
+        for (int32 h = 0; h < num_pools_t; h++) {
+          for (int32 f = 0; f < num_pools_f; f++) {
+            // initialize the maximum value as the first element in the pool
+            int32 max_x = 0; int32 max_y = 0; 
+            int32 max_value = A(t * pool_t_step_, h * pool_h_step_ * input_f_dim_ + f * pool_f_step_);
+            
+            // find the maximm value in the pool
+            for (int32 x = 0; x < pool_t_size_; x++) {
+              int32 cur_x = t * pool_t_step_ + x;
+
+              for (int32 y = 0; y < pool_h_size_; y++) { 
+                for (int32 z = 0; z < pool_f_size_; z++) {              
+                  int32 cur_y = (h * pool_h_step_ + y) * input_f_dim_ + f * pool_f_step_ + z;
+                  if (A(cur_x, cur_y) > max_value) {
+                    max_x = cur_x;
+                    max_y = cur_y;
+                    max_value = A(cur_x, cur_y);
+                    index_max_(tmp) = max_x;
+                    index_max_(tmp+1) = max_y;
+                  }
+                }
+              }
+            }
+            (*this)(t, h * num_pools_f + f) = max_value;
+            tmp += 2;
+          }
+        }
+      }
+    }
+  } else {
+
+    // This is the "backward-propagation" version of MaxMatBlocks, where
+    // *this is larger than src.
+    if (transA == kNoTrans){
+      KALDI_ASSERT(A.NumRows() == num_pools_t &&
+                   A.NumCols() == num_pools_h * num_pools_f &&
+                   num_rows_ == input_t_dim_  &&
+                   num_cols_ == input_h_dim_ * input_f_dim_);
+    } else {
+      KALDI_ASSERT(A.NumCols() == num_pools_t &&
+                   A.NumRows() == num_pools_h * num_pools_f &&
+                   num_cols_ == input_t_dim_  &&
+                   num_rows_ == input_h_dim_ * input_f_dim_);
+    }
+
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+      CuTimer tim;
+      dim3 dimBlock(num_pools_t, num_pools_h, num_pools_f);
+      dim3 dimGrid(1);
+
+      cuda_max_mat_blocks_back(dimGrid, dimBlock, A.data_, data_, index_max_.data_,
+                               input_t_dim_, pool_t_size_, pool_t_step_,
+                               input_h_dim_, pool_h_size_, pool_h_step_,
+                               input_f_dim_, pool_f_size_, pool_f_step_);
+
+      CU_SAFE_CALL(cudaGetLastError());
+      CuDevice::Instantiate().AccuProfile(__func__, tim);
+    } else
+#endif
+    {
+    // maxpooling backward propagation without cuda
+      this->SetZero();
+      for (int32 x = 0; x < num_pools_t * num_pools_h * num_pools_f; x += 2) {
+        int32 row_tmp = (x / 2) / (num_pools_h * num_pools_f);
+        int32 col_tmp = (x / 2) % (num_pools_h * num_pools_f);
+        (*this)(index_max_(x),index_max_(x+1)) = A(row_tmp, col_tmp);
+      }
+    }
+  }
+}
+
 /// dst = a * b / c (by element; when c = 0, dst = a)
 /// dst can be an alias of a, b or c safely and get expected result.
 template<typename Real>
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 03e69b639d3..345d3fd980b 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -509,6 +509,84 @@ class CuMatrixBase {
   void AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
                     MatrixTransposeType trans = kNoTrans);
 
+
+  /// This function is used for do the  maxpooling over blocks. The detailed 
+  /// description is written in the MaxPoolingOverBlock component in file
+  /// nnet-convolutional-component.h
+  ///
+  /// To point out, this function has two version 'forward-propagate' and
+  /// 'backward-propagate':
+  ///
+  ///  (1)  When the size of input matrix &A is larger than *this, it is then a 
+  ///       'forward-propagate' version, and the function do the maxpooling 
+  ///       depending on the parameters. Meanwhile, it stores the index of 
+  ///       maximum value in each pool in vector 'index_max_' for backpropagation.
+  ///
+  ///  (2)  When the size of input matrix &A is smaller than *this, it is then a 
+  ///       'backward-propagate' version. According to the vector 'index_max_', the 
+  ///       function set all the values in &out_deriv whose index is not in 
+  ///       vector(not corresponding to maximum value in each pool of &in_value) 
+  ///       as zero, and keeps those correponding to maximum value as the *in_deriv.
+  /// Parameters:
+  ///    
+  ///    size of input matrix:
+  ///         input_t_dim_    size of the input along t-axis
+  ///                         (e.g. number of time steps)
+  ///         input_h_dim_    size of input along h-axis
+  ///                         (e.g. number of mel-frequency bins)
+  ///         input_f_dim_    size of input along f-axis
+  ///                         (e.g. number of filters in the ConvolutionComponent)
+  ///  
+  ///    block size:
+  ///         pool_t_size_    size of the pooling window along t-axis
+  ///         pool_h_size_    size of the pooling window along h-axis
+  ///         pool_f_size_    size of the pooling window along f-axis
+  ///                         (So, the dimension of block is:
+  ///                          pool_t_size_ by pool_h_size_ * pool_f_size_)
+  ///    
+  ///    stride size:
+  ///         pool_t_step_    the number of steps taken along t-axis of input
+  ///                         before computing the next pool (e.g. the stride
+  ///                          size along t-axis)
+  ///         pool_h_step_    the number of steps taken along h-axis of input
+  ///                         before computing the next pool (e.g. the stride
+  ///                          size along t-axis)
+  ///         pool_f_step_    the number of steps taken along f-axis of input
+  ///                         before computing the next pool (e.g. the stride
+  ///                          size along t-axis)
+  ///         stride          the time stride size within blocks. So we get 
+  ///                         one row of maxpooling candidate every stride rows
+  ///                         in the input matrix.
+
+  ///         index_max_      a vector that store the index of the maximum 
+  ///                         value as (r, c), used in back-propagation. The 
+  ///                         size of this vector is 2 * num_pools_t * 
+  ///                         num_pools_h * num_pools_f
+  ///
+  ///   So there are totally num_pools_t * num_pools_h * num_pools_f blocks,
+  ///   where:
+  ///       num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
+  ///       // the number of blocks in t dimension
+  ///       num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; 
+  ///       // the number of blocks in h dimension
+  ///       num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; 
+  ///       // the number of blocks in f dimension
+  ///    If we have index idx_t, idx_h, idx_f in each axis, then we can find 
+  ///    the block with:
+  ///      row index: 
+  ///        [start_t, start_t + pool_t_size_];
+  ///      column index: combination of sets:
+  ///        [start_col(0), start_col(0) + pool_f_size_],
+  ///        [start_col(1), start_col(1) + pool_f_size_],
+  ///        ...,
+  ///        [start_col(pool_h_size_), start_col(pool_h_size_) + pool_f_size_]
+  ///    where:
+  ///      start_row    = idx_t * pool_t_step_
+  ///      start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_
+  void MaxMatBlocks(const CuMatrixBase<Real> &A, CuVectorBase<Real> &index_max_, const int32 stride,                    const int32 input_t_dim_, const int32 pool_t_size_, const int32 pool_t_step_,
+                    const int32 input_h_dim_, const int32 pool_h_size_, const int32 pool_h_step_,
+                    const int32 input_f_dim_, const int32 pool_f_size_, const int32 pool_f_step_,
+                    MatrixTransposeType trans = kNoTrans);
   /// (for each column c of *this), c = alpha * col + beta * c
   void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
   /// (for each row r of *this), r = alpha * row + beta * r
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index bea3b9d31d5..7bcca4bd3dc 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -1,4 +1,4 @@
-// nnet3/nnet-convolutional-component.cc
+ // nnet3/nnet-convolutional-component.cc
 
 // Copyright      2017  Johns Hopkins University (author: Daniel Povey)
 
@@ -666,6 +666,171 @@ void TimeHeightConvolutionComponent::PrecomputedIndexes::Read(
   ExpectToken(is, binary, "</TimeHeightConvolutionComponentPrecomputedIndexes>");
 }
 
+MaxPoolingOverBlock::MaxPoolingOverBlock(
+    const MaxPoolingOverBlock &other):
+    input_t_dim_(other.input_t_dim_),
+    input_h_dim_(other.input_h_dim_),
+    input_f_dim_(other.input_f_dim_),
+    pool_t_size_(other.pool_t_size_),
+    pool_h_size_(other.pool_h_size_),
+    pool_f_size_(other.pool_f_size_),
+    pool_t_step_(other.pool_t_step_),
+    pool_h_step_(other.pool_h_step_),
+    pool_f_step_(other.pool_f_step_) { }
+
+// aquire input dim
+int32 MaxPoolingOverBlock::InputDim() const {
+  return input_t_dim_ * input_h_dim_ * input_f_dim_;
+}
+
+// aquire output dim
+int32 MaxPoolingOverBlock::OutputDim() const {
+  int32 num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_;
+  int32 num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_;
+  int32 num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_;
+  return num_pools_t * num_pools_h * num_pools_f;
+}
+
+// check the component parameters
+void MaxPoolingOverBlock::Check() const {
+  // sanity check of the max pooling parameters
+  KALDI_ASSERT(input_t_dim_ > 0);
+  KALDI_ASSERT(input_h_dim_ > 0);
+  KALDI_ASSERT(input_f_dim_ > 0);
+  KALDI_ASSERT(pool_t_size_ > 0);
+  KALDI_ASSERT(pool_h_size_ > 0);
+  KALDI_ASSERT(pool_f_size_ > 0);
+  KALDI_ASSERT(pool_t_step_ > 0);
+  KALDI_ASSERT(pool_h_step_ > 0);
+  KALDI_ASSERT(pool_f_step_ > 0);
+  KALDI_ASSERT(input_t_dim_ >= pool_t_size_);
+  KALDI_ASSERT(input_h_dim_ >= pool_h_size_);
+  KALDI_ASSERT(input_f_dim_ >= pool_f_size_);
+  KALDI_ASSERT(pool_t_size_ >= pool_t_step_);
+  KALDI_ASSERT(pool_h_size_ >= pool_h_step_);
+  KALDI_ASSERT(pool_f_size_ >= pool_f_step_);
+  KALDI_ASSERT((input_t_dim_ - pool_t_size_) % pool_t_step_  == 0);
+  KALDI_ASSERT((input_h_dim_ - pool_h_size_) % pool_h_step_  == 0);
+  KALDI_ASSERT((input_f_dim_ - pool_f_size_) % pool_f_step_  == 0);
+}
+
+// initialize the component using configuration file
+void MaxPoolingOverBlock::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true;
+
+  ok = ok && cfl->GetValue("input-t-dim", &input_t_dim_);
+  ok = ok && cfl->GetValue("input-h-dim", &input_h_dim_);
+  ok = ok && cfl->GetValue("input-f-dim", &input_f_dim_);
+  ok = ok && cfl->GetValue("pool-t-size", &pool_t_size_);
+  ok = ok && cfl->GetValue("pool-h-size", &pool_h_size_);
+  ok = ok && cfl->GetValue("pool-f-size", &pool_f_size_);
+  ok = ok && cfl->GetValue("pool-t-step", &pool_t_step_);
+  ok = ok && cfl->GetValue("pool-h-step", &pool_h_step_);
+  ok = ok && cfl->GetValue("pool-f-step", &pool_f_step_);
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Bad initializer " 
+              << cfl->WholeLine();
+
+  Check();
+}
+
+void MaxPoolingOverBlock::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<MaxPoolingOverBlock>", "<InputTDim>");
+  ReadBasicType(is, binary, &input_t_dim_);
+  ExpectToken(is, binary, "<InputHDim>");
+  ReadBasicType(is, binary, &input_h_dim_);
+  ExpectToken(is, binary, "<InputFDim>");
+  ReadBasicType(is, binary, &input_f_dim_);
+  ExpectToken(is, binary, "<PoolTize>");
+  ReadBasicType(is, binary, &pool_t_size_);
+  ExpectToken(is, binary, "<PoolHSize>");
+  ReadBasicType(is, binary, &pool_h_size_);
+  ExpectToken(is, binary, "<PoolFSize>");
+  ReadBasicType(is, binary, &pool_f_size_);
+  ExpectToken(is, binary, "<PoolTStep>");
+  ReadBasicType(is, binary, &pool_t_step_);
+  ExpectToken(is, binary, "<PoolHStep>");
+  ReadBasicType(is, binary, &pool_h_step_);
+  ExpectToken(is, binary, "<PoolFStep>");
+  ReadBasicType(is, binary, &pool_f_step_);
+  ExpectToken(is, binary, "</MaxPoolingOverBlock>");
+  Check();
+}
+
+void MaxPoolingOverBlock::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MaxPoolingOverBlock>");
+  WriteToken(os, binary, "<InputTDim>");
+  WriteBasicType(os, binary, input_t_dim_);
+  WriteToken(os, binary, "<InputHDim>");
+  WriteBasicType(os, binary, input_h_dim_);
+  WriteToken(os, binary, "<InputFDim>");
+  WriteBasicType(os, binary, input_f_dim_);
+  WriteToken(os, binary, "<PoolTSize>");
+  WriteBasicType(os, binary, pool_t_size_);
+  WriteToken(os, binary, "<PoolHSize>");
+  WriteBasicType(os, binary, pool_h_size_);
+  WriteToken(os, binary, "<PoolFSize>");
+  WriteBasicType(os, binary, pool_f_size_);
+  WriteToken(os, binary, "<PoolTStep>");
+  WriteBasicType(os, binary, pool_t_step_);
+  WriteToken(os, binary, "<PoolHStep>");
+  WriteBasicType(os, binary, pool_h_step_);
+  WriteToken(os, binary, "<PoolFStep>");
+  WriteBasicType(os, binary, pool_f_step_);
+  WriteToken(os, binary, "</MaxPoolingOverBlock>");
+}
+
+// display information about component
+std::string MaxPoolingOverBlock::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", input-t-dim=" << input_t_dim_
+         << ", input-h-dim=" << input_h_dim_
+         << ", input-f-dim=" << input_f_dim_
+         << ", pool-t-size=" << pool_t_size_
+         << ", pool-h-size=" << pool_h_size_
+         << ", pool-f-size=" << pool_f_size_
+         << ", pool-t-step=" << pool_t_step_
+         << ", pool-h-step=" << pool_h_step_
+         << ", pool-f-step=" << pool_f_step_;
+  return stream.str();
+}
+
+void* MaxPoolingOverBlock::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                     const CuMatrixBase<BaseFloat> &in_value,
+                                     CuMatrixBase<BaseFloat> *out_value) const {
+
+  out_value->MaxMatBlocks(in_value, index_max_,
+                    input_t_dim_, pool_t_size_, pool_t_step_,
+                    input_h_dim_, pool_h_size_, pool_h_step_,
+                    input_f_dim_, pool_f_size_, pool_f_step_,
+                    kNoTrans);
+  return NULL;
+}
+
+void MaxPoolingOverBlock::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &,//in_value,
+    const CuMatrixBase<BaseFloat> &,//out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  if (in_deriv) {
+    in_deriv->MaxMatBlocks(out_deriv, index_max_,
+                          input_t_dim_, pool_t_size_, pool_t_step_,
+                          input_h_dim_, pool_h_size_, pool_h_step_,
+                          input_f_dim_, pool_f_size_, pool_f_step_,
+                          kNoTrans);
+
+  }
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index 35cf0de11c9..6cd421eaf5a 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -370,8 +370,190 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
   OnlineNaturalGradient preconditioner_out_;
 };
 
-
-
+/**  MaxPoolingOverBlock gets maximum value over blocks of its input
+  this component should be compatible with TimeHeightConvolutionComponent
+
+  MaxPoolingOverBlock :
+  MaxPoolingOverBlock component was firstly used in ConvNet. It inspired 
+  Maxout nonlinearity. Each output element of this component is the 
+  maximum of a block of input elements where the block has a 
+  dimension (pool_t_size_, pool_h_size_ * pool_f_size_).
+  Blocks could overlap if the shift value on any axis is smaller
+  than its corresponding pool size (e.g. pool_t_step_ < pool_t_size_).
+  If the shift values are euqal to their pool size, there is no
+  overlap; while if they all equal 1, the blocks overlap to
+  the greatest possible extent.
+ 
+  This component is designed to be used after a ConvolutionComponent
+  so that the input matrix is propagated from a 2d-convolutional layer.
+  This component implements maxpooling which performs
+  max pooling along the three axes.
+ 
+  Input : A matrix with dimensions:
+         t: (e.g. time)
+         h: (e.g. height, mel-frequency)
+         f: (e.g. channels like number of filters in the ConvolutionComponent)
+
+  Parameters:
+
+            input_t_dim_    size of the input along t-axis
+                            (e.g. number of time steps)
+            input_h_dim_    size of input along h-axis
+                            (e.g. number of mel-frequency bins)
+            input_f_dim_    size of input along f-axis
+                            (e.g. number of filters in the ConvolutionComponent)
+
+            pool_t_size_    size of the pooling window along t-axis
+            pool_h_size_    size of the pooling window along h-axis
+            pool_f_size_    size of the pooling window along f-axis
+
+            pool_t_step_    the number of steps taken along t-axis of input
+                            before computing the next pool (e.g. the stride
+                             size along t-axis)
+            pool_h_step_    the number of steps taken along h-axis of input
+                            before computing the next pool (e.g. the stride
+                             size along t-axis)
+            pool_f_step_    the number of steps taken along f-axis of input
+                            before computing the next pool (e.g. the stride
+                             size along t-axis)
+
+            index_max_      a vector that store the index of the maximum 
+                            value as (r, c), used in back-propagation. The 
+                            size of this vector is 2 * num_pools_t * 
+                            num_pools_h * num_pools_f
+
+         So there are totally num_pools_t * num_pools_h * num_pools_f blocks,
+         where:
+           num_pools_t = 1 + (input_t_dim_ - pool_t_size_) / pool_t_step_; 
+           // the number of blocks in t dimension
+           num_pools_h = 1 + (input_h_dim_ - pool_h_size_) / pool_h_step_; 
+           // the number of blocks in h dimension
+           num_pools_f = 1 + (input_f_dim_ - pool_f_size_) / pool_f_step_; 
+           // the number of blocks in f dimension
+
+         If we have index idx_t, idx_h, idx_f in each axis, then we can find 
+         the block with:
+           row index: 
+             [start_t, start_t + pool_t_size_];
+           column index: combination of sets:
+             [start_col(0), start_col(0) + pool_f_size_],
+             [start_col(1), start_col(1) + pool_f_size_],
+             ...,
+             [start_col(pool_h_size_), start_col(pool_h_size_) + pool_f_size_]
+         where:
+           start_row    = idx_t * pool_t_step_
+           start_col(i) = (idx_h * pool_h_step_ + i) * input_f_dim_ + idx_f * pool_f_step_
+
+  Example:
+         We store the 3D matrix 
+         into a 2D matrix by concatenating each 2D matrix at different channel like:
+ 
+                     h = 0                   h = 1
+        |------------------------|------------------------|----...
+         f=0  f=1  f=2  ...  f=n  f=0  f=1  f=2  ...  f=n 
+        |----|----|----|----|----|----|----|----|----|----|----...
+    t=0  **** **** **** **** ****|**** **** **** **** ****|****... -
+    t=1  **** **** **** **** ****|**** **** **** **** ****|****... | m
+    t=2  **** **** **** **** ****|**** **** **** **** ****|****... | a
+    t=3  **** **** **** **** ****|**** **** **** **** ****|****... | t
+    t=4  **** **** **** **** ****|**** **** **** **** ****|****... | r
+    t=5  **** **** **** **** ****|**** **** **** **** ****|****... | i
+    t=6  **** **** **** **** ****|**** **** **** **** ****|****... | x
+    t=7  **** **** **** **** ****|**** **** **** **** ****|****... -
+    
+          In this case, if we set pool_t_size = 2, pool_t_step = 1
+                                  pool_h_size = 2, pool_h_step = 1
+                                  pool_f_size = 2, pool_f_step = 1
+          Then, the pooling block is like:
+          
+           h = 0     h = 1             h = 0     h = 1                  h = 1     h = 2              
+        |---------|---------|       |---------|---------|            |---------|---------|
+         f=0  f=1  f=0  f=1          f=1  f=2  f=1  f=2               f=0  f=1  f=0  f=1   
+        |----|----|----|----|       |----|----|----|----|   ...      |----|----|----|----|  ......
+    t=0  **** **** **** ****    t=0  **** **** **** ****         t=0  **** **** **** ****  
+    t=1  **** **** **** ****    t=1  **** **** **** ****         t=1  **** **** **** ****  
+
+
+           h = 0     f = 1             h = 0    h = 1                   h = 1     h = 2              
+        |---------|---------|       |---------|---------|            |---------|---------|
+         f=0  f=1  f=0  f=1          f=1  f=2  f=1  f=2               f=0  f=1  f=0  f=1   
+        |----|----|----|----|       |----|----|----|----|   ...      |----|----|----|----|  ......
+    t=1  **** **** **** ****    t=1  **** **** **** ****         t=1  **** **** **** ****  
+    t=2  **** **** **** ****    t=2  **** **** **** ****         t=2  **** **** **** ****  
+
+                  .                            .                               .
+                  .                            .                               .
+                  .                            .                               .
+                  .                            .                               .
+                  .                            .                               .
+                  .                            .                               .
+
+      Since the stride of filter(pool_f_step) is usually smaller than the 
+      stride of height(poo_h_step), we arrange each row of output as:
+      (all filters for height 0)(all filters for height 1)...
+
+ 
+ 
+  Output : The output is also a 2D matrix with dimension (num_block_t by 
+           num_block_h * num_block_f), with each element corresponding to
+           a block.
+ 
+ 
+ */
+class MaxPoolingOverBlock: public Component {
+ public:
+  explicit MaxPoolingOverBlock(const MaxPoolingOverBlock &other);
+  MaxPoolingOverBlock(): input_t_dim_(0), input_h_dim_(0), input_f_dim_(0),
+                         pool_t_size_(0), pool_h_size_(0), pool_f_size_(0),
+                         pool_t_step_(0), pool_h_step_(0), pool_f_step_(0) { }
+  virtual std::string Type() const { return "MaxPoolingOverBlock"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|kBackpropAdds;
+  }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual std::string Info() const;
+  virtual Component* Copy() const { return new MaxPoolingOverBlock(*this); }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in_value,
+                          CuMatrixBase<BaseFloat> *out_value) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &,//in_value,
+                        const CuMatrixBase<BaseFloat> &,//out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+  virtual void Check() const;
+  
+  
+ protected:
+  int32 input_t_dim_;   // size of the input along t-axis
+  // (e.g. number of time steps)
+  int32 input_h_dim_;   // size of input along h-axis
+  // (e.g. number of mel-frequency bins)
+  int32 input_f_dim_;   // size of input along f-axis
+  // (e.g. number of filters in the ConvolutionComponent)
+
+  int32 pool_t_size_;    // size of the pooling window along t-axis
+  int32 pool_h_size_;    // size of the pooling window along h-axis
+  int32 pool_f_size_;    // size of the pooling window along f-axis
+
+  int32 pool_t_step_;   // the number of steps taken along t-axis of input
+  //  before computing the next pool
+  int32 pool_h_step_;   // the number of steps taken along h-axis of input
+  // before computing the next pool
+  int32 pool_f_step_;   // the number of steps taken along f-axis of input
+  // before computing the next pool
+
+  CuVectorBase<BaseFloat> index_max_; // the index of maximum value
+
+  MaxPoolingOverBlock &operator = (const MaxPoolingOverBlock &other); // Disallow.
+};
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 4eb078c0fcb..081e61bfa28 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -5860,6 +5860,5 @@ void SumBlockComponent::Backprop(
   }
 }
 
-
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 3929c253aab..55e7ff703cf 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1220,7 +1220,6 @@ class SumBlockComponent: public Component {
   SumBlockComponent &operator = (const SumBlockComponent &other); // Disallow.
 };
 
-
 /*
  ClipGradientComponent just duplicates its input, but clips gradients
  during backpropagation if they cross a predetermined threshold.