diff --git a/fec.cpp b/fec.cpp
index 47c3577..f5bc960 100644
--- a/fec.cpp
+++ b/fec.cpp
@@ -2,12 +2,9 @@
 // Created by 理 傅 on 2017/1/2.
 //
 
-#include <err.h>
-#include <sys/time.h>
 #include <iostream>
 #include <stdexcept>
 #include "fec.h"
-#include "sess.h"
 #include "encoding.h"
 
 FEC::FEC(ReedSolomon enc) :enc(enc) {}
@@ -33,13 +30,11 @@ FEC::New(int rxlimit, int dataShards, int parityShards)  {
 }
 
 fecPacket
-FEC::Decode(byte *data, size_t sz) {
+FEC::Decode(byte *data, size_t sz, uint32_t ts) {
     fecPacket pkt;
     data = decode32u(data, &pkt.seqid);
     data = decode16u(data, &pkt.flag);
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    pkt.ts = uint32_t(time.tv_sec * 1000 + time.tv_usec/1000);
+    pkt.ts = ts;
     pkt.data = std::make_shared<std::vector<byte>>(data, data+sz - fecHeaderSize);
     return pkt;
 }
@@ -62,11 +57,8 @@ FEC::MarkFEC(byte *data) {
     }
 }
 
-std::vector<row_type>
-FEC::Input(fecPacket &pkt) {
-    std::vector<row_type> recovered;
-
-    uint32_t now = currentMs();
+void
+FEC::Input(fecPacket &pkt, uint32_t now, std::vector<row_type>& recovered) {
     if (now-lastCheck >= fecExpire) {
         for (auto it = rx.begin();it !=rx.end();) {
             if (now - it->ts > fecExpire) {
@@ -78,13 +70,12 @@ FEC::Input(fecPacket &pkt) {
         lastCheck = now;
     }
 
-
     // insertion
     auto n = this->rx.size() -1;
     int insertIdx = 0;
     for (int i=n;i>=0;i--) {
         if (pkt.seqid == rx[i].seqid) {
-            return recovered;
+            return;
         } else if (pkt.seqid > rx[i].seqid) {
             insertIdx = i + 1;
             break;
@@ -113,10 +104,10 @@ FEC::Input(fecPacket &pkt) {
         int numDataShard = 0;
         int first = 0;
         size_t maxlen = 0;
-
-        std::vector<row_type> shardVec(totalShards);
-        std::vector<bool> shardflag(totalShards, false);
-
+        static thread_local std::vector<row_type> shardVec(totalShards);
+        static thread_local std::vector<bool> shardflag(totalShards, false);
+        std::fill(shardVec.begin(), shardVec.end(), nullptr);
+        std::fill(shardflag.begin(), shardflag.end(), false);
         for (auto i = searchBegin; i <= searchEnd; i++) {
             auto seqid = rx[i].seqid;
             if (seqid > shardEnd) {
@@ -163,7 +154,7 @@ FEC::Input(fecPacket &pkt) {
         rx.erase(rx.begin());
     }
 
-    return recovered;
+    return;
 }
 
 
diff --git a/fec.h b/fec.h
index 361d4fe..cd838d4 100644
--- a/fec.h
+++ b/fec.h
@@ -35,13 +35,13 @@ class FEC {
     inline bool isEnabled() { return dataShards > 0 && parityShards > 0 ; }
 
     // Input a FEC packet, and return recovered data if possible.
-    std::vector<row_type> Input(fecPacket &pkt);
+    void Input(fecPacket &pkt, uint32_t now, std::vector<row_type>& recovered);
 
     // Calc Parity Shards
     void Encode(std::vector<row_type> &shards);
 
     // Decode a raw array into fecPacket
-    static fecPacket Decode(byte *data, size_t sz);
+    static fecPacket Decode(byte *data, size_t sz,  uint32_t ts);
 
     // Mark raw array as typeData, and write correct size.
     void MarkData(byte *data, uint16_t sz);
diff --git a/fec_test.cpp b/fec_test.cpp
index 2638625..83be9a7 100644
--- a/fec_test.cpp
+++ b/fec_test.cpp
@@ -61,7 +61,8 @@ int main() {
         } else {
             pkt.flag = typeFEC;
         }
-        auto recovered = fec.Input(pkt);
+        std::vector<row_type> recovered;
+        fec.Input(pkt, 0, recovered);
 
         if (recovered.size() > 0) {
             std::cout << "recovered:" << std::endl;
diff --git a/galois_noasm.cpp b/galois_noasm.cpp
index d82e267..9089f7b 100644
--- a/galois_noasm.cpp
+++ b/galois_noasm.cpp
@@ -3,18 +3,17 @@
 //
 
 #include "galois_noasm.h"
-#include "matrix.h"
 
 extern const byte mulTable[256][256];
 
-void galMulSlice(byte c, row_type in, row_type out) {
-    for (int n=0;n<in->size();n++) {
-        (*out)[n] = mulTable[c][(*in)[n]];
-    }
+void galMulSlice(byte c, byte* in, byte* out, int size) {
+  for (int n = 0; n < size; n++) {
+    out[n] = mulTable[c][in[n]];
+  }
 }
 
-void galMulSliceXor(byte c, row_type in, row_type out) {
-    for (int n=0;n<in->size();n++) {
-        (*out)[n] ^= mulTable[c][(*in)[n]];
-    }
-}
+void galMulSliceXor(byte c, byte* in, byte* out, int size) {
+  for (int n = 0; n < size; n++) {
+    out[n] ^= mulTable[c][in[n]];
+  }
+}
\ No newline at end of file
diff --git a/galois_noasm.h b/galois_noasm.h
index 42d5ebe..8348f6d 100644
--- a/galois_noasm.h
+++ b/galois_noasm.h
@@ -6,13 +6,12 @@
 #define KCP_GALOIS_NOASM_H
 
 #include "galois.h"
-#include "matrix.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-void galMulSlice(byte c,  row_type in, row_type out);
-void galMulSliceXor(byte c, row_type in, row_type out);
+void galMulSlice(byte c,  byte* in, byte* out, int size);
+void galMulSliceXor(byte c, byte* in, byte* out, int size);
 
 #ifdef __cplusplus
 }
diff --git a/inversion_tree.cpp b/inversion_tree.cpp
index 71d8214..69a57ad 100644
--- a/inversion_tree.cpp
+++ b/inversion_tree.cpp
@@ -8,65 +8,65 @@
 inversionTree inversionTree::newInversionTree(int dataShards, int parityShards) {
     inversionTree tree;
     tree.m_root.m_children.resize(dataShards + parityShards, nullptr);
-    tree.m_root.m_matrix = matrix::identityMatrix(dataShards);
+    tree.m_root.m_matrix = std::make_shared<IdentityMatrix>(dataShards);
     return tree;
 }
 
 
-matrix
+MatrixPtr
 inversionTree::GetInvertedMatrix(std::vector<int> &invalidIndices) {
     if (invalidIndices.size() == 0) {
         return m_root.m_matrix;
     }
 
-    return m_root.getInvertedMatrix(invalidIndices, 0);
+    return m_root.getInvertedMatrix(invalidIndices, 0, 0);
 }
 
 int
-inversionTree::InsertInvertedMatrix(std::vector<int> &invalidIndices, matrix &matrix, int shards) {
+inversionTree::InsertInvertedMatrix(std::vector<int> &invalidIndices, MatrixPtr &matrix, int shards) {
     // If no invalid indices were given then we are done because the
     // m_root node is already set with the identity matrix.
     if (invalidIndices.size() == 0) {
         return -1;
     }
 
-    if (!matrix.IsSquare()) {
+    if (!matrix->IsSquare()) {
         return -2;
     }
 
     // Recursively create nodes for the inverted matrix in the tree until
     // we reach the node to insert the matrix to.  We start by passing in
     // 0 as the parent index as we start at the m_root of the tree.
-    m_root.insertInvertedMatrix(invalidIndices, matrix, shards, 0);
+    m_root.insertInvertedMatrix(invalidIndices, 0, matrix, shards, 0);
 
     return 0;
 }
 
-matrix
-inversionNode::getInvertedMatrix(std::vector<int> &invalidIndices, int parent) {
+MatrixPtr
+inversionNode::getInvertedMatrix(const std::vector<int> &invalidIndices, int index, int parent) {
     // Get the child node to search next from the list of m_children.  The
     // list of m_children starts relative to the parent index passed in
     // because the indices of invalid rows is sorted (by default).  As we
     // search recursively, the first invalid index gets popped off the list,
     // so when searching through the list of m_children, use that first invalid
     // index to find the child node.
-    int firstIndex = invalidIndices[0];
+    int firstIndex = invalidIndices[index];
     auto node = m_children[firstIndex - parent];
 
     // If the child node doesn't exist in the list yet, fail fast by
     // returning, so we can construct and insert the proper inverted matrix.
     if (node == nullptr) {
-        return matrix{};
+        return nullptr;
     }
 
     // If there's more than one invalid index left in the list we should
     // keep searching recursively.
-    if (invalidIndices.size() > 1) {
+    if (invalidIndices.size() - index> 1) {
         // Search recursively on the child node by passing in the invalid indices
         // with the first index popped off the front.  Also the parent index to
         // pass down is the first index plus one.
-        std::vector<int> v(invalidIndices.begin() + 1, invalidIndices.end());
-        return node->getInvertedMatrix(v, firstIndex + 1);
+        // no copy std::vector<int> v(invalidIndices.begin() + 1, invalidIndices.end());
+        return node->getInvertedMatrix(invalidIndices, index + 1, firstIndex + 1);
     }
 
     // If there aren't any more invalid indices to search, we've found our
@@ -79,8 +79,9 @@ inversionNode::getInvertedMatrix(std::vector<int> &invalidIndices, int parent) {
 
 void
 inversionNode::insertInvertedMatrix(
-        std::vector<int> &invalidIndices,
-        struct matrix &matrix,
+        const std::vector<int> &invalidIndices,
+        int index,
+        MatrixPtr &matrix,
         int shards,
         int parent) {
     // As above, get the child node to search next from the list of m_children.
@@ -89,7 +90,7 @@ inversionNode::insertInvertedMatrix(
     // search recursively, the first invalid index gets popped off the list,
     // so when searching through the list of m_children, use that first invalid
     // index to find the child node.
-    int firstIndex = invalidIndices[0];
+    int firstIndex = invalidIndices[index];
     auto node = m_children[firstIndex - parent];
 
     // If the child node doesn't exist in the list yet, create a new
@@ -109,13 +110,13 @@ inversionNode::insertInvertedMatrix(
     // If there's more than one invalid index left in the list we should
     // keep searching recursively in order to find the node to add our
     // matrix.
-    if (invalidIndices.size() > 1) {
+    if (invalidIndices.size() - index > 1) {
         // As above, search recursively on the child node by passing in
         // the invalid indices with the first index popped off the front.
         // Also the total number of shards and parent index are passed down
         // which is equal to the first index plus one.
-        std::vector<int> v(invalidIndices.begin() + 1, invalidIndices.end());
-        node->insertInvertedMatrix(v, matrix, shards, firstIndex + 1);
+        // no copy std::vector<int> v(invalidIndices.begin() + 1, invalidIndices.end());
+        node->insertInvertedMatrix(invalidIndices, index + 1, matrix, shards, firstIndex + 1);
     } else {
         node->m_matrix = matrix;
     }
diff --git a/inversion_tree.h b/inversion_tree.h
index 47cc9e3..fb8258b 100644
--- a/inversion_tree.h
+++ b/inversion_tree.h
@@ -9,11 +9,11 @@
 #include "matrix.h"
 
 struct inversionNode {
-    struct matrix m_matrix;
+    MatrixPtr m_matrix;
     std::vector<std::shared_ptr<inversionNode>> m_children;
-    struct matrix getInvertedMatrix(std::vector<int> & invalidIndices, int parent);
+    MatrixPtr getInvertedMatrix(const std::vector<int> & invalidIndices, int index, int parent);
 
-    void insertInvertedMatrix(std::vector<int> &invalidIndices, struct matrix &matrix, int shards, int parent);
+    void insertInvertedMatrix(const std::vector<int> &invalidIndices, int index, MatrixPtr &matrix, int shards, int parent);
 };
 
 class inversionTree {
@@ -25,13 +25,13 @@ class inversionTree {
 
     // GetInvertedMatrix returns the cached inverted matrix or nil if it
     // is not found in the tree keyed on the indices of invalid rows.
-    matrix GetInvertedMatrix(std::vector<int> & invalidIndices);
+    MatrixPtr GetInvertedMatrix(std::vector<int> & invalidIndices);
 
     // InsertInvertedMatrix inserts a new inverted matrix into the tree
     // keyed by the indices of invalid rows.  The total number of shards
     // is required for creating the proper length lists of child nodes for
     // each node.
-    int InsertInvertedMatrix(std::vector<int> & invalidIndices, struct matrix &matrix, int shards);
+    int InsertInvertedMatrix(std::vector<int> & invalidIndices, MatrixPtr &matrix, int shards);
 
 private:
     inversionNode m_root;
diff --git a/matrix.cpp b/matrix.cpp
index f94864c..81d15f9 100644
--- a/matrix.cpp
+++ b/matrix.cpp
@@ -2,180 +2,131 @@
 // Created by 理 傅 on 2016/12/30.
 //
 
+#include <stdexcept>
 #include "galois.h"
 #include "matrix.h"
-#include <stdexcept>
 
-matrix
-matrix::newMatrix(int rows, int cols) {
-    if (rows <= 0 || cols <= 0) {
-        throw std::invalid_argument("invalid arguments");
+int Matrix::GaussianElimination() {
+  auto rows = this->rows;
+  auto columns = this->cols;
+  // Clear out the part below the main diagonal and scale the main
+  // diagonal to be 1.
+  for (int r = 0; r < rows; r++) {
+    // If the element on the diagonal is 0, find a row below
+    // that has a non-zero and swap them.
+    if (at(r, r) == 0) {
+      for (int rowBelow = r + 1; rowBelow < rows; rowBelow++) {
+        if (at(rowBelow, r) != 0) {
+          this->SwapRows(r, rowBelow);
+          break;
+        }
+      }
     }
 
-    matrix m;
-    m.rows = rows;
-    m.cols = cols;
-    m.data.resize(rows, nullptr);
-    for (auto i = 0; i < rows; i++) {
-        m.data[i] = std::make_shared<std::vector<byte>>(cols);
+    // If we couldn't find one, the matrix is singular.
+    if (at(r, r) == 0) {
+      return -1;
     }
-    return m;
-}
 
-matrix
-matrix::identityMatrix(int size) {
-    matrix m = matrix::newMatrix(size, size);
-    for (int i = 0; i < size; i++) {
-        m.at(i, i) = 1;
+    // Scale to 1.
+    if (at(r, r) != 1) {
+      byte scale = galDivide(1, at(r, r));
+      for (int c = 0; c < columns; c++) {
+        at(r, c) = galMultiply(at(r, c), scale);
+      }
     }
-    return m;
-}
-
-
-matrix
-matrix::Multiply(matrix &right) {
-    if (cols != right.rows) {
-        return matrix{};
-    }
-
-    matrix result = matrix::newMatrix(rows, right.cols);
 
-    for (int r = 0; r < result.rows; r++) {
-        for (int c = 0; c < result.cols; c++) {
-            byte value = 0;
-            for (int i = 0; i < this->cols; i++) {
-                value ^= galMultiply(at(r, i), right.at(i, c));
-            }
-            result.at(r, c) = value;
+    // Make everything below the 1 be a 0 by subtracting
+    // a multiple of it.  (Subtraction and addition are
+    // both exclusive or in the Galois field.)
+    for (int rowBelow = r + 1; rowBelow < rows; rowBelow++) {
+      if (at(rowBelow, r) != 0) {
+        byte scale = at(rowBelow, r);
+        for (int c = 0; c < columns; c++) {
+          at(rowBelow, c) ^= galMultiply(scale, at(r, c));
         }
+      }
     }
-    return result;
-}
-
-matrix
-matrix::Augment(matrix &right) {
-    matrix result = matrix::newMatrix(this->rows, this->cols + right.cols);
-
-    for (int r = 0; r < this->rows; r++) {
-        for (int c = 0; c < this->cols; c++) {
-            result.at(r, c) = at(r, c);
-        }
-        auto cols = this->cols;
-        for (int c = 0; c < right.cols; c++) {
-            result.at(r, cols + c) = right.at(r, c);
+  }
+
+  // Now clear the part above the main diagonal.
+  for (int d = 0; d < rows; d++) {
+    for (int rowAbove = 0; rowAbove < d; rowAbove++) {
+      if (at(rowAbove, d) != 0) {
+        byte scale = at(rowAbove, d);
+        for (int c = 0; c < columns; c++) {
+          at(rowAbove, c) ^= galMultiply(scale, at(d, c));
         }
+      }
     }
-    return result;
+  }
+  return 0;
 }
 
-matrix
-matrix::SubMatrix(int rmin, int cmin, int rmax, int cmax) {
-    matrix result = matrix::newMatrix(rmax - rmin, cmax - cmin);
-    for (int r = rmin; r < rmax; r++) {
-        for (int c = cmin; c < cmax; c++) {
-            result.at(r - rmin, c - cmin) = at(r, c);
-        }
-    }
-    return result;
-}
+int Matrix::SwapRows(int r1, int r2) {
+  if (r1 < 0 || rows <= r1 || r2 < 0 || rows <= r2) {
+    return -1;
+  }
 
-// SwapRows Exchanges two rows in the matrix.
-int
-matrix::SwapRows(int r1, int r2) {
-    if (r1 < 0 || rows <= r1 || r2 < 0 || rows <= r2) {
-        return -1;
-    }
-
-    std::swap(data[r1], data[r2]);
-    return 0;
+  std::swap(data[r1], data[r2]);
+  return 0;
 }
 
-bool
-matrix::IsSquare() {
-    return this->rows == this->cols;
-}
+MatrixPtr Multiply(const MatrixPtr& left, const MatrixPtr& right) {
+  if (left->cols != right->rows) {
+    return nullptr;
+  }
 
-matrix
-matrix::Invert() {
-    if (!IsSquare()) {
-        return matrix{};
-    }
-    auto work = matrix::identityMatrix(rows);
-    work = matrix::Augment(work);
+  auto result = std::make_shared<Matrix>(left->rows, right->cols);
 
-    auto ret = work.gaussianElimination();
-    if (ret != 0) {
-        return matrix{};
+  for (int r = 0; r < result->rows; r++) {
+    for (int c = 0; c < result->cols; c++) {
+      byte value = 0;
+      for (int i = 0; i < left->cols; i++) {
+        value ^= galMultiply(left->at(r, i), right->at(i, c));
+      }
+      result->at(r, c) = value;
     }
-
-    return work.SubMatrix(0, rows, rows, rows * 2);
+  }
+  return result;
 }
 
-int
-matrix::gaussianElimination() {
-    auto rows = this->rows;
-    auto columns = this->cols;
-    // Clear out the part below the main diagonal and scale the main
-    // diagonal to be 1.
-    for (int r = 0; r < rows; r++) {
-        // If the element on the diagonal is 0, find a row below
-        // that has a non-zero and swap them.
-        if (at(r, r) == 0) {
-            for (int rowBelow = r + 1; rowBelow < rows; rowBelow++) {
-                if (at(rowBelow, r) != 0) {
-                    this->SwapRows(r, rowBelow);
-                    break;
-                }
-            }
-        }
-
-        // If we couldn't find one, the matrix is singular.
-        if (at(r, r) == 0) {
-            return -1;
-        }
-
-        // Scale to 1.
-        if (at(r, r) != 1) {
-            byte scale = galDivide(1, at(r, r));
-            for (int c = 0; c < columns; c++) {
-                at(r, c) = galMultiply(at(r, c), scale);
-            }
-        }
+MatrixPtr Augment(const MatrixPtr& left, const MatrixPtr& right) {
+  auto result = std::make_shared<Matrix>(left->rows, left->cols + right->cols);
 
-        // Make everything below the 1 be a 0 by subtracting
-        // a multiple of it.  (Subtraction and addition are
-        // both exclusive or in the Galois field.)
-        for (int rowBelow = r + 1; rowBelow < rows; rowBelow++) {
-            if (at(rowBelow, r) != 0) {
-                byte scale = at(rowBelow, r);
-                for (int c = 0; c < columns; c++) {
-                    at(rowBelow, c) ^= galMultiply(scale, at(r, c));
-                }
-            }
-        }
+  for (int r = 0; r < left->rows; r++) {
+    for (int c = 0; c < left->cols; c++) {
+      result->at(r, c) = left->at(r, c);
     }
-
-    // Now clear the part above the main diagonal.
-    for (int d = 0; d < rows; d++) {
-        for (int rowAbove = 0; rowAbove < d; rowAbove++) {
-            if (at(rowAbove, d) != 0) {
-                byte scale = at(rowAbove, d);
-                for (int c = 0; c < columns; c++) {
-                    at(rowAbove, c) ^= galMultiply(scale, at(d, c));
-                }
-            }
-        }
+    auto cols = left->cols;
+    for (int c = 0; c < right->cols; c++) {
+      result->at(r, cols + c) = right->at(r, c);
     }
-    return 0;
+  }
+  return result;
 }
 
-matrix
-matrix::vandermonde(int rows, int cols) {
-    matrix result = matrix::newMatrix(rows, cols);
-    for (int r = 0; r < rows; r++) {
-        for (int c = 0; c < cols; c++) {
-            result.at(r, c) = galExp(byte(r), byte(c));
-        }
+MatrixPtr SubMatrix(const MatrixPtr& mp, int rmin, int cmin, int rmax, int cmax) {
+  auto result = std::make_shared<Matrix>(rmax - rmin, cmax - cmin);
+  for (int r = rmin; r < rmax; r++) {
+    for (int c = cmin; c < cmax; c++) {
+      result->at(r - rmin, c - cmin) = mp->at(r, c);
     }
-    return result;
+  }
+  return result;
 }
+
+MatrixPtr Invert(const MatrixPtr& mp) {
+  if (!mp->IsSquare()) {
+    return nullptr;
+  }
+  std::shared_ptr<Matrix> work = std::make_shared<IdentityMatrix>(mp->rows);
+  work = Augment(mp, work);
+
+  auto ret = work->GaussianElimination();
+  if (ret != 0) {
+    return nullptr;
+  }
+
+  return SubMatrix(work, 0, mp->rows, mp->rows, mp->rows * 2);
+}
\ No newline at end of file
diff --git a/matrix.h b/matrix.h
index deffd52..9380185 100644
--- a/matrix.h
+++ b/matrix.h
@@ -9,51 +9,74 @@
 #include <memory>
 #include "galois.h"
 
-using row_type = std::shared_ptr<std::vector<byte>>;
+// newMatrix returns a matrix of zeros.
+class Matrix {
+ public:
+  Matrix(int r, int c) : rows(r), cols(c) {
+    data = new byte*[r];
+    for (int i = 0; i < r; i++) data[i] = new byte[c]{0};
+  }
 
-struct matrix {
-    // newMatrix returns a matrix of zeros.
-    static matrix newMatrix(int rows, int cols);
+  ~Matrix() {
+    for (int i = 0; i < rows; i++) delete[] data[i];
+    delete[] data;
+    data = nullptr;
+  }
 
-    // IdentityMatrix returns an identity matrix of the given empty.
-    static matrix identityMatrix(int size);
+  inline byte& at(int row, int col) { return data[row][col]; }
 
-    // Create a Vandermonde matrix, which is guaranteed to have the
-    // property that any subset of rows that forms a square matrix
-    // is invertible.
-    static matrix vandermonde(int rows, int cols);
+  inline bool IsSquare() { return this->rows == this->cols; }
 
-    // Multiply multiplies this matrix (the one on the left) by another
-    // matrix (the one on the right) and returns a new matrix with the result.
-    matrix Multiply(matrix &right);
+  // SwapRows Exchanges two rows in the matrix.
+  int SwapRows(int r1, int r2);
 
-    // Augment returns the concatenation of this matrix and the matrix on the right.
-    matrix Augment(matrix &right);
+  //  Gaussian elimination (also known as row reduction)
+  int GaussianElimination();
 
-    // Returns a part of this matrix. Data is copied.
-    matrix SubMatrix(int rmin, int cmin, int rmax, int cmax);
+  int rows{0}, cols{0};
 
-    // IsSquare will return true if the matrix is square
-    bool IsSquare();
+  byte** data;
+};
 
-    // SwapRows Exchanges two rows in the matrix.
-    int SwapRows(int r1, int r2);
+// IdentityMatrix returns an identity matrix of the given empty.
+class IdentityMatrix : public Matrix {
+ public:
+  IdentityMatrix(int size) : Matrix(size, size) {
+    for (int i = 0; i < size; i++) {
+      at(i, i) = 1;
+    }
+  }
+};
 
-    // Invert returns the inverse of this matrix.
-    // Returns ErrSingular when the matrix is singular and doesn't have an inverse.
-    // The matrix must be square, otherwise ErrNotSquare is returned.
-    matrix Invert();
+// Create a Vandermonde matrix, which is guaranteed to have the
+// property that any subset of rows that forms a square matrix
+// is invertible.
+class VandermondeMatrix : public Matrix {
+ public:
+  VandermondeMatrix(int rows, int cols) : Matrix(rows, cols) {
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        at(r, c) = galExp(byte(r), byte(c));
+      }
+    }
+  }
+};
 
-    //  Gaussian elimination (also known as row reduction)
-    int gaussianElimination();
+using MatrixPtr = std::shared_ptr<Matrix>;
 
-    std::vector<row_type> data;
-    int rows{0}, cols{0};
+// Multiply multiplies this matrix (the one on the left) by another
+// matrix (the one on the right) and returns a new matrix with the result.
+MatrixPtr Multiply(const MatrixPtr& left, const MatrixPtr& right);
 
-    inline byte &at(int row, int col) { return (*(data[row]))[col]; }
+// Augment returns the concatenation of this matrix and the matrix on the right.
+MatrixPtr Augment(const MatrixPtr& left, const MatrixPtr& right);
 
-    inline bool empty() { return (rows == 0 || cols == 0); }
-};
+// Returns a part of this matrix. Data is copied.
+MatrixPtr SubMatrix(const MatrixPtr& mp, int rmin, int cmin, int rmax, int cmax);
 
+// Invert returns the inverse of this matrix.
+// Returns ErrSingular when the matrix is singular and doesn't have an inverse.
+// The matrix must be square, otherwise ErrNotSquare is returned.
+MatrixPtr Invert(const MatrixPtr& mp);
 
 #endif //KCP_MATRIX_H
diff --git a/reedsolomon.cpp b/reedsolomon.cpp
index ab32cb2..b50f18c 100644
--- a/reedsolomon.cpp
+++ b/reedsolomon.cpp
@@ -30,15 +30,15 @@ ReedSolomon::New(int dataShards, int parityShards) {
     // Start with a Vandermonde matrix.  This matrix would work,
     // in theory, but doesn't have the property that the data
     // shards are unchanged after encoding.
-    matrix vm = matrix::vandermonde(r.m_totalShards, r.m_dataShards);
+    MatrixPtr vm = std::make_shared<VandermondeMatrix>(r.m_totalShards, r.m_dataShards);
 
     // Multiply by the inverse of the top square of the matrix.
     // This will make the top square be the identity matrix, but
     // preserve the property that any square subset of rows  is
     // invertible.
-    auto top = vm.SubMatrix(0, 0, dataShards, dataShards);
-    top = top.Invert();
-    r.m = vm.Multiply(top);
+    auto top = SubMatrix(vm, 0, 0, dataShards, dataShards);
+    top = Invert(top);
+    r.m = Multiply(vm, top);
 
     // Inverted matrices are cached in a tree keyed by the indices
     // of the invalid rows of the data to reconstruct.
@@ -47,9 +47,10 @@ ReedSolomon::New(int dataShards, int parityShards) {
     // with the original data.
     r.tree = inversionTree::newInversionTree(dataShards, parityShards);
 
-    r.parity = std::vector<row_type>(parityShards);
+    r.parity = std::vector<byte *>(parityShards);
     for (int i = 0; i < parityShards; i++) {
-        r.parity[i] = r.m.data[dataShards + i];
+      r.parity[i] = r.m->data[dataShards + i];
+          // std::make_shared<std::vector<byte>>(r.m->data[dataShards + i], r.m->data[dataShards + i] + r.m->cols);
     }
     return r;
 }
@@ -63,24 +64,31 @@ ReedSolomon::Encode(std::vector<row_type> &shards) {
     checkShards(shards, false);
 
     // Get the slice of output buffers.
-    std::vector<row_type> output(shards.begin() + m_dataShards, shards.end());
+    static thread_local std::vector<byte*> output(shards.size() - m_dataShards);
+    std::fill(output.begin(), output.end(), nullptr);
+    for (int i = m_dataShards; i < shards.size(); i++) output[i-m_dataShards] = shards[i]->data();
+
+    // Get the slice of input buffers.
+    static thread_local std::vector<byte*> input(m_dataShards);
+    std::fill(input.begin(), input.end(), nullptr);
+    for (int i = 0; i < m_dataShards; i++) input[i] = shards[i]->data();
 
     // Do the coding.
-    std::vector<row_type> input(shards.begin(), shards.begin() + m_dataShards);
-    codeSomeShards(parity, input, output, m_parityShards);
+    auto indata_size = shards[0]->size();
+    codeSomeShards(parity, input, indata_size, output, m_parityShards);
 };
 
 
 void
-ReedSolomon::codeSomeShards(std::vector<row_type> &matrixRows, std::vector<row_type> &inputs,
-                            std::vector<row_type> &outputs, int outputCount) {
+ReedSolomon::codeSomeShards(std::vector<byte*> &matrixRows, std::vector<byte*> &inputs, int data_size,
+                            std::vector<byte*> &outputs, int outputCount) {
     for (int c = 0; c < m_dataShards; c++) {
         auto in = inputs[c];
-        for (int iRow = 0; iRow < outputCount; iRow++) {
+        for (int r = 0; r < outputCount; r++) {
             if (c == 0) {
-                galMulSlice((*matrixRows[iRow])[c], in, outputs[iRow]);
+                galMulSlice(matrixRows[r][c], in, outputs[r], data_size);
             } else {
-                galMulSliceXor((*matrixRows[iRow])[c], in, outputs[iRow]);
+                galMulSliceXor(matrixRows[r][c], in, outputs[r], data_size);
             }
         }
     }
@@ -124,14 +132,18 @@ ReedSolomon::Reconstruct(std::vector<row_type> &shards) {
     //
     // Also, create an array of indices of the valid rows we do have
     // and the invalid rows we don't have up until we have enough valid rows.
-    std::vector<row_type> subShards(m_dataShards);
-    std::vector<int> validIndices(m_dataShards, 0);
-    std::vector<int> invalidIndices;
+    static thread_local std::vector<byte*> subShards(m_dataShards);
+    static thread_local std::vector<int> validIndices(m_dataShards, 0);
+    static thread_local std::vector<int> invalidIndices;
+
+    // clean
+    std::fill(subShards.begin(), subShards.end(), nullptr);
+    std::fill(validIndices.begin(), validIndices.end(), 0);
+    invalidIndices.clear();
     int subMatrixRow = 0;
-
     for (int matrixRow = 0; matrixRow < m_totalShards && subMatrixRow < m_dataShards; matrixRow++) {
         if (shards[matrixRow] != nullptr) {
-            subShards[subMatrixRow] = shards[matrixRow];
+            subShards[subMatrixRow] = shards[matrixRow]->data();
             validIndices[subMatrixRow] = matrixRow;
             subMatrixRow++;
         } else {
@@ -146,15 +158,15 @@ ReedSolomon::Reconstruct(std::vector<row_type> &shards) {
     // If the inverted matrix isn't cached in the tree yet we must
     // construct it ourselves and insert it into the tree for the
     // future.  In this way the inversion tree is lazily loaded.
-    if (dataDecodeMatrix.empty()) {
+    if (dataDecodeMatrix == nullptr) {
         // Pull out the rows of the matrix that correspond to the
         // shards that we have and build a square matrix.  This
         // matrix could be used to generate the shards that we have
         // from the original data.
-        auto subMatrix = matrix::newMatrix(m_dataShards, m_dataShards);
+        auto subMatrix = std::make_shared<Matrix>(m_dataShards, m_dataShards);
         for (subMatrixRow = 0; subMatrixRow < validIndices.size(); subMatrixRow++) {
             for (int c = 0; c < m_dataShards; c++) {
-                subMatrix.at(subMatrixRow, c) = m.at(validIndices[subMatrixRow], c);
+                subMatrix->at(subMatrixRow, c) = m->at(validIndices[subMatrixRow], c);
             };
         }
 
@@ -163,8 +175,8 @@ ReedSolomon::Reconstruct(std::vector<row_type> &shards) {
         // generates the shard that we want to Decode.  Note that
         // since this matrix maps back to the original data, it can
         // be used to create a data shard, but not a parity shard.
-        dataDecodeMatrix = subMatrix.Invert();
-        if (dataDecodeMatrix.empty()) {
+        dataDecodeMatrix = Invert(subMatrix);
+        if (dataDecodeMatrix == nullptr) {
             throw std::runtime_error("cannot get matrix invert");
         }
 
@@ -181,36 +193,43 @@ ReedSolomon::Reconstruct(std::vector<row_type> &shards) {
     // The Input to the coding is all of the shards we actually
     // have, and the output is the missing data shards.  The computation
     // is done using the special Decode matrix we just built.
-    std::vector<row_type> outputs(m_parityShards);
-    std::vector<row_type> matrixRows(m_parityShards);
+    static thread_local std::vector<byte*> outputs(m_parityShards);
+    static thread_local std::vector<byte*> matrixRows(m_parityShards);
+    // clean
+    std::fill(outputs.begin(), outputs.end(), nullptr);
+    std::fill(matrixRows.begin(), matrixRows.end(), nullptr);
     int outputCount = 0;
 
     for (int iShard = 0; iShard < m_dataShards; iShard++) {
         if (shards[iShard] == nullptr) {
             shards[iShard] = std::make_shared<std::vector<byte>>(shardSize);
-            outputs[outputCount] = shards[iShard];
-            matrixRows[outputCount] = dataDecodeMatrix.data[iShard];
+            outputs[outputCount] = shards[iShard]->data();
+            matrixRows[outputCount] = dataDecodeMatrix->data[iShard];
             outputCount++;
         }
     }
-    codeSomeShards(matrixRows, subShards, outputs, outputCount);
+
+    auto indata_size = shards[0]->size();
+    codeSomeShards(matrixRows, subShards, indata_size, outputs, outputCount);
 
     // Now that we have all of the data shards intact, we can
     // compute any of the parity that is missing.
     //
     // The Input to the coding is ALL of the data shards, including
     // any that we just calculated.  The output is whichever of the
-    // data shards were missing.
+    // data shards were missing. 
+
     outputCount = 0;
+    for (int iShard = 0; iShard < m_dataShards; iShard++) subShards[iShard] = shards[iShard]->data();
     for (int iShard = m_dataShards; iShard < m_totalShards; iShard++) {
         if (shards[iShard] == nullptr) {
             shards[iShard] = std::make_shared<std::vector<byte>>(shardSize);
-            outputs[outputCount] = shards[iShard];
+            outputs[outputCount] = shards[iShard]->data();
             matrixRows[outputCount] = parity[iShard - m_dataShards];
             outputCount++;
         }
     }
-    codeSomeShards(matrixRows, shards, outputs, outputCount);
+    codeSomeShards(matrixRows, subShards, indata_size, outputs, outputCount);
 }
 
 void
diff --git a/reedsolomon.h b/reedsolomon.h
index 65fe72e..f2d9157 100644
--- a/reedsolomon.h
+++ b/reedsolomon.h
@@ -9,6 +9,8 @@
 #include "inversion_tree.h"
 #include "galois.h"
 
+using row_type = std::shared_ptr<std::vector<byte>>;
+
 class ReedSolomon {
 public:
     ReedSolomon() = default;
@@ -49,9 +51,9 @@ class ReedSolomon {
     int m_parityShards; // Number of parity shards, should not be modified.
     int m_totalShards; // Total number of shards. Calculated, and should not be modified.
 
-    matrix m;
+    MatrixPtr m;
     inversionTree tree;
-    std::vector<row_type> parity;
+    std::vector<byte*> parity;
 
     int shardSize(std::vector<row_type> &shards);
 
@@ -66,7 +68,7 @@ class ReedSolomon {
     // number of matrix rows used, is determined by
     // outputCount, which is the number of outputs to compute.
     void
-    codeSomeShards(std::vector<row_type> &matrixRows, std::vector<row_type> &inputs, std::vector<row_type> &outputs,
+    codeSomeShards(std::vector<byte*> &matrixRows, std::vector<byte*> &inputs, int data_size, std::vector<byte*> &outputs,
                    int outputCount);
 
     // checkShards will check if shards are the same size
diff --git a/sess.cpp b/sess.cpp
index f5230f2..e45de21 100644
--- a/sess.cpp
+++ b/sess.cpp
@@ -99,7 +99,7 @@ UDPSession::Update(uint32_t current) noexcept {
         if (n > 0) {
             if (fec.isEnabled()) {
                 // decode FEC packet
-                auto pkt = fec.Decode(m_buf, static_cast<size_t>(n));
+                auto pkt = fec.Decode(m_buf, static_cast<size_t>(n), current);
                 if (pkt.flag == typeData) {
                     auto ptr = pkt.data->data();
                     // we have 2B size, ignore for typeData
@@ -109,7 +109,9 @@ UDPSession::Update(uint32_t current) noexcept {
                 // allow FEC packet processing with correct flags.
                 if (pkt.flag == typeData || pkt.flag == typeFEC) {
                     // input to FEC, and see if we can recover data.
-                    auto recovered = fec.Input(pkt);
+                    static thread_local std::vector<row_type> recovered;
+                    recovered.clear();
+                    fec.Input(pkt, current, recovered);
 
                     // we have some data recovered.
                     for (auto &r : recovered) {
diff --git a/sess.h b/sess.h
index f856fce..3db5f0d 100644
--- a/sess.h
+++ b/sess.h
@@ -4,7 +4,6 @@
 #include "ikcp.h"
 #include "fec.h"
 #include <sys/types.h>
-#include <sys/time.h>
 
 class UDPSession  {
 private:
@@ -76,11 +75,4 @@ class UDPSession  {
 
 };
 
-inline uint32_t currentMs() {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return uint32_t((time.tv_sec * 1000) + (time.tv_usec / 1000));
-}
-
-
 #endif //KCP_SESS_H