diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index fcb5beb..781c7f5 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -265,6 +265,8 @@ jobs:
           # Bparser dependency
           sudo apt-get install -y libboost-all-dev
 
+          # install eigen
+          sudo apt install libeigen3-dev
           
           
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 627b7e3..ae5cbef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,12 @@ option(SANITIZER_ON "Whether to use AddressSanitizer (asan) in the DEBUG configu
 
 message(STATUS "CMakeLists.txt - BParser")
 
+include(FetchContent)
+
+if(POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+endif()
+
 # CLANG
 
 #set(CMAKE_CXX_FLAGS     "-std=c++14 -finline-hint-functions -pedantic-errors -Werror=pedantic -Wall -Wextra -Werror -Wno-long-long -Wno-strict-aliasing -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION")
@@ -160,7 +166,7 @@ if (NOT Boost_FOUND)
   if (NOT EXTERNAL_PROJECT_DIR)
    unset(BOOST_ROOT)
   endif()
-  find_package( Boost 1.58.0 REQUIRED)
+  find_package( Boost 1.70.0 REQUIRED) #since Boost 1.70.0 it should be find_package( Boost CONFIG REQUIRED)
 endif()
 
 message(STATUS "-------------------------------------------------------")
@@ -169,11 +175,31 @@ message(STATUS "BOOST_ROOT = ${BOOST_ROOT}")
 message(STATUS "Boost_LIBRARIES = ${Boost_LIBRARIES}")
 message(STATUS "Boost_LIBRARY_DIRS = ${Boost_LIBRARY_DIRS}")
 message(STATUS "Boost_INCLUDE_DIR = ${Boost_INCLUDE_DIR}")
+message(STATUS "=======================================================\n")
+
+#Eigen
+message(STATUS "=======================================================")
+message(STATUS "====== EIGEN ==========================================")
+message(STATUS "=======================================================")
+
+#find_package(Eigen3 CONFIG REQUIRED) #Remember to install Eigen for this to work. See eigen-3.x.x/INSTALL
+FetchContent_Declare(
+    Eigen3
+    URL https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz
+    EXCLUDE_FROM_ALL
+    FIND_PACKAGE_ARGS CONFIG #same as find_package(Eigen3 CONFIG)
+)
+FetchContent_MakeAvailable(Eigen3)
+
+message(STATUS "-------------------------------------------------------")
+message(STATUS "EIGEN_ROOT = ${EIGEN_ROOT}")
+message(STATUS "Eigen3_DIR = ${Eigen3_DIR}")
+message(STATUS "EIGEN3_INCLUDE_DIR = ${EIGEN3_INCLUDE_DIR}")
 message(STATUS "=======================================================\n\n")
 
 message(STATUS "VCL2_INCLUDE_DIR = ${CMAKE_CURRENT_SOURCE_DIR}/third_party/VCL_v2")
 
-set(BPARSER_INCLUDES   ${CMAKE_CURRENT_SOURCE_DIR}/include     ${Boost_INCLUDE_DIR}     ${CMAKE_CURRENT_SOURCE_DIR}/third_party/VCL_v2)
+set(BPARSER_INCLUDES   ${CMAKE_CURRENT_SOURCE_DIR}/include     ${Boost_INCLUDE_DIR}     ${CMAKE_CURRENT_SOURCE_DIR}/third_party/VCL_v2      ${EIGEN3_INCLUDE_DIR})
 if(NOT PROJECT_IS_TOP_LEVEL)
     set(BPARSER_INCLUDES ${BPARSER_INCLUDES} PARENT_SCOPE)
 endif()
@@ -198,6 +224,8 @@ add_library(bparser SHARED
     ${CMAKE_CURRENT_SOURCE_DIR}/include/processor_AVX512.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/include/processor_double.cc
 )
+target_link_libraries(bparser Eigen3::Eigen) #Interface library, includes the header files
+set_target_properties(bparser PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -DBPARSER_DLL")
 
 
 
@@ -259,6 +287,6 @@ endmacro()
 define_test(test_parser bparser)
 define_test(test_array)
 define_test(test_grammar bparser)
-define_test(test_processor)
+define_test(test_processor bparser) #is it broken? -LV
 define_test(test_speed bparser)  
 define_test(test_simd)
diff --git a/include/arena_alloc.hh b/include/arena_alloc.hh
index 31196ab..0bde0b6 100644
--- a/include/arena_alloc.hh
+++ b/include/arena_alloc.hh
@@ -12,6 +12,7 @@
 #include <utility>
 #include <malloc.h>
 #include "aligned_alloc.hh"
+#include "arena_resource.hh"
 
 namespace bparser {
 
@@ -21,55 +22,86 @@ inline size_t align_size(size_t al, size_t size) {
 }
 
 struct ArenaAlloc {
+	
+
+	//Creates a wrapper of PatchArena for backwards compatibility with BParser
+	ArenaAlloc(PatchArena& existing_arena) : arena(&existing_arena),buffer(nullptr) {
+		;
+	}
+	//Creates a wrapper with a new PatchArena with the specified memory alignment and size
+	//However AssemblyArena might be the correct class to create 
 	ArenaAlloc(std::size_t alignment, std::size_t size)
-	: alignment_(alignment),
-	  size_(0)
+	//: alignment_(alignment),
+	//  size_(0)
 	{
-		size_ = align_size(alignment_, size);
+		size_t size_ = align_size(alignment, size);
+		buffer = align_alloc(alignment, size_);
+		arena = new PatchArena(buffer, size_, alignment);
+		/*size_ = align_size(alignment_, size);
 		base_ = (char*)align_alloc(alignment_, size_);
 		BP_ASSERT(base_ != nullptr);
 		ptr_ = base_;
 		//std::cout << "arena begin: " << (void *)base_ << " end: " << end() << std::endl;
+		*/
 	}
 	
 	~ArenaAlloc() {
         destroy();
     }
 
-	void destroy() {
-		align_free(base_);
+	inline void destroy() {
+		//align_free(base_);
+		if (buffer != nullptr) {
+			align_free(buffer);
+			delete arena;
+		}
 	}
 
-	void *end() {
+	/*void* end() {
 		return base_ + size_;
-	}
+	}*/
 
-	void * allocate(std::size_t size) {
+	inline void* allocate(std::size_t size) {
+		/*
 		size = align_size(alignment_, size);
 		void * ptr = ptr_;
 		ptr_ += size;
 		BP_ASSERT(ptr_ <= end());
 		//std::cout << "allocated: " << ptr << " end: " << (void *)ptr_ << " aend: " << end() << "\n";
 		return ptr;
+		*/
+		return arena->allocate(size);
+		
 	}
 
 	template <class T, typename... Args>
-	T * create(Args&&... args) {
+	T* create(Args&&... args) {
+		
 		void * ptr = allocate(sizeof(T));
 		return new (ptr) T(std::forward<Args>(args)...);
+		
 	}
 
 	template <class T>
-	T * create_array(uint n_items) {
+	T* create_array(uint n_items) {
+		/*
 		void * ptr = allocate(sizeof(T) * n_items);
 		return new (ptr) T[n_items];
+		*/
+		return arena->allocate_simd<T>(n_items);
 	}
 
-
-	std::size_t alignment_;
-	std::size_t size_;
-	char * base_;
-	char * ptr_;
+	inline std::size_t get_size() const {
+		return arena->get_size();
+	}
+	
+	//std::size_t alignment_;
+	//std::size_t size_;
+	//char * base_;
+	//char * ptr_;
+protected:
+	PatchArena* arena;
+	void* buffer;
 };
 
 } // namespace bparser
diff --git a/include/arena_resource.hh b/include/arena_resource.hh
new file mode 100644
index 0000000..a01c847
--- /dev/null
+++ b/include/arena_resource.hh
@@ -0,0 +1,184 @@
+/*!
+ *
+﻿ * Copyright (C) 2015 Technical University of Liberec.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 3 as published by the
+ * Free Software Foundation. (http://www.gnu.org/licenses/gpl-3.0.en.html)
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *
+ * @file    arena_resource.hh
+ */
+
+#ifndef ARENA_RESOURCE_HH_
+#define ARENA_RESOURCE_HH_
+
+#include <memory_resource>
+#include <vector>
+#include <iostream>
+#include <new>
+#include <stdexcept>   // !! Use Flow exception mechanism
+
+//#include "system/asserts.hh"
+#include "assert.hh"
+
+
+// Final proposal of Arena
+// TODO shared_ptr out of class, pass pointer to data, describe how to use
+template <class Resource>
+class PatchArenaResource : public std::pmr::memory_resource {
+protected:
+    /// Returns different upstream resource in debug / release mode
+	static inline std::pmr::memory_resource* upstream_resource() {
+#ifdef DEBUG
+        return std::pmr::null_memory_resource();
+#else
+        return std::pmr::get_default_resource();
+#endif
+    }
+
+public:
+    //DECLARE_EXCEPTION( ExcArenaAllocation,
+    //        << "Allocation of ArenaResource failed. Please check if correct type of upstream is used.");
+#define EXC_ARENA_ALLOCATION "Allocation of ArenaResource failed. Please check if correct type of upstream is used."
+
+    /// Same as previous but doesn't construct buffer implicitly.
+	PatchArenaResource(void *buffer, size_t buffer_size, size_t simd_alignment, std::pmr::memory_resource* upstream = PatchArenaResource<Resource>::upstream_resource())
+    : upstream_( upstream ),
+      buffer_(buffer),
+      buffer_size_(buffer_size),
+      resource_(buffer_, buffer_size, upstream_),
+      simd_alignment_(simd_alignment),
+      full_data_(false)
+    {
+        //ASSERT_PERMANENT_EQ( (buffer_size%simd_alignment), 0 );
+        BP_ASSERT( (buffer_size % simd_alignment) == 0 );
+    }
+
+
+    ~PatchArenaResource() = default; // virtual, call destructor buffer_ = default_resource, (resource_)
+
+    /// Compute and print free space and used space of arena buffer. Development method
+    inline void print_space() {
+        void *p = this->raw_allocate(1, simd_alignment_);
+        size_t used_size = (char *)p - (char *)buffer_;
+        size_t free_space = buffer_size_ - used_size;
+        std::cout << "Allocated space of arena is " << used_size << " B, free space is " << free_space << " B." << std::endl;
+    }
+
+
+    /// Getter for resource
+    Resource &resource() {
+    	return resource_;
+    }
+
+    /// Allocate and return data pointer of n_item array of type T (alignment to length 8 bytes)
+    template <class T>
+    T* allocate_8(size_t n_items) {
+        size_t bytes = sizeof(T) * n_items;
+        return (T*)this->raw_allocate(bytes, 8);
+    }
+
+    /// Allocate and return data pointer of n_item array of type T (alignment to length given by simd_alignment constructor argument)
+    template <class T>
+    T* allocate_simd(size_t n_items) {
+        size_t bytes = sizeof(T) * n_items;
+        return (T*)this->raw_allocate(bytes, simd_alignment_);
+    }
+
+    // Reset allocated data
+    void reset() {
+        resource_.release();
+        full_data_ = false;
+#ifdef DEBUG
+    	char *c_buffer = (char *)buffer_;
+    	for (size_t i=0; i<buffer_size_; ++i)
+    	    c_buffer[i] = 0;
+#endif
+    }
+
+    inline size_t get_size() const{
+        return buffer_size_;
+    }
+protected:
+    void* raw_allocate(size_t bytes, size_t alignment) {
+        //ASSERT(!full_data_).error("Allocation of new data is not possible because child arena was created.");
+        BP_ASSERT( !full_data_ );
+        //ASSERT_EQ(buffer_size_%alignment, 0);
+        BP_ASSERT( (buffer_size_ % alignment) == 0 );
+
+    	try {
+            void* p = resource_.allocate(bytes, alignment);
+            return p;
+    	} catch ( std::bad_alloc& ) {
+            //THROW( ExcArenaAllocation() );
+            using bparser::Exception;
+            Throw() << EXC_ARENA_ALLOCATION;
+            
+    	}
+        return nullptr;
+    }
+
+    /// Override do_allocate to handle allocation logic
+    void* do_allocate(size_t bytes, size_t alignment) override {
+        return raw_allocate(bytes, alignment);
+    }
+
+    /// Override do_deallocate (no-op for monotonic buffer)
+    void do_deallocate(void* p, size_t bytes, size_t alignment) override {
+        upstream_->deallocate(p, bytes, alignment);
+    }
+
+    /// Override do_is_equal for memory resource comparison
+    bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override {
+        return this == &other;
+    }
+
+    std::pmr::memory_resource* upstream_;   ///< Pointer to upstream
+    void* buffer_;                          ///< Pointer to buffer
+    size_t buffer_size_;                    ///< Size of buffer
+    Resource resource_;                     ///< Resource of arena
+    size_t simd_alignment_;                 ///< Size of SIMD alignment
+    bool full_data_;                        ///< Flag signs full data (child arena is created)
+};
+
+
+template <class Resource>
+class AssemblyArenaResource : public PatchArenaResource<Resource> {
+public:
+    /// Constructor. Creates assembly arena
+	AssemblyArenaResource(size_t buffer_size, size_t simd_alignment, std::pmr::memory_resource* upstream = PatchArenaResource<Resource>::upstream_resource())
+    : PatchArenaResource<Resource>( std::pmr::get_default_resource()->allocate(buffer_size, simd_alignment), buffer_size, simd_alignment, upstream ) {}
+
+	virtual ~AssemblyArenaResource() {
+	    this->do_deallocate(this->buffer_, this->buffer_size_, this->simd_alignment_);
+	}
+
+    /**
+     * Create and return child arena.
+     *
+     * Child arena is created in free space of actual arena.
+     * Actual arena is marked as full (flag full_data_) and cannot allocate new data.
+     */
+	PatchArenaResource<Resource> *get_child_arena() {
+        void *p = this->raw_allocate(1, this->simd_alignment_);
+        size_t used_size = (char *)p - (char *)this->buffer_;
+        size_t free_space = this->buffer_size_ - used_size;
+        this->full_data_ = true;
+        return new PatchArenaResource<Resource>(p, free_space, this->simd_alignment_);
+    }
+
+
+};
+
+
+
+using AssemblyArena = AssemblyArenaResource<std::pmr::monotonic_buffer_resource>;
+using PatchArena = PatchArenaResource<std::pmr::monotonic_buffer_resource>;
+
+
+#endif /* ARENA_RESOURCE_HH_ */
diff --git a/include/array.hh b/include/array.hh
index 490118f..cbd88fd 100644
--- a/include/array.hh
+++ b/include/array.hh
@@ -12,10 +12,10 @@
 #include <vector>
 #include <algorithm>
 #include <cmath>
-#include <memory>
 #include <boost/math/constants/constants.hpp>
 #include "config.hh"
 #include "scalar_node.hh"
+#include "scalar_wrapper.hh"
 //#include "test_tools.hh"
 
 namespace bparser {
@@ -860,6 +860,81 @@ public:
 	}
 
 
+
+	//Wraps the ScalarNodes of an Array into an Eigen Matrix of ScalarWrappers.
+	//Vectors will be column vectors. Eigen does not support vectors without orientation.
+	//Cannot wrap scalars. To wrap scalars, use the bparser::details::ScalarWrapper constructor
+	static Eigen::MatrixX<details::ScalarWrapper> wrap_array(const bparser::Array& a) {
+		MultiIdx idx(a.range());
+		return wrap_array(a, idx);
+	}
+
+	//Wraps the ScalarNodes of an Array accessed via MultiIdx.idx_trg() created from supplied MultiIdxRange into an Eigen Matrix of ScalarWrapper
+	//Vectors will be column vectors. Eigen does not support vectors without orientation.
+	//Cannot wrap scalars. To wrap scalars, use the bparser::details::ScalarWrapper constructor
+	static Eigen::MatrixX<details::ScalarWrapper> wrap_array(const bparser::Array& a, MultiIdxRange& range) {
+		MultiIdx idx (range);
+		return wrap_array(a, idx);
+	}
+
+	//Wraps the ScalarNodes of an Array accessed via MultiIdx.idx_trg() into an Eigen Matrix of ScalarWrapper
+	//Vectors will be column vectors. Eigen does not support vectors without orientation.
+	//Cannot wrap scalars. To wrap scalars, use the bparser::details::ScalarWrapper constructor
+	static Eigen::MatrixX<details::ScalarWrapper> wrap_array(const bparser::Array& a, MultiIdx& index) {
+
+		using namespace details;
+		Shape trg_shape = index.range_.target_shape();
+		//std::cout << "Wrapping: " << print_shape(trg_shape) << std::endl;
+		if (trg_shape.size() == 0) {
+			Throw() << "Attempted to wrap scalar into Eigen Matrix";
+		}
+		if (trg_shape.size() == 1) {
+			uint len = trg_shape[0];
+			Eigen::VectorX<ScalarWrapper> v(len);
+			for (uint i = 0; i < len && index.valid(); i++, index.inc_trg()) {
+				v(i) = ScalarWrapper(a[index]);
+			}
+			return v;
+		}
+		else {// (a.shape().size() > 2) {
+			uint rows = *(trg_shape.end() - 2);
+			uint cols = *(trg_shape.end() - 1);
+
+			Eigen::MatrixX<ScalarWrapper> m(rows, cols);
+			for (uint row = 0; row < rows; row++) {
+				for (uint col = 0; col < cols && index.valid(); col++, index.inc_trg()) {
+					m(row, col) = ScalarWrapper(a[index]);
+				}
+			}
+			return m;
+		}
+	}
+
+	//Creates an Array of ScalarNodes from an Eigen Matrix of ScalarWrappers
+	// make_vector - Will reduce the Array shape if Matrix is actually a Vector. Shape:(x,1) -> (x); (1,y) -> (y)
+	static bparser::Array unwrap_array(const Eigen::MatrixX<details::ScalarWrapper>& m, const bool make_vector = false) {
+		using namespace details;
+
+		if (make_vector && (m.rows() == 1 || m.cols() == 1)) {
+			Array a({ (uint)std::max(m.rows(),m.cols()) });
+			MultiIdx index(a.range());
+			for (uint i = 0; i < a.shape()[0]; i++, index.inc_src()) {
+				a.elements_[index.idx_src()] = m(i).get();
+			}
+			return a;
+		}
+		else {
+			Array a({ (uint)m.rows(), (uint)m.cols() });
+			MultiIdx index(a.range());
+			for (uint row = 0; row < a.shape()[0]; row++) {
+				for (uint col = 0; col < a.shape()[1]; col++, index.inc_src()) {
+					a.elements_[index.idx_src()] = m(row, col).get();
+				}
+			}
+			return a;
+		}
+	}
+
 	/**
 	 * Numpy.matmul:
 	 *
@@ -867,7 +942,7 @@ public:
 	 * b 		has shape (..., i,j, l,m)
 	 * result 	has shape (..., i,j, k,m)
 	 */
-	static Array mat_mult(const Array &a,  const Array &b) {
+	static Array mat_mult_old(const Array& a, const Array& b) {
 		//std::cout << "mat mult: " << print_vector(a.shape()) << " @ " << print_vector(b.shape()) << "\n";
 
 		if (a.shape().size() == 0)
@@ -919,44 +994,45 @@ public:
 		MultiIdx a_idx(a_range);
 		MultiIdx b_idx(b_range); // allocated
 		MultiIdx result_idx(result_range);
-/*
-		std::cout << "a_idx, shp: " << print_vector(a_idx.range_.full_shape_) << "\n";
-		std::cout << "b_idx, shp: " << print_vector(b_idx.range_.full_shape_) << "\n";
-		std::cout << "r_idx, shp: " << print_vector(result_idx.range_.full_shape_) << "\n";
-*/
+		/*
+				std::cout << "a_idx, shp: " << print_vector(a_idx.range_.full_shape_) << "\n";
+				std::cout << "b_idx, shp: " << print_vector(b_idx.range_.full_shape_) << "\n";
+				std::cout << "r_idx, shp: " << print_vector(result_idx.range_.full_shape_) << "\n";
+		*/
 
 		ScalarNodePtr sum;
 		Array result(result_shape);
-		for(;result_idx.valid();) {
+		for (; result_idx.valid();) {
 			sum = nullptr;
 			a_idx.reset_indices(result_idx);
 			b_idx.reset_indices(result_idx);
-			for(;a_idx.valid();) {
-/*
-				std::cout << "a_idx: " << print_vector(a_idx.indices()) << " didx: "
-						<< a_idx.src_idx() << "\n";
-				std::cout << "b_idx: " << print_vector(b_idx.indices()) << " didx: "
-						<< b_idx.src_idx() << "\n";
-*/
+			for (; a_idx.valid();) {
+				/*
+								std::cout << "a_idx: " << print_vector(a_idx.indices()) << " didx: "
+										<< a_idx.src_idx() << "\n";
+								std::cout << "b_idx: " << print_vector(b_idx.indices()) << " didx: "
+										<< b_idx.src_idx() << "\n";
+				*/
 				ScalarNodePtr mult = details::ScalarNode::create<details::_mul_>(
-						a.elements_[a_idx.idx_src()],
-						b.elements_[b_idx.idx_src()]);
+					a.elements_[a_idx.idx_src()],
+					b.elements_[b_idx.idx_src()]);
 				if (sum == nullptr) {
 					sum = mult;
-				} else {
+				}
+				else {
 					// TODO: how to use inplace operations correctly ??
 					sum = details::ScalarNode::create<details::_add_>(sum, mult);
 				}
 				//std::cout << "aidx ";
-				a_idx.inc_trg(-1,1, false);
+				a_idx.inc_trg(-1, 1, false);
 				//std::cout << "bidx ";
-				b_idx.inc_trg(-1,1, false);
+				b_idx.inc_trg(-1, 1, false);
 				BP_ASSERT(a_idx.valid() == b_idx.valid());
 			}
-/*
-			std::cout << "r_idx: " << print_vector(result_idx.indices()) << " didx: "
-									<< result_idx.src_idx() << "\n";
-*/
+			/*
+						std::cout << "r_idx: " << print_vector(result_idx.indices()) << " didx: "
+												<< result_idx.src_idx() << "\n";
+			*/
 
 			result.elements_[result_idx.idx_src()] = sum;
 
@@ -967,6 +1043,134 @@ public:
 
 		auto final_range = MultiIdxRange(result.shape()).full();
 
+		//std::cout << "  raw res: "<< print_vector(result_shape);
+		if (b.shape().size() == 1 && *(result_shape.end() - 1) == 1) {
+			// cut -1 axis
+			//std::cout << "  b cut: "<< result_shape.size()-1 << "\n";
+			final_range.remove_target_axis(result_shape.size() - 1);
+		}
+		BP_ASSERT(*(result_shape.end() - 2) == 1);
+		//std::cout << "  r cut: "<< result_shape.size()-2 << "\n";
+		final_range.remove_target_axis(result_shape.size() - 2);
+		// cut -2 axis always
+		if (a.shape().size() == 1 && *(result_shape.end() - 3) == 1) {
+			// cut -3 axis
+			// std::cout << "  a cut: "<< result_shape.size()-3 << "\n";
+			final_range.remove_target_axis(result_shape.size() - 3);
+		}
+		// std::cout << "  final res: " << print_vector(final_range.sub_shape()) << "\n";
+		return Array(result, final_range);
+
+	}
+
+	/**
+	 * Numpy.matmul:
+	 *
+	 * a 		has shape (..., i,j, k,l)
+	 * b 		has shape (..., i,j, l,m)
+	 * result 	has shape (..., i,j, k,m)
+	 */
+	static Array mat_mult(const Array &a,  const Array &b) {
+		//std::cout << "mat mult: " << print_vector(a.shape()) << " @ " << print_vector(b.shape()) << "\n";
+
+		//std::cout << "Shape: ---------" << std::endl;
+		//std::cout << print_shape(a.shape()) << std::endl;
+		//std::cout << print_shape(b.shape()) << std::endl;
+
+		if (a.shape().size() == 0)
+			Throw() << "Matmult can not multiply by scalar a." << "\n";
+		if (b.shape().size() == 0)
+			Throw() << "Matmult can not multiply by scalar b." << "\n";
+
+		Shape a_shape = a.shape();
+		if (a_shape.size() == 1) {
+			a_shape.insert(a_shape.begin(), 1);
+			// shape (l) -> (1,l)
+		}
+
+
+		Shape b_shape = b.shape();
+		if (b_shape.size() == 1) {
+			b_shape.push_back(1);
+			// shape (l) -> (l,1)
+		}
+
+
+		uint a_cols = *(a_shape.end() - 1), b_rows = *(b_shape.end() - 2);
+
+		if (a_cols != b_rows) { // l != l
+			Throw() << "Matmult summing dimension mismatch: " << a_cols << " != " << b_rows << "\n";
+		}
+
+		//Add for common shape
+		a_shape.insert(a_shape.end(), 1);
+		// a_shape : (...,i,j,k,l,1)
+		b_shape.insert(b_shape.end() - 2, 1);
+		// b_shape : (...,i,j,1,l,m)
+
+		
+		Shape result_shape(MultiIdxRange::broadcast_common_shape(a_shape, b_shape)); 
+		// r_shape (..., i,j,k,l,m)
+		MultiIdxRange a_range(MultiIdxRange(a_shape).full().broadcast(result_shape));
+		// a_shape (..., 1,1,k,l,1) -> (...,i,j,k,l,1)
+		MultiIdxRange b_range(MultiIdxRange(b_shape).full().broadcast(result_shape));
+		// b_shape (..., 1,1,1,l,m) -> (...,i,j,1,l,m)
+
+		//Remove for computation
+		a_range.target_transpose_.erase(a_range.target_transpose_.end() - 1); 
+		// a_shape (..., i,j,k,l, )
+		b_range.target_transpose_.erase(b_range.target_transpose_.end() - 3); 
+		// b_shape (..., i,j, ,l,m)
+		result_shape.erase(result_shape.end() - 2); 
+		// r_shape (..., i,j,k, ,m)
+
+		//std::cout << print_shape(result_shape) << std::endl;
+
+		Array result(result_shape);
+		bool should_transpose = a.shape().size() == 1;
+
+		for (MultiIdx	
+			result_idx(result.range()),
+			a_idx(a_range),
+			b_idx(b_range);	result_idx.valid(); ) {
+
+			Eigen::MatrixX<details::ScalarWrapper> m_a = wrap_array(a, a_idx);
+			Eigen::MatrixX<details::ScalarWrapper> m_b = wrap_array(b, b_idx);
+
+			Array matmult = unwrap_array(m_a * m_b);
+
+			for (MultiIdx mult_idx(matmult.range()); mult_idx.valid(); mult_idx.inc_src(), result_idx.inc_src()) {
+				result.elements_[result_idx.idx_src()] = matmult[mult_idx]; 
+			}
+		}
+
+		MultiIdxRange final_range(result.range());
+		if (b.shape().size() == 1) {
+			final_range.remove_target_axis(absolute_idx(-1, result_shape.size()));
+			// shape (..., i,j,k,1) -> ...,j,k)
+		}
+		if (a.shape().size() == 1) {
+			final_range.remove_target_axis(absolute_idx(-2, result_shape.size()));
+			// shape (..., i,j,1,m) -> ...,j,m)
+		}
+		
+		return Array(result,final_range);
+		/*
+		auto m_a = wrap_array(a);
+		auto m_b = wrap_array(b);
+
+		if (a.shape().size() == 1) { //is vector
+			m_a = m_a.transpose(); //colvec -> rowvec
+		}
+
+		if (m_a.cols() != m_b.rows())
+			Throw() << "Matmult summing dimension mismatch: " << m_a.cols() << " != " << m_b.rows() << "\n";
+
+		return unwrap_array(m_a * m_b, (a.shape().size() == 1 || b.shape().size() == 1));*/
+		//Shape result_shape = result.shape();
+
+		/*auto final_range = MultiIdxRange(result.shape()).full();
+
 		//std::cout << "  raw res: "<< print_vector(result_shape);
 		if (b.shape().size() == 1 && *(result_shape.end() - 1) == 1 ) {
 		    // cut -1 axis
@@ -984,7 +1188,125 @@ public:
 		}
 		// std::cout << "  final res: " << print_vector(final_range.sub_shape()) << "\n";
 		return Array(result, final_range);
+		*/
+
+	}
 
+	static Array diag(const Array& a) {
+		if (a.shape().size() == 0) {
+			return a;
+		}
+
+		if (a.shape().size() == 1) { // diag -> matrix
+			return unwrap_array(wrap_array(a).asDiagonal());
+		}
+		// matrix -> diag
+		return unwrap_array(wrap_array(a).diagonal(),true);
+
+	}
+
+	static Array trace(const Array& a) {
+		if (a.shape().size() != 2) {
+			Throw() << "Function trace can only be used for matrices" << "\n";
+		}
+		Shape s; //empty Shape for scalar
+		Array r(s);
+		r.elements_[0U] = *wrap_array(a).trace();
+		return r;
+		//return full_({}, *wrap_array(a).trace());
+	}
+
+	static Array norm1(const Array& a) {
+		switch (a.shape().size()) {
+			case 0: //scalar
+				Throw() << "Norms are not for scalar values" << "\n";
+				break;
+			case 1: //vector
+			{
+
+				Shape s; //empty Shape for scalar
+				Array r(s);
+				r.elements_[0U] = *wrap_array(a).lpNorm<1>();
+				return r;
+			}
+			case 2: //matrix
+			{
+				Shape s; //empty Shape for scalar
+				Array r(s);
+				r.elements_[0U] = *wrap_array(a).colwise().lpNorm<1>().maxCoeff();
+				return r;
+			}
+			default:
+				Throw() << "Norms are not avaiable for ND tensors" << "\n";
+			}
+	}
+
+	static Array norm2(const Array& a) {
+		switch (a.shape().size()) {
+		case 0: //scalar
+			Throw() << "Norms are not for scalar values" << "\n";
+			break;
+		case 1: //vector
+		{
+			//Euclidean norm
+			Shape s; //empty Shape for scalar
+			Array r(s);
+			r.elements_[0U] = *wrap_array(a).norm();
+			return r;
+		}
+		case 2: //matrix
+		{
+			//Spectral norm
+			Throw() << "norm2(matrix) is not yet possible" << "\n";
+			/*Shape s; //empty Shape for scalar
+			Array r(s);
+
+			Eigen::MatrixX<details::ScalarWrapper> m( wrap_array(a) );
+
+			r.elements_[0U] = *details::sqrt((m.adjoint()*m).eigenvalues().real().maxCoeff());
+			//computing eigenvalues would require static cast to double and comparison operators (<,<=,>,>=,!=,==)
+			//something which we cannot support
+			return r;*/
+			break;
+		}
+		default:
+			Throw() << "Norms are not avaiable for ND tensors" << "\n";
+		}
+	}
+
+	static Array normfro(const Array& a) {
+		if (a.shape().size() != 2) {
+			Throw() << "Frobenius norm is only defined for matrices" << "\n";
+		}
+
+		Shape s;
+		Array r(s);
+		r.elements_[0U] = *wrap_array(a).norm();
+		return r;
+	}
+
+	static Array norminf(const Array& a) {
+		switch (a.shape().size()) {
+			case 0: //scalar
+				Throw() << "Norms are not for scalar values" << "\n";
+				break;
+			case 1: //vector
+			{
+				Shape s; //empty Shape for scalar
+				Array r(s);
+				r.elements_[0U] = *wrap_array(a).lpNorm<Eigen::Infinity>();
+				return r;
+			}
+			case 2: //matrix
+			{
+				Shape s; //empty Shape for scalar
+				Array r(s);
+				r.elements_[0U] = *wrap_array(a).rowwise().lpNorm<1>().maxCoeff();
+				return r;
+			}
+			default:
+				Throw() << "Norms are not avaiable for ND tensors" << "\n";
+		}
 	}
 
 	static Array flatten(const Array &tensor) {
diff --git a/include/config.hh b/include/config.hh
index d1db252..91cf3fe 100644
--- a/include/config.hh
+++ b/include/config.hh
@@ -33,9 +33,13 @@ typedef unsigned int uint;
 #endif
 
 #if defined(_WIN32)
-# define EXPORT __declspec(dllexport)
+# if defined(BPARSER_DLL)
+#  define EXPORT __declspec(dllexport)
+# else
+#  define EXPORT __declspec(dllimport)
+# endif
 #else
-#define EXPORT
+# define EXPORT
 #endif
 
 #if defined(_WIN32)
diff --git a/include/create_processor.hh b/include/create_processor.hh
index a4f10e4..fecb051 100644
--- a/include/create_processor.hh
+++ b/include/create_processor.hh
@@ -25,7 +25,7 @@ namespace bparser{
         }
     }
 
-    ProcessorBase * ProcessorBase::create_processor(ExpressionDAG &se, uint vector_size, uint simd_size, ArenaAllocPtr arena) {
+    ProcessorBase * ProcessorBase::create_processor(ExpressionDAG &se, uint vector_size, uint simd_size, PatchArenaPtr arena) {
         if (simd_size == 0) {
             simd_size = get_simd_size();
         }
diff --git a/include/expression_dag.hh b/include/expression_dag.hh
index da08bb5..ab32ccb 100644
--- a/include/expression_dag.hh
+++ b/include/expression_dag.hh
@@ -15,6 +15,7 @@
 #include "config.hh"
 #include "scalar_node.hh"
 #include "assert.hh"
+#include "array.hh"
 
 
 namespace bparser {
@@ -40,6 +41,8 @@ private:
 	/// Result nodes, given as input.
 	NodeVec results;
 
+	typedef std::pair<std::string, bool> InvDotNameAndScalar;
+	typedef std::map<ScalarNodePtr, InvDotNameAndScalar> InvDotMap;
 
 	/**
 	 * Used in the setup_result_storage to note number of unclosed nodes
@@ -102,6 +105,7 @@ public:
 
 	/**
 	 * Print ScalarExpression graph in the dot format.
+	 * Useful for debugging
 	 */
 	void print_in_dot() {
 		std::map<ScalarNodePtr , uint> i_node;
@@ -131,8 +135,155 @@ public:
 		std::cout << "Node: " << node->op_name_ <<  "_" << node->result_idx_ << " " << node->result_storage << std::endl;
 	}
 
+	/**
+	 * Print ScalarExpression graph in the common dot format.
+	 * Useful for understanding the DAG.
+	 */
+	void print_in_dot2() {
+		print_in_dot2(InvDotMap());
+	}
+
+	/**
+	 * Print ScalarExpression graph in the common dot format.
+	 * Useful for understanding the DAG. Using the parser's map of var. Name -> Array find the inverse ScalarNodePtr -> var. Name
+	 */
+	void print_in_dot2(const std::map<std::string, bparser::Array>& symbols) {
+		print_in_dot2(create_inverse_map(symbols));
+	}
+
+	/**
+	 * Print ScalarExpression graph in the common dot format.
+	 * Useful for understanding the DAG. Using the map of ScalarNodePtr -> variableName
+	 */
+	void print_in_dot2(const InvDotMap& names) {
+
+		sort_nodes();
+		
+		std::cout << "\n" << "----- begin cut here -----" << "\n";
+		std::cout << "digraph Expr {" << "\n";
+
+		std::cout << "/* definitions */" << "\n";
+
+		std::cout << "edge [dir=back]" << "\n";
+		for (uint i = 0; i < sorted.size(); ++i) {
+			_print_dot_node_definition(sorted[i],names);
+		}
+		std::cout << "/* end of definitions */" << "\n";
+
+		for (uint i = 0; i < sorted.size(); ++i) {
+			for (uint in = 0; in < sorted[i]->n_inputs_; ++in) {
+				std::cout << "    ";
+				_print_dot_node_id(sorted[i]);
+				std::cout << "\n -> ";
+				_print_dot_node_id(sorted[i]->inputs_[in]);
+				std::cout << "\n\n";
+			}
+		}
+		std::cout << "}" << "\n";
+		std::cout << "-----  end cut here  -----" << "\n";
+		std::cout.flush();
+	}
+	
+	//Create a map of ScalarNodePtr -> (variable name, is_scalar)
+	InvDotMap create_inverse_map(const std::map<std::string, bparser::Array>& symbols) const {
+		InvDotMap inv_map;
+		if (symbols.empty()) return inv_map;
+		for (const auto& s : symbols)
+		{
+			for (const auto& n : s.second.elements()) {
+				inv_map[n] = std::pair<std::string,bool>(s.first, s.second.shape().empty());
+			}
+		}
+		return inv_map;
+	}
+
 
 private:
+	//Print the vertice identifier for dot
+	void _print_dot_node_id(const ScalarNodePtr& node) const {
+		std::cout << node->op_name_ << "_" << (uintptr_t)node.get() << "__" << node->result_storage;// << std::endl;
+	}
+
+	//Print how the vertice should look in dot
+	void _print_dot_node_definition(const ScalarNodePtr& node, const InvDotMap& invmap) const {
+		_print_dot_node_id(node);
+		std::cout << ' ';
+
+		if (node->result_storage == ResultStorage::constant) {				// Constant
+			std::cout << "[shape=circle,";
+
+			try { //If the constant has a name
+				std::string name(invmap.at(node).first);
+				std::cout << "label=\"" << name << ": " << *node->values_ << "\",group=\"" << name << '"';
+			}
+			catch (const std::out_of_range&) { //No name
+				std::cout << "label=\"" << "const " << *node->values_ << '"';
+			}
+			std::cout  << "]" << std::endl;
+		}
+
+		else if (node->result_storage == ResultStorage::constant_bool) {	//Constant bool
+			std::cout << "[shape=circle,";
+
+			try { //If the constant has a name
+				std::string name(invmap.at(node).first);
+				std::cout << "label=\"" << name << ": " << *node->values_ << "\",group=\"" << name << '"';
+			}
+			catch (const std::out_of_range&) { //No name
+				std::cout << "label=\"" << "const " << *node->values_ << '"';
+			}
+			std::cout << "]" << std::endl;
+		}
+
+		else if (node->result_storage == ResultStorage::expr_result) {		//Result
+			std::cout << "[shape=box,label=\"" << node->op_name_ << " [" << node->result_idx_ << "]" << "\"]" << std::endl;
+		}
+
+		else if (node->result_storage == ResultStorage::value) {			// Value
+
+			std::cout << "[shape=circle,";
+			try {
+				std::string name(invmap.at(node).first);
+				bool scalar(invmap.at(node).second);
+				if (scalar) {
+					std::cout << "label=\"" << name << '"';
+				}
+				else {
+					std::cout << "label=<" << name << "<SUB>i</SUB>" << '>';
+				}
+				std::cout << ",group=\"" << name << '"';
+			}
+			catch (const std::out_of_range&) {
+				std::cout << "label=<<I>var</I>>";
+			}
+			
+			std::cout << "]" << std::endl;
+		}
+
+		else if (node->result_storage == ResultStorage::value_copy) {		//Value copy
+			std::cout << "[shape=circle,";
+			try {
+				std::string name(invmap.at(node).first);
+				bool scalar(invmap.at(node).second);
+				if (scalar) {
+					std::cout << "label=\"" << name << '"';
+				}
+				else {
+					std::cout << "label=<" << name << "<SUB>i</SUB>" << '>';
+				}
+				std::cout << ",group=\"" << name << '"';
+			}
+			catch (const std::out_of_range&) {
+				std::cout << "label=<<I>var_cp</I>>";
+			}
+			std::cout << "]" << std::endl;
+		}
+
+		else {//Temporary & other											//Temporary & other
+			std::cout << "[label=\"" << node->op_name_ << "\"]" << std::endl;
+		}
+	}
+
 	void _print_i_node(uint i) {
 		std::cout << sorted[i]->op_name_ << "_" << i << "_"<< sorted[i]->result_idx_;
 	}
diff --git a/include/grammar.impl.hh b/include/grammar.impl.hh
index 037c8ff..082592f 100644
--- a/include/grammar.impl.hh
+++ b/include/grammar.impl.hh
@@ -17,7 +17,8 @@
 #include <string>
 
 #include <boost/math/constants/constants.hpp>
-#include <boost/spirit/include/phoenix.hpp>
+//#include <boost/spirit/include/phoenix.hpp>
+#include <boost/phoenix.hpp>
 
 
 //#define BOOST_SPIRIT_NO_PREDEFINED_TERMINALS
@@ -178,6 +179,12 @@ struct grammar : qi::grammar<Iterator, ast::operand(), ascii::space_type> {
             FN("power"  , binary_array<_pow_>())
 			FN("minimum", binary_array<_min_>())
 			FN("maximum", binary_array<_max_>())
+            FN("diag"   , &Array::diag)
+            FN("tr"     , &Array::trace)
+            FN("norm1"  , &Array::norm1)
+            FN("norm2"  , &Array::norm2)
+            FN("normfro", &Array::normfro)
+            FN("norminf", &Array::norminf)
             ;
 
         unary_op.add
diff --git a/include/instrset_detect.cc b/include/instrset_detect.cc
index a023773..49618c3 100644
--- a/include/instrset_detect.cc
+++ b/include/instrset_detect.cc
@@ -1,6 +1,9 @@
 #include "instrset_detect.hh"
 
+namespace bparser {
+
+	int b_instrset_detect(void) {
+		return instrset_detect();
+	}
 
-int b_instrset_detect(void) {
-	return instrset_detect();
 }
\ No newline at end of file
diff --git a/include/instrset_detect.hh b/include/instrset_detect.hh
index 10430a6..5cd0b54 100644
--- a/include/instrset_detect.hh
+++ b/include/instrset_detect.hh
@@ -10,12 +10,14 @@
  * Wraps the third party library function for DLL export reasons.
  */
 
-#ifndef INCLUDE_INSTRSET_DETECT_HH
-#define INCLUDE_INSTRSET_DETECT_HH
+#ifndef INCLUDE_INSTRSET_DETECT_HH_
+#define INCLUDE_INSTRSET_DETECT_HH_
 
 #include "config.hh"
 #include "instrset.h"
+namespace bparser{
 
-EXPORT int b_instrset_detect(void);
+	EXPORT int b_instrset_detect(void);
 
-#endif
\ No newline at end of file
+}
+#endif //!INCLUDE_INSTRSET_DETECT_HH_
\ No newline at end of file
diff --git a/include/parser.hh b/include/parser.hh
index e233927..781dbc8 100644
--- a/include/parser.hh
+++ b/include/parser.hh
@@ -169,7 +169,7 @@ public:
     ///
     /// All variable names have to be set before this call.
     /// TODO: set result variable
-    void compile(std::shared_ptr<ArenaAlloc> arena = nullptr) {
+    void compile(std::shared_ptr<PatchArena> arena = nullptr) {
     	destroy_processor();
 
         ParserResult res_array = boost::apply_visitor(ast::make_array(symbols_), ast);
@@ -190,6 +190,8 @@ public:
 		details::ExpressionDAG se(result_array_.elements());
 
 		//se.print_in_dot();
+        //se.print_in_dot2();
+        //se.print_in_dot2(symbols_);
 		processor = ProcessorBase::create_processor(se, max_vec_size, simd_size, arena);
     }
 
diff --git a/include/processor.hh b/include/processor.hh
index 6f9452e..b16e018 100644
--- a/include/processor.hh
+++ b/include/processor.hh
@@ -127,6 +127,8 @@ using namespace details;
 
 
 typedef std::shared_ptr<ArenaAlloc> ArenaAllocPtr;
+typedef std::shared_ptr<PatchArena> PatchArenaPtr;
+
 
 
 #define CODE(OP_NAME) \
@@ -158,7 +160,7 @@ struct ProcessorBase {
 		return arena_;
 	}
 	
-	inline static ProcessorBase *create_processor(ExpressionDAG &se, uint vec_n_blocks, uint simd_size = 0, ArenaAllocPtr arena = nullptr);
+	inline static ProcessorBase *create_processor(ExpressionDAG &se, uint vec_n_blocks, uint simd_size = 0, PatchArenaPtr arena = nullptr);
 
 	ArenaAllocPtr arena_;
 };
@@ -477,7 +479,13 @@ struct Processor : public ProcessorBase {
 	Operation * program_;
 	std::vector< std::shared_ptr<ValueCopyNode> > val_copy_nodes_;
 };
-
+template <class VCLVec>
+ProcessorBase* create_processor_(ExpressionDAG& se, uint vector_size, uint simd_size, PatchArenaPtr arena) {
+	if (arena == nullptr) {
+		return create_processor_<VCLVec>(se, vector_size, simd_size, (ArenaAllocPtr)std::shared_ptr<ArenaAlloc>(nullptr)); //will create new ArenaAlloc in the other method
+	}
+	return create_processor_<VCLVec>(se, vector_size, simd_size, std::make_shared<ArenaAlloc>(*arena));
+}
 
 template <class VCLVec> 
 ProcessorBase * create_processor_(ExpressionDAG &se, uint vector_size,  uint simd_size, ArenaAllocPtr arena)
@@ -503,7 +511,7 @@ ProcessorBase * create_processor_(ExpressionDAG &se, uint vector_size,  uint sim
     if (arena == nullptr)
         arena = std::make_shared<ArenaAlloc>(simd_bytes, est);
     else
-        BP_ASSERT(arena->size_ >= est);
+        BP_ASSERT(arena->get_size() >= est);
     return arena->create<Processor<Vec<VCLVec>>>(arena, se, vec_n_blocks);
 }
 
diff --git a/include/scalar_wrapper.hh b/include/scalar_wrapper.hh
new file mode 100644
index 0000000..8a07d46
--- /dev/null
+++ b/include/scalar_wrapper.hh
@@ -0,0 +1,218 @@
+/*
+ * scalar_wrapper.hh
+ *
+ *  Created on: Apr 6, 2025
+ *      Author: LV
+ */
+
+//https://eigen.tuxfamily.org/dox/TopicCustomizing_CustomScalar.html
+
+#ifndef INCLUDE_SCALAR_WRAPPER_HH_
+#define INCLUDE_SCALAR_WRAPPER_HH_
+
+#include "scalar_node.hh"
+#include <Eigen/Core>
+//#include <Eigen/Eigenvalues> //impossible
+
+namespace bparser {
+	namespace details {
+		// Eigen compatible wrapper for ScalarNode
+		struct ScalarWrapper {
+
+			ScalarWrapper() : node(ScalarNode::create_zero()) { ; }
+			ScalarWrapper(int i) : node(ScalarNode::create_const(i)) { ; }
+			ScalarWrapper(double d) : node(ScalarNode::create_const(d)) { ; }
+			ScalarWrapper(ScalarNodePtr existing_ptr) : node(existing_ptr) { ; }
+
+			inline ScalarWrapper operator+() const {
+				return ScalarWrapper(*this);
+			}
+
+			inline ScalarWrapper operator-() const {
+				return un_op<_minus_>(*this);
+			}
+
+			inline ScalarWrapper& operator+=(const ScalarWrapper& b) {
+				node = bin_op<_add_>(*this, b).get();
+				return *this;
+			}
+
+			inline ScalarWrapper operator+(const ScalarWrapper& b) const {
+				return bin_op<_add_>(*this, b);
+			}
+
+			inline ScalarWrapper& operator-=(const ScalarWrapper& b) {
+				node = bin_op<_sub_>(*this, b).get();
+				return *this;
+			}
+
+			inline ScalarWrapper operator-(const ScalarWrapper& b) const {
+				return bin_op<_sub_>(*this, b);
+			}
+
+			inline ScalarWrapper& operator*=(const ScalarWrapper& b) {
+				node = bin_op<_mul_>(*this, b).get();
+				return *this;
+			}
+
+			inline ScalarWrapper operator*(const ScalarWrapper& b) const {
+				return bin_op<_mul_>(*this, b);
+			}
+
+			inline ScalarWrapper& operator/=(const ScalarWrapper& b) {
+				node = bin_op<_div_>(*this, b).get();
+				return *this;
+			}
+
+			inline ScalarWrapper operator/(const ScalarWrapper& b) const {
+				return bin_op<_div_>(*this, b);
+			}
+
+			inline bool operator==(const ScalarWrapper& b) const {
+				if ((*this).is_constant() && (*this).have_same_result_storage(b))
+					return *(***this).values_ == *(**b).values_;
+				return false;
+			}
+			/* These do not make any sense with what we are trying to achieve
+			inline bool operator!=(const ScalarWrapper& b) const {
+				return !((*this) == b);
+			}
+
+			inline bool operator<(const ScalarWrapper& b) const {
+				if ((*this).is_constant() && (*this).have_same_result_storage(b))
+					return *(***this).values_ < *(**b).values_;
+				return false;
+			}
+
+			inline bool operator<=(const ScalarWrapper& b) const {
+				if ((*this).is_constant() && (*this).have_same_result_storage(b))
+					return *(***this).values_ <= *(**b).values_;
+				return false;
+			}
+
+			inline bool operator>=(const ScalarWrapper& b) const {
+				if ((*this).is_constant() && (*this).have_same_result_storage(b))
+					return *(***this).values_ >= *(**b).values_;
+				return false;
+			}
+
+			inline bool operator>(const ScalarWrapper& b) const {
+				if ((*this).is_constant() && (*this).have_same_result_storage(b))
+					return *(***this).values_ > *(**b).values_;
+				return false;
+			}*/
+
+
+			inline ScalarNodePtr operator*() const { //dereference
+				return get();
+			}
+
+			inline ScalarNodePtr get() const {
+				return node;
+			}
+
+			template<class T>
+			static ScalarWrapper bin_op(const ScalarWrapper& a, const ScalarWrapper& b) {
+				return ScalarWrapper(ScalarNode::create<T>(a.get(), b.get()));
+			}
+
+			template<class T>
+			static ScalarWrapper un_op(const ScalarWrapper& a) {
+				return ScalarWrapper(ScalarNode::create<T>(a.get()));
+			}
+
+
+		protected:
+			ScalarNodePtr node;
+
+			inline bool is_constant() const {
+				return (***this).result_storage == constant ||
+					   (***this).result_storage == constant_bool;
+			}
+
+			inline bool have_same_result_storage(const ScalarWrapper& b)const {
+				return (***this).result_storage == (**b).result_storage;
+			}
+
+		}; //ScalarWrapper
+
+		//inline std::ostream& operator<<(std::ostream& out, const ScalarWrapper& s) {
+		//
+		//}
+
+#define UN_OP(OP)											\
+		inline ScalarWrapper OP(const ScalarWrapper& s) {	\
+			return ScalarWrapper::un_op<_##OP##_>(s);		\
+		}													\
+		using std::OP;
+
+#define BIN_OP(OP)																	\
+		inline ScalarWrapper OP(const ScalarWrapper& a,const ScalarWrapper& b) {	\
+			return ScalarWrapper::bin_op<_##OP##_>(a,b);							\
+		}																			\
+		using std::OP;
+
+		
+		UN_OP(abs)
+
+		//https://eigen.tuxfamily.org/dox/namespaceEigen.html#a54cc34b64b4935307efc06d56cd531df
+		inline ScalarWrapper abs2(const ScalarWrapper& s) {
+			return s*s;
+		}
+	
+
+		UN_OP(sqrt)
+		//UN_OP(exp)
+		//UN_OP(log)
+		//UN_OP(log2)
+		//UN_OP(log10)
+		//UN_OP(sin)
+		//UN_OP(sinh)
+		//UN_OP(asin)
+		//UN_OP(cos)
+		//UN_OP(cosh)
+		//UN_OP(acos)
+		//UN_OP(tan)
+		//UN_OP(tanh)
+		//UN_OP(atan)
+		//UN_OP(ceil)
+		//UN_OP(floor)
+
+		BIN_OP(max)
+		inline ScalarWrapper maxi(const ScalarWrapper& a, const ScalarWrapper& b) {
+			return ScalarWrapper::bin_op<_max_>(a, b);
+		}
+
+		BIN_OP(min)
+		inline ScalarWrapper mini(const ScalarWrapper& a, const ScalarWrapper& b) {
+			return ScalarWrapper::bin_op<_min_>(a, b);
+		}
+
+		//BIN_OP(atan2)
+		//BIN_OP(pow)
+
+	} //details
+} //bparser
+
+//https://eigen.tuxfamily.org/dox/structEigen_1_1NumTraits.html
+namespace Eigen {
+	template<> struct NumTraits<bparser::details::ScalarWrapper>
+	: NumTraits<double>
+	{
+		typedef bparser::details::ScalarWrapper Real;
+		typedef bparser::details::ScalarWrapper NonInteger;
+		typedef bparser::details::ScalarWrapper Nested;
+
+		enum {
+			IsComplex = 0,
+			IsInteger = 0,
+			IsSigned = 1,
+			RequireInitialization = 1,
+			ReadCost = HugeCost,
+			AddCost = HugeCost,
+			MulCost = HugeCost
+		};
+	};
+}
+
+#endif //!INCLUDE_SCALAR_WRAPPER_HH_
\ No newline at end of file
diff --git a/test/test_parser.cc b/test/test_parser.cc
index 8f905df..e226692 100644
--- a/test/test_parser.cc
+++ b/test/test_parser.cc
@@ -226,7 +226,8 @@ void test_expression() {
 
 	BP_ASSERT(test_expr("25 % cs3", {1}));
 	BP_ASSERT(test_expr("25 % cv4", {1, 0, 1}));
-
+	
+	BP_ASSERT(test_expr("[[1,2],[3,4]] @ [5,6]", { 17,39 }, { 2 }));
 	BP_ASSERT(test_expr("[3, 4] @ [[1], [2]]", {11}, {1}));
 	BP_ASSERT(test_expr("[3, 4, 1] @ [[1], [2], [3]]", {14}, {1}));
 	ASSERT_THROW(test_expr("[[1], [2], [3]] @ [3, 4, 1]", {14}, {1}), "Matmult summing dimension mismatch");
@@ -236,6 +237,37 @@ void test_expression() {
 	BP_ASSERT(test_expr("[[1],[2],[3]] @ [[1,2,3]]", {1, 2, 3, 2, 4, 6, 3, 6, 9}, {3,3}));
 	BP_ASSERT(test_expr("a=[1,2,3]; a[:, None] @ a[None,:]", {1, 2, 3, 2, 4, 6, 3, 6, 9}, {3,3}));
 
+	// 2×2 @ 2×2 → 2×2
+	BP_ASSERT(test_expr(
+		"[[1, 2], [3, 4]] @ [[5, 6], [7, 8]]",
+		{19, 22, 43, 50},    // 1*5+2*7, 1*6+2*8, 3*5+4*7, 3*6+4*8
+		{2, 2}
+	));
+	
+	// 3×1×2 @ 2×3 → 3×1×3 (batched matmul)
+	BP_ASSERT(test_expr(
+		"[[[1,2]], [[3,4]], [[5,6]]] @ [[7,8,9], [10,11,12]]",
+		{
+		27, 30, 33,   // batch 0: [1,2]×[[7,8,9],[10,11,12]]
+		61, 68, 75,   // batch 1: [3,4]×...
+		95,106,117    // batch 2: [5,6]×...
+		},
+		{3, 1, 3}
+	));
+
+	BP_ASSERT(test_expr("diag([1,2,3])", { 1, 0, 0,  0, 2, 0,  0, 0, 3 }, { 3,3 }));
+	BP_ASSERT(test_expr("diag([[1,5],[9,2]])", { 1, 2 }, { 2 }));
+	BP_ASSERT(test_expr("diag(diag([1,2,3]))", { 1, 2, 3 }, { 3 }));
+
+	BP_ASSERT(test_expr("tr([[1,9,9],[9,1,9],[9,9,1]])", { 3 }, {}));
+
+	BP_ASSERT(test_expr("norm1([-4,-3,-2,-1,0,1,2,3,4])", {20}, {}));
+	BP_ASSERT(test_expr("norm1([[-4,-3,-2],[-1,0,1],[2,3,4]])", { 7 }, {}));
+	BP_ASSERT(test_expr("norm2([-4,-3,-2,-1,0,1,2,3,4])", { 7.745966692414834 }, {}));
+	//BP_ASSERT(test_expr("norm2([[-4,-3,-2],[-1,0,1],[2,3,4]])", { 7.3484692283495345 }, {})); //Spectral norm uses eigenvalues/singular values. Eigen uses comparison operators in the algorithm. Bparser does not like that
+	BP_ASSERT(test_expr("normfro([[-4,-3,-2],[-1,0,1],[2,3,4]])", { 7.745966692414834 }, {}));
+	BP_ASSERT(test_expr("norminf([-4,-3,-2,-1,0,1,2,3,4])", { 4 }, {}));
+	BP_ASSERT(test_expr("norminf([[-4,-3,-2],[-1,0,1],[2,3,4]])", { 9 }, {}));
 
 	BP_ASSERT(test_expr("abs(-1)+abs(0)+abs(1)", {2}));
 	BP_ASSERT(test_expr("floor(-3.5)", {-4}, {}));
diff --git a/test/test_speed.cc b/test/test_speed.cc
index 2bdb7ce..6ac4fb4 100644
--- a/test/test_speed.cc
+++ b/test/test_speed.cc
@@ -23,6 +23,7 @@
 #include "test_tools.hh"
 
 #include "arena_alloc.hh"
+#include "arena_resource.hh"
 
 // Optimized structure, holds data in common arena
 struct ExprData {
@@ -31,7 +32,8 @@ struct ExprData {
 	{
 		uint simd_bytes = sizeof(double) * simd_size;
 
-		arena = std::make_shared<bparser::ArenaAlloc>(simd_bytes, 512 * 1012);
+		patch_arena = std::make_shared<AssemblyArena>(512 * 1012, simd_bytes);
+		arena = std::make_shared<bparser::ArenaAlloc>(*patch_arena);//(simd_bytes, 512 * 1012);
 		v1 = arena->create_array<double>(vec_size * 3);
 		fill_seq(v1, 100, 100 + 3 * vec_size);
 		v2 = arena->create_array<double>(vec_size * 3);
@@ -54,6 +56,7 @@ struct ExprData {
 	~ExprData()
 	{}
 
+	std::shared_ptr<PatchArena> patch_arena;
 	std::shared_ptr<bparser::ArenaAlloc> arena;
 	uint vec_size;
 	uint simd_size;
@@ -266,7 +269,7 @@ void test_expr(std::string expr, uint block_size, void (* func)(ExprData&)) {
 		//std::cout << "vres: " << vres << ", " << vres + block_size << ", " << vres + 2*vec_size << "\n";
 		//std::cout << "Symbols: " << print_vector(p.symbols()) << "\n";
 		//std::cout.flush();
-		p.compile(data1.arena);
+		p.compile(data1.patch_arena);
 
 		std::vector<uint> ss = std::vector<uint>(data1.subset, data1.subset+vec_size/simd_size);
 		p.set_subset(ss);