diff --git a/.classpath b/.classpath
new file mode 100755
index 00000000..1a79d80b
--- /dev/null
+++ b/.classpath
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src/main/scala"/>
+	<classpathentry kind="src" path="src/test/scala"/>
+	<classpathentry kind="src" path="src/main/java"/>
+	<classpathentry kind="con" path="org.scala-ide.sdt.launching.SCALA_CONTAINER"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="lib" path="C:/Users/jfc.EECS/.ivy2/cache/org.scala-tools.testing/scalacheck_2.9.1/jars/scalacheck_2.9.1-1.9.jar"/>
+	<classpathentry kind="lib" path="C:/Users/jfc.EECS/.ivy2/cache/org.scalatest/scalatest/jars/scalatest-1.4.RC2.jar"/>
+	<classpathentry kind="lib" path="C:/Users/jfc.EECS/.ivy2/cache/org.scala-lang/jline/jars/jline-2.9.1.jar"/>
+	<classpathentry kind="lib" path="C:/Users/jfc.EECS/.ivy2/cache/junit/junit/jars/junit-4.5.jar"/>
+	<classpathentry kind="lib" path="C:/code/BIDMat/lib/jhdf5.jar"/>
+	<classpathentry kind="lib" path="C:/code/BIDMat/lib/ptplot.jar"/>
+	<classpathentry kind="lib" path="C:/code/BIDMat/lib/jcublas-0.5.0RC.jar"/>
+	<classpathentry kind="lib" path="C:/code/BIDMat/lib/jcuda-0.5.0RC.jar"/>
+	<classpathentry kind="lib" path="C:/code/BIDMat/lib/jcufft-0.5.0RC.jar"/>
+	<classpathentry kind="lib" path="C:/code/BIDMat/lib/jcurand-0.5.0RC.jar"/>
+	<classpathentry kind="lib" path="C:/code/BIDMat/lib/jcusparse-0.5.0RC.jar"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>
diff --git a/.project b/.project
new file mode 100644
index 00000000..20732747
--- /dev/null
+++ b/.project
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>BIDMat</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.scala-ide.sdt.core.scalabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.scala-ide.sdt.core.scalanature</nature>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/BIDMat.jar b/BIDMat.jar
new file mode 100644
index 00000000..84bac8ed
Binary files /dev/null and b/BIDMat.jar differ
diff --git a/Copyright.txt b/Copyright.txt
new file mode 100755
index 00000000..21326596
--- /dev/null
+++ b/Copyright.txt
@@ -0,0 +1,25 @@
+Copyright (c) 2012, Regents of the University of California
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/INSTALLING.txt b/INSTALLING.txt
new file mode 100755
index 00000000..8a01920d
--- /dev/null
+++ b/INSTALLING.txt
@@ -0,0 +1,28 @@
+BIDMat is a set of Scala, Java and native code libraries. To run it, you need:
+
+* A java runtime, version 1.5 or later.
+
+* An Installation of the Scala language, which you can get from here: http://www.scala-lang.org/
+  You need Scala 2.9.1 or later. We have tested extensively on 2.9.1 and 2.9.2.
+
+* A 64bit Intel machine (Linux, Windows, and *soon* Mac), if you plan to use any native
+  code acceleration. We have tested on Windows 7 and RedHat Enterprise Linux 6. 
+  You should set Mat.noMKL=true in scala if you dont have one. A lot of
+  the code will still work, but that's really not the point of the library.
+
+* To use these libraries with a CUDA GPU, you need a CUDA-enabled GPU device. The code has
+  been developed for GTX 600-series, and this is the ideal platform. The code should work
+  with Tesla K10 devices, which share the same GPU chip as the 680/690 which is our
+  main platform. Most operations will work also with 500-series and we are working to add
+  the others. 
+
+* You need the [http://developer.nvidia.com/cuda/cuda-downloads NVIDIA CUDA driver and the
+  CUDA Toolkit] from the NVIDIA website. This version is based on CUDA 4.2. CUDA 5.0 supported
+  should appear soon. 
+
+In Linux, you should just be able to run the "bidmat" script from the top-level
+directory. If that doesnt work, its probably because $JAVA_HOME is not set in your
+environment. You can set it inside the script instead.
+
+On windows you will need to set the system PATH variable to include the lib subdirectory,
+which is at <root_dir>/BIDMat/lib/win64, and which contains the native code dlls
diff --git a/README.md b/README.md
index 84cebc04..d166638e 100755
--- a/README.md
+++ b/README.md
@@ -1,4 +1,7 @@
 
+NOTE: Documentation for BIDMat is available here: 
+http://bid.berkeley.edu/BIDMat/index.php/Main_Page
+
 BIDMat is a matrix library intended to support large-scale exploratory
 data analysis and to accelerate production deployment on single
 machines or clusters.  While there are many excellent tools exist to
@@ -12,17 +15,19 @@ of a high-end programming language including good general-purpose data
 sructures. And also of Scala's compiler-based REPL (Read-Eval-Print Loop). 
 
 2. To leverage native machine performance through native libraries
-(Intel MKL, HDF5 and string/XML processing). Java/Scala are excellent
-high-level languages, but are one or two orders of magnitude away from
-native performance in some key areas: especially matrix algebra and
-string processing, and below the bar to a lesser degree in File-IO.
+(Intel MKL, HDF5, CUDA and string/XML processing). Java/Scala are
+excellent high-level languages, but are one or two orders of magnitude
+away from native performance in some key areas: especially matrix
+algebra and string processing, and below the bar to a lesser degree in
+File-IO.
 
 3. To leverage GPU hardware and GPU-based data as a first-class
 object.  GPUs now offer large improvements (again one or more orders
 of magnitude) over CPU performance in many areas that are relevant to
 data mining: matrix algebra, transcendental functions, random number
-generation, network and graph algorithms and even natural language
-parsing. Our own work suggests that the list is going to continue to
-grow, and that GPU acceleration will fairly soon be a requirement for
-competitive performance in most algorithms.
+generation. These advantages in low-level operations carry over to
+network and graph algorithms and even natural language parsing. Our
+own work suggests that the list is going to continue to grow, and that
+GPU acceleration will fairly soon be a requirement for competitive
+performance in most algorithms.
  
diff --git a/bidmat b/bidmat
new file mode 100755
index 00000000..f3f48319
--- /dev/null
+++ b/bidmat
@@ -0,0 +1,22 @@
+#!/bin/bash
+export BIDMAT_ROOT="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+export BIDMAT_ROOT="$( echo ${BIDMAT_ROOT} | sed s+/cygdrive/c+c:+ )" 
+# This is only needed/works on Linux
+export LD_LIBRARY_PATH="${BIDMAT_ROOT}/lib/linux64:${BIDMAT_ROOT}/lib/linux64/JCUDA5.0:/usr/local/cuda-5.0/lib64:${LD_LIBRARY_PATH}" 
+# export JAVA_HOME="" # Set here if not set in environment
+export JAVA_OPTS="-Xmx12G -Xms128M" # Set as much memory as possible
+# Fix these if needed
+export JCUDA_VERSION="0.5.0RC"
+export JCUDA_LIBDIR=${BIDMAT_ROOT}/lib
+export LIBDIR=${BIDMAT_ROOT}/lib
+
+export BIDMAT_LIBS="${BIDMAT_ROOT}/BIDMat.jar;${LIBDIR}/ptplot.jar;${LIBDIR}/ptplotapplication.jar;${LIBDIR}/jhdf5.jar"
+export JCUDA_LIBS="${JCUDA_LIBDIR}/jcuda-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcublas-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcufft-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcurand-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcusparse-${JCUDA_VERSION}.jar"
+
+export ALL_LIBS="${BIDMAT_LIBS};${JCUDA_LIBS};${JAVA_HOME}/lib/tools.jar"
+
+if [ ! "$OS" = "Windows_NT" ]; then
+    export ALL_LIBS=`echo "${ALL_LIBS}" | sed 's/;/:/g'`
+fi
+
+scala -nobootcp -cp "${ALL_LIBS}" -Yrepl-sync -i ${LIBDIR}/bidmat_init.scala
\ No newline at end of file
diff --git a/bidmat.cmd b/bidmat.cmd
new file mode 100755
index 00000000..8db514e2
--- /dev/null
+++ b/bidmat.cmd
@@ -0,0 +1,20 @@
+@ECHO OFF
+:: Set JAVA_HOME here if not set in environment
+:: SET JAVA_HOME= 
+:: Set as much memory as possible
+(SET JAVA_OPTS=-Xmx12G -Xms128M)
+:: Fix these if needed
+SET JCUDA_VERSION=0.5.0RC
+SET JCUDA_LIBDIR=%CD%\lib
+SET LIBDIR=%CD%\lib
+SET PATH=%LIBDIR%\win64;%LIBDIR%\win64\JCUDA5.0;%PATH%
+
+
+SET BIDMAT_LIBS=%CD%\BIDMat.jar;%LIBDIR%\ptplot.jar;%LIBDIR%\ptplotapplication.jar;%LIBDIR%\jhdf5.jar
+
+SET JCUDA_LIBS=%JCUDA_LIBDIR%\jcuda-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcublas-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcufft-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcurand-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcusparse-%JCUDA_VERSION%.jar
+
+SET ALL_LIBS=%BIDMAT_LIBS%;%JCUDA_LIBS%;%JAVA_HOME%\lib\tools.jar
+echo %ALL_LIBS%
+
+scala -nobootcp -cp "%ALL_LIBS%" -Yrepl-sync -i %LIBDIR%\bidmat_init.scala
\ No newline at end of file
diff --git a/bidmat4.2 b/bidmat4.2
new file mode 100755
index 00000000..4a417778
--- /dev/null
+++ b/bidmat4.2
@@ -0,0 +1,21 @@
+#!/bin/bash
+export BIDMAT_ROOT=`pwd`
+# This is only needed/works on Linux
+export LD_LIBRARY_PATH="${BIDMAT_ROOT}/lib/linux64:${BIDMAT_ROOT}/lib/linux64/JCUDA4.2:/usr/local/cuda-4.2/lib64:${LD_LIBRARY_PATH}" 
+# export JAVA_HOME="" # Set here if not set in environment
+export JAVA_OPTS="-Xmx12G -Xms128M" # Set as much memory as possible
+# Fix these if needed
+export JCUDA_VERSION="0.4.2"
+export JCUDA_LIBDIR=${BIDMAT_ROOT}/lib
+export LIBDIR=${BIDMAT_ROOT}/lib
+
+export BIDMAT_LIBS="${BIDMAT_ROOT}/BIDMat.jar;${LIBDIR}/ptplot.jar;${LIBDIR}/ptplotapplication.jar;${LIBDIR}/jhdf5.jar"
+export JCUDA_LIBS="${JCUDA_LIBDIR}/jcuda-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcublas-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcufft-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcurand-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcusparse-${JCUDA_VERSION}.jar"
+
+export ALL_LIBS="${BIDMAT_LIBS};${JCUDA_LIBS};${JAVA_HOME}/lib/tools.jar"
+
+if [ ! "$OS" = "Windows_NT" ]; then
+    export ALL_LIBS=`echo "${ALL_LIBS}" | sed 's/;/:/g'`
+fi
+
+scala -nobootcp -cp "${ALL_LIBS}" -Yrepl-sync -i ${LIBDIR}/bidmat_init.scala
\ No newline at end of file
diff --git a/build.sbt b/build.sbt
new file mode 100755
index 00000000..e88a5e45
--- /dev/null
+++ b/build.sbt
@@ -0,0 +1,48 @@
+
+name := "BIDMat"
+
+version := "0.1.0"
+
+organization := "edu.berkeley.bid"
+
+scalaVersion := "2.9.1"
+
+resolvers ++= Seq(
+  "Scala Tools Snapshots" at "http://scala-tools.org/repo-snapshots/"
+)
+
+libraryDependencies <<= (scalaVersion, libraryDependencies) { (sv, deps) =>
+  deps :+ ("org.scala-lang" % "scala-compiler" % sv)
+}
+
+libraryDependencies += "org.scala-lang" % "jline" % "2.9.1"
+
+libraryDependencies += "org.scalatest" %% "scalatest" % "1.8" % "test"
+
+libraryDependencies += "org.scala-tools.testing" %% "scalacheck" % "1.9" % "test"
+
+libraryDependencies += "junit" % "junit" % "4.5" % "test"
+
+credentials += Credentials(Path.userHome / ".ivy2" / ".credentials")
+
+javacOptions ++= Seq("-source", "1.5", "-target", "1.5")
+
+scalacOptions ++= Seq("-deprecation","-target:jvm-1.5")
+
+initialCommands := scala.io.Source.fromFile("lib/bidmat_init.scala").getLines.mkString("\n")
+
+javaOptions += "-Xmx12g"
+
+seq(ProguardPlugin.proguardSettings :_*)
+
+proguardOptions ++= Seq (
+  "-keep class scala.** { *; }",
+  "-keep class org.jfree.** { *; }",
+  keepMain("scala.tools.nsc.MainGenericRunner"),
+  keepLimitedSerializability,
+  keepAllScala,
+  "-keep class ch.epfl.** { *; }",
+  "-keep interface scala.ScalaObject"
+)
+
+
diff --git a/jni/src/BIDMat_CBLAS.c b/jni/src/BIDMat_CBLAS.c
new file mode 100755
index 00000000..29c5ce5e
--- /dev/null
+++ b/jni/src/BIDMat_CBLAS.c
@@ -0,0 +1,358 @@
+#include <jni.h>
+#include <mkl.h>
+#include <mkl_trans.h>
+
+
+JNIEXPORT jdouble JNICALL Java_edu_berkeley_bid_CBLAS_ddot 
+(JNIEnv * env, jobject calling_obj, jint N, jdoubleArray jX, jint incX, jdoubleArray jY, jint incY){
+	jdouble * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jdouble * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+	jdouble returnValue;
+
+	returnValue = cblas_ddot(N, X, incX, Y, incY);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+	return returnValue;
+}
+
+
+JNIEXPORT jdouble JNICALL Java_edu_berkeley_bid_CBLAS_ddotxx 
+(JNIEnv * env, jobject calling_obj, jint N, jdoubleArray jX, jint startX, jdoubleArray jY, jint startY){
+	jdouble * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jdouble * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+	jdouble returnValue;
+
+	returnValue = cblas_ddot(N, X+startX, 1, Y+startY, 1);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+	return returnValue;
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_daxpy
+(JNIEnv * env, jobject calling_obj, jint N, jdouble a, jdoubleArray jX, jint incX, jdoubleArray jY, jint incY){
+	jdouble * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jdouble * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+
+        cblas_daxpy(N, a, X, incX, Y, incY);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_daxpyxx
+(JNIEnv * env, jobject calling_obj, jint N, jdouble a, jdoubleArray jX, jint startX, jdoubleArray jY, jint startY){
+	jdouble * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jdouble * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+
+        cblas_daxpy(N, a, X+startX, 1, Y+startY, 1);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_dgemv 
+(JNIEnv * env, jobject calling_obj, jint order, jint transA, jint M, jint N, jdouble alpha, 
+ jdoubleArray jA, jint lda, jdoubleArray jX, jint incX, jdouble beta, jdoubleArray jY, jint incY){
+	jdouble * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jdouble * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jdouble * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+
+	cblas_dgemv((CBLAS_ORDER)order, (CBLAS_TRANSPOSE)transA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_dgemm 
+(JNIEnv * env, jobject calling_obj, jint order, jint transA, jint transB, jint M, jint N, jint K, 
+ jdouble alpha, jdoubleArray jA, jint lda, jdoubleArray jB, jint ldb, jdouble beta, jdoubleArray jC, jint ldc){
+	jdouble * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jdouble * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE);
+	jdouble * C = (*env)->GetPrimitiveArrayCritical(env, jC, JNI_FALSE);
+
+	cblas_dgemm((CBLAS_ORDER)order, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB, M, N, K, 
+                    alpha, A, lda, B, ldb, beta, C, ldc);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jC, C, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_domatcopy
+(JNIEnv * env, jobject calling_obj, jstring j_order, jstring j_transA, jint M, jint N,
+ jdouble alpha, jdoubleArray j_A, jint lda, jdoubleArray j_B, jint ldb) {
+	char * order = (char *)(*env)->GetStringUTFChars(env, j_order, 0);
+	char * transA = (char *)(*env)->GetStringUTFChars(env, j_transA, 0);
+	jdouble * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE);
+	jdouble * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE);
+
+	mkl_domatcopy(order[0], transA[0], M, N, alpha, A, lda, B, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0);
+	(*env)->ReleaseStringUTFChars(env, j_transA, transA);
+	(*env)->ReleaseStringUTFChars(env, j_order, order);
+}
+
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_dmcscm 
+(JNIEnv * env, jobject calling_obj, jint M, jint N, jdoubleArray j_A, jint lda, 
+ jdoubleArray j_B, jintArray j_ir, jintArray j_jc, jdoubleArray j_C, jint ldc){
+	jdouble * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE);
+	jdouble * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, JNI_FALSE);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, JNI_FALSE);
+	jdouble * C = (*env)->GetPrimitiveArrayCritical(env, j_C, JNI_FALSE);
+
+        int ioff = jc[0];
+        int i, j, ir0;
+        for (i = 0; i < N; i++) {
+          for (j = jc[i]-ioff; j < jc[i+1]-ioff; j++) {
+            ir0 = ir[j]-ioff;
+            cblas_daxpy(M, B[j], A+(ir0*lda), 1, C+(i*ldc), 1);
+          }
+        }
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_C, C, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);	
+        (*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0);
+}
+
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_dmcsrm 
+(JNIEnv * env, jobject calling_obj, jint M, jint N, jdoubleArray j_A, jint lda, 
+ jdoubleArray j_B, jintArray j_ir, jintArray j_jc, jdoubleArray j_C, jint ldc){
+	jdouble * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE);
+	jdouble * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, JNI_FALSE);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, JNI_FALSE);
+	jdouble * C = (*env)->GetPrimitiveArrayCritical(env, j_C, JNI_FALSE);
+
+        int ioff = jc[0];
+        int i, j, k;
+        for (i = 0; i < N; i++) {
+          for (j = jc[i]-ioff; j < jc[i+1]-ioff; j++) {
+            k = ir[j]-ioff;
+            cblas_daxpy(M, B[j], A+(i*lda), 1, C+(k*ldc), 1);
+          }
+        }
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_C, C, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);	
+    (*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0);
+}
+
+JNIEXPORT jfloat JNICALL Java_edu_berkeley_bid_CBLAS_sdot 
+(JNIEnv * env, jobject calling_obj, jint N, jfloatArray jX, jint incX, jfloatArray jY, jint incY){
+	jfloat * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jfloat * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+	jfloat returnValue;
+
+	returnValue = cblas_sdot(N, X, incX, Y, incY);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+	return returnValue;
+}
+
+
+JNIEXPORT jfloat JNICALL Java_edu_berkeley_bid_CBLAS_sdotxx 
+(JNIEnv * env, jobject calling_obj, jint N, jfloatArray jX, jint startX, jfloatArray jY, jint startY){
+	jfloat * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jfloat * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+	jfloat returnValue;
+
+	returnValue = cblas_sdot(N, X+startX, 1, Y+startY, 1);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+	return returnValue;
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_sgemv 
+(JNIEnv * env, jobject calling_obj, jint order, jint transA, jint M, jint N, jfloat alpha, 
+jfloatArray jA, jint lda, jfloatArray jX, jint incX, jfloat beta, jfloatArray jY, jint incY){
+	jfloat * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jfloat * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jfloat * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+
+	cblas_sgemv((CBLAS_ORDER)order, (CBLAS_TRANSPOSE)transA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_sgemm 
+(JNIEnv * env, jobject calling_obj, jint order, jint transA, jint transB, jint M, jint N, jint K, 
+jfloat alpha, jfloatArray jA, jint lda, jfloatArray jB, jint ldb, jfloat beta, jfloatArray jC, jint ldc){
+	jfloat * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jfloat * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE);
+	jfloat * C = (*env)->GetPrimitiveArrayCritical(env, jC, JNI_FALSE);
+
+	cblas_sgemm((CBLAS_ORDER)order, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB, M, N, K, 
+                    alpha, A, lda, B, ldb, beta, C, ldc);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jC, C, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_somatcopy
+(JNIEnv * env, jobject calling_obj, jstring j_order, jstring j_transA, jint M, jint N,
+ jfloat alpha, jfloatArray j_A, jint lda, jfloatArray j_B, jint ldb) {
+	char * order = (char *)(*env)->GetStringUTFChars(env, j_order, 0);
+	char * transA = (char *)(*env)->GetStringUTFChars(env, j_transA, 0);
+	jfloat * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE);
+	jfloat * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE);
+
+	mkl_somatcopy(order[0], transA[0], M, N, alpha, A, lda, B, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0);
+	(*env)->ReleaseStringUTFChars(env, j_transA, transA);
+	(*env)->ReleaseStringUTFChars(env, j_order, order);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_saxpy
+(JNIEnv * env, jobject calling_obj, jint N, jfloat a, jfloatArray jX, jint incX, jfloatArray jY, jint incY){
+	jfloat * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jfloat * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+
+        cblas_saxpy(N, a, X, incX, Y, incY);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_saxpyxx
+(JNIEnv * env, jobject calling_obj, jint N, jfloat a, jfloatArray jX, jint startX, jfloatArray jY, jint startY){
+	jfloat * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jfloat * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+
+        cblas_saxpy(N, a, X+startX, 1, Y+startY, 1);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_smcscm 
+(JNIEnv * env, jobject calling_obj, jint M, jint N, jfloatArray j_A, jint lda, 
+ jfloatArray j_B, jintArray j_ir, jintArray j_jc, jfloatArray j_C, jint ldc){
+	jfloat * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE);
+	jfloat * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, JNI_FALSE);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, JNI_FALSE);
+	jfloat * C = (*env)->GetPrimitiveArrayCritical(env, j_C, JNI_FALSE);
+
+        int ioff = jc[0];
+        int i, j, ir0;
+        for (i = 0; i < N; i++) {
+          for (j = jc[i]-ioff; j < jc[i+1]-ioff; j++) {
+            ir0 = ir[j]-ioff;
+            cblas_saxpy(M, B[j], A+(ir0*lda), 1, C+(i*ldc), 1);
+          }
+        }
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_C, C, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);	
+        (*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_smcsrm 
+(JNIEnv * env, jobject calling_obj, jint M, jint N, jfloatArray j_A, jint lda, 
+ jfloatArray j_B, jintArray j_ir, jintArray j_jc, jfloatArray j_C, jint ldc){
+	jfloat * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE);
+	jfloat * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, JNI_FALSE);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, JNI_FALSE);
+	jfloat * C = (*env)->GetPrimitiveArrayCritical(env, j_C, JNI_FALSE);
+
+        int ioff = jc[0];
+        int i, j, k;
+        for (i = 0; i < N; i++) {
+          for (j = jc[i]-ioff; j < jc[i+1]-ioff; j++) {
+            k = ir[j]-ioff;
+            cblas_saxpy(M, B[j], A+(i*lda), 1, C+(k*ldc), 1);
+          }
+        }
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_C, C, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);	
+        (*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0);
+}
+
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_cgemv 
+(JNIEnv * env, jobject calling_obj, jint order, jint transA, jint M, jint N, jfloatArray jAlpha, 
+jfloatArray jA, jint lda, jfloatArray jX, jint incX, jfloatArray jBeta, jfloatArray jY, jint incY){
+	jfloat * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jfloat * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jfloat * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+	jfloat * alpha = (*env)->GetPrimitiveArrayCritical(env, jAlpha, JNI_FALSE);
+	jfloat * beta = (*env)->GetPrimitiveArrayCritical(env, jBeta, JNI_FALSE);
+
+	cblas_cgemv((CBLAS_ORDER)order, (CBLAS_TRANSPOSE)transA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jBeta, beta, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jAlpha, alpha, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_cgemm 
+(JNIEnv * env, jobject calling_obj, jint order, jint transA, jint transB, jint M, jint N, jint K, 
+jfloatArray jAlpha, jfloatArray jA, jint lda, jfloatArray jB, jint ldb, jfloatArray jBeta, jfloatArray jC, jint ldc){
+	jfloat * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jfloat * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE);
+	jfloat * C = (*env)->GetPrimitiveArrayCritical(env, jC, JNI_FALSE);
+	jfloat * alpha = (*env)->GetPrimitiveArrayCritical(env, jAlpha, JNI_FALSE);
+	jfloat * beta = (*env)->GetPrimitiveArrayCritical(env, jBeta, JNI_FALSE);
+
+	cblas_cgemm((CBLAS_ORDER)order, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB, M, N, K, 
+                    alpha, A, lda, B, ldb, beta, C, ldc);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jC, C, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+}
+
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_caxpy
+(JNIEnv * env, jobject calling_obj, jint N, jfloatArray jA, jfloatArray jX, jint incX, 
+ jfloatArray jY, jint incY){
+	jfloat * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jfloat * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+
+     cblas_caxpy(N, a, X, incX, Y, incY);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, a, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_caxpyxx
+(JNIEnv * env, jobject calling_obj, jint N, jfloatArray jA, jfloatArray jX, jint startX, jfloatArray jY, jint startY){
+	jfloat * X = (*env)->GetPrimitiveArrayCritical(env, jX, JNI_FALSE);
+	jfloat * Y = (*env)->GetPrimitiveArrayCritical(env, jY, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+
+        cblas_caxpy(N, a, X+startX, 1, Y+startY, 1);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, a, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jY, Y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jX, X, 0);
+}
diff --git a/jni/src/BIDMat_CUMAT.cpp b/jni/src/BIDMat_CUMAT.cpp
new file mode 100755
index 00000000..03054a1d
--- /dev/null
+++ b/jni/src/BIDMat_CUMAT.cpp
@@ -0,0 +1,138 @@
+
+#include <jni.h>
+#include <cuda_runtime.h>
+#include "Logger.hpp"
+#include "JNIUtils.hpp"
+#include "PointerUtils.hpp"
+#include "MatKernel.hpp"
+
+extern "C" {
+
+  JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *jvm, void *reserved)
+  {
+    JNIEnv *env = NULL;
+    if (jvm->GetEnv((void **)&env, JNI_VERSION_1_4))
+      {
+        return JNI_ERR;
+      }
+
+    Logger::log(LOG_TRACE, "Initializing JCublas\n");
+
+    jclass cls = NULL;
+
+    // Initialize the JNIUtils and PointerUtils
+    if (initJNIUtils(env) == JNI_ERR) return JNI_ERR;
+    if (initPointerUtils(env) == JNI_ERR) return JNI_ERR;
+
+    return JNI_VERSION_1_4;
+
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_applyop 
+  (JNIEnv *env, jobject obj, jobject jA, jint Anrows, jint Ancols, 
+   jobject jB, jint Bnrows, jint Bncols, jobject jC, jint opn) 
+  {
+    float *nativeA = (float*)getPointer(env, jA);
+    float *nativeB = (float*)getPointer(env, jB);
+    float *nativeC = (float*)getPointer(env, jC);
+
+    return apply_binop(nativeA, Anrows, Ancols, nativeB, Bnrows, Bncols, nativeC, opn);
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_applyiop 
+  (JNIEnv *env, jobject obj, jobject jA, jint Anrows, jint Ancols, 
+   jobject jB, jint Bnrows, jint Bncols, jobject jC, jint opn) 
+  {
+    int *nativeA = (int*)getPointer(env, jA);
+    int *nativeB = (int*)getPointer(env, jB);
+    int *nativeC = (int*)getPointer(env, jC);
+
+    return apply_biniop(nativeA, Anrows, Ancols, nativeB, Bnrows, Bncols, nativeC, opn);
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_applygfun
+  (JNIEnv *env, jobject obj, jobject jA, jobject jB, jint N, jint opn) 
+  {
+    float *nativeA = (float*)getPointer(env, jA);
+    float *nativeB = (float*)getPointer(env, jB);
+
+    return apply_gfun(nativeA, nativeB, N, opn);
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_applygfun2
+  (JNIEnv *env, jobject obj, jobject jA, jobject jB, jobject jC, jint N, jint opn) 
+  {
+    float *nativeA = (float*)getPointer(env, jA);
+    float *nativeB = (float*)getPointer(env, jB);
+    float *nativeC = (float*)getPointer(env, jC);
+
+    return apply_gfun2(nativeA, nativeB, nativeC, N, opn);
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_dsmult
+  (JNIEnv *env, jobject obj, jint nrows, jint ncols, jint nnz, 
+   jobject jA, jobject jBdata, jobject jBir, jobject jBic, jobject jC)
+  {
+    float *A = (float*)getPointer(env, jA);
+    float *Bdata = (float*)getPointer(env, jBdata);
+    float *C = (float*)getPointer(env, jC);
+    int *Bir = (int*)getPointer(env, jBir);
+    int *Bic = (int*)getPointer(env, jBic);
+
+    return dsmult(nrows, ncols, nnz, A, Bdata, Bir, Bic, C);
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_dsmultT
+  (JNIEnv *env, jobject obj, jint nrows, jint ncols, jint nnz, 
+   jobject jA, jobject jBdata, jobject jBir, jobject jBic, jobject jC)
+  {
+    float *A = (float*)getPointer(env, jA);
+    float *Bdata = (float*)getPointer(env, jBdata);
+    float *C = (float*)getPointer(env, jC);
+    int *Bir = (int*)getPointer(env, jBir);
+    int *Bic = (int*)getPointer(env, jBic);
+
+    return dsmultT(nrows, ncols, nnz, A, Bdata, Bir, Bic, C);
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_dds
+  (JNIEnv *env, jobject obj, jint nrows, jint nnz, 
+   jobject jA, jobject jB, jobject jCir, jobject jCic, jobject jP)
+  {
+    float *A = (float*)getPointer(env, jA);
+    float *B = (float*)getPointer(env, jB);
+    float *P = (float*)getPointer(env, jP);
+    int *Cir = (int*)getPointer(env, jCir);
+    int *Cic = (int*)getPointer(env, jCic);
+
+    return dds(nrows, nnz, A, B, Cir, Cic, P);
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_reduce1op
+  (JNIEnv *env, jobject obj, jint nrows, jint ncols, jobject jA, jobject jB, jint opn)
+  {
+    float *A = (float*)getPointer(env, jA);
+    float *B = (float*)getPointer(env, jB);
+
+    return reduce1op(nrows, ncols, A, B, opn);
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_reduce2op
+  (JNIEnv *env, jobject obj, jint nrows, jint ncols, jobject jA, jobject jB, jint opn)
+  {
+    float *A = (float*)getPointer(env, jA);
+    float *B = (float*)getPointer(env, jB);
+
+    return reduce2op(nrows, ncols, A, B, opn);
+  }
+
+  JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_transpose
+  (JNIEnv *env, jobject obj, jobject jA, jint instride, jobject jB, jint outstride, jint nrows, jint ncols)
+  {
+    float *A = (float*)getPointer(env, jA);
+    float *B = (float*)getPointer(env, jB);
+
+    return transpose(A, instride, B, outstride, nrows, ncols);
+  }
+
+}
diff --git a/jni/src/BIDMat_LAPACK.c b/jni/src/BIDMat_LAPACK.c
new file mode 100755
index 00000000..54f753af
--- /dev/null
+++ b/jni/src/BIDMat_LAPACK.c
@@ -0,0 +1,1074 @@
+
+#include <jni.h>
+#include <mkl.h>
+#include <mkl_lapacke.h>
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dgetrf
+(JNIEnv * env, jobject calling_obj, jint order, jint m, jint n, jdoubleArray ja, jint lda, jintArray jipiv){
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, ja, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, jipiv, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dgetrf(order, m, n, a, lda, ipiv);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, ja, a, 0);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sgetrf
+(JNIEnv * env, jobject calling_obj, jint order, jint m, jint n, jfloatArray ja, jint lda, jintArray jipiv){
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, ja, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, jipiv, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_sgetrf(order, m, n, a, lda, ipiv);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, ja, a, 0);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cgetrf
+(JNIEnv * env, jobject calling_obj, jint order, jint m, jint n, jfloatArray ja, jint lda, jintArray jipiv){
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, ja, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, jipiv, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_cgetrf(order, m, n, (MKL_Complex8 *)a, lda, ipiv);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, ja, a, 0);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zgetrf
+(JNIEnv * env, jobject calling_obj, jint order, jint m, jint n, jdoubleArray ja, jint lda, jintArray jipiv){
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, ja, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, jipiv, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_zgetrf(order, m, n, (MKL_Complex16 *)a, lda, ipiv);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, ja, a, 0);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dgetri
+(JNIEnv * env, jobject calling_obj, jint order, jint n, jdoubleArray ja, jint lda, jintArray jipiv){
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, ja, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, jipiv, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dgetri(order, n, a, lda, ipiv);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, ja, a, 0);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sgetri
+(JNIEnv * env, jobject calling_obj, jint order, jint n, jfloatArray ja, jint lda, jintArray jipiv){
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, ja, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, jipiv, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_sgetri(order, n, a, lda, ipiv);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, ja, a, 0);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cgetri
+(JNIEnv * env, jobject calling_obj, jint order, jint n, jfloatArray ja, jint lda, jintArray jipiv){
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, ja, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, jipiv, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_cgetri(order, n, (MKL_Complex8 *)a, lda, ipiv);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, ja, a, 0);
+	return returnValue;
+}
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zgetri
+(JNIEnv * env, jobject calling_obj, jint order, jint n, jdoubleArray ja, jint lda, jintArray jipiv){
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, ja, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, jipiv, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_zgetri(order, n, (MKL_Complex16 *)a, lda, ipiv);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, ja, a, 0);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dgetrs
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_transa, jint n, jint nrhs, jdoubleArray j_a, jint lda, 
+ jintArray j_ipiv, jdoubleArray j_b, int ldb){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, JNI_FALSE);
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, j_ipiv, JNI_FALSE);
+	jdouble * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dgetrs(order, *transa, n, nrhs, a, lda, ipiv, b, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+}
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sgetrs
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_transa, jint n, jint nrhs, jfloatArray j_a, jint lda, 
+ jintArray j_ipiv, jfloatArray j_b, int ldb){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, j_ipiv, JNI_FALSE);
+	jfloat * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_sgetrs(order, *transa, n, nrhs, a, lda, ipiv, b, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cgetrs
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_transa, jint n, jint nrhs, jfloatArray j_a, jint lda, 
+ jintArray j_ipiv, jfloatArray j_b, int ldb){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, j_ipiv, JNI_FALSE);
+	jfloat * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_cgetrs(order, *transa, n, nrhs, (MKL_Complex8 *)a, lda, ipiv, (MKL_Complex8 *)b, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zgetrs
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_transa, jint n, jint nrhs, jdoubleArray j_a, jint lda, 
+ jintArray j_ipiv, jdoubleArray j_b, int ldb){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, JNI_FALSE);
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jint * ipiv = (*env)->GetPrimitiveArrayCritical(env, j_ipiv, JNI_FALSE);
+	jdouble * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_zgetrs(order, *transa, n, nrhs, (MKL_Complex16 *)a, lda, ipiv, (MKL_Complex16 *)b, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ipiv, ipiv, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dtrtrs
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_meta, jint n, jint nrhs, jdoubleArray j_a, jint lda, 
+ jdoubleArray j_b, int ldb){
+	char * meta = (char *)(*env)->GetStringUTFChars(env, j_meta, JNI_FALSE);
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jdouble * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dtrtrs(order, meta[0], meta[1], meta[2], n, nrhs, a, lda, b, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_meta, meta);
+	return returnValue;
+}
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_strtrs
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_meta, jint n, jint nrhs, jfloatArray j_a, jint lda, 
+ jfloatArray j_b, int ldb){
+	char * meta = (char *)(*env)->GetStringUTFChars(env, j_meta, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jfloat * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_strtrs(order, meta[0], meta[1], meta[2], n, nrhs, a, lda, b, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_meta, meta);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_ctrtrs
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_meta, jint n, jint nrhs, jfloatArray j_a, jint lda, 
+ jfloatArray j_b, int ldb){
+	char * meta = (char *)(*env)->GetStringUTFChars(env, j_meta, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jfloat * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_ctrtrs(order, meta[0], meta[1], meta[2], n, nrhs, (MKL_Complex8 *)a, lda, (MKL_Complex8 *)b, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_meta, meta);
+	return returnValue;
+}
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_ztrtrs
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_meta, jint n, jint nrhs, jdoubleArray j_a, jint lda, 
+ jdoubleArray j_b, int ldb){
+	char * meta = (char *)(*env)->GetStringUTFChars(env, j_meta, JNI_FALSE);
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jdouble * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_ztrtrs(order, meta[0], meta[1], meta[2], n, nrhs, (MKL_Complex16 *)a, lda, (MKL_Complex16 *)b, ldb);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_meta, meta);
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dsytrd
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_uplo, jint n, jdoubleArray j_a, jint lda, 
+ jdoubleArray j_d,  jdoubleArray j_e,  jdoubleArray j_tau) {
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jdouble * d = (*env)->GetPrimitiveArrayCritical(env, j_d, JNI_FALSE);
+	jdouble * e = (*env)->GetPrimitiveArrayCritical(env, j_e, JNI_FALSE);
+	jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dsytrd(order, *uplo, n, a, lda, d, e, tau);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_e, e, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_d, d, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_ssytrd
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_uplo, jint n, jfloatArray j_a, jint lda, 
+ jfloatArray j_d,  jfloatArray j_e,  jfloatArray j_tau) {
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jfloat * d = (*env)->GetPrimitiveArrayCritical(env, j_d, JNI_FALSE);
+	jfloat * e = (*env)->GetPrimitiveArrayCritical(env, j_e, JNI_FALSE);
+	jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_ssytrd(order, *uplo, n, a, lda, d, e, tau);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_e, e, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_d, d, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dorgtr
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_uplo, jint n, 
+ jdoubleArray j_a, jint lda,  jdoubleArray j_tau) {
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dorgtr(order, *uplo, n, a, lda, tau);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sorgtr
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_uplo, jint n, 
+ jfloatArray j_a, jint lda,  jfloatArray j_tau) {
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_sorgtr(order, *uplo, n, a, lda, tau);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dsteqr
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_compz, jint n, 
+ jdoubleArray j_d, jdoubleArray j_e, jdoubleArray j_z, int ldz) {
+	char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+	jdouble * d = (*env)->GetPrimitiveArrayCritical(env, j_d, JNI_FALSE);
+	jdouble * e = (*env)->GetPrimitiveArrayCritical(env, j_e, JNI_FALSE);
+	jdouble * z = (*env)->GetPrimitiveArrayCritical(env, j_z, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dsteqr(order, *compz, n, d, e, z, ldz);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_z, z, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_e, e, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_d, d, 0);
+	(*env)->ReleaseStringUTFChars(env, j_compz, compz);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_ssteqr
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_compz, jint n, 
+jfloatArray j_d, jfloatArray j_e, jfloatArray j_z, int ldz) {
+	char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+	jfloat * d = (*env)->GetPrimitiveArrayCritical(env, j_d, JNI_FALSE);
+	jfloat * e = (*env)->GetPrimitiveArrayCritical(env, j_e, JNI_FALSE);
+	jfloat * z = (*env)->GetPrimitiveArrayCritical(env, j_z, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_ssteqr(order, *compz, n, d, e, z, ldz);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_z, z, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_e, e, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_d, d, 0);
+	(*env)->ReleaseStringUTFChars(env, j_compz, compz);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_csteqr
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_compz, jint n, 
+jfloatArray j_d, jfloatArray j_e, jfloatArray j_z, int ldz) {
+	char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+	jfloat * d = (*env)->GetPrimitiveArrayCritical(env, j_d, JNI_FALSE);
+	jfloat * e = (*env)->GetPrimitiveArrayCritical(env, j_e, JNI_FALSE);
+	jfloat * z = (*env)->GetPrimitiveArrayCritical(env, j_z, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_csteqr(order, *compz, n, d, e, (MKL_Complex8 *)z, ldz);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_z, z, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_e, e, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_d, d, 0);
+	(*env)->ReleaseStringUTFChars(env, j_compz, compz);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zsteqr
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_compz, jint n, 
+jdoubleArray j_d, jdoubleArray j_e, jdoubleArray j_z, int ldz) {
+	char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+	jdouble * d = (*env)->GetPrimitiveArrayCritical(env, j_d, JNI_FALSE);
+	jdouble * e = (*env)->GetPrimitiveArrayCritical(env, j_e, JNI_FALSE);
+	jdouble * z = (*env)->GetPrimitiveArrayCritical(env, j_z, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_zsteqr(order, *compz, n, d, e, (MKL_Complex16 *)z, ldz);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_z, z, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_e, e, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_d, d, 0);
+	(*env)->ReleaseStringUTFChars(env, j_compz, compz);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dstedc
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_compz, jint n, 
+ jdoubleArray j_d, jdoubleArray j_e, jdoubleArray j_z, int ldz) {
+	char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+	jdouble * d = (*env)->GetPrimitiveArrayCritical(env, j_d, JNI_FALSE);
+	jdouble * e = (*env)->GetPrimitiveArrayCritical(env, j_e, JNI_FALSE);
+	jdouble * z = (*env)->GetPrimitiveArrayCritical(env, j_z, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dstedc(order, *compz, n, d, e, z, ldz);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_z, z, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_e, e, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_d, d, 0);
+	(*env)->ReleaseStringUTFChars(env, j_compz, compz);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dsyevd
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_compz, jstring j_uplo, jint n, 
+ jdoubleArray j_a, int lda, jdoubleArray j_w) {
+	char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jdouble * w = (*env)->GetPrimitiveArrayCritical(env, j_w, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dsyevd(order, *compz, *uplo, n, a, lda, w);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_w, w, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+	(*env)->ReleaseStringUTFChars(env, j_compz, compz);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_ssyevd
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_compz, jstring j_uplo, jint n, 
+ jfloatArray j_a, int lda, jfloatArray j_w) {
+	char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jfloat * w = (*env)->GetPrimitiveArrayCritical(env, j_w, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_ssyevd(order, *compz, *uplo, n, a, lda, w);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_w, w, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+	(*env)->ReleaseStringUTFChars(env, j_compz, compz);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dpotrf
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_uplo, jint n, jdoubleArray j_a, jint lda) {
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_dpotrf(order, *uplo, n, a, lda);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_spotrf
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_uplo, jint n, jfloatArray j_a, jint lda) {
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_spotrf(order, *uplo, n, a, lda);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cpotrf
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_uplo, jint n, jfloatArray j_a, jint lda) {
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_cpotrf(order, *uplo, n, (MKL_Complex8 *)a, lda);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zpotrf
+(JNIEnv * env, jobject calling_obj, jint order, jstring j_uplo, jint n, jdoubleArray j_a, jint lda) {
+	char * uplo = (char *)(*env)->GetStringUTFChars(env, j_uplo, JNI_FALSE);
+	jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+	jint returnValue;
+
+	returnValue = LAPACKE_zpotrf(order, *uplo, n, (MKL_Complex16 *)a, lda);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+	(*env)->ReleaseStringUTFChars(env, j_uplo, uplo);
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sgebal
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jint n, jfloatArray j_a, jint lda, jintArray j_ilo, jintArray j_ihi, jfloatArray j_scale) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jint * ilo = (*env)->GetPrimitiveArrayCritical(env, j_ilo, JNI_FALSE);
+    jint * ihi = (*env)->GetPrimitiveArrayCritical(env, j_ihi, JNI_FALSE);
+    jfloat * scale = (*env)->GetPrimitiveArrayCritical(env, j_scale, JNI_FALSE);
+
+    jint retval = LAPACKE_sgebal(matrix_order, *job, n, a, lda, ilo, ihi, scale);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_ilo, ilo, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_ihi, ihi, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_scale, scale, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dgebal
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jint n, jdoubleArray j_a, jint lda, jintArray j_ilo, jintArray j_ihi, jdoubleArray j_scale) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jint * ilo = (*env)->GetPrimitiveArrayCritical(env, j_ilo, JNI_FALSE);
+    jint * ihi = (*env)->GetPrimitiveArrayCritical(env, j_ihi, JNI_FALSE);
+    jdouble * scale = (*env)->GetPrimitiveArrayCritical(env, j_scale, JNI_FALSE);
+
+    jint retval = LAPACKE_dgebal(matrix_order, *job, n, a, lda, ilo, ihi, scale);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_ilo, ilo, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_ihi, ihi, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_scale, scale, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cgebal
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jint n, jfloatArray j_a, jint lda, jintArray j_ilo, jintArray j_ihi, jfloatArray j_scale) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jint * ilo = (*env)->GetPrimitiveArrayCritical(env, j_ilo, JNI_FALSE);
+    jint * ihi = (*env)->GetPrimitiveArrayCritical(env, j_ihi, JNI_FALSE);
+    jfloat * scale = (*env)->GetPrimitiveArrayCritical(env, j_scale, JNI_FALSE);
+
+    jint retval = LAPACKE_cgebal(matrix_order, *job, n, (lapack_complex_float *)a, lda, ilo, ihi, scale);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_ilo, ilo, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_ihi, ihi, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_scale, scale, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zgebal
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jint n, jdoubleArray j_a, jint lda, jintArray j_ilo, jintArray j_ihi, jdoubleArray j_scale) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jint * ilo = (*env)->GetPrimitiveArrayCritical(env, j_ilo, JNI_FALSE);
+    jint * ihi = (*env)->GetPrimitiveArrayCritical(env, j_ihi, JNI_FALSE);
+    jdouble * scale = (*env)->GetPrimitiveArrayCritical(env, j_scale, JNI_FALSE);
+
+    jint retval = LAPACKE_zgebal(matrix_order, *job, n, (lapack_complex_double *)a, lda, ilo, ihi, scale);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_ilo, ilo, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_ihi, ihi, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_scale, scale, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cunghr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint n, jint ilo, jint ihi, jfloatArray j_a, jint lda, jfloatArray j_tau) {
+
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_cunghr(matrix_order, n, ilo, ihi, (lapack_complex_float *)a, lda, (lapack_complex_float *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zunghr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint n, jint ilo, jint ihi, jdoubleArray j_a, jint lda, jdoubleArray j_tau) {
+
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_zunghr(matrix_order, n, ilo, ihi, (lapack_complex_double *)a, lda, (lapack_complex_double *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_strevc
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_side, jstring j_howmny, jintArray j_select, jint n, jfloatArray j_t, jint ldt, jfloatArray j_vl, jint ldvl, jfloatArray j_vr, jint ldvr, jint mm, jintArray j_m) {
+
+    char * side = (char *)(*env)->GetStringUTFChars(env, j_side, JNI_FALSE);
+    char * howmny = (char *)(*env)->GetStringUTFChars(env, j_howmny, JNI_FALSE);
+    jint * select = (*env)->GetPrimitiveArrayCritical(env, j_select, JNI_FALSE);
+    jfloat * t = (*env)->GetPrimitiveArrayCritical(env, j_t, JNI_FALSE);
+    jfloat * vl = (*env)->GetPrimitiveArrayCritical(env, j_vl, JNI_FALSE);
+    jfloat * vr = (*env)->GetPrimitiveArrayCritical(env, j_vr, JNI_FALSE);
+    jint * m = (*env)->GetPrimitiveArrayCritical(env, j_m, JNI_FALSE);
+
+    jint retval = LAPACKE_strevc(matrix_order, *side, *howmny, (lapack_logical *)select, n, t, ldt, vl, ldvl, vr, ldvr, mm, m);
+
+    (*env)->ReleaseStringUTFChars(env, j_side, side);
+    (*env)->ReleaseStringUTFChars(env, j_howmny, howmny);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_select, select, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_t, t, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_vl, vl, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_vr, vr, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_m, m, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dtrevc
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_side, jstring j_howmny, jintArray j_select, jint n, jdoubleArray j_t, jint ldt, jdoubleArray j_vl, jint ldvl, jdoubleArray j_vr, jint ldvr, jint mm, jintArray j_m) {
+
+    char * side = (char *)(*env)->GetStringUTFChars(env, j_side, JNI_FALSE);
+    char * howmny = (char *)(*env)->GetStringUTFChars(env, j_howmny, JNI_FALSE);
+    jint * select = (*env)->GetPrimitiveArrayCritical(env, j_select, JNI_FALSE);
+    jdouble * t = (*env)->GetPrimitiveArrayCritical(env, j_t, JNI_FALSE);
+    jdouble * vl = (*env)->GetPrimitiveArrayCritical(env, j_vl, JNI_FALSE);
+    jdouble * vr = (*env)->GetPrimitiveArrayCritical(env, j_vr, JNI_FALSE);
+    jint * m = (*env)->GetPrimitiveArrayCritical(env, j_m, JNI_FALSE);
+
+    jint retval = LAPACKE_dtrevc(matrix_order, *side, *howmny, (lapack_logical *)select, n, t, ldt, vl, ldvl, vr, ldvr, mm, m);
+
+    (*env)->ReleaseStringUTFChars(env, j_side, side);
+    (*env)->ReleaseStringUTFChars(env, j_howmny, howmny);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_select, select, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_t, t, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_vl, vl, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_vr, vr, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_m, m, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_ctrevc
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_side, jstring j_howmny, jintArray j_select, jint n, jfloatArray j_t, jint ldt, jfloatArray j_vl, jint ldvl, jfloatArray j_vr, jint ldvr, jint mm, jintArray j_m) {
+
+    char * side = (char *)(*env)->GetStringUTFChars(env, j_side, JNI_FALSE);
+    char * howmny = (char *)(*env)->GetStringUTFChars(env, j_howmny, JNI_FALSE);
+    jint * select = (*env)->GetPrimitiveArrayCritical(env, j_select, JNI_FALSE);
+    jfloat * t = (*env)->GetPrimitiveArrayCritical(env, j_t, JNI_FALSE);
+    jfloat * vl = (*env)->GetPrimitiveArrayCritical(env, j_vl, JNI_FALSE);
+    jfloat * vr = (*env)->GetPrimitiveArrayCritical(env, j_vr, JNI_FALSE);
+    jint * m = (*env)->GetPrimitiveArrayCritical(env, j_m, JNI_FALSE);
+
+    jint retval = LAPACKE_ctrevc(matrix_order, *side, *howmny, (lapack_logical *)select, n, (lapack_complex_float *)t, ldt, (lapack_complex_float *)vl, ldvl, (lapack_complex_float *)vr, ldvr, mm, m);
+
+    (*env)->ReleaseStringUTFChars(env, j_side, side);
+    (*env)->ReleaseStringUTFChars(env, j_howmny, howmny);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_select, select, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_t, t, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_vl, vl, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_vr, vr, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_m, m, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_ztrevc
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_side, jstring j_howmny, jintArray j_select, jint n, jdoubleArray j_t, jint ldt, jdoubleArray j_vl, jint ldvl, jdoubleArray j_vr, jint ldvr, jint mm, jintArray j_m) {
+
+    char * side = (char *)(*env)->GetStringUTFChars(env, j_side, JNI_FALSE);
+    char * howmny = (char *)(*env)->GetStringUTFChars(env, j_howmny, JNI_FALSE);
+    jint * select = (*env)->GetPrimitiveArrayCritical(env, j_select, JNI_FALSE);
+    jdouble * t = (*env)->GetPrimitiveArrayCritical(env, j_t, JNI_FALSE);
+    jdouble * vl = (*env)->GetPrimitiveArrayCritical(env, j_vl, JNI_FALSE);
+    jdouble * vr = (*env)->GetPrimitiveArrayCritical(env, j_vr, JNI_FALSE);
+    jint * m = (*env)->GetPrimitiveArrayCritical(env, j_m, JNI_FALSE);
+
+    jint retval = LAPACKE_ztrevc(matrix_order, *side, *howmny, (lapack_logical *)select, n, (lapack_complex_double *)t, ldt, (lapack_complex_double *)vl, ldvl, (lapack_complex_double *)vr, ldvr, mm, m);
+
+    (*env)->ReleaseStringUTFChars(env, j_side, side);
+    (*env)->ReleaseStringUTFChars(env, j_howmny, howmny);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_select, select, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_t, t, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_vl, vl, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_vr, vr, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_m, m, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sgehrd
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint n, jint ilo, jint ihi, jfloatArray j_a, jint lda, jfloatArray j_tau) {
+
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_sgehrd(matrix_order, n, ilo, ihi, a, lda, tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dgehrd
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint n, jint ilo, jint ihi, jdoubleArray j_a, jint lda, jdoubleArray j_tau) {
+
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_dgehrd(matrix_order, n, ilo, ihi, a, lda, tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cgehrd
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint n, jint ilo, jint ihi, jfloatArray j_a, jint lda, jfloatArray j_tau) {
+
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_cgehrd(matrix_order, n, ilo, ihi, (lapack_complex_float *)a, lda, (lapack_complex_float *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zgehrd
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint n, jint ilo, jint ihi, jdoubleArray j_a, jint lda, jdoubleArray j_tau) {
+
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_zgehrd(matrix_order, n, ilo, ihi, (lapack_complex_double *)a, lda, (lapack_complex_double *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_shseqr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jstring j_compz, jint n, jint ilo, jint ihi, jfloatArray j_h, jint ldh, jfloatArray j_wr, jfloatArray j_wi, jfloatArray j_z, jint ldz) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+    jfloat * h = (*env)->GetPrimitiveArrayCritical(env, j_h, JNI_FALSE);
+    jfloat * wr = (*env)->GetPrimitiveArrayCritical(env, j_wr, JNI_FALSE);
+    jfloat * wi = (*env)->GetPrimitiveArrayCritical(env, j_wi, JNI_FALSE);
+    jfloat * z = (*env)->GetPrimitiveArrayCritical(env, j_z, JNI_FALSE);
+
+    jint retval = LAPACKE_shseqr(matrix_order, *job, *compz, n, ilo, ihi, h, ldh, wr, wi, z, ldz);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleaseStringUTFChars(env, j_compz, compz);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_h, h, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_wr, wr, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_wi, wi, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_z, z, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dhseqr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jstring j_compz, jint n, jint ilo, jint ihi, jdoubleArray j_h, jint ldh, jdoubleArray j_wr, jdoubleArray j_wi, jdoubleArray j_z, jint ldz) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+    jdouble * h = (*env)->GetPrimitiveArrayCritical(env, j_h, JNI_FALSE);
+    jdouble * wr = (*env)->GetPrimitiveArrayCritical(env, j_wr, JNI_FALSE);
+    jdouble * wi = (*env)->GetPrimitiveArrayCritical(env, j_wi, JNI_FALSE);
+    jdouble * z = (*env)->GetPrimitiveArrayCritical(env, j_z, JNI_FALSE);
+
+    jint retval = LAPACKE_dhseqr(matrix_order, *job, *compz, n, ilo, ihi, h, ldh, wr, wi, z, ldz);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleaseStringUTFChars(env, j_compz, compz);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_h, h, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_wr, wr, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_wi, wi, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_z, z, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_chseqr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jstring j_compz, jint n, jint ilo, jint ihi, jfloatArray j_h, jint ldh, jfloatArray j_w, jfloatArray j_z, jint ldz) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+    jfloat * h = (*env)->GetPrimitiveArrayCritical(env, j_h, JNI_FALSE);
+    jfloat * w = (*env)->GetPrimitiveArrayCritical(env, j_w, JNI_FALSE);
+    jfloat * z = (*env)->GetPrimitiveArrayCritical(env, j_z, JNI_FALSE);
+
+    jint retval = LAPACKE_chseqr(matrix_order, *job, *compz, n, ilo, ihi, (lapack_complex_float *)h, ldh, (lapack_complex_float *)w, (lapack_complex_float *)z, ldz);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleaseStringUTFChars(env, j_compz, compz);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_h, h, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_w, w, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_z, z, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zhseqr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jstring j_compz, jint n, jint ilo, jint ihi, jdoubleArray j_h, jint ldh, jdoubleArray j_w, jdoubleArray j_z, jint ldz) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    char * compz = (char *)(*env)->GetStringUTFChars(env, j_compz, JNI_FALSE);
+    jdouble * h = (*env)->GetPrimitiveArrayCritical(env, j_h, JNI_FALSE);
+    jdouble * w = (*env)->GetPrimitiveArrayCritical(env, j_w, JNI_FALSE);
+    jdouble * z = (*env)->GetPrimitiveArrayCritical(env, j_z, JNI_FALSE);
+
+    jint retval = LAPACKE_zhseqr(matrix_order, *job, *compz, n, ilo, ihi, (lapack_complex_double *)h, ldh, (lapack_complex_double *)w, (lapack_complex_double *)z, ldz);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleaseStringUTFChars(env, j_compz, compz);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_h, h, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_w, w, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_z, z, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sgebak
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jstring j_side, jint n, jint ilo, jint ihi, jfloatArray j_scale, jint m, jfloatArray j_v, jint ldv) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    char * side = (char *)(*env)->GetStringUTFChars(env, j_side, JNI_FALSE);
+    jfloat * scale = (*env)->GetPrimitiveArrayCritical(env, j_scale, JNI_FALSE);
+    jfloat * v = (*env)->GetPrimitiveArrayCritical(env, j_v, JNI_FALSE);
+
+    jint retval = LAPACKE_sgebak(matrix_order, *job, *side, n, ilo, ihi, scale, m, v, ldv);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleaseStringUTFChars(env, j_side, side);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_scale, scale, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_v, v, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dgebak
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jstring j_side, jint n, jint ilo, jint ihi, jdoubleArray j_scale, jint m, jdoubleArray j_v, jint ldv) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    char * side = (char *)(*env)->GetStringUTFChars(env, j_side, JNI_FALSE);
+    jdouble * scale = (*env)->GetPrimitiveArrayCritical(env, j_scale, JNI_FALSE);
+    jdouble * v = (*env)->GetPrimitiveArrayCritical(env, j_v, JNI_FALSE);
+
+    jint retval = LAPACKE_dgebak(matrix_order, *job, *side, n, ilo, ihi, scale, m, v, ldv);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleaseStringUTFChars(env, j_side, side);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_scale, scale, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_v, v, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cgebak
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jstring j_side, jint n, jint ilo, jint ihi, jfloatArray j_scale, jint m, jfloatArray j_v, jint ldv) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    char * side = (char *)(*env)->GetStringUTFChars(env, j_side, JNI_FALSE);
+    jfloat * scale = (*env)->GetPrimitiveArrayCritical(env, j_scale, JNI_FALSE);
+    jfloat * v = (*env)->GetPrimitiveArrayCritical(env, j_v, JNI_FALSE);
+
+    jint retval = LAPACKE_cgebak(matrix_order, *job, *side, n, ilo, ihi, scale, m, (lapack_complex_float *)v, ldv);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleaseStringUTFChars(env, j_side, side);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_scale, scale, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_v, v, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zgebak
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jstring j_job, jstring j_side, jint n, jint ilo, jint ihi, jdoubleArray j_scale, jint m, jdoubleArray j_v, jint ldv) {
+
+    char * job = (char *)(*env)->GetStringUTFChars(env, j_job, JNI_FALSE);
+    char * side = (char *)(*env)->GetStringUTFChars(env, j_side, JNI_FALSE);
+    jdouble * scale = (*env)->GetPrimitiveArrayCritical(env, j_scale, JNI_FALSE);
+    jdouble * v = (*env)->GetPrimitiveArrayCritical(env, j_v, JNI_FALSE);
+
+    jint retval = LAPACKE_zgebak(matrix_order, *job, *side, n, ilo, ihi, scale, m, (lapack_complex_double *)v, ldv);
+
+    (*env)->ReleaseStringUTFChars(env, j_job, job);
+    (*env)->ReleaseStringUTFChars(env, j_side, side);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_scale, scale, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_v, v, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sgeqrf
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jfloatArray j_a, jint lda, jfloatArray j_tau) {
+
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_sgeqrf(matrix_order, m, n, a, lda, tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dgeqrf
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jdoubleArray j_a, jint lda, jdoubleArray j_tau) {
+
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_dgeqrf(matrix_order, m, n, a, lda, tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cgeqrf
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jfloatArray j_a, jint lda, jfloatArray j_tau) {
+
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_cgeqrf(matrix_order, m, n, (lapack_complex_float *)a, lda, (lapack_complex_float *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zgeqrf
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jdoubleArray j_a, jint lda, jdoubleArray j_tau) {
+
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_zgeqrf(matrix_order, m, n, (lapack_complex_double *)a, lda, (lapack_complex_double *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sgeqp3
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jfloatArray j_a, jint lda, jintArray j_jpvt, jfloatArray j_tau) {
+
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jint * jpvt = (*env)->GetPrimitiveArrayCritical(env, j_jpvt, JNI_FALSE);
+    jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_sgeqp3(matrix_order, m, n, a, lda, jpvt, tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_jpvt, jpvt, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dgeqp3
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jdoubleArray j_a, jint lda, jintArray j_jpvt, jdoubleArray j_tau) {
+
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jint * jpvt = (*env)->GetPrimitiveArrayCritical(env, j_jpvt, JNI_FALSE);
+    jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_dgeqp3(matrix_order, m, n, a, lda, jpvt, tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_jpvt, jpvt, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cgeqp3
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jfloatArray j_a, jint lda, jintArray j_jpvt, jfloatArray j_tau) {
+
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jint * jpvt = (*env)->GetPrimitiveArrayCritical(env, j_jpvt, JNI_FALSE);
+    jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_cgeqp3(matrix_order, m, n, (lapack_complex_float *)a, lda, jpvt, (lapack_complex_float *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_jpvt, jpvt, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zgeqp3
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jdoubleArray j_a, jint lda, jintArray j_jpvt, jdoubleArray j_tau) {
+
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jint * jpvt = (*env)->GetPrimitiveArrayCritical(env, j_jpvt, JNI_FALSE);
+    jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_zgeqp3(matrix_order, m, n, (lapack_complex_double *)a, lda, jpvt, (lapack_complex_double *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_jpvt, jpvt, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_sorgqr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jint k, jfloatArray j_a, jint lda, jfloatArray j_tau) {
+
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_sorgqr(matrix_order, m, n, k, a, lda, tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_dorgqr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jint k, jdoubleArray j_a, jint lda, jdoubleArray j_tau) {
+
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_dorgqr(matrix_order, m, n, k, a, lda, tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_cungqr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jint k, jfloatArray j_a, jint lda, jfloatArray j_tau) {
+
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jfloat * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_cungqr(matrix_order, m, n, k, (lapack_complex_float *)a, lda, (lapack_complex_float *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_LAPACK_zungqr
+(JNIEnv * env, jobject calling_obj, jint matrix_order, jint m, jint n, jint k, jdoubleArray j_a, jint lda, jdoubleArray j_tau) {
+
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jdouble * tau = (*env)->GetPrimitiveArrayCritical(env, j_tau, JNI_FALSE);
+
+    jint retval = LAPACKE_zungqr(matrix_order, m, n, k, (lapack_complex_double *)a, lda, (lapack_complex_double *)tau);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_tau, tau, 0);
+    return retval;
+}
diff --git a/jni/src/BIDMat_SPBLAS.c b/jni/src/BIDMat_SPBLAS.c
new file mode 100755
index 00000000..597f6665
--- /dev/null
+++ b/jni/src/BIDMat_SPBLAS.c
@@ -0,0 +1,232 @@
+
+#include <jni.h>
+#include <mkl.h>
+#include <mkl_spblas.h>
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_SPBLAS_scsrmm 
+(JNIEnv * env, jobject calling_obj, jstring j_transa, jint m, jint n, jint k, jfloat alpha, jstring j_matdescra,
+ jfloatArray j_vals, jintArray j_ir, jintArray j_jc, jfloatArray j_b, jint ldb, jfloat beta, jfloatArray j_c, jint ldc){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, 0);
+	char * matdescra = (char *)(*env)->GetStringUTFChars(env, j_matdescra, 0);
+	jfloat * vals = (*env)->GetPrimitiveArrayCritical(env, j_vals, 0);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, 0);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, 0);
+	jfloat * b = (*env)->GetPrimitiveArrayCritical(env, j_b, 0);
+	jfloat * c = (*env)->GetPrimitiveArrayCritical(env, j_c, 0);
+	jint returnValue = 0;
+
+	if (transa != NULL && matdescra != NULL && vals != NULL && ir != NULL && jc != NULL && b != NULL && c != NULL) {
+	  mkl_scsrmm(transa, &m, &n, &k, &alpha, matdescra, vals, ir, jc, jc+1, b, &ldb, &beta, c, &ldc);
+	} else {
+	  returnValue = 1;
+	}
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_c, c, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_vals, vals, 0);
+	(*env)->ReleaseStringUTFChars(env, j_matdescra, matdescra);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+};
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_SPBLAS_scscmm
+(JNIEnv * env, jobject calling_obj, jstring j_transa, jint m, jint n, jint k, jfloat alpha, jstring j_matdescra,
+ jfloatArray j_vals, jintArray j_ir, jintArray j_jc, jfloatArray j_b, jint ldb, jfloat beta, jfloatArray j_c, jint ldc){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, 0);
+	char * matdescra = (char *)(*env)->GetStringUTFChars(env, j_matdescra, 0);
+	jfloat * vals = (*env)->GetPrimitiveArrayCritical(env, j_vals, 0);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, 0);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, 0);
+	jfloat * b = (*env)->GetPrimitiveArrayCritical(env, j_b, 0);
+	jfloat * c = (*env)->GetPrimitiveArrayCritical(env, j_c, 0);
+	jint returnValue = 0;
+
+	if (transa != NULL && matdescra != NULL && vals != NULL && ir != NULL && jc != NULL && b != NULL && c != NULL) {
+	  mkl_scscmm(transa, &m, &n, &k, &alpha, matdescra, vals, ir, jc, jc+1, b, &ldb, &beta, c, &ldc);
+	} else {
+	  returnValue = 1;
+	}
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_c, c, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_vals, vals, 0);
+	(*env)->ReleaseStringUTFChars(env, j_matdescra, matdescra);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+};
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_SPBLAS_scscmv 
+(JNIEnv * env, jobject calling_obj, jstring j_transa, jint m, jint k, jfloat alpha, jstring j_matdescra,
+ jfloatArray j_vals, jintArray j_ir, jintArray j_jc, jfloatArray j_x, jfloat beta, jfloatArray j_y){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, 0);
+	char * matdescra = (char *)(*env)->GetStringUTFChars(env, j_matdescra, 0);
+	jfloat * vals = (*env)->GetPrimitiveArrayCritical(env, j_vals, 0);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, 0);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, 0);
+	jfloat * x = (*env)->GetPrimitiveArrayCritical(env, j_x, 0);
+	jfloat * y = (*env)->GetPrimitiveArrayCritical(env, j_y, 0);
+	jint returnValue;
+
+	if (transa != NULL && matdescra != NULL && vals != NULL && ir != NULL && jc != NULL && x != NULL && y != NULL) {
+	  MKL_SCSCMV(transa, &m, &k, &alpha, matdescra, vals, ir, jc, jc+1, x, &beta, y);
+	} else {
+	  returnValue = 1;
+	}
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_y, y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_x, x, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_vals, vals, 0);
+	(*env)->ReleaseStringUTFChars(env, j_matdescra, matdescra);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+};
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_SPBLAS_scsrmv 
+(JNIEnv * env, jobject calling_obj, jstring j_transa, jint m, jint k, jfloat alpha, jstring j_matdescra,
+ jfloatArray j_vals, jintArray j_ir, jintArray j_jc, jfloatArray j_x, jfloat beta, jfloatArray j_y){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, 0);
+	char * matdescra = (char *)(*env)->GetStringUTFChars(env, j_matdescra, 0);
+	jfloat * vals = (*env)->GetPrimitiveArrayCritical(env, j_vals, 0);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, 0);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, 0);
+	jfloat * x = (*env)->GetPrimitiveArrayCritical(env, j_x, 0);
+	jfloat * y = (*env)->GetPrimitiveArrayCritical(env, j_y, 0);
+	jint returnValue;
+
+	if (transa != NULL && matdescra != NULL && vals != NULL && ir != NULL && jc != NULL && x != NULL && y != NULL) {
+	  MKL_SCSRMV(transa, &m, &k, &alpha, matdescra, vals, ir, jc, jc+1, x, &beta, y);
+	} else {
+	  returnValue = 1;
+	}
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_y, y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_x, x, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_vals, vals, 0);
+	(*env)->ReleaseStringUTFChars(env, j_matdescra, matdescra);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+};
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_SPBLAS_dcsrmm 
+(JNIEnv * env, jobject calling_obj, jstring j_transa, jint m, jint n, jint k, jdouble alpha, jstring j_matdescra,
+ jdoubleArray j_vals, jintArray j_ir, jintArray j_jc, jdoubleArray j_b, jint ldb, jdouble beta, jdoubleArray j_c, jint ldc){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, 0);
+	char * matdescra = (char *)(*env)->GetStringUTFChars(env, j_matdescra, 0);
+	jdouble * vals = (*env)->GetPrimitiveArrayCritical(env, j_vals, 0);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, 0);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, 0);
+	jdouble * b = (*env)->GetPrimitiveArrayCritical(env, j_b, 0);
+	jdouble * c = (*env)->GetPrimitiveArrayCritical(env, j_c, 0);
+	jint returnValue = 0;
+
+	if (transa != NULL && matdescra != NULL && vals != NULL && ir != NULL && jc != NULL && b != NULL && c != NULL) {
+	  mkl_dcsrmm(transa, &m, &n, &k, &alpha, matdescra, vals, ir, jc, jc+1, b, &ldb, &beta, c, &ldc);
+	} else {
+	  returnValue = 1;
+	}
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_c, c, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_vals, vals, 0);
+	(*env)->ReleaseStringUTFChars(env, j_matdescra, matdescra);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+};
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_SPBLAS_dcscmm 
+(JNIEnv * env, jobject calling_obj, jstring j_transa, jint m, jint n, jint k, jdouble alpha, jstring j_matdescra,
+ jdoubleArray j_vals, jintArray j_ir, jintArray j_jc, jdoubleArray j_b, jint ldb, jdouble beta, jdoubleArray j_c, jint ldc){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, 0);
+	char * matdescra = (char *)(*env)->GetStringUTFChars(env, j_matdescra, 0);
+	jdouble * vals = (*env)->GetPrimitiveArrayCritical(env, j_vals, 0);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, 0);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, 0);
+	jdouble * b = (*env)->GetPrimitiveArrayCritical(env, j_b, 0);
+	jdouble * c = (*env)->GetPrimitiveArrayCritical(env, j_c, 0);
+	jint returnValue = 0;
+
+	if (transa != NULL && matdescra != NULL && vals != NULL && ir != NULL && jc != NULL && b != NULL && c != NULL) {
+	  mkl_dcscmm(transa, &m, &n, &k, &alpha, matdescra, vals, ir, jc, jc+1, b, &ldb, &beta, c, &ldc);
+	} else {
+	  returnValue = 1;
+	}
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_c, c, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_vals, vals, 0);
+	(*env)->ReleaseStringUTFChars(env, j_matdescra, matdescra);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+};
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_SPBLAS_dcscmv 
+(JNIEnv * env, jobject calling_obj, jstring j_transa, jint m, jint k, jdouble alpha, jstring j_matdescra,
+ jdoubleArray j_vals, jintArray j_ir, jintArray j_jc, jdoubleArray j_x, jdouble beta, jdoubleArray j_y){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, 0);
+	char * matdescra = (char *)(*env)->GetStringUTFChars(env, j_matdescra, 0);
+	jdouble * vals = (*env)->GetPrimitiveArrayCritical(env, j_vals, 0);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, 0);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, 0);
+	jdouble * x = (*env)->GetPrimitiveArrayCritical(env, j_x, 0);
+	jdouble * y = (*env)->GetPrimitiveArrayCritical(env, j_y, 0);
+	jint returnValue = 0;
+
+	if (transa != NULL && matdescra != NULL && vals != NULL && ir != NULL && jc != NULL && x != NULL && y != NULL) {
+	  MKL_DCSCMV(transa, &m, &k, &alpha, matdescra, vals, ir, jc, jc+1, x, &beta, y);
+	} else {
+	  returnValue = 1;
+	}
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_y, y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_x, x, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_vals, vals, 0);
+	(*env)->ReleaseStringUTFChars(env, j_matdescra, matdescra);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+};
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_SPBLAS_dcsrmv
+(JNIEnv * env, jobject calling_obj, jstring j_transa, jint m, jint k, jdouble alpha, jstring j_matdescra,
+ jdoubleArray j_vals, jintArray j_ir, jintArray j_jc, jdoubleArray j_x, jdouble beta, jdoubleArray j_y){
+	char * transa = (char *)(*env)->GetStringUTFChars(env, j_transa, 0);
+	char * matdescra = (char *)(*env)->GetStringUTFChars(env, j_matdescra, 0);
+	jdouble * vals = (*env)->GetPrimitiveArrayCritical(env, j_vals, 0);
+	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, 0);
+	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, 0);
+	jdouble * x = (*env)->GetPrimitiveArrayCritical(env, j_x, 0);
+	jdouble * y = (*env)->GetPrimitiveArrayCritical(env, j_y, 0);
+	jint returnValue = 0;
+
+	if (transa != NULL && matdescra != NULL && vals != NULL && ir != NULL && jc != NULL && x != NULL && y != NULL) {
+	  MKL_DCSRMV(transa, &m, &k, &alpha, matdescra, vals, ir, jc, jc+1, x, &beta, y);
+	} else {
+	  returnValue = 1;
+	}
+
+	(*env)->ReleasePrimitiveArrayCritical(env, j_y, y, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_x, x, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, j_vals, vals, 0);
+	(*env)->ReleaseStringUTFChars(env, j_matdescra, matdescra);
+	(*env)->ReleaseStringUTFChars(env, j_transa, transa);
+	return returnValue;
+}
diff --git a/jni/src/BIDMat_UTILS.c b/jni/src/BIDMat_UTILS.c
new file mode 100755
index 00000000..88ccfc73
--- /dev/null
+++ b/jni/src/BIDMat_UTILS.c
@@ -0,0 +1,73 @@
+#include <jni.h>
+#include <mkl.h>
+#include <mkl_trans.h>
+#include <string.h>
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_UTILS_memcpybi
+(JNIEnv * env, jobject calling_obj, jint N, jbyteArray jA, jint startA, jintArray jB, jint startB){
+	jbyte * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jint * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE);
+
+    memcpy(((char *)B)+startB, A+startA, N);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_UTILS_memcpybf
+(JNIEnv * env, jobject calling_obj, jint N, jbyteArray jA, jint startA, jfloatArray jB, jint startB){
+	jbyte * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jfloat * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE);
+
+    memcpy(((char *)B)+startB, A+startA, N);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_UTILS_memcpybd
+(JNIEnv * env, jobject calling_obj, jint N, jbyteArray jA, jint startA, jdoubleArray jB, jint startB){
+	jbyte * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jdouble * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE);
+
+    memcpy(((char *)B)+startB, A+startA, N);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_UTILS_memcpyib
+(JNIEnv * env, jobject calling_obj, jint N, jintArray jA, jint startA, jbyteArray jB, jint startB){
+	jint * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jbyte * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE);
+
+    memcpy(B+startB, ((char *)A)+startA, N);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0);
+}
+
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_UTILS_memcpyfb
+(JNIEnv * env, jobject calling_obj, jint N, jfloatArray jA, jint startA, jbyteArray jB, jint startB){
+	jfloat * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jbyte * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE);
+
+    memcpy(B+startB, ((char *)A)+startA, N);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0);
+}
+
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_UTILS_memcpydb
+(JNIEnv * env, jobject calling_obj, jint N, jdoubleArray jA, jint startA, jbyteArray jB, jint startB){
+	jdouble * A = (*env)->GetPrimitiveArrayCritical(env, jA, JNI_FALSE);
+	jbyte * B = (*env)->GetPrimitiveArrayCritical(env, jB, JNI_FALSE);
+
+    memcpy(B+startB, ((char *)A)+startA, N);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, jA, A, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, jB, B, 0);
+}
+
diff --git a/jni/src/BIDMat_VML.c b/jni/src/BIDMat_VML.c
new file mode 100755
index 00000000..c4b2a614
--- /dev/null
+++ b/jni/src/BIDMat_VML.c
@@ -0,0 +1,2006 @@
+
+#include <jni.h>
+#include <mkl.h>
+#include <mkl_vml.h>
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsCdfNormInv (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsCdfNormInv(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdCdfNormInv (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdCdfNormInv(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsLinearFrac (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloat arg4, jfloat arg5, jfloat arg6, jfloat arg7, jfloatArray arg8, jlong arg9){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg8 = (*env)->GetPrimitiveArrayCritical(env, arg8, JNI_FALSE);
+
+	vmsLinearFrac(n, jni_arg2, jni_arg3, arg4, arg5, arg6, arg7, jni_arg8, arg9);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg8, jni_arg8, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdLinearFrac (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdouble arg4, jdouble arg5, jdouble arg6, jdouble arg7, jdoubleArray arg8, jlong arg9){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg8 = (*env)->GetPrimitiveArrayCritical(env, arg8, JNI_FALSE);
+
+	vmdLinearFrac(n, jni_arg2, jni_arg3, arg4, arg5, arg6, arg7, jni_arg8, arg9);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg8, jni_arg8, 0);
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VML_vmlSetErrStatus (JNIEnv * env, jobject calling_obj, jint n){
+	jint returnValue;
+
+	returnValue = vmlSetErrStatus(n);
+
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VML_vmlGetErrStatus (JNIEnv * env, jobject calling_obj){
+	jint returnValue;
+
+	returnValue = vmlGetErrStatus();
+
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VML_vmlClearErrStatus (JNIEnv * env, jobject calling_obj){
+	jint returnValue;
+
+	returnValue = vmlClearErrStatus();
+
+
+	return returnValue;
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsAbs (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsAbs(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdAbs (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdAbs(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsAdd (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsAdd(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdAdd (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdAdd(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsSub (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsSub(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdSub (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdSub(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsInv (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsInv(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdInv (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdInv(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsSqrt (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsSqrt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdSqrt (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdSqrt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsSqrt (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsSqrt(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdSqrt (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdSqrt(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsInvSqrt (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsInvSqrt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdInvSqrt (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdInvSqrt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsInvSqrt (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsInvSqrt(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdInvSqrt (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdInvSqrt(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsCbrt (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsCbrt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdCbrt (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdCbrt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsCbrt (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsCbrt(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdCbrt (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdCbrt(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsInvCbrt (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsInvCbrt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdInvCbrt (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdInvCbrt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsInvCbrt (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsInvCbrt(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdInvCbrt (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdInvCbrt(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsSqr (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsSqr(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdSqr (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdSqr(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsExp (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsExp(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdExp (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdExp(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsExp (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsExp(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdExp (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdExp(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsExpm1 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsExpm1(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdExpm1 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdExpm1(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsExpm1 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsExpm1(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdExpm1 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdExpm1(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsLn (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsLn(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdLn (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdLn(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsLn (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsLn(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdLn (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdLn(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsLog10 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsLog10(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdLog10 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdLog10(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsLog10 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsLog10(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdLog10 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdLog10(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsLog1p (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsLog1p(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdLog1p (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdLog1p(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsLog1p (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsLog1p(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdLog1p (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdLog1p(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsCos (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsCos(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdCos (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdCos(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsCos (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsCos(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdCos (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdCos(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsSin (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsSin(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdSin (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdSin(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsSin (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsSin(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdSin (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdSin(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsTan (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsTan(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdTan (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdTan(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsTan (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsTan(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdTan (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdTan(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsCosh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsCosh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdCosh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdCosh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsCosh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsCosh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdCosh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdCosh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsSinh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsSinh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdSinh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdSinh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsSinh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsSinh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdSinh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdSinh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsTanh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsTanh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdTanh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdTanh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsTanh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsTanh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdTanh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdTanh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsAcos (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsAcos(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdAcos (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdAcos(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsAcos (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsAcos(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdAcos (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdAcos(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsAsin (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsAsin(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdAsin (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdAsin(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsAsin (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsAsin(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdAsin (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdAsin(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsAtan (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsAtan(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdAtan (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdAtan(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsAtan (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsAtan(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdAtan (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdAtan(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsAcosh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsAcosh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdAcosh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdAcosh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsAcosh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsAcosh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdAcosh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdAcosh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsAsinh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsAsinh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdAsinh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdAsinh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsAsinh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsAsinh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdAsinh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdAsinh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsAtanh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsAtanh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdAtanh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdAtanh(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsAtanh (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsAtanh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdAtanh (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdAtanh(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsErf (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsErf(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdErf (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdErf(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsErf (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsErf(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdErf (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdErf(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsErfInv (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsErfInv(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdErfInv (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdErfInv(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsErfInv (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsErfInv(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdErfInv (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdErfInv(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsHypot (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsHypot(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdHypot (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdHypot(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsHypot (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4, jlong arg5){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmsHypot(n, jni_arg2, jni_arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdHypot (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4, jlong arg5){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmdHypot(n, jni_arg2, jni_arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsErfc (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsErfc(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdErfc (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdErfc(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsErfc (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsErfc(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdErfc (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdErfc(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsErfcInv (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsErfcInv(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdErfcInv (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdErfcInv(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsErfcInv (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsErfcInv(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdErfcInv (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdErfcInv(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsCdfNorm (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsCdfNorm(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdCdfNorm (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdCdfNorm(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsCdfNorm (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsCdfNorm(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdCdfNorm (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdCdfNorm(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsCdfNormInv (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsCdfNormInv(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdCdfNormInv (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdCdfNormInv(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsLGamma (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsLGamma(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdLGamma (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdLGamma(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsLGamma (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsLGamma(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdLGamma (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdLGamma(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsTGamma (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsTGamma(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdTGamma (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdTGamma(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsTGamma (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsTGamma(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdTGamma (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdTGamma(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsAtan2 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsAtan2(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdAtan2 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdAtan2(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsAtan2 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4, jlong arg5){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmsAtan2(n, jni_arg2, jni_arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdAtan2 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4, jlong arg5){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmdAtan2(n, jni_arg2, jni_arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsMul (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsMul(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdMul (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdMul(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsDiv (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsDiv(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdDiv (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdDiv(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsPow (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsPow(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdPow (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdPow(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsPow (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4, jlong arg5){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmsPow(n, jni_arg2, jni_arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdPow (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4, jlong arg5){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmdPow(n, jni_arg2, jni_arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsPow3o2 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsPow3o2(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdPow3o2 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdPow3o2(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsPow3o2 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsPow3o2(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdPow3o2 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdPow3o2(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsPow2o3 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsPow2o3(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdPow2o3 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdPow2o3(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsPow2o3 (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jlong arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmsPow2o3(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdPow2o3 (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jlong arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vmdPow2o3(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsPowx (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloat arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsPowx(n, jni_arg2, arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdPowx (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdouble arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdPowx(n, jni_arg2, arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsPowx (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloat arg3, jfloatArray arg4, jlong arg5){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmsPowx(n, jni_arg2, arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdPowx (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdouble arg3, jdoubleArray arg4, jlong arg5){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmdPowx(n, jni_arg2, arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsSinCos (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsSinCos(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdSinCos (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdSinCos(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmsSinCos (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4, jlong arg5){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmsSinCos(n, jni_arg2, jni_arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vmdSinCos (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4, jlong arg5){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vmdSinCos(n, jni_arg2, jni_arg3, jni_arg4, arg5);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsLinearFrac (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloat arg4, jfloat arg5, jfloat arg6, jfloat arg7, jfloatArray arg8){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg8 = (*env)->GetPrimitiveArrayCritical(env, arg8, JNI_FALSE);
+
+	vsLinearFrac(n, jni_arg2, jni_arg3, arg4, arg5, arg6, arg7, jni_arg8);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg8, jni_arg8, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdLinearFrac (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdouble arg4, jdouble arg5, jdouble arg6, jdouble arg7, jdoubleArray arg8){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg8 = (*env)->GetPrimitiveArrayCritical(env, arg8, JNI_FALSE);
+
+	vdLinearFrac(n, jni_arg2, jni_arg3, arg4, arg5, arg6, arg7, jni_arg8);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg8, jni_arg8, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsCeil (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsCeil(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdCeil (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdCeil(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsFloor (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsFloor(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdFloor (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdFloor(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsModf (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsModf(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdModf (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdModf(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsNearbyInt (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsNearbyInt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdNearbyInt (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdNearbyInt(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsRint (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsRint(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdRint (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdRint(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsRound (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsRound(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdRound (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdRound(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsTrunc (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsTrunc(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdTrunc (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdTrunc(n, jni_arg2, jni_arg3);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsPackI (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jint arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsPackI(n, jni_arg2, arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdPackI (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jint arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdPackI(n, jni_arg2, arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsPackV (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jintArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jint * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsPackV(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdPackV (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jintArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jint * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdPackV(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsPackM (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jintArray arg3, jfloatArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jint * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jfloat * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsPackM(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdPackM (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jintArray arg3, jdoubleArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jint * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jdouble * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdPackM(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsUnpackI (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jint arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vsUnpackI(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdUnpackI (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jint arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+
+	vdUnpackI(n, jni_arg2, jni_arg3, arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsUnpackV (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jintArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jint * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsUnpackV(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdUnpackV (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jintArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jint * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdUnpackV(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vsUnpackM (JNIEnv * env, jobject calling_obj, jint n, jfloatArray arg2, jfloatArray arg3, jintArray arg4){
+	jfloat * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jfloat * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jint * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vsUnpackM(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT void JNICALL Java_edu_berkeley_bid_VML_vdUnpackM (JNIEnv * env, jobject calling_obj, jint n, jdoubleArray arg2, jdoubleArray arg3, jintArray arg4){
+	jdouble * jni_arg2 = (*env)->GetPrimitiveArrayCritical(env, arg2, JNI_FALSE);
+	jdouble * jni_arg3 = (*env)->GetPrimitiveArrayCritical(env, arg3, JNI_FALSE);
+	jint * jni_arg4 = (*env)->GetPrimitiveArrayCritical(env, arg4, JNI_FALSE);
+
+	vdUnpackM(n, jni_arg2, jni_arg3, jni_arg4);
+
+	(*env)->ReleasePrimitiveArrayCritical(env, arg2, jni_arg2, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg3, jni_arg3, 0);
+	(*env)->ReleasePrimitiveArrayCritical(env, arg4, jni_arg4, 0);
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VML_vmlSetMode (JNIEnv * env, jobject calling_obj, jint n){
+	jint returnValue;
+
+	returnValue = vmlSetMode(n);
+
+
+	return returnValue;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VML_vmlGetMode (JNIEnv * env, jobject calling_obj){
+	jint returnValue;
+
+	returnValue = vmlGetMode();
+
+
+	return returnValue;
+}
+
diff --git a/jni/src/BIDMat_VSL.c b/jni/src/BIDMat_VSL.c
new file mode 100755
index 00000000..ef1f022c
--- /dev/null
+++ b/jni/src/BIDMat_VSL.c
@@ -0,0 +1,469 @@
+
+#include "mkl_vsl.h"
+
+#include <jni.h>
+
+#include <stdlib.h>
+#include <assert.h>
+
+union VoidLong {
+    jlong l;
+    void* p;
+};
+
+static jlong void2long(void* ptr) {
+    union VoidLong v;
+    v.l = (jlong) 0; 
+    v.p = ptr;
+    return v.l;
+}
+
+static void* long2void(jlong l) {
+    union VoidLong v;
+    v.l = l;
+    return v.p;
+}
+
+static VSLStreamStatePtr getStream(JNIEnv *env, jclass clazz, jobject jstream)
+{
+    jfieldID handle_id = (*env)->GetFieldID(env, clazz, "handle", "J");
+    jlong handle = (*env)->GetLongField(env, jstream, handle_id);
+    VSLStreamStatePtr streamp = long2void(handle);
+    return streamp;
+}
+
+static void setStream(JNIEnv *env, jclass clazz, jobject jstream, VSLStreamStatePtr streamp)
+{
+    jfieldID handle_id = (*env)->GetFieldID(env, clazz, "handle", "J");
+    jlong handle = void2long(streamp);
+    (*env)->SetLongField(env, jstream, handle_id, handle);
+}
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vslNewStream
+  (JNIEnv *env, jclass clazz, jobject jstream, jint brng, jint seed)
+{
+    VSLStreamStatePtr streamp;
+    int status = vslNewStream(&streamp, brng, seed);
+    setStream(env, clazz, jstream, streamp);
+
+    return (jint)status;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vslDeleteStream
+  (JNIEnv *env, jclass clazz, jobject jstream)
+{
+    VSLStreamStatePtr streamp = getStream(env, clazz, jstream);
+    int status = vslDeleteStream(&streamp);
+    setStream(env, clazz, jstream, streamp);
+
+    return (jint)status;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngCauchy
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngCauchy(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngUniform
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngUniform(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngGaussian
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngGaussian(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngGaussianMV
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jint d, jint m, jdoubleArray j_a, jdoubleArray j_b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jdouble * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+
+    jint retval = vdRngGaussianMV(method, stream, n, r, d, m, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngExponential
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngExponential(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngLaplace
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngLaplace(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngWeibull
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b, jdouble c) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngWeibull(method, stream, n, r, a, b, c);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngRayleigh
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngRayleigh(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngLognormal
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b, jdouble c, jdouble d) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngLognormal(method, stream, n, r, a, b, c, d);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngGumbel
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngGumbel(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngGamma
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b, jdouble c) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngGamma(method, stream, n, r, a, b, c);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vdRngBeta
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jdoubleArray j_r, jdouble a, jdouble b, jdouble c, jdouble d) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jdouble * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vdRngBeta(method, stream, n, r, a, b, c, d);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_viRngBernoulli
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jintArray j_r, jdouble a) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jint * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = viRngBernoulli(method, stream, n, r, a);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_viRngUniform
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jintArray j_r, jdouble a, jdouble b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jint * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = viRngUniform(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_viRngUniformBits
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jintArray j_r) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jint * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = viRngUniformBits(method, stream, n, (unsigned int *)r);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_viRngGeometric
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jintArray j_r, jdouble p) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jint * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = viRngGeometric(method, stream, n, r, p);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_viRngBinomial
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jintArray j_r, jdouble m, jdouble p) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jint * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = viRngBinomial(method, stream, n, r, m, p);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_viRngHypergeometric
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jintArray j_r, jint a, jint b, jint c) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jint * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = viRngHypergeometric(method, stream, n, r, a, b, c);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_viRngNegbinomial
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jintArray j_r, jdouble a, jdouble b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jint * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = viRngNegbinomial(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_viRngPoisson
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jintArray j_r, jdouble a) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jint * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = viRngPoisson(method, stream, n, r, a);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_viRngPoissonV
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jintArray j_r, jdoubleArray j_a) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jint * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+    jdouble * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+
+    jint retval = viRngPoissonV(method, stream, n, r, a);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    return retval;
+}
+
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngCauchy
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngCauchy(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngUniform
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngUniform(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngGaussian
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngGaussian(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngGaussianMV
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jint d, jint m, jfloatArray j_a, jfloatArray j_b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+    jfloat * a = (*env)->GetPrimitiveArrayCritical(env, j_a, JNI_FALSE);
+    jfloat * b = (*env)->GetPrimitiveArrayCritical(env, j_b, JNI_FALSE);
+
+    jint retval = vsRngGaussianMV(method, stream, n, r, d, m, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_a, a, 0);
+    (*env)->ReleasePrimitiveArrayCritical(env, j_b, b, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngExponential
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngExponential(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngLaplace
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngLaplace(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngWeibull
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b, jfloat c) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngWeibull(method, stream, n, r, a, b, c);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngRayleigh
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngRayleigh(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngLognormal
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b, jfloat c, jfloat d) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngLognormal(method, stream, n, r, a, b, c, d);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngGumbel
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngGumbel(method, stream, n, r, a, b);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngGamma
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b, jfloat c) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngGamma(method, stream, n, r, a, b, c);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
+JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vsRngBeta
+(JNIEnv * env, jobject calling_obj, jint method, jobject j_stream, jint n, jfloatArray j_r, jfloat a, jfloat b, jfloat c, jfloat d) {
+
+    VSLStreamStatePtr stream = getStream(env, calling_obj, j_stream);
+    jfloat * r = (*env)->GetPrimitiveArrayCritical(env, j_r, JNI_FALSE);
+
+    jint retval = vsRngBeta(method, stream, n, r, a, b, c, d);
+
+    (*env)->ReleasePrimitiveArrayCritical(env, j_r, r, 0);
+    return retval;
+}
+
diff --git a/jni/src/Copyright.txt b/jni/src/Copyright.txt
new file mode 100755
index 00000000..21326596
--- /dev/null
+++ b/jni/src/Copyright.txt
@@ -0,0 +1,25 @@
+Copyright (c) 2012, Regents of the University of California
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/jni/src/Makefile b/jni/src/Makefile
new file mode 100755
index 00000000..677710c1
--- /dev/null
+++ b/jni/src/Makefile
@@ -0,0 +1,36 @@
+include Makefile.incl
+
+MKL_OBJS=BIDMat_CBLAS.$(OBJ) BIDMat_UTILS.$(OBJ) BIDMat_SPBLAS.$(OBJ) BIDMat_LAPACK.$(OBJ) \
+         BIDMat_VML.$(OBJ) BIDMat_VSL.$(OBJ)
+
+CUDA_OBJS=BIDMat_CUMAT.$(OBJ) MatKernel.$(OBJ) 
+
+.SUFFIXES: .$(OBJ) .c .cpp .cu
+
+all: $(LIBPREPEND)bidmatmkl$(LIBAPPEND) $(LIBPREPEND)bidmatcuda$(LIBAPPEND)
+
+$(LIBPREPEND)bidmatmkl$(LIBAPPEND): $(MKL_OBJS)
+	$(LD) $(LDFLAGS) $(MKL_OBJS) $(MKL_LIBS) $(OUTFLG)$@
+
+$(LIBPREPEND)bidmatcuda$(LIBAPPEND): $(CUDA_OBJS)
+	$(GLD) $(LDFLAGS) $(CUDA_OBJS) $(CUDA_LIBS) $(OUTFLG)$@
+
+%.$(OBJ) : %.c
+	$(CC) $(CPPFLAGS) $(LAPACK_INCLUDES) $(CFLAGS) $*.c
+
+%.$(OBJ) : %.cpp
+	$(GCC) $(CPPFLAGS) $(LAPACK_INCLUDES) $(CFLAGS) $*.cpp
+
+%.$(OBJ) : %.cu
+	$(NVCC) $(NVCCFLAGS) $*.cu
+
+install: all
+	cp $(LIBPREPEND)bidmatmkl$(LIBAPPEND)  ../../lib/$(SUBLIB)
+	cp $(LIBPREPEND)bidmatcuda$(LIBAPPEND) ../../lib/$(SUBLIB)
+
+clean:
+	rm -f *.$(OBJ) *$(LIBAPPEND) *.pdb *.exp *.lib
+
+distclean: clean
+	rm -f *$(LIBAPPEND) *.exp *.lib *.jnilib Makefile.incl
+
diff --git a/jni/src/MatKernel.cu b/jni/src/MatKernel.cu
new file mode 100755
index 00000000..4d0f43d2
--- /dev/null
+++ b/jni/src/MatKernel.cu
@@ -0,0 +1,660 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__device__ float op_add(float a, float b) {return a+b;}
+__device__ float op_sub(float a, float b) {return a-b;}
+__device__ float op_mul(float a, float b) {return a*b;}
+__device__ float op_div(float a, float b) {return a/b;}
+__device__ float op_gt(float a, float b) {return (a > b)  ? 1.0f : 0;}
+__device__ float op_lt(float a, float b) {return (a < b)  ? 1.0f : 0;}
+__device__ float op_eq(float a, float b) {return (a == b) ? 1.0f : 0;}
+__device__ float op_ge(float a, float b) {return (a >= b) ? 1.0f : 0;}
+__device__ float op_le(float a, float b) {return (a <= b) ? 1.0f : 0;}
+__device__ float op_ne(float a, float b) {return (a != b) ? 1.0f : 0;}
+__device__ float op_max(float a, float b) {return max(a,b);}
+__device__ float op_min(float a, float b) {return min(a,b);}
+
+__device__ int iop_add(int a, int b) {return a+b;}
+__device__ int iop_sub(int a, int b) {return a-b;}
+__device__ int iop_mul(int a, int b) {return a*b;}
+__device__ int iop_div(int a, int b) {return a/b;}
+__device__ int iop_gt(int a, int b) {return (a > b)  ? 1 : 0;}
+__device__ int iop_lt(int a, int b) {return (a < b)  ? 1 : 0;}
+__device__ int iop_eq(int a, int b) {return (a == b) ? 1 : 0;}
+__device__ int iop_ge(int a, int b) {return (a >= b) ? 1 : 0;}
+__device__ int iop_le(int a, int b) {return (a <= b) ? 1 : 0;}
+__device__ int iop_ne(int a, int b) {return (a != b) ? 1 : 0;}
+
+typedef float (*optype)(float,float);
+typedef int (*ioptype)(int,int);
+
+__device__ const optype operators[] = {
+    op_add, 
+    op_sub, 
+    op_mul,
+    op_div,
+    op_gt,
+    op_lt,
+    op_eq,
+    op_ge,
+    op_le,
+    op_ne,
+    op_max,
+    op_min};
+
+__device__ const ioptype ioperators[] = {
+    iop_add, 
+    iop_sub, 
+    iop_mul,
+    iop_div,
+    iop_gt,
+    iop_lt,
+    iop_eq,
+    iop_ge,
+    iop_le,
+    iop_ne};
+
+__device__ float fn_abs(float a) {return abs(a);}
+__device__ float fn_exp(float a) {return expf(a);}
+__device__ float fn_log(float a) {return logf(a);}
+__device__ float fn_expm1(float a) {return expm1f(a);}
+__device__ float fn_sqrt(float a) {return sqrtf(a);}
+__device__ float fn_ln(float a) {return logf(a);}
+__device__ float fn_log10(float a) {return log10f(a);}
+__device__ float fn_log1p(float a) {return log1pf(a);}
+__device__ float fn_cos(float a) {return cosf(a);}
+__device__ float fn_sin(float a) {return sinf(a);}
+__device__ float fn_tan(float a) {return tanf(a);}
+__device__ float fn_cosh(float a) {return coshf(a);}
+__device__ float fn_sinh(float a) {return sinhf(a);}
+__device__ float fn_tanh(float a) {return tanhf(a);}
+__device__ float fn_acos(float a) {return acosf(a);}
+__device__ float fn_asin(float a) {return asinf(a);}
+__device__ float fn_atan(float a) {return atanf(a);}
+__device__ float fn_acosh(float a) {return acoshf(a);}
+__device__ float fn_asinh(float a) {return asinhf(a);}
+__device__ float fn_atanh(float a) {return atanhf(a);}
+__device__ float fn_erf(float a) {return erff(a);}
+__device__ float fn_erfinv(float a) {return erfinvf(a);}
+__device__ float fn_erfc(float a) {return erfcf(a);}
+__device__ float fn_erfcinv(float a) {return erfcinvf(a);}
+__device__ float fn_gammaln(float a) {return lgammaf(a);}
+__device__ float fn_gamma(float a) {return tgammaf(a);}
+__device__ float fn_ceil(float a) {return ceilf(a);}
+__device__ float fn_floor(float a) {return floorf(a);}
+__device__ float fn_round(float a) {return roundf(a);}
+__device__ float fn_trunc(float a) {return truncf(a);}
+__device__ float fn_sign(float a) {return (a>0) ? 1.0f : ((a<0) ? -1.0f : 0);}
+__device__ float fn_j0(float a) {return j0f(a);}
+__device__ float fn_j1(float a) {return j1f(a);}
+//__device__ float fn_jn(float a) {return jnf(a);}
+__device__ float fn_y0(float a) {return y0f(a);}
+__device__ float fn_y1(float a) {return y1f(a);}
+//__device__ float fn_yn(float a) {return ynf(a);}
+__device__ float fn_exppsi(float a) {return (a<1.0f) ? 0.5f*a*a : a-0.5f;}
+
+__device__ float fn_atan2(float a, float b) {return atan2f(a, b);}
+__device__ float fn_pow(float a, float b) {return powf(a, b);}
+
+typedef float (*fntype)(float);
+
+__device__ const fntype fctns[35] = {
+    fn_abs,
+    fn_exp,
+    fn_expm1,
+    fn_sqrt,
+    fn_ln,
+    fn_log10,
+    fn_log1p,
+    fn_cos,
+    fn_sin,
+    fn_tan,
+    fn_cosh,
+    fn_sinh,
+    fn_tanh,
+    fn_acos,
+    fn_asin,
+    fn_atan,
+    fn_acosh,
+    fn_asinh,
+    fn_atanh,
+    fn_erf,
+    fn_erfinv,
+    fn_erfc,
+    fn_erfcinv,
+    fn_gammaln,
+    fn_gamma,
+    fn_ceil,
+    fn_floor,
+    fn_round,
+    fn_trunc,
+    fn_sign,
+    fn_j0,
+    fn_j1,
+    fn_y0,
+    fn_y1,
+    fn_exppsi};
+
+__device__ const optype fctns2[2] = {
+    fn_atan2,
+    fn_pow};
+
+
+__global__ void __apply_gfun(float *A, float *B, int N, int opn) {
+  fntype fn = fctns[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < N; i += blockDim.x * gridDim.x) {
+    B[i] = fn(A[i]);
+  }
+}
+
+int apply_gfun(float *A, float *B, int N, int opn) {
+  int nthreads = 32;
+  int nblocks = 1;
+  while (nblocks * nthreads < N) {
+    if (nblocks < 16) {
+      nblocks = 2*nblocks;
+    } else if (nthreads < 1024) {
+      nthreads = 2*nthreads;
+    } else {
+      nblocks = 2*nblocks;
+    }
+  }
+  __apply_gfun<<<nblocks,nthreads>>>(A, B, N, opn);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void __apply_gfun2(float *A, float *B, float *C, int N, int opn) {
+  optype fn = fctns2[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < N; i += blockDim.x * gridDim.x) {
+    C[i] = fn(A[i], B[i]);
+  }
+}
+
+int apply_gfun2(float *A, float *B, float *C, int N, int opn) {
+  int nthreads = 32;
+  int nblocks = 1;
+  while (nblocks * nthreads < N) {
+    if (nblocks < 16) {
+      nblocks = 2*nblocks;
+    } else if (nthreads < 1024) {
+      nthreads = 2*nthreads;
+    } else {
+      nblocks = 2*nblocks;
+    }
+  }
+  __apply_gfun2<<<nblocks,nthreads>>>(A, B, C, N, opn);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void __apply_full(float *A, float *B, float *C, int N, int opn) {
+  optype op = operators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < N; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i],B[i]);
+  }
+}
+
+__global__ void __apply_right_col(float *A, float *B, float *C, int nrows, int ncols, int opn) {
+  optype op = operators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i],B[i % nrows]);
+  }
+}
+
+__global__ void __apply_right_row(float *A, float *B, float *C, int nrows, int ncols, int opn) {
+  optype op = operators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i],B[i / nrows]);
+  }
+}
+
+__global__ void __apply_left_col(float *A, float *B, float *C, int nrows, int ncols, int opn) {
+  optype op = operators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i % nrows],B[i]);
+  }
+}
+
+__global__ void __apply_left_row(float *A, float *B, float *C, int nrows, int ncols, int opn) {
+  optype op = operators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i / nrows],B[i]);
+  }
+}
+
+__global__ void __apply_right_val(float *A, float *B, float *C, int nrows, int ncols, int opn) {
+  optype op = operators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  float val = B[0];
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i],val);
+  }
+}
+
+__global__ void __apply_left_val(float *A, float *B, float *C, int nrows, int ncols, int opn) {
+  optype op = operators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  float val = A[0];
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(val,B[i]);
+  }
+}
+
+int apply_binop(float *A, int Anrows, int Ancols, 
+     float *B, int Bnrows, int Bncols, float *C, int opn) {
+  int N = max(Anrows, Bnrows)*max(Ancols, Bncols);
+  int nthreads = 32;
+  int nblocks = 1;
+  while (nblocks * nthreads < N) {
+    if (nblocks < 16) {
+      nblocks = 2*nblocks;
+    } else if (nthreads < 1024) {
+      nthreads = 2*nthreads;
+    } else {
+      nblocks = 2*nblocks;
+    }
+  }
+  if (Anrows == Bnrows && Ancols == Bncols) {
+    __apply_full<<<nblocks,nthreads>>>(A, B, C, N, opn);
+  } else if (Anrows == Bnrows && Bncols == 1) {
+    __apply_right_col<<<nblocks,nthreads>>>(A, B, C, Anrows, Ancols, opn);
+  } else if (Ancols == Bncols && Bnrows == 1) {
+    __apply_right_row<<<nblocks,nthreads>>>(A, B, C, Anrows, Ancols, opn);
+  } else if (Anrows == Bnrows && Ancols == 1) {
+    __apply_left_col<<<nblocks,nthreads>>>(A, B, C, Bnrows, Bncols, opn);
+  } else if (Ancols == Bncols && Anrows == 1) {
+    __apply_left_row<<<nblocks,nthreads>>>(A, B, C, Bnrows, Bncols, opn);
+  } else if (Bnrows == 1 && Bncols == 1) {
+    __apply_right_val<<<nblocks,nthreads>>>(A, B, C, Anrows, Ancols, opn);
+  } else if (Anrows == 1 && Ancols == 1) {
+    __apply_left_val<<<nblocks,nthreads>>>(A, B, C, Bnrows, Bncols, opn);
+  }
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void __apply_full_int(int *A, int *B, int *C, int N, int opn) {
+  ioptype op = ioperators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < N; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i],B[i]);
+  }
+}
+
+__global__ void __apply_right_col_int(int *A, int *B, int *C, int nrows, int ncols, int opn) {
+  ioptype op = ioperators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i],B[i % nrows]);
+  }
+}
+
+__global__ void __apply_right_row_int(int *A, int *B, int *C, int nrows, int ncols, int opn) {
+  ioptype op = ioperators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i],B[i / nrows]);
+  }
+}
+
+__global__ void __apply_left_col_int(int *A, int *B, int *C, int nrows, int ncols, int opn) {
+  ioptype op = ioperators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i % nrows],B[i]);
+  }
+}
+
+__global__ void __apply_left_row_int(int *A, int *B, int *C, int nrows, int ncols, int opn) {
+  ioptype op = ioperators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i / nrows],B[i]);
+  }
+}
+
+__global__ void __apply_right_val_int(int *A, int *B, int *C, int nrows, int ncols, int opn) {
+  ioptype op = ioperators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  int val = B[0];
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(A[i],val);
+  }
+}
+
+__global__ void __apply_left_val_int(int *A, int *B, int *C, int nrows, int ncols, int opn) {
+  ioptype op = ioperators[opn];
+  int ip = threadIdx.x + blockDim.x * blockIdx.x;
+  int val = A[0];
+  for (int i = ip; i < nrows*ncols; i += blockDim.x * gridDim.x) {
+    C[i] = op(val,B[i]);
+  }
+}
+
+int apply_biniop(int *A, int Anrows, int Ancols, 
+     int *B, int Bnrows, int Bncols, 
+     int *C, int opn) {
+  int N = max(Anrows, Bnrows)*max(Ancols, Bncols);
+  int nthreads = 32;
+  int nblocks = 1;
+  while (nblocks * nthreads < N) {
+    if (nblocks < 16) {
+      nblocks = 2*nblocks;
+    } else if (nthreads < 1024) {
+      nthreads = 2*nthreads;
+    } else {
+      nblocks = 2*nblocks;
+    }
+  }
+  if (Anrows == Bnrows && Ancols == Bncols) {
+    __apply_full_int<<<nblocks,nthreads>>>(A, B, C, N, opn);
+  } else if (Anrows == Bnrows && Bncols == 1) {
+    __apply_right_col_int<<<nblocks,nthreads>>>(A, B, C, Anrows, Ancols, opn);
+  } else if (Ancols == Bncols && Bnrows == 1) {
+    __apply_right_row_int<<<nblocks,nthreads>>>(A, B, C, Anrows, Ancols, opn);
+  } else if (Anrows == Bnrows && Ancols == 1) {
+    __apply_left_col_int<<<nblocks,nthreads>>>(A, B, C, Bnrows, Bncols, opn);
+  } else if (Ancols == Bncols && Anrows == 1) {
+    __apply_left_row_int<<<nblocks,nthreads>>>(A, B, C, Bnrows, Bncols, opn);
+  } else if (Bnrows == 1 && Bncols == 1) {
+    __apply_right_val_int<<<nblocks,nthreads>>>(A, B, C, Anrows, Ancols, opn);
+  } else if (Anrows == 1 && Ancols == 1) {
+    __apply_left_val_int<<<nblocks,nthreads>>>(A, B, C, Bnrows, Bncols, opn);
+  }
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void __dsmult(int nrows, int nnz, float *A, float *Bdata, int *Bir, int *Bic, float *C) {
+  int jstart = ((long long)blockIdx.x) * nnz / gridDim.x;
+  int jend = ((long long)(blockIdx.x + 1)) * nnz / gridDim.x;
+  for (int i = threadIdx.x; i < nrows; i += blockDim.x) {
+    float sum = 0;
+    for (int j = jstart; j < jend ; j++) {
+      sum += A[i + nrows * Bir[j]] * Bdata[j];
+      if (j == jend-1 || Bic[j] != Bic[j+1]) {
+        atomicAdd(&C[i + nrows * Bic[j]], sum);
+        sum = 0;
+      }
+    }
+  }
+}
+
+int dsmult(int nrows, int ncols, int nnz, float *A, float *Bdata, int *Bir, int *Bic, float *C) {
+  int nthreads = min(1024, nrows);
+  int nblocks = min(1024*1024, ncols);
+  __dsmult<<<nblocks,nthreads>>>(nrows, nnz, A, Bdata, Bir, Bic, C);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void __dsmultT(int nrows, int nnz, float *A, float *Bdata, int *Bir, int *Bic, float *C) {
+  int jstart = ((long long)blockIdx.x) * nnz / gridDim.x;
+  int jend = ((long long)(blockIdx.x + 1)) * nnz / gridDim.x;
+  for (int i = threadIdx.x; i < nrows; i += blockDim.x) {
+    float aval = 0;
+    for (int j = jstart; j < jend ; j++) {
+      if (j == jstart || Bic[j-1] != Bic[j]) {
+        aval = A[i + nrows * Bic[j]];
+      }
+      atomicAdd(&C[i + nrows * Bir[j]], aval * Bdata[j]);
+    }
+  }
+}
+
+int dsmultT(int nrows, int ncols, int nnz, float *A, float *Bdata, int *Bir, int *Bic, float *C) {
+  int nthreads = min(1024, nrows);
+  int nblocks = min(1024*1024, ncols);
+  __dsmultT<<<nblocks,nthreads>>>(nrows, nnz, A, Bdata, Bir, Bic, C);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void __dds(int nrows, int nnz, float *A, float *B, int *Cir, int *Cic, float *P);
+
+__global__ void __reduce1op(int nrows, int ncols, float *A, float *B, int opn);
+
+#ifdef __CUDA_ARCH__
+#if __CUDA_ARCH__ > 200
+
+__global__ void __dds(int nrows, int nnz, float *A, float *B, int *Cir, int *Cic, float *P) {
+  int jstart = ((long long)blockIdx.x) * nnz / gridDim.x;
+  int jend = ((long long)(blockIdx.x + 1)) * nnz / gridDim.x;
+  for (int j = jstart; j < jend ; j++) {
+    float sum = 0;
+    int aoff = nrows * Cir[j];
+    int boff = nrows * Cic[j];
+    for (int i = threadIdx.x; i < nrows; i += blockDim.x) {
+      sum += A[i + aoff] * B[i + boff];
+    }
+    for (int i = 1; i < blockDim.x; i *= 2) {
+      sum = sum + __shfl_down(sum, i);
+    }
+    if (threadIdx.x == 0) {
+      P[j] = sum;
+    }
+  }
+}
+
+__global__ void __reduce1op(int nrows, int ncols, float *A, float *B, int opn) {
+  optype op = operators[opn];
+  int basecol = threadIdx.y + blockDim.y * blockIdx.x;
+  for (int icol = basecol; icol < ncols; icol += blockDim.y * gridDim.x) {
+    float v = A[threadIdx.x + icol * nrows];
+    for (int i = threadIdx.x + blockDim.x; i < nrows; i += blockDim.x) {
+      v = op(v, A[i + icol * nrows]);
+    }
+    for (int i = 1; i < blockDim.x; i *= 2) {
+      v = op(v, __shfl_down(v, i));
+    }
+    if (threadIdx.x == 0) {
+      B[icol] = v;
+    }
+  }
+}
+#else
+
+__global__ void __dds(int nrows, int nnz, float *A, float *B, int *Cir, int *Cic, float *P) {
+  __shared__ float parts[1][33];
+  int jstart = ((long long)blockIdx.x) * nnz / gridDim.x;
+  int jend = ((long long)(blockIdx.x + 1)) * nnz / gridDim.x;
+  for (int j = jstart; j < jend ; j++) {
+    float sum = 0;
+    int aoff = nrows * Cir[j];
+    int boff = nrows * Cic[j];
+    for (int i = threadIdx.x; i < nrows; i += blockDim.x) {
+      sum += A[i + aoff] * B[i + boff];
+    }
+    parts[0][threadIdx.x] = sum;
+    for (int i = 1; i < blockDim.x; i *= 2) {
+      if (i + threadIdx.x < blockDim.x) {
+        parts[0][threadIdx.x] = parts[0][threadIdx.x] + parts[0][i + threadIdx.x];
+      }
+    }
+    if (threadIdx.x == 0) {
+      P[j] = parts[0][0];
+    }
+  }
+}
+
+__global__ void __reduce1op(int nrows, int ncols, float *A, float *B, int opn) {
+  __shared__ float parts[32][33];
+  optype op = operators[opn];
+  for (int icol = threadIdx.y + blockIdx.y * blockDim.y; icol < ncols; icol += blockDim.y * gridDim.y) {
+    float v = A[threadIdx.x + icol * nrows];
+    for (int irow = threadIdx.x + blockDim.x; irow < nrows; irow += blockDim.x) {
+      v = op(v, A[irow + icol * nrows]);
+    }
+    parts[threadIdx.x][threadIdx.y] = v;
+    for (int i = 1; i < blockDim.x; i *= 2) {
+      if (i + threadIdx.x < blockDim.x) {
+        parts[threadIdx.x][threadIdx.y] = op(parts[threadIdx.x][threadIdx.y], parts[i + threadIdx.x][threadIdx.y]);
+      }
+    }
+    if (threadIdx.x == 0) {
+      B[icol] = parts[0][threadIdx.y];
+    }
+    __syncthreads();
+  }
+}
+#endif
+#endif
+
+#define BLOCKDIM 32
+
+__global__ void __transpose(float *in, int instride, float *out, int outstride, int nrows, int ncols) {
+  int nx = BLOCKDIM * gridDim.x;
+  int ny = BLOCKDIM * gridDim.y;
+  int ix = BLOCKDIM * blockIdx.x;
+  int iy = BLOCKDIM * blockIdx.y;
+  __shared__ float tile[BLOCKDIM][BLOCKDIM+1];
+
+  for (int yb = iy; yb < ncols; yb += ny) {
+    for (int xb = ix; xb < nrows; xb += nx) {
+      if (xb + threadIdx.x < nrows) {
+        int ylim = min(ncols, yb + BLOCKDIM);
+        for (int y = threadIdx.y + yb; y < ylim; y += blockDim.y) {
+          tile[threadIdx.x][y-yb] = in[threadIdx.x+xb + y*instride];
+        }
+      }
+      __syncthreads();
+      if (yb + threadIdx.x < ncols) {
+        int xlim = min(nrows, xb + BLOCKDIM);
+        for (int x = threadIdx.y + xb; x < xlim; x += blockDim.y) {
+          out[threadIdx.x + yb + x*outstride] = tile[x-xb][threadIdx.x];
+        }
+      }
+      __syncthreads();
+    }
+  } 
+}
+
+int transpose(float *in, int instride, float *out, int outstride, int nrows, int ncols) {
+  const dim3 griddims(32,32);
+  const dim3 blockdims(BLOCKDIM,16,1);
+  cudaError_t err;
+  __transpose<<<griddims,blockdims>>>(in, instride, out, outstride, nrows, ncols); 
+  cudaDeviceSynchronize();
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {fprintf(stderr, "cuda error in transpose"); return err;}
+  return 0;
+}
+
+
+
+ int dds(int nrows, int nnz, float *A, float *B, int *Cir, int *Cic, float *P) {
+  int nthreads = min(32, nrows);
+  int nblocks = min(32*1024*1024, max(1,nnz/8));
+  __dds<<<nblocks,nthreads>>>(nrows, nnz, A, B, Cir, Cic, P);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+int reduce1op(int nrows, int ncols, float *A, float *B, int opn) {
+  int blkx = min(32, nrows);
+  int blky = min(32, ncols);
+  int nblks = max(1, ((int)(((long long)nrows) * ncols / blkx / blky / 16)));
+  const dim3 blkdims(blkx,blky,1);
+  const dim3 griddims(1,nblks,1);
+  __reduce1op<<<griddims,blkdims>>>(nrows, ncols, A, B, opn);
+  cudaDeviceSynchronize();
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void __reduce2op(int nrows, int ncols, float *A, float *B, int opn) {
+  __shared__ float parts[32][33];
+  optype op = operators[opn];
+  int baserow = threadIdx.x + blockDim.x * blockIdx.x;
+  for (int irow = baserow; irow < nrows; irow += blockDim.x * gridDim.x) {
+    float v = A[irow + threadIdx.y * nrows];
+    for (int icol = threadIdx.y + blockDim.y; icol < ncols; icol += blockDim.y) {
+      v = op(v, A[irow + icol * nrows]);
+    }
+    parts[threadIdx.x][threadIdx.y] = v;
+    __syncthreads();
+    float newv = 0;
+    for (int i = 1; i < blockDim.y; i *= 2) {
+      if (i + threadIdx.y < blockDim.y) newv = parts[threadIdx.x][i+threadIdx.y];
+      __syncthreads();
+      if (i + threadIdx.y < blockDim.y) parts[threadIdx.x][threadIdx.y] = op(parts[threadIdx.x][threadIdx.y], newv);
+      __syncthreads();
+    }
+    if (threadIdx.y == 0) {
+      B[irow] = parts[threadIdx.x][0];
+    }
+    __syncthreads();
+  }
+}
+
+int reduce2op(int nrows, int ncols, float *A, float *B, int opn) {
+  int blkx = min(32, nrows);
+  int blky = min(32, ncols);
+  int nblks = max(1, ((int)(((long long)nrows) * ncols / blkx / blky / 16)));
+  const dim3 blkdims(blkx,blky,1);
+  const dim3 griddims(nblks,1,1);
+  __reduce2op<<<griddims,blkdims>>>(nrows, ncols, A, B, opn);
+  cudaDeviceSynchronize();
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+
+#ifdef TEST
+int main(int argc, char **argv) {
+  int m=8, n=8, opn = 0;
+  float *dA, *dB, *dC, *A, *B, *C;
+  if (argc > 1) {
+    sscanf(argv[1], "%d", &opn);
+    if (argc > 2) {
+      sscanf(argv[2], "%d", &m);
+      if (argc > 3) {
+        sscanf(argv[3], "%d", &n);
+      }
+    }
+  }
+  A = (float *)malloc(m*n*sizeof(float));
+  B = (float *)malloc(m*n*sizeof(float));
+  C = (float *)malloc(m*n*sizeof(float));
+  cudaMalloc((void**)&dA, m*n*sizeof(float));
+  cudaMalloc((void**)&dB, m*n*sizeof(float));
+  cudaMalloc((void**)&dC, m*n*sizeof(float));
+
+  for (int i = 0; i < m*n; i++) {
+    A[i] = 1.0f;
+    B[i] = 2.0f;
+  }
+
+  cudaMemcpy(dA, A, m*n*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(dB, B, m*n*sizeof(float), cudaMemcpyHostToDevice);
+
+  printf("A %f %f %f %f\n", A[0], A[1], A[2], A[3]);
+  printf("B %f %f %f %f\n", B[0], B[1], B[2], B[3]);
+
+  MatKernel(dA, m, n, dB, m, n, dC, opn);
+  cudaError_t err = cudaGetLastError();
+  if( cudaSuccess != err) {
+    fprintf(stderr, "CUDA error %d", err);
+    exit(1);
+  }
+
+  cudaMemcpy(C, dC, m*n*sizeof(float), cudaMemcpyDeviceToHost);
+
+  printf("C %f %f %f %f\n", C[0], C[1], C[2], C[3]);
+  printf("A %f %f %f %f\n", A[0], A[1], A[2], A[3]);
+  printf("B %f %f %f %f\n", B[0], B[1], B[2], B[3]);
+
+  if (dA != NULL) cudaFree(dA);
+  if (dB != NULL) cudaFree(dB);
+  if (dC != NULL) cudaFree(dC);
+  if (C != NULL) free(C);
+}
+#endif
diff --git a/jni/src/MatKernel.hpp b/jni/src/MatKernel.hpp
new file mode 100755
index 00000000..ef1acdf6
--- /dev/null
+++ b/jni/src/MatKernel.hpp
@@ -0,0 +1,20 @@
+
+int apply_binop(float *nativeA, int Anrows, int Ancols, float *nativeB, int Bnrows, int Bncols, float *nativeC, int opn);
+
+int apply_biniop(int *nativeA, int Anrows, int Ancols, int *nativeB, int Bnrows, int Bncols, int *nativeC, int opn);
+
+int apply_gfun(float *nativeA, float *nativeB, int N, int opn);
+
+int apply_gfun2(float *nativeA, float *nativeB, float *nativeC, int N, int opn);
+
+int dsmult(int nrows, int ncols, int nnz, float *A, float *Bdata, int *Bir, int *Bic, float *C);
+
+int dsmultT(int nrows, int ncols, int nnz, float *A, float *Bdata, int *Bir, int *Bic, float *C);
+
+int dds(int nrows, int nnz, float *A, float *B, int *Cir, int *Cic, float *P);
+
+int reduce1op(int nrows, int ncols, float *A, float *B, int opn);
+
+int reduce2op(int nrows, int ncols, float *A, float *B, int opn);
+
+int transpose(float *in, int instride, float *out, int outstride, int nrows, int ncols);
diff --git a/jni/src/configure b/jni/src/configure
new file mode 100755
index 00000000..586c4ce1
--- /dev/null
+++ b/jni/src/configure
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+OS=`uname`
+PARLIB=$1
+ARCH="x86_64"
+ 
+VERSION="dev"
+
+# standardise the OS and ARCH names
+if [ "$OS" = "Darwin" ] ; then
+	OS="apple"
+elif [ "$OS" = "Linux" ] ; then
+	OS="linux"
+elif [ "$OS" = "SunOS" ] ; then
+	OS="sun"
+elif [[ "$OS" == CYGWIN* ]] ; then
+	OS="windows"
+else
+	echo "OS not supported" $OS
+	exit 1
+fi
+
+if [ "$ARCH" = "x86" ] || [ "$ARCH" = "i686" ] || [ "$ARCH" = "i586" ] \
+	|| [ "$ARCH" = "i486" ] || [ "$ARCH" = "i386" ] ; then
+	ARCH="x86"
+elif [ "$ARCH" = "Power Macintosh" ] ; then
+	ARCH="ppc"
+elif [ "$ARCH" = "amd64" ] || [ "$ARCH" = "x86_64" ] ; then
+	ARCH="x86_64"
+elif [ "$ARCH" = "sun4u" ] ; then
+	ARCH="sparc"
+else
+	echo "ARCH not supported"
+	exit 1
+fi
+
+if [ "$OS" = "apple" ] ; then
+	CC="gcc -Wall"
+	OBJ="o"
+	OUTFLG="-o "
+	CPPFLAGS="$CPPFLAGS -I/System/Library/Frameworks/JavaVM.framework/Home/include"
+	CFLAGS="-fPIC -fno-common $CFLAGS"
+	LB="ar rc"    
+	LD="gcc -dynamiclib"
+	LDFLAGS="$LDFLAGS -framework JavaVM"
+	LIBPREPEND="lib"
+	LIBAPPEND="-apple-"${ARCH}".jnilib"
+	FC="g95"
+	FFLAGS="$CFLAGS"
+	LAPACK_INCLUDES="-I/System/Library/Frameworks/vecLib.framework/Headers"
+	FORTRAN_LIBS="-lg95 -Wl,-single_module"
+	MKL_LIBS="-framework veclib"
+elif [ "$OS" = "linux" ] ; then
+        MKL_ROOT="/opt/intel/mkl"
+        JAVA_HOME="/usr/java/default"
+        CUDA_HOME="/usr/local/cuda"
+        JCUDA_HOME="/home/jfc/code/JCUDA5"
+        CC="icc"
+        GCC="gcc"
+        NVCC="nvcc"
+        NVCCFLAGS="-c -arch=compute_20 -code=sm_20,sm_30 --machine 64  -Xcompiler \"-fPIC -c -O2 -DNDEBUG\""
+        SUBLIB=linux64
+	OBJ="o"
+	OUTFLG="-o "
+	CPPFLAGS="$CPPFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -I$MKL_ROOT/include \
+                 -I$MKL_ROOT/include/intel64/lp64 -I$JCUDA_HOME/CommonJNI/src -I$CUDA_HOME/include"
+	CFLAGS="-fPIC -c -O2 -DNDEBUG -std=c99 $CFLAGS"
+	LB="ar rc"    
+	GLD="gcc -shared"
+	LD="icc -shared -static-intel"
+	LDFLAGS="$LDFLAGS"
+	LIBPREPEND="lib"
+	LIBAPPEND=".so"
+	FC="gfortran"
+	FFLAGS="$CFLAGS"
+	LAPACK_INCLUDES=""
+	FORTRAN_LIBS="-lgfortran"
+	if [ "$PARLIB" = "threaded" ] ; then
+	MKL_LIBS="-L$JAVA_HOME/lib -L/opt/intel/composerxe/lib/intel64 $MKL_ROOT/lib/intel64/libmkl_intel_lp64.a -Wl,--start-group  \
+            $MKL_ROOT/lib/intel64/libmkl_intel_thread.a $MKL_ROOT/lib/intel64/libmkl_core.a \
+            -Wl,--end-group -liomp5 -lpthread -lm"
+        else 
+	MKL_LIBS="-L$JAVA_HOME/lib -L/opt/intel/composerxe/lib/intel64 $MKL_ROOT/lib/intel64/libmkl_intel_lp64.a -Wl,--start-group  \
+            $MKL_ROOT/lib/intel64/libmkl_sequential.a $MKL_ROOT/lib/intel64/libmkl_core.a \
+            -Wl,--end-group -liomp5 -lpthread -lm"
+        fi
+        CUDA_LIBS="-L${CUDA_HOME}/lib64 -L${JCUDA_HOME}/lib -lcudart -lCommonJNI"
+elif [ "$OS" = "windows" ] ; then
+        MKL_ROOT="c:/Intel/MKL"
+#    JAVA_HOME=""
+#        CUDA_HOME="C:/Progra~1/NVIDIA~2/CUDA/v4.2"
+#        JCUDA_HOME="/code/JCUDA"
+        CUDA_HOME="C:/Progra~1/NVIDIA~2/CUDA/v5.0"
+        JCUDA_HOME="/code/JCUDA5"
+        JAVA_HOME="C:/Progra~1/Java/jdk1.6.0_29"
+	CC="icl"
+    GCC="icl"
+    NVCC="nvcc"
+    SUBLIB=win64
+	OBJ="obj"
+	OUTFLG="/OUT:"
+	CPPFLAGS=""
+    NVCCFLAGS="-c -arch=compute_20 -code=sm_20,sm_30 --machine 64  -Xcompiler \"/EHsc /W3 /nologo /O2 /Zi  /MT\""
+#    NVCCFLAGS="-c -arch=compute_30 -code=sm_30 --machine 64  -Xcompiler \"/EHsc /W3 /nologo /O2 /Zi  /MT\""
+	CFLAGS="/c /MT /DNDEBUG /O2 /Qstd=c99 $CFLAGS"    # static linking
+#	CFLAGS="/c /MT /DMKL_ILP64 /DNDEBUG /O2 /Qstd=c99 $CFLAGS" # static link, 64bit ints
+	LB="lib"    
+	LD="link"
+	GLD="link"
+	LDFLAGS="/DLL /MACHINE:AMD64 $LDFLAGS"
+	LIBPREPEND=""
+	LIBAPPEND=".dll"
+	FC="ifort"
+	FFLAGS="-c $FFLAGS"
+	LAPACK_INCLUDES=""
+	FORTRAN_LIBS=""
+	if [ "$PARLIB" = "threaded" ] ; then
+#	MKL_LIBS="mkl_intel_lp64_dll.lib mkl_intel_thread_dll.lib mkl_core_dll.lib"     # threaded, 32bit ints, dll
+	MKL_LIBS="mkl_intel_lp64.lib mkl_intel_thread.lib mkl_core.lib libiomp5md.lib"  # threaded, 32bit integer, static link
+        else
+#	MKL_LIBS="mkl_intel_lp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib"       # sequential, 32bit ints, dll
+	MKL_LIBS="mkl_intel_lp64.lib mkl_sequential.lib mkl_core.lib libiomp5md.lib"    # sequential, 32bit int, static link
+        fi
+#	MKL_LIBS="mkl_intel_ilp64.lib mkl_intel_thread.lib mkl_core.lib libiomp5md.lib" # threaded, 64bit integer, static link
+#	MKL_LIBS="mkl_intel_ilp64.lib mkl_sequential.lib mkl_core.lib libiomp5md.lib"   # sequential, 64bit integer, static link
+        CUDA_LIBS="cudart.lib CommonJNI.lib"
+	LIB="$MKL_ROOT/mkl/lib/intel64;$MKL_ROOT/compiler/lib/intel64;$JAVA_HOME/lib;$CUDA_HOME/lib/x64;$JCUDA_HOME/lib;$LIB"
+	INCLUDE="$JAVA_HOME/include;$JAVA_HOME/include/win32;c:/Intel/MKL/mkl/include;c:/codeh/BIDMat/jni/include;$JCUDA_HOME/CommonJNI/src;$CUDA_HOME/include;$INCLUDE"
+else
+	echo "OS not supported"
+	exit 1
+fi
+
+echo "Creating config for $OS $ARCH"
+
+echo "CC=$CC" > Makefile.incl
+echo "GCC=$GCC" >> Makefile.incl
+echo "NVCC=$NVCC" >> Makefile.incl
+echo "NVCCFLAGS=$NVCCFLAGS" >> Makefile.incl
+echo "SUBLIB=$SUBLIB" >> Makefile.incl
+echo "OBJ=$OBJ" >> Makefile.incl
+echo "OUTFLG=$OUTFLG" >> Makefile.incl
+echo "CPPFLAGS=$CPPFLAGS" >> Makefile.incl
+echo "CFLAGS=$CFLAGS" >> Makefile.incl
+echo "LB=$LB" >> Makefile.incl
+echo "LD=$LD" >> Makefile.incl
+echo "GLD=$GLD" >> Makefile.incl
+echo "LDFLAGS=$LDFLAGS" >> Makefile.incl
+echo "LIBPREPEND=$LIBPREPEND" >> Makefile.incl
+echo "LIBAPPEND=$LIBAPPEND" >> Makefile.incl
+echo "LAPACK_INCLUDES=$LAPACK_INCLUDES" >> Makefile.incl
+echo "MKL_LIBS=$MKL_LIBS" >> Makefile.incl
+echo "CUDA_LIBS=$CUDA_LIBS" >> Makefile.incl
+echo "FORTRAN_LIBS=$FORTRAN_LIBS" >> Makefile.incl
+echo "FC=$FC" >> Makefile.incl
+echo "FFLAGS=$FFLAGS" >> Makefile.incl
+echo "LIB=$LIB" >> Makefile.incl
+echo "INCLUDE=$INCLUDE" >> Makefile.incl
+echo "JCUDA_COMMON=$JCUDA_HOME/CommonJNI/src" >> Makefile.incl
+
+
diff --git a/lib/HDF5_Copyright.html b/lib/HDF5_Copyright.html
new file mode 100644
index 00000000..07a71f45
--- /dev/null
+++ b/lib/HDF5_Copyright.html
@@ -0,0 +1,160 @@
+<html><head>
+<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
+    <title>
+      HDF5 Copyright Notice and License Terms
+    </title>
+  </head>
+
+<body bgcolor="#FFFFFF">
+<!-- NEW PAGE -->
+
+
+<hr>               
+
+<h3>Copyright Notice and License Terms for
+<br>
+HDF5 (Hierarchical Data Format 5) Software Library and Utilities</h3>
+<hr>               
+<p>
+
+
+HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+<br>
+Copyright 2006-2012 by The HDF Group.
+</p><p>
+NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+<br>
+Copyright 1998-2006 by the Board of Trustees of the University of Illinois.
+</p><p>
+<strong>All rights reserved.</strong>
+</p><p>
+
+</p><p>
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted for any purpose (including commercial purposes) 
+provided that the following conditions are met:
+
+</p><p>
+</p><ol>
+<li>
+Redistributions of source code must retain the above copyright notice, 
+this list of conditions, and the following disclaimer.
+
+</li><li>
+Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions, and the following disclaimer in the documentation 
+and/or materials provided with the distribution.
+
+</li><li>
+In addition, redistributions of modified forms of the source or binary code 
+must carry prominent notices stating that the original code was changed and 
+the date of the change.
+
+</li><li>
+All publications or advertising materials mentioning features or use of this 
+software are asked, but not required, to acknowledge that it was developed 
+by The HDF Group and by the National Center for Supercomputing Applications 
+at the University of Illinois at Urbana-Champaign and credit the contributors.
+
+</li><li>
+Neither the name of The HDF Group, the name of the University, nor the name 
+of any Contributor may be used to endorse or promote products derived from 
+this software without specific prior written permission from The HDF Group, 
+the University, or the Contributor, respectively.
+</li></ol>
+
+<p>
+<b>DISCLAIMER:</b>
+THIS SOFTWARE IS PROVIDED BY THE HDF GROUP AND THE CONTRIBUTORS 
+"AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED.  
+In no event shall The HDF Group or the Contributors be liable for any damages 
+suffered by the users arising out of the use of this software, even if advised 
+of the possibility of such damage. 
+
+
+</p><hr>
+<hr>
+
+<p>
+Contributors:   National Center for Supercomputing Applications  (NCSA) at 
+the University of Illinois, Fortner Software, Unidata Program Center (netCDF), 
+The Independent JPEG Group (JPEG), Jean-loup Gailly and Mark Adler (gzip), 
+and Digital Equipment Corporation (DEC).
+
+</p><hr>
+
+<p>
+Portions of HDF5 were developed with support from the Lawrence Berkeley 
+National Laboratory (LBNL) and the United States Department of Energy 
+under Prime Contract No. DE-AC02-05CH11231.
+
+</p><hr>
+
+<p>
+Portions of HDF5 were developed with support from the University of 
+California, Lawrence Livermore National Laboratory (UC LLNL). 
+The following statement applies to those portions of the product and must 
+be retained in any redistribution of source code, binaries, documentation, 
+and/or accompanying materials:
+</p><dir>
+        This work was partially produced at the University of California, 
+        Lawrence Livermore National Laboratory (UC LLNL) under contract 
+        no. W-7405-ENG-48 (Contract 48) between the U.S. Department of 
+        Energy (DOE) and The Regents of the University of California 
+        (University) for the operation of UC LLNL.
+	<p>
+	<b>DISCLAIMER:</b>
+        This work was prepared as an account of work sponsored by an agency 
+        of the United States Government. Neither the United States Government 
+        nor the University of California nor any of their employees, makes 
+        any warranty, express or implied, or assumes any liability or 
+        responsibility for the accuracy, completeness, or usefulness of any 
+        information, apparatus, product, or process disclosed, or represents 
+        that its use would not infringe privately- owned rights. Reference 
+        herein to any specific commercial products, process, or service by 
+        trade name, trademark, manufacturer, or otherwise, does not 
+        necessarily constitute or imply its endorsement, recommendation, or 
+        favoring by the United States Government or the University of 
+        California. The views and opinions of authors expressed herein do not 
+        necessarily state or reflect those of the United States Government or 
+        the University of California, and shall not be used for advertising 
+        or product endorsement purposes.
+</p></dir>
+
+<hr>
+
+<!--    DO NOT EDIT THE FOLLOWING 8 LINES;                           -->
+<!--    THEY ARE AUTOMATICALLY UPDATED BY DOCUMENTATION SOFTWARE.    -->
+
+<!-- #BeginLibraryItem "/ed_libs/Footer.lbi" -->
+<address>
+<table border="0" width="100%">
+  <tbody><tr valign="top">
+      <td align="left">
+          <address>
+          The HDF Group Help Desk: <img src="HDF5_help.png" align="top" height="16">
+          <br>
+          Describes HDF5 Release 1.8.9, May 2012.
+          </address>
+      </td><td width="5%">&nbsp;</td>
+      <td align="right">
+          <a href="http://www.hdfgroup.org/HDF5/doc/Copyright.html">Copyright</a> by
+          <a href="http://www.hdfgroup.org/">The HDF Group</a>
+          <br>
+          and the Board of Trustees of the University of Illinois
+      </td>   
+  </tr>   
+</tbody></table>
+</address>
+<!-- #EndLibraryItem --><script language="JAVASCRIPT">
+<!--
+document.writeln("Last modified: 5 March 2012")
+-->
+</script>Last modified: 5 March 2012
+
+
+
+
+
+
+</body></html>
diff --git a/lib/JCUDA_Copyright.txt b/lib/JCUDA_Copyright.txt
new file mode 100644
index 00000000..a47ba681
--- /dev/null
+++ b/lib/JCUDA_Copyright.txt
@@ -0,0 +1,24 @@
+JCuda - Java bindings for NVIDIA CUDA
+
+Copyright (c) 2008-2012 Marco Hutter - http://www.jcuda.org
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
diff --git a/lib/PtPlot_Copyright.txt b/lib/PtPlot_Copyright.txt
new file mode 100755
index 00000000..7da2f50e
--- /dev/null
+++ b/lib/PtPlot_Copyright.txt
@@ -0,0 +1,27 @@
+Below is the copyright agreement for the Ptolemy II system.
+Version: $Id: copyright.txt 57469 2010-03-10 22:04:46Z cxh $
+
+Copyright (c) 1995-2010 The Regents of the University of California.
+All rights reserved.
+
+Permission is hereby granted, without written agreement and without
+license or royalty fees, to use, copy, modify, and distribute this
+software and its documentation for any purpose, provided that the above
+copyright notice and the following two paragraphs appear in all copies
+of this software.
+
+IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
+THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
+PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
+CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+ENHANCEMENTS, OR MODIFICATIONS.
+
+Ptolemy II includes the work of others, to see those copyrights, follow
+the copyright link on the splash page or see copyright.htm.
diff --git a/lib/bidmat_init.scala b/lib/bidmat_init.scala
new file mode 100755
index 00000000..94a40ca5
--- /dev/null
+++ b/lib/bidmat_init.scala
@@ -0,0 +1,7 @@
+import BIDMat.{Mat, FMat, DMat, IMat, CMat, BMat, CSMat, SMat, SDMat, GMat, GIMat, GSMat, HMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+import BIDMat.Solvers._
+import BIDMat.Plotting._
+
+Mat.checkCUDA
diff --git a/lib/jcublas-0.4.2.jar b/lib/jcublas-0.4.2.jar
new file mode 100644
index 00000000..8183047a
Binary files /dev/null and b/lib/jcublas-0.4.2.jar differ
diff --git a/lib/jcublas-0.5.0RC.jar b/lib/jcublas-0.5.0RC.jar
new file mode 100755
index 00000000..139ed978
Binary files /dev/null and b/lib/jcublas-0.5.0RC.jar differ
diff --git a/lib/jcuda-0.4.2.jar b/lib/jcuda-0.4.2.jar
new file mode 100644
index 00000000..6b73cf5b
Binary files /dev/null and b/lib/jcuda-0.4.2.jar differ
diff --git a/lib/jcuda-0.5.0RC.jar b/lib/jcuda-0.5.0RC.jar
new file mode 100755
index 00000000..b0e89f1a
Binary files /dev/null and b/lib/jcuda-0.5.0RC.jar differ
diff --git a/lib/jcudpp-0.4.2.jar b/lib/jcudpp-0.4.2.jar
new file mode 100644
index 00000000..3c329bf0
Binary files /dev/null and b/lib/jcudpp-0.4.2.jar differ
diff --git a/lib/jcufft-0.4.2.jar b/lib/jcufft-0.4.2.jar
new file mode 100644
index 00000000..4f55ba01
Binary files /dev/null and b/lib/jcufft-0.4.2.jar differ
diff --git a/lib/jcufft-0.5.0RC.jar b/lib/jcufft-0.5.0RC.jar
new file mode 100755
index 00000000..5a26a2e4
Binary files /dev/null and b/lib/jcufft-0.5.0RC.jar differ
diff --git a/lib/jcurand-0.4.2.jar b/lib/jcurand-0.4.2.jar
new file mode 100644
index 00000000..69b674ca
Binary files /dev/null and b/lib/jcurand-0.4.2.jar differ
diff --git a/lib/jcurand-0.5.0RC.jar b/lib/jcurand-0.5.0RC.jar
new file mode 100755
index 00000000..1399969d
Binary files /dev/null and b/lib/jcurand-0.5.0RC.jar differ
diff --git a/lib/jcusparse-0.4.2.jar b/lib/jcusparse-0.4.2.jar
new file mode 100644
index 00000000..c58917db
Binary files /dev/null and b/lib/jcusparse-0.4.2.jar differ
diff --git a/lib/jcusparse-0.5.0RC.jar b/lib/jcusparse-0.5.0RC.jar
new file mode 100755
index 00000000..80be5937
Binary files /dev/null and b/lib/jcusparse-0.5.0RC.jar differ
diff --git a/lib/jhdf5.jar b/lib/jhdf5.jar
new file mode 100644
index 00000000..9d15b7d1
Binary files /dev/null and b/lib/jhdf5.jar differ
diff --git a/lib/linux64/HDF5_Copyright.html b/lib/linux64/HDF5_Copyright.html
new file mode 100755
index 00000000..07a71f45
--- /dev/null
+++ b/lib/linux64/HDF5_Copyright.html
@@ -0,0 +1,160 @@
+<html><head>
+<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
+    <title>
+      HDF5 Copyright Notice and License Terms
+    </title>
+  </head>
+
+<body bgcolor="#FFFFFF">
+<!-- NEW PAGE -->
+
+
+<hr>               
+
+<h3>Copyright Notice and License Terms for
+<br>
+HDF5 (Hierarchical Data Format 5) Software Library and Utilities</h3>
+<hr>               
+<p>
+
+
+HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+<br>
+Copyright 2006-2012 by The HDF Group.
+</p><p>
+NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+<br>
+Copyright 1998-2006 by the Board of Trustees of the University of Illinois.
+</p><p>
+<strong>All rights reserved.</strong>
+</p><p>
+
+</p><p>
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted for any purpose (including commercial purposes) 
+provided that the following conditions are met:
+
+</p><p>
+</p><ol>
+<li>
+Redistributions of source code must retain the above copyright notice, 
+this list of conditions, and the following disclaimer.
+
+</li><li>
+Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions, and the following disclaimer in the documentation 
+and/or materials provided with the distribution.
+
+</li><li>
+In addition, redistributions of modified forms of the source or binary code 
+must carry prominent notices stating that the original code was changed and 
+the date of the change.
+
+</li><li>
+All publications or advertising materials mentioning features or use of this 
+software are asked, but not required, to acknowledge that it was developed 
+by The HDF Group and by the National Center for Supercomputing Applications 
+at the University of Illinois at Urbana-Champaign and credit the contributors.
+
+</li><li>
+Neither the name of The HDF Group, the name of the University, nor the name 
+of any Contributor may be used to endorse or promote products derived from 
+this software without specific prior written permission from The HDF Group, 
+the University, or the Contributor, respectively.
+</li></ol>
+
+<p>
+<b>DISCLAIMER:</b>
+THIS SOFTWARE IS PROVIDED BY THE HDF GROUP AND THE CONTRIBUTORS 
+"AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED.  
+In no event shall The HDF Group or the Contributors be liable for any damages 
+suffered by the users arising out of the use of this software, even if advised 
+of the possibility of such damage. 
+
+
+</p><hr>
+<hr>
+
+<p>
+Contributors:   National Center for Supercomputing Applications  (NCSA) at 
+the University of Illinois, Fortner Software, Unidata Program Center (netCDF), 
+The Independent JPEG Group (JPEG), Jean-loup Gailly and Mark Adler (gzip), 
+and Digital Equipment Corporation (DEC).
+
+</p><hr>
+
+<p>
+Portions of HDF5 were developed with support from the Lawrence Berkeley 
+National Laboratory (LBNL) and the United States Department of Energy 
+under Prime Contract No. DE-AC02-05CH11231.
+
+</p><hr>
+
+<p>
+Portions of HDF5 were developed with support from the University of 
+California, Lawrence Livermore National Laboratory (UC LLNL). 
+The following statement applies to those portions of the product and must 
+be retained in any redistribution of source code, binaries, documentation, 
+and/or accompanying materials:
+</p><dir>
+        This work was partially produced at the University of California, 
+        Lawrence Livermore National Laboratory (UC LLNL) under contract 
+        no. W-7405-ENG-48 (Contract 48) between the U.S. Department of 
+        Energy (DOE) and The Regents of the University of California 
+        (University) for the operation of UC LLNL.
+	<p>
+	<b>DISCLAIMER:</b>
+        This work was prepared as an account of work sponsored by an agency 
+        of the United States Government. Neither the United States Government 
+        nor the University of California nor any of their employees, makes 
+        any warranty, express or implied, or assumes any liability or 
+        responsibility for the accuracy, completeness, or usefulness of any 
+        information, apparatus, product, or process disclosed, or represents 
+        that its use would not infringe privately- owned rights. Reference 
+        herein to any specific commercial products, process, or service by 
+        trade name, trademark, manufacturer, or otherwise, does not 
+        necessarily constitute or imply its endorsement, recommendation, or 
+        favoring by the United States Government or the University of 
+        California. The views and opinions of authors expressed herein do not 
+        necessarily state or reflect those of the United States Government or 
+        the University of California, and shall not be used for advertising 
+        or product endorsement purposes.
+</p></dir>
+
+<hr>
+
+<!--    DO NOT EDIT THE FOLLOWING 8 LINES;                           -->
+<!--    THEY ARE AUTOMATICALLY UPDATED BY DOCUMENTATION SOFTWARE.    -->
+
+<!-- #BeginLibraryItem "/ed_libs/Footer.lbi" -->
+<address>
+<table border="0" width="100%">
+  <tbody><tr valign="top">
+      <td align="left">
+          <address>
+          The HDF Group Help Desk: <img src="HDF5_help.png" align="top" height="16">
+          <br>
+          Describes HDF5 Release 1.8.9, May 2012.
+          </address>
+      </td><td width="5%">&nbsp;</td>
+      <td align="right">
+          <a href="http://www.hdfgroup.org/HDF5/doc/Copyright.html">Copyright</a> by
+          <a href="http://www.hdfgroup.org/">The HDF Group</a>
+          <br>
+          and the Board of Trustees of the University of Illinois
+      </td>   
+  </tr>   
+</tbody></table>
+</address>
+<!-- #EndLibraryItem --><script language="JAVASCRIPT">
+<!--
+document.writeln("Last modified: 5 March 2012")
+-->
+</script>Last modified: 5 March 2012
+
+
+
+
+
+
+</body></html>
diff --git a/lib/linux64/JCUDA4.2/libJCublas-linux-x86_64.so b/lib/linux64/JCUDA4.2/libJCublas-linux-x86_64.so
new file mode 100755
index 00000000..50637794
Binary files /dev/null and b/lib/linux64/JCUDA4.2/libJCublas-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA4.2/libJCublas2-linux-x86_64.so b/lib/linux64/JCUDA4.2/libJCublas2-linux-x86_64.so
new file mode 100755
index 00000000..ff0797ba
Binary files /dev/null and b/lib/linux64/JCUDA4.2/libJCublas2-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA4.2/libJCudaDriver-linux-x86_64.so b/lib/linux64/JCUDA4.2/libJCudaDriver-linux-x86_64.so
new file mode 100755
index 00000000..d197de2d
Binary files /dev/null and b/lib/linux64/JCUDA4.2/libJCudaDriver-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA4.2/libJCudaRuntime-linux-x86_64.so b/lib/linux64/JCUDA4.2/libJCudaRuntime-linux-x86_64.so
new file mode 100755
index 00000000..9a5aa6a0
Binary files /dev/null and b/lib/linux64/JCUDA4.2/libJCudaRuntime-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA4.2/libJCufft-linux-x86_64.so b/lib/linux64/JCUDA4.2/libJCufft-linux-x86_64.so
new file mode 100755
index 00000000..750b0f6b
Binary files /dev/null and b/lib/linux64/JCUDA4.2/libJCufft-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA4.2/libJCurand-linux-x86_64.so b/lib/linux64/JCUDA4.2/libJCurand-linux-x86_64.so
new file mode 100755
index 00000000..5db8c4ed
Binary files /dev/null and b/lib/linux64/JCUDA4.2/libJCurand-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA4.2/libJCusparse-linux-x86_64.so b/lib/linux64/JCUDA4.2/libJCusparse-linux-x86_64.so
new file mode 100755
index 00000000..215ebae6
Binary files /dev/null and b/lib/linux64/JCUDA4.2/libJCusparse-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA4.2/libJCusparse2-linux-x86_64.so b/lib/linux64/JCUDA4.2/libJCusparse2-linux-x86_64.so
new file mode 100755
index 00000000..b20485ce
Binary files /dev/null and b/lib/linux64/JCUDA4.2/libJCusparse2-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA4.2/libbidmatcuda.so b/lib/linux64/JCUDA4.2/libbidmatcuda.so
new file mode 100755
index 00000000..1b2c0e0b
Binary files /dev/null and b/lib/linux64/JCUDA4.2/libbidmatcuda.so differ
diff --git a/lib/linux64/JCUDA5.0/libJCublas-linux-x86_64.so b/lib/linux64/JCUDA5.0/libJCublas-linux-x86_64.so
new file mode 100755
index 00000000..cf3aeb39
Binary files /dev/null and b/lib/linux64/JCUDA5.0/libJCublas-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA5.0/libJCublas2-linux-x86_64.so b/lib/linux64/JCUDA5.0/libJCublas2-linux-x86_64.so
new file mode 100755
index 00000000..11d1ee13
Binary files /dev/null and b/lib/linux64/JCUDA5.0/libJCublas2-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA5.0/libJCudaDriver-linux-x86_64.so b/lib/linux64/JCUDA5.0/libJCudaDriver-linux-x86_64.so
new file mode 100755
index 00000000..bd6dfa53
Binary files /dev/null and b/lib/linux64/JCUDA5.0/libJCudaDriver-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA5.0/libJCudaRuntime-linux-x86_64.so b/lib/linux64/JCUDA5.0/libJCudaRuntime-linux-x86_64.so
new file mode 100755
index 00000000..6bfbdbcf
Binary files /dev/null and b/lib/linux64/JCUDA5.0/libJCudaRuntime-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA5.0/libJCufft-linux-x86_64.so b/lib/linux64/JCUDA5.0/libJCufft-linux-x86_64.so
new file mode 100755
index 00000000..90499083
Binary files /dev/null and b/lib/linux64/JCUDA5.0/libJCufft-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA5.0/libJCurand-linux-x86_64.so b/lib/linux64/JCUDA5.0/libJCurand-linux-x86_64.so
new file mode 100755
index 00000000..396e9274
Binary files /dev/null and b/lib/linux64/JCUDA5.0/libJCurand-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA5.0/libJCusparse-linux-x86_64.so b/lib/linux64/JCUDA5.0/libJCusparse-linux-x86_64.so
new file mode 100755
index 00000000..d6b8b827
Binary files /dev/null and b/lib/linux64/JCUDA5.0/libJCusparse-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA5.0/libJCusparse2-linux-x86_64.so b/lib/linux64/JCUDA5.0/libJCusparse2-linux-x86_64.so
new file mode 100755
index 00000000..917b3bba
Binary files /dev/null and b/lib/linux64/JCUDA5.0/libJCusparse2-linux-x86_64.so differ
diff --git a/lib/linux64/JCUDA5.0/libbidmatcuda.so b/lib/linux64/JCUDA5.0/libbidmatcuda.so
new file mode 100755
index 00000000..cfc001f0
Binary files /dev/null and b/lib/linux64/JCUDA5.0/libbidmatcuda.so differ
diff --git a/lib/linux64/JCUDA_Copyright.txt b/lib/linux64/JCUDA_Copyright.txt
new file mode 100755
index 00000000..a47ba681
--- /dev/null
+++ b/lib/linux64/JCUDA_Copyright.txt
@@ -0,0 +1,24 @@
+JCuda - Java bindings for NVIDIA CUDA
+
+Copyright (c) 2008-2012 Marco Hutter - http://www.jcuda.org
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
diff --git a/lib/linux64/libbidmatmkl.so b/lib/linux64/libbidmatmkl.so
new file mode 100755
index 00000000..c9c0e293
Binary files /dev/null and b/lib/linux64/libbidmatmkl.so differ
diff --git a/lib/linux64/libhdf4.settings b/lib/linux64/libhdf4.settings
new file mode 100644
index 00000000..6f6e7cb6
--- /dev/null
+++ b/lib/linux64/libhdf4.settings
@@ -0,0 +1,35 @@
+	    SUMMARY OF THE HDF4 CONFIGURATION
+	    =================================
+
+General Information:
+-------------------
+		   HDF4 Version: 4.2.6-post2
+		  Configured on: Tue Dec 13 17:07:13 CST 2011
+		  Configured by: hdftest@koala
+		 Configure mode: production
+		    Host system: x86_64-unknown-linux-gnu
+              Uname information: Linux koala 2.6.18-274.12.1.el5 #1 SMP Tue Nov 29 13:37:46 EST 2011 x86_64 x86_64 x86_64 GNU/Linux
+		      Libraries: 
+	     Installation point: /mnt/scr1/pre-release/hdf4/vdev/koalajava
+
+Compiling Options:
+------------------
+               Compilation Mode: production
+                     C compiler: /usr/bin/gcc ( gcc (GCC) 4.1.2 20080704 )
+                         CFLAGS: -fPIC -O3 -fomit-frame-pointer
+                       CPPFLAGS: -I/usr/include/rpc  -I/mnt/hdf/packages/jpeg-PIC/Linux2.6-x86_64-gcc/include -I/mnt/hdf/packages/szip/static/encoder/Linux2.6-x86_64-gcc/include -DBIG_LONGS -DSWAP 
+               Shared Libraries: no
+               Static Libraries: yes
+                        LDFLAGS:  -L/mnt/hdf/packages/jpeg-PIC/Linux2.6-x86_64-gcc/lib -L/mnt/hdf/packages/szip/static/encoder/Linux2.6-x86_64-gcc/lib
+ 	 	Extra libraries: -lsz -ljpeg -lz  -lm
+ 		       Archiver: ar
+ 		 	 Ranlib: ranlib
+
+Languages:
+----------
+                        Fortran: no
+
+Features:
+---------
+               SZIP compression: enabled with encoder
+   Support for netCDF API 2.3.2: yes
diff --git a/lib/linux64/libhdf5.settings b/lib/linux64/libhdf5.settings
new file mode 100644
index 00000000..afaae7fa
--- /dev/null
+++ b/lib/linux64/libhdf5.settings
@@ -0,0 +1,62 @@
+	    SUMMARY OF THE HDF5 CONFIGURATION
+	    =================================
+
+General Information:
+-------------------
+		   HDF5 Version: 1.8.8
+		  Configured on: Wed Nov 16 17:48:07 CST 2011
+		  Configured by: hdftest@koala
+		 Configure mode: production
+		    Host system: x86_64-unknown-linux-gnu
+	      Uname information: Linux koala 2.6.18-274.7.1.el5 #1 SMP Thu Oct 20 16:21:01 EDT 2011 x86_64 x86_64 x86_64 GNU/Linux
+		       Byte sex: little-endian
+		      Libraries: 
+	     Installation point: /mnt/scr1/pre-release/hdf5/v188/koalajava
+
+Compiling Options:
+------------------
+               Compilation Mode: production
+                     C Compiler: /usr/bin/ gcc -fPIC ( gcc (GCC) 4.1.2 20080704 )
+                         CFLAGS: 
+                      H5_CFLAGS: -std=c99 -pedantic -Wall -Wextra -Wundef -Wshadow -Wpointer-arith -Wbad-function-cast -Wcast-qual -Wcast-align -Wwrite-strings -Wconversion -Waggregate-return -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls -Wnested-externs -Winline -Wno-long-long -Wfloat-equal -Wmissing-format-attribute -Wmissing-noreturn -Wpacked -Wdisabled-optimization -Wformat=2 -Wunreachable-code -Wendif-labels -Wdeclaration-after-statement -Wold-style-definition -Winvalid-pch -Wvariadic-macros -Wnonnull -Winit-self -Wmissing-include-dirs -Wswitch-default -Wswitch-enum -Wunused-macros -Wunsafe-loop-optimizations -Wc++-compat -Wvolatile-register-var -O3 -fomit-frame-pointer -finline-functions
+                      AM_CFLAGS: 
+                       CPPFLAGS: 
+                    H5_CPPFLAGS: -D_POSIX_C_SOURCE=199506L   -DNDEBUG -UH5_DEBUG_API
+                    AM_CPPFLAGS: -I/mnt/hdf/packages/szip-PIC/static/encoder/Linux2.6-x86_64-gcc/include -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_BSD_SOURCE 
+               Shared C Library: no
+               Static C Library: yes
+  Statically Linked Executables: no
+                        LDFLAGS: 
+                     H5_LDFLAGS: 
+                     AM_LDFLAGS:  -L/mnt/hdf/packages/szip-PIC/static/encoder/Linux2.6-x86_64-gcc/lib
+ 	 	Extra libraries:  -lsz -lz -lrt -lm 
+ 		       Archiver: ar
+ 		 	 Ranlib: ranlib
+ 	      Debugged Packages: 
+		    API Tracing: no
+
+Languages:
+----------
+                        Fortran: no
+
+                            C++: no
+
+Features:
+---------
+                  Parallel HDF5: no
+             High Level library: yes
+                   Threadsafety: no
+            Default API Mapping: v18
+ With Deprecated Public Symbols: yes
+         I/O filters (external): deflate(zlib),szip(encoder)
+         I/O filters (internal): shuffle,fletcher32,nbit,scaleoffset
+                            MPE: no
+                     Direct VFD: no
+                        dmalloc: no
+Clear file buffers before write: yes
+           Using memory checker: no
+         Function Stack Tracing: no
+                           GPFS: no
+      Strict File Format Checks: no
+   Optimization Instrumentation: no
+       Large File Support (LFS): yes
diff --git a/lib/linux64/libiomp5.so b/lib/linux64/libiomp5.so
new file mode 100755
index 00000000..3b9e7257
Binary files /dev/null and b/lib/linux64/libiomp5.so differ
diff --git a/lib/linux64/libjhdf.so b/lib/linux64/libjhdf.so
new file mode 100755
index 00000000..ff6304c3
Binary files /dev/null and b/lib/linux64/libjhdf.so differ
diff --git a/lib/linux64/libjhdf5.so b/lib/linux64/libjhdf5.so
new file mode 100755
index 00000000..c3dcb2d3
Binary files /dev/null and b/lib/linux64/libjhdf5.so differ
diff --git a/lib/ptplot.jar b/lib/ptplot.jar
new file mode 100644
index 00000000..9582f1cb
Binary files /dev/null and b/lib/ptplot.jar differ
diff --git a/lib/ptplotapplication.jar b/lib/ptplotapplication.jar
new file mode 100755
index 00000000..cc32dd0c
Binary files /dev/null and b/lib/ptplotapplication.jar differ
diff --git a/lib/win64/HDF5_Copyright.html b/lib/win64/HDF5_Copyright.html
new file mode 100755
index 00000000..07a71f45
--- /dev/null
+++ b/lib/win64/HDF5_Copyright.html
@@ -0,0 +1,160 @@
+<html><head>
+<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
+    <title>
+      HDF5 Copyright Notice and License Terms
+    </title>
+  </head>
+
+<body bgcolor="#FFFFFF">
+<!-- NEW PAGE -->
+
+
+<hr>               
+
+<h3>Copyright Notice and License Terms for
+<br>
+HDF5 (Hierarchical Data Format 5) Software Library and Utilities</h3>
+<hr>               
+<p>
+
+
+HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+<br>
+Copyright 2006-2012 by The HDF Group.
+</p><p>
+NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities
+<br>
+Copyright 1998-2006 by the Board of Trustees of the University of Illinois.
+</p><p>
+<strong>All rights reserved.</strong>
+</p><p>
+
+</p><p>
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted for any purpose (including commercial purposes) 
+provided that the following conditions are met:
+
+</p><p>
+</p><ol>
+<li>
+Redistributions of source code must retain the above copyright notice, 
+this list of conditions, and the following disclaimer.
+
+</li><li>
+Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions, and the following disclaimer in the documentation 
+and/or materials provided with the distribution.
+
+</li><li>
+In addition, redistributions of modified forms of the source or binary code 
+must carry prominent notices stating that the original code was changed and 
+the date of the change.
+
+</li><li>
+All publications or advertising materials mentioning features or use of this 
+software are asked, but not required, to acknowledge that it was developed 
+by The HDF Group and by the National Center for Supercomputing Applications 
+at the University of Illinois at Urbana-Champaign and credit the contributors.
+
+</li><li>
+Neither the name of The HDF Group, the name of the University, nor the name 
+of any Contributor may be used to endorse or promote products derived from 
+this software without specific prior written permission from The HDF Group, 
+the University, or the Contributor, respectively.
+</li></ol>
+
+<p>
+<b>DISCLAIMER:</b>
+THIS SOFTWARE IS PROVIDED BY THE HDF GROUP AND THE CONTRIBUTORS 
+"AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED.  
+In no event shall The HDF Group or the Contributors be liable for any damages 
+suffered by the users arising out of the use of this software, even if advised 
+of the possibility of such damage. 
+
+
+</p><hr>
+<hr>
+
+<p>
+Contributors:   National Center for Supercomputing Applications  (NCSA) at 
+the University of Illinois, Fortner Software, Unidata Program Center (netCDF), 
+The Independent JPEG Group (JPEG), Jean-loup Gailly and Mark Adler (gzip), 
+and Digital Equipment Corporation (DEC).
+
+</p><hr>
+
+<p>
+Portions of HDF5 were developed with support from the Lawrence Berkeley 
+National Laboratory (LBNL) and the United States Department of Energy 
+under Prime Contract No. DE-AC02-05CH11231.
+
+</p><hr>
+
+<p>
+Portions of HDF5 were developed with support from the University of 
+California, Lawrence Livermore National Laboratory (UC LLNL). 
+The following statement applies to those portions of the product and must 
+be retained in any redistribution of source code, binaries, documentation, 
+and/or accompanying materials:
+</p><dir>
+        This work was partially produced at the University of California, 
+        Lawrence Livermore National Laboratory (UC LLNL) under contract 
+        no. W-7405-ENG-48 (Contract 48) between the U.S. Department of 
+        Energy (DOE) and The Regents of the University of California 
+        (University) for the operation of UC LLNL.
+	<p>
+	<b>DISCLAIMER:</b>
+        This work was prepared as an account of work sponsored by an agency 
+        of the United States Government. Neither the United States Government 
+        nor the University of California nor any of their employees, makes 
+        any warranty, express or implied, or assumes any liability or 
+        responsibility for the accuracy, completeness, or usefulness of any 
+        information, apparatus, product, or process disclosed, or represents 
+        that its use would not infringe privately- owned rights. Reference 
+        herein to any specific commercial products, process, or service by 
+        trade name, trademark, manufacturer, or otherwise, does not 
+        necessarily constitute or imply its endorsement, recommendation, or 
+        favoring by the United States Government or the University of 
+        California. The views and opinions of authors expressed herein do not 
+        necessarily state or reflect those of the United States Government or 
+        the University of California, and shall not be used for advertising 
+        or product endorsement purposes.
+</p></dir>
+
+<hr>
+
+<!--    DO NOT EDIT THE FOLLOWING 8 LINES;                           -->
+<!--    THEY ARE AUTOMATICALLY UPDATED BY DOCUMENTATION SOFTWARE.    -->
+
+<!-- #BeginLibraryItem "/ed_libs/Footer.lbi" -->
+<address>
+<table border="0" width="100%">
+  <tbody><tr valign="top">
+      <td align="left">
+          <address>
+          The HDF Group Help Desk: <img src="HDF5_help.png" align="top" height="16">
+          <br>
+          Describes HDF5 Release 1.8.9, May 2012.
+          </address>
+      </td><td width="5%">&nbsp;</td>
+      <td align="right">
+          <a href="http://www.hdfgroup.org/HDF5/doc/Copyright.html">Copyright</a> by
+          <a href="http://www.hdfgroup.org/">The HDF Group</a>
+          <br>
+          and the Board of Trustees of the University of Illinois
+      </td>   
+  </tr>   
+</tbody></table>
+</address>
+<!-- #EndLibraryItem --><script language="JAVASCRIPT">
+<!--
+document.writeln("Last modified: 5 March 2012")
+-->
+</script>Last modified: 5 March 2012
+
+
+
+
+
+
+</body></html>
diff --git a/lib/win64/JCUDA4.2/JCublas-windows-x86_64.dll b/lib/win64/JCUDA4.2/JCublas-windows-x86_64.dll
new file mode 100755
index 00000000..812bf249
Binary files /dev/null and b/lib/win64/JCUDA4.2/JCublas-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA4.2/JCublas2-windows-x86_64.dll b/lib/win64/JCUDA4.2/JCublas2-windows-x86_64.dll
new file mode 100755
index 00000000..66d70142
Binary files /dev/null and b/lib/win64/JCUDA4.2/JCublas2-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA4.2/JCudaDriver-windows-x86_64.dll b/lib/win64/JCUDA4.2/JCudaDriver-windows-x86_64.dll
new file mode 100755
index 00000000..5993832c
Binary files /dev/null and b/lib/win64/JCUDA4.2/JCudaDriver-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA4.2/JCudaRuntime-windows-x86_64.dll b/lib/win64/JCUDA4.2/JCudaRuntime-windows-x86_64.dll
new file mode 100755
index 00000000..3aba265f
Binary files /dev/null and b/lib/win64/JCUDA4.2/JCudaRuntime-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA4.2/JCufft-windows-x86_64.dll b/lib/win64/JCUDA4.2/JCufft-windows-x86_64.dll
new file mode 100755
index 00000000..7fbad0dd
Binary files /dev/null and b/lib/win64/JCUDA4.2/JCufft-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA4.2/JCurand-windows-x86_64.dll b/lib/win64/JCUDA4.2/JCurand-windows-x86_64.dll
new file mode 100755
index 00000000..f189d062
Binary files /dev/null and b/lib/win64/JCUDA4.2/JCurand-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA4.2/JCusparse-windows-x86_64.dll b/lib/win64/JCUDA4.2/JCusparse-windows-x86_64.dll
new file mode 100755
index 00000000..0f483793
Binary files /dev/null and b/lib/win64/JCUDA4.2/JCusparse-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA4.2/JCusparse2-windows-x86_64.dll b/lib/win64/JCUDA4.2/JCusparse2-windows-x86_64.dll
new file mode 100755
index 00000000..5b66a121
Binary files /dev/null and b/lib/win64/JCUDA4.2/JCusparse2-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA4.2/bidmatcuda.dll b/lib/win64/JCUDA4.2/bidmatcuda.dll
new file mode 100755
index 00000000..7a506749
Binary files /dev/null and b/lib/win64/JCUDA4.2/bidmatcuda.dll differ
diff --git a/lib/win64/JCUDA5.0/JCublas-windows-x86_64.dll b/lib/win64/JCUDA5.0/JCublas-windows-x86_64.dll
new file mode 100755
index 00000000..e8f812aa
Binary files /dev/null and b/lib/win64/JCUDA5.0/JCublas-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA5.0/JCublas2-windows-x86_64.dll b/lib/win64/JCUDA5.0/JCublas2-windows-x86_64.dll
new file mode 100755
index 00000000..bb0cff7b
Binary files /dev/null and b/lib/win64/JCUDA5.0/JCublas2-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA5.0/JCudaDriver-windows-x86_64.dll b/lib/win64/JCUDA5.0/JCudaDriver-windows-x86_64.dll
new file mode 100755
index 00000000..cc72206e
Binary files /dev/null and b/lib/win64/JCUDA5.0/JCudaDriver-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA5.0/JCudaRuntime-windows-x86_64.dll b/lib/win64/JCUDA5.0/JCudaRuntime-windows-x86_64.dll
new file mode 100755
index 00000000..35715c0d
Binary files /dev/null and b/lib/win64/JCUDA5.0/JCudaRuntime-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA5.0/JCufft-windows-x86_64.dll b/lib/win64/JCUDA5.0/JCufft-windows-x86_64.dll
new file mode 100755
index 00000000..060f337a
Binary files /dev/null and b/lib/win64/JCUDA5.0/JCufft-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA5.0/JCurand-windows-x86_64.dll b/lib/win64/JCUDA5.0/JCurand-windows-x86_64.dll
new file mode 100755
index 00000000..f248ec61
Binary files /dev/null and b/lib/win64/JCUDA5.0/JCurand-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA5.0/JCusparse-windows-x86_64.dll b/lib/win64/JCUDA5.0/JCusparse-windows-x86_64.dll
new file mode 100755
index 00000000..5ecef03b
Binary files /dev/null and b/lib/win64/JCUDA5.0/JCusparse-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA5.0/JCusparse2-windows-x86_64.dll b/lib/win64/JCUDA5.0/JCusparse2-windows-x86_64.dll
new file mode 100755
index 00000000..854747cf
Binary files /dev/null and b/lib/win64/JCUDA5.0/JCusparse2-windows-x86_64.dll differ
diff --git a/lib/win64/JCUDA5.0/bidmatcuda.dll b/lib/win64/JCUDA5.0/bidmatcuda.dll
new file mode 100755
index 00000000..37ce271e
Binary files /dev/null and b/lib/win64/JCUDA5.0/bidmatcuda.dll differ
diff --git a/lib/win64/JCUDA_Copyright.txt b/lib/win64/JCUDA_Copyright.txt
new file mode 100755
index 00000000..a47ba681
--- /dev/null
+++ b/lib/win64/JCUDA_Copyright.txt
@@ -0,0 +1,24 @@
+JCuda - Java bindings for NVIDIA CUDA
+
+Copyright (c) 2008-2012 Marco Hutter - http://www.jcuda.org
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
diff --git a/lib/win64/bidmatmkl.dll b/lib/win64/bidmatmkl.dll
new file mode 100755
index 00000000..f616fe52
Binary files /dev/null and b/lib/win64/bidmatmkl.dll differ
diff --git a/lib/win64/jhdf.dll b/lib/win64/jhdf.dll
new file mode 100755
index 00000000..fd03758e
Binary files /dev/null and b/lib/win64/jhdf.dll differ
diff --git a/lib/win64/jhdf5.dll b/lib/win64/jhdf5.dll
new file mode 100755
index 00000000..3d47abfb
Binary files /dev/null and b/lib/win64/jhdf5.dll differ
diff --git a/lib/win64/libiomp5md.dll b/lib/win64/libiomp5md.dll
new file mode 100755
index 00000000..faf9ebcc
Binary files /dev/null and b/lib/win64/libiomp5md.dll differ
diff --git a/project/plugins.sbt b/project/plugins.sbt
new file mode 100755
index 00000000..bf5cb709
--- /dev/null
+++ b/project/plugins.sbt
@@ -0,0 +1,7 @@
+
+libraryDependencies <+= sbtVersion(v => "com.github.siasia" %% "xsbt-proguard-plugin" % (v+"-0.1.1"))
+
+resolvers += "Proguard plugin repo" at "http://siasia.github.com/maven2"
+
+
+
diff --git a/src/main/java/edu/berkeley/bid/CBLAS.java b/src/main/java/edu/berkeley/bid/CBLAS.java
new file mode 100755
index 00000000..fba33ec2
--- /dev/null
+++ b/src/main/java/edu/berkeley/bid/CBLAS.java
@@ -0,0 +1,73 @@
+package edu.berkeley.bid;
+
+public final class CBLAS {
+
+    private CBLAS() {}
+
+    static {
+        System.loadLibrary("bidmatmkl");
+    }
+
+    public final static class ORDER {
+        private ORDER() {}
+        public final static int RowMajor=101;
+        public final static int ColMajor=102;
+    }
+
+    public final static class TRANSPOSE {
+        private TRANSPOSE() {}
+        public final static int NoTrans  =111;
+        public final static int Trans    =112;
+        public final static int ConjTrans=113;
+    }
+
+    public final static class UPLO {
+        private UPLO() {}
+        public final static int Upper=121;
+        public final static int Lower=122;
+    }
+
+    public final static class DIAG {
+        private DIAG() {}
+        public final static int NonUnit=131;
+        public final static int Unit   =132;
+    }
+
+    public final static class SIDE {
+        private SIDE() {}
+        public final static int Left =141;
+        public final static int Right=142;
+    }
+
+  public static native  double ddot( int N,  double []  X,  int incX,  double []  Y,  int incY);
+  public static native  double ddotxx( int N,  double []  X,  int startX,  double []  Y,  int startY);
+  public static native  double daxpy( int N, double a, double []  X,  int incX,  double []  Y,  int incY);
+  public static native  double daxpyxx( int N, double a, double []  X,  int startX,  double []  Y,  int startY);
+  public static native  void dgemv(  int order,   int TransA,  int M,  int N,  double alpha,  double []  A,  int lda,  
+  		                               double []  X,  int incX,  double beta, double []  Y,  int incY);
+  public static native  void dgemm(  int Order,   int TransA,   int TransB,  int M,  int N,  int K,  double alpha,  
+  		                               double []  A,  int lda,  double []  B,  int ldb,  double beta, double []  C,  int ldc);
+  public static native  void domatcopy( String Order, String TransA, int M, int N, double alpha, double [] A, int lda, double [] B, int ldb);
+  public static native  void dmcscm( int m, int n, double [] a, int lda, double [] b, int [] ir, int [] jc, double [] c, int ldc);
+  public static native  void dmcsrm( int m, int n, double [] a, int lda, double [] b, int [] ir, int [] jc, double [] c, int ldc);
+  
+  public static native  float sdot( int N,  float []  X,  int incX,  float []  Y,  int incY);
+  public static native  float sdotxx( int N,  float []  X,  int startX,  float []  Y,  int startY);
+  public static native  double saxpy( int N, float a, float []  X,  int incX,  float []  Y,  int incY);
+  public static native  double saxpyxx( int N, float a, float []  X,  int startX,  float []  Y,  int startY);
+  public static native  void sgemv(  int order,   int TransA,  int M,  int N,  float alpha,  float []  A,  int lda,  
+  		                               float []  X,  int incX,  float beta, float []  Y,  int incY);
+  public static native  void sgemm(  int Order,   int TransA,   int TransB,  int M,  int N,  int K,  float alpha,  
+  		                               float []  A,  int lda,  float []  B,  int ldb,  float beta, float []  C,  int ldc);
+  public static native  void somatcopy( String Order, String TransA, int M, int N, float alpha, float [] A, int lda, float [] B, int ldb);
+  
+  public static native  double caxpy( int N, float [] a, float [] X,  int incX,  float []  Y,  int incY);
+  public static native  double caxpyxx( int N, float [] a, float [] X,  int startX,  float []  Y,  int startY);
+  public static native  void cgemv(  int order,   int TransA,  int M,  int N,  float [] alpha,  float [] A,  int lda,  
+  		                               float []  X,  int incX,  float [] beta, float []  Y,  int incY);
+  public static native  void cgemm(  int Order,   int TransA,   int TransB,  int M,  int N,  int K,  float [] alpha,  
+  		                               float []  A,  int lda,  float []  B,  int ldb,  float [] beta, float []  C,  int ldc);
+  
+  public static native  void smcscm( int m, int n, float [] a, int lda, float [] b, int [] ir, int [] jc, float [] c, int ldc);
+  public static native  void smcsrm( int m, int n, float [] a, int lda, float [] b, int [] ir, int [] jc, float [] c, int ldc);
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/bid/CUMAT.java b/src/main/java/edu/berkeley/bid/CUMAT.java
new file mode 100755
index 00000000..9c8fcdd9
--- /dev/null
+++ b/src/main/java/edu/berkeley/bid/CUMAT.java
@@ -0,0 +1,32 @@
+package edu.berkeley.bid;
+import jcuda.*;
+import jcuda.runtime.*;
+
+public final class CUMAT {
+
+    private CUMAT() {}
+
+    static {
+        System.loadLibrary("bidmatcuda");
+    }
+
+    public static native int applyop(Pointer A, int Anrows, int Ancols, Pointer B, int Bnrows, int Bncols, Pointer C, int opn);
+
+    public static native int applyiop(Pointer A, int Anrows, int Ancols, Pointer B, int Bnrows, int Bncols, Pointer C, int opn);
+    
+    public static native int applygfun(Pointer A, Pointer B, int N, int opn);
+    
+    public static native int applygfun2(Pointer A, Pointer B, Pointer C, int N, int opn);
+    
+    public static native int reduce1op(int nr, int nc, Pointer A, Pointer B, int opn);
+    
+    public static native int reduce2op(int nr, int nc, Pointer A, Pointer B, int opn);
+    
+    public static native int dsmult(int nr, int nc, int nnz, Pointer A, Pointer Bdata, Pointer Bir, Pointer Bic, Pointer C);
+    
+    public static native int dsmultT(int nr, int nc, int nnz, Pointer A, Pointer Bdata, Pointer Bir, Pointer Bic, Pointer C);
+    
+    public static native int dds(int nr, int nnz, Pointer A, Pointer B, Pointer Cir, Pointer Cic, Pointer P);
+    
+    public static native int transpose(Pointer A, int lda, Pointer B, int ldb, int nr, int nc);
+}
diff --git a/src/main/java/edu/berkeley/bid/Copyright.txt b/src/main/java/edu/berkeley/bid/Copyright.txt
new file mode 100755
index 00000000..21326596
--- /dev/null
+++ b/src/main/java/edu/berkeley/bid/Copyright.txt
@@ -0,0 +1,25 @@
+Copyright (c) 2012, Regents of the University of California
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/src/main/java/edu/berkeley/bid/LAPACK.java b/src/main/java/edu/berkeley/bid/LAPACK.java
new file mode 100755
index 00000000..0cea7828
--- /dev/null
+++ b/src/main/java/edu/berkeley/bid/LAPACK.java
@@ -0,0 +1,102 @@
+package edu.berkeley.bid;
+
+public final class LAPACK {
+
+    private LAPACK() {}
+
+    static {
+        System.loadLibrary("bidmatmkl");
+    }
+
+/*    public final static class ORDER {
+        private ORDER() {}
+        public final static int RowMajor=101;
+        public final static int ColMajor=102;
+    } */
+
+  public static native  int sgetrf( int order, int M, int N, float [] A, int lda, int [] ipiv);
+  public static native  int dgetrf( int order, int M, int N, double [] A, int lda, int [] ipiv);
+  public static native  int cgetrf( int order, int M, int N, float [] A, int lda, int [] ipiv);
+  public static native  int zgetrf( int order, int M, int N, double [] A, int lda, int [] ipiv);
+  
+  public static native  int sgetri( int order, int N, float [] A, int lda, int [] ipiv);
+  public static native  int dgetri( int order, int N, double [] A, int lda, int [] ipiv);
+  public static native  int cgetri( int order, int N, float [] A, int lda, int [] ipiv);
+  public static native  int zgetri( int order, int N, double [] A, int lda, int [] ipiv);
+  
+  public static native  int sgetrs( int order, String trans, int N, int nrhs, float [] A, int lda, int [] ipiv, float [] b, int ldb);
+  public static native  int dgetrs( int order, String trans, int N, int nrhs, double [] A, int lda, int [] ipiv, double [] b, int ldb);
+  public static native  int cgetrs( int order, String trans, int N, int nrhs, float [] A, int lda, int [] ipiv, float [] b, int ldb);
+  public static native  int zgetrs( int order, String trans, int N, int nrhs, double [] A, int lda, int [] ipiv, double [] b, int ldb);
+  
+  public static native  int strtrs( int order, String mdata, int n, int nrhs, float [] A, int lda, float [] b, int ldb);
+  public static native  int dtrtrs( int order, String mdata, int n, int nrhs, double [] A, int lda, double [] b, int ldb);
+  public static native  int ctrtrs( int order, String mdata, int n, int nrhs, float [] A, int lda, float [] b, int ldb);
+  public static native  int ztrtrs( int order, String mdata, int n, int nrhs, double [] A, int lda, double [] b, int ldb);
+  
+  public static native  int ssteqr( int order, String compz, int n, float [] d, float [] e, float [] z, int ldz );
+  public static native  int dsteqr( int order, String compz, int n, double [] d, double [] e, double [] z, int ldz );
+  public static native  int csteqr( int order, String compz, int n, float [] d, float [] e, float [] z, int ldz );
+  public static native  int zsteqr( int order, String compz, int n, double [] d, double [] e, double [] z, int ldz );
+  
+  public static native  int ssytrd( int order, String uplo, int n, float [] a, int lda, float [] d, float [] e, float [] tau );
+  public static native  int dsytrd( int order, String uplo, int n, double [] a, int lda, double [] d, double [] e, double [] tau );
+  
+  public static native  int sorgtr( int order, String uplo, int n, float [] a, int lda, float [] tau );
+  public static native  int dorgtr( int order, String uplo, int n, double [] a, int lda, double [] tau );
+  
+  public static native  int sstedc( int order, String compz, int n, float [] d, float [] e, float [] z, int ldz );
+  public static native  int dstedc( int order, String compz, int n, double [] d, double [] e, double [] z, int ldz );
+  
+  public static native  int ssyevd( int order, String jobz, String uplo, int n, float [] a, int lda, float [] w );
+  public static native  int dsyevd( int order, String jobz, String uplo, int n, double [] a, int lda, double [] w );
+  
+  public static native  int spotrf( int order, String uplo, int n, float [] a, int lda);
+  public static native  int dpotrf( int order, String uplo, int n, double [] a, int lda);
+  public static native  int cpotrf( int order, String uplo, int n, float [] a, int lda); 
+  public static native  int zpotrf( int order, String uplo, int n, double [] a, int lda); 
+
+  public static native int sgebal(int matrix_order, String job, int n, float [] a, int lda, int [] ilo, int [] ihi, float [] scale);
+  public static native int dgebal(int matrix_order, String job, int n, double [] a, int lda, int [] ilo, int [] ihi, double [] scale);
+  public static native int cgebal(int matrix_order, String job, int n, float [] a, int lda, int [] ilo, int [] ihi, float [] scale);
+  public static native int zgebal(int matrix_order, String job, int n, double [] a, int lda, int [] ilo, int [] ihi, double [] scale);
+  
+  public static native int cunghr(int matrix_order, int n, int ilo, int ihi, float [] a, int lda, float [] tau);
+  public static native int zunghr(int matrix_order, int n, int ilo, int ihi, double [] a, int lda, double [] tau);
+
+  public static native int strevc(int matrix_order, String side, String howmny, int [] select, int n, float [] t, int ldt, float [] vl, int ldvl, float [] vr, int ldvr, int mm, int [] m);
+  public static native int dtrevc(int matrix_order, String side, String howmny, int [] select, int n, double [] t, int ldt, double [] vl, int ldvl, double [] vr, int ldvr, int mm, int [] m);
+  public static native int ctrevc(int matrix_order, String side, String howmny, int [] select, int n, float [] t, int ldt, float [] vl, int ldvl, float [] vr, int ldvr, int mm, int [] m);
+  public static native int ztrevc(int matrix_order, String side, String howmny, int [] select, int n, double [] t, int ldt, double [] vl, int ldvl, double [] vr, int ldvr, int mm, int [] m);
+
+  public static native int sgehrd(int matrix_order, int n, int ilo, int ihi, float [] a, int lda, float [] tau);
+  public static native int dgehrd(int matrix_order, int n, int ilo, int ihi, double [] a, int lda, double [] tau);
+  public static native int cgehrd(int matrix_order, int n, int ilo, int ihi, float [] a, int lda, float [] tau);
+  public static native int zgehrd(int matrix_order, int n, int ilo, int ihi, double [] a, int lda, double [] tau);
+
+  public static native int shseqr(int matrix_order, String job, String compz, int n, int ilo, int ihi, float [] h, int ldh, float [] wr, float [] wi, float [] z, int ldz);
+  public static native int dhseqr(int matrix_order, String job, String compz, int n, int ilo, int ihi, double [] h, int ldh, double [] wr, double [] wi, double [] z, int ldz);
+  public static native int chseqr(int matrix_order, String job, String compz, int n, int ilo, int ihi, float [] h, int ldh, float [] w, float [] z, int ldz);
+  public static native int zhseqr(int matrix_order, String job, String compz, int n, int ilo, int ihi, double [] h, int ldh, double [] w, double [] z, int ldz);
+
+  public static native int sgebak(int matrix_order, String job, String side, int n, int ilo, int ihi, float [] scale, int m, float [] v, int ldv);
+  public static native int dgebak(int matrix_order, String job, String side, int n, int ilo, int ihi, double [] scale, int m, double [] v, int ldv);
+  public static native int cgebak(int matrix_order, String job, String side, int n, int ilo, int ihi, float [] scale, int m, float [] v, int ldv);
+  public static native int zgebak(int matrix_order, String job, String side, int n, int ilo, int ihi, double [] scale, int m, double [] v, int ldv);
+
+  public static native int sgeqrf(int matrix_order, int m, int n, float [] a, int lda, float [] tau);
+  public static native int dgeqrf(int matrix_order, int m, int n, double [] a, int lda, double [] tau);
+  public static native int cgeqrf(int matrix_order, int m, int n, float [] a, int lda, float [] tau);
+  public static native int zgeqrf(int matrix_order, int m, int n, double [] a, int lda, double [] tau);
+
+  public static native int sgeqp3(int matrix_order, int m, int n, float [] a, int lda, int [] jpvt, float [] tau);
+  public static native int dgeqp3(int matrix_order, int m, int n, double [] a, int lda, int [] jpvt, double [] tau);
+  public static native int cgeqp3(int matrix_order, int m, int n, float [] a, int lda, int [] jpvt, float [] tau);
+  public static native int zgeqp3(int matrix_order, int m, int n, double [] a, int lda, int [] jpvt, double [] tau);
+
+  public static native int sorgqr(int matrix_order, int m, int n, int k, float [] a, int lda, float [] tau);
+  public static native int dorgqr(int matrix_order, int m, int n, int k, double [] a, int lda, double [] tau);
+
+  public static native int cungqr(int matrix_order, int m, int n, int k, float [] a, int lda, float [] tau);
+  public static native int zungqr(int matrix_order, int m, int n, int k, double [] a, int lda, double [] tau);
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/bid/SPBLAS.java b/src/main/java/edu/berkeley/bid/SPBLAS.java
new file mode 100755
index 00000000..6effca69
--- /dev/null
+++ b/src/main/java/edu/berkeley/bid/SPBLAS.java
@@ -0,0 +1,35 @@
+package edu.berkeley.bid;
+
+public final class SPBLAS {
+
+    private SPBLAS() {}
+
+    static {
+        System.loadLibrary("bidmatmkl");
+    }
+
+  public static native  void scsrmm(String transa, int m, int n, int k, float alpha, String matdescra, 
+  		float [] val, int [] ir, int [] jc,  float []  b, int ldb, float beta, float []  c, int ldc);
+  
+  public static native  void scscmm(String transa, int m, int n, int k, float alpha, String matdescra, 
+  		float [] val, int [] ir, int [] jc,  float []  b, int ldb, float beta, float []  c, int ldc);
+  
+  public static native  void scsrmv (String transa, int m, int k, float alpha, String matdescra, 
+  		float [] val, int [] ir, int [] jc,  float []  x, float beta, float []  y);
+  
+  public static native  void scscmv (String transa, int m, int k, float alpha, String matdescra, 
+  		float [] val, int [] ir, int [] jc,  float []  x, float beta, float []  y);
+  
+  public static native  void dcsrmm(String transa, int m, int n, int k, double alpha, String matdescra, 
+  		double [] val, int [] ir, int [] jc,  double []  b, int ldb, double beta, double []  c, int ldc);
+  
+  public static native  void dcscmm(String transa, int m, int n, int k, double alpha, String matdescra, 
+  		double [] val, int [] ir, int [] jc,  double []  b, int ldb, double beta, double []  c, int ldc);
+  
+  public static native  void dcsrmv (String transa, int m, int k, double alpha, String matdescra, 
+  		double [] val, int [] ir, int [] jc,  double []  x, double beta, double []  y);
+  
+  public static native  void dcscmv (String transa, int m, int k, double alpha, String matdescra, 
+  		double [] val, int [] ir, int [] jc,  double []  x, double beta, double []  y);
+
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/bid/UTILS.java b/src/main/java/edu/berkeley/bid/UTILS.java
new file mode 100755
index 00000000..5a20f2cd
--- /dev/null
+++ b/src/main/java/edu/berkeley/bid/UTILS.java
@@ -0,0 +1,41 @@
+package edu.berkeley.bid;
+import java.io.*;
+import java.util.zip.*;
+
+public final class UTILS {
+
+    private UTILS() {}
+
+    static {
+        System.loadLibrary("bidmatmkl");
+    }
+
+    public static native  void memcpybi( int n, byte [] a, int startA, int [] b, int startB );
+    public static native  void memcpybf( int n, byte [] a, int startA, float [] b, int startB );
+    public static native  void memcpybd( int n, byte [] a, int startA, double [] b, int startB );
+
+    public static native  void memcpyib( int n, int [] a, int startA, byte [] b, int startB );
+    public static native  void memcpyfb( int n, float [] a, int startA, byte [] b, int startB );
+    public static native  void memcpydb( int n, double [] a, int startA, byte [] b, int startB );
+    
+    public static OutputStream _getOutputStream(String fname, Boolean compressed, int compressionLevel) throws IOException {
+    	FileOutputStream fout = new FileOutputStream(fname);
+    	if (compressed) {
+    		switch (compressionLevel) {
+    		case 1: return new GZIPOutputStream(fout, 1024*1024){{def.setLevel(1);}};
+    		case 2: return new GZIPOutputStream(fout, 1024*1024){{def.setLevel(2);}};
+    		case 3: return new GZIPOutputStream(fout, 1024*1024){{def.setLevel(3);}};
+    		case 4: return new GZIPOutputStream(fout, 1024*1024){{def.setLevel(4);}};
+    		case 5: return new GZIPOutputStream(fout, 1024*1024){{def.setLevel(5);}};
+    		case 6: return new GZIPOutputStream(fout, 1024*1024){{def.setLevel(6);}};
+    		case 7: return new GZIPOutputStream(fout, 1024*1024){{def.setLevel(7);}};
+    		case 8: return new GZIPOutputStream(fout, 1024*1024){{def.setLevel(8);}};
+    		case 9: return new GZIPOutputStream(fout, 1024*1024){{def.setLevel(9);}};
+    		default: throw new RuntimeException("Unsupported compression level "+compressionLevel);
+    		}    		
+    	} else {
+    		return new BufferedOutputStream(fout, 1024*1024);
+    	}
+    }
+
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/bid/VML.java b/src/main/java/edu/berkeley/bid/VML.java
new file mode 100755
index 00000000..8bb0b482
--- /dev/null
+++ b/src/main/java/edu/berkeley/bid/VML.java
@@ -0,0 +1,144 @@
+package edu.berkeley.bid;
+
+public final class VML {
+
+    private VML() {}
+
+    static {
+        System.loadLibrary("bidmatmkl");
+    }
+    
+    public final static class VMLMODE {
+      private VMLMODE() {}
+      public final static int VML_LA = 0x00000001;
+      public final static int VML_HA = 0x00000002;
+      public final static int VML_EP = 0x00000003;
+  
+      public final static int VML_ERRMODE_IGNORE = 0x00000100;
+      public final static int VML_ERRMODE_ERRNO  = 0x00000200;
+      public final static int VML_ERRMODE_STDERR = 0x00000400;
+      public final static int VML_ERRMODE_EXCEPT = 0x00000800;
+      public final static int VML_ERRMODE_CALLBACK = 0x00001000;
+      public final static int VML_ERRMODE_DEFAULT  = VML_ERRMODE_ERRNO | VML_ERRMODE_CALLBACK | VML_ERRMODE_EXCEPT;
+
+      public final static int VML_FTZDAZ_ON  = 0x00280000;
+      public final static int VML_FTZDAZ_OFF = 0x00140000;
+    }
+
+
+  public static native   void  vsAbs ( int n,  float [] a, float [] r);
+  public static native   void  vdAbs ( int n,  double [] a, double [] r);
+  public static native   void  vsAdd ( int n,  float [] a,  float [] b, float [] r);
+  public static native   void  vdAdd ( int n,  double [] a,  double [] b, double [] r);
+  public static native   void  vsSub ( int n,  float [] a,  float [] b, float [] r);
+  public static native   void  vdSub ( int n,  double [] a,  double [] b, double [] r);
+  public static native   void  vsInv ( int n,  float [] a, float [] r);
+  public static native   void  vdInv ( int n,  double [] a, double [] r);
+  public static native   void  vsSqrt ( int n,  float [] a, float [] r);
+  public static native   void  vdSqrt ( int n,  double [] a, double [] r);
+  public static native   void  vsExp ( int n,  float [] a, float [] r);
+  public static native   void  vdExp ( int n,  double [] a, double [] r);
+  public static native   void  vsExpm1 ( int n,  float [] a, float [] r);
+  public static native   void  vdExpm1 ( int n,  double [] a, double [] r);
+  public static native   void  vsLn ( int n,  float [] a, float [] r);
+  public static native   void  vdLn ( int n,  double [] a, double [] r);
+  public static native   void  vsLog10 ( int n,  float [] a, float [] r);
+  public static native   void  vdLog10 ( int n,  double [] a, double [] r);
+  public static native   void  vsLog1p ( int n,  float [] a, float [] r);
+  public static native   void  vdLog1p ( int n,  double [] a, double [] r);
+  public static native   void  vsCos ( int n,  float [] a, float [] r);
+  public static native   void  vdCos ( int n,  double [] a, double [] r);
+  public static native   void  vsSin ( int n,  float [] a, float [] r);
+  public static native   void  vdSin ( int n,  double [] a, double [] r);
+  public static native   void  vsTan ( int n,  float [] a, float [] r);
+  public static native   void  vdTan ( int n,  double [] a, double [] r);
+  public static native   void  vsCosh ( int n,  float [] a, float [] r);
+  public static native   void  vdCosh ( int n,  double [] a, double [] r);
+  public static native   void  vsSinh ( int n,  float [] a, float [] r);
+  public static native   void  vdSinh ( int n,  double [] a, double [] r);
+  public static native   void  vsTanh ( int n,  float [] a, float [] r);
+  public static native   void  vdTanh ( int n,  double [] a, double [] r);
+  public static native   void  vsAcos ( int n,  float [] a, float [] r);
+  public static native   void  vdAcos ( int n,  double [] a, double [] r);
+  public static native   void  vsAsin ( int n,  float [] a, float [] r);
+  public static native   void  vdAsin ( int n,  double [] a, double [] r);
+  public static native   void  vsAtan ( int n,  float [] a, float [] r);
+  public static native   void  vdAtan ( int n,  double [] a, double [] r);
+  public static native   void  vsAcosh ( int n,  float [] a, float [] r);
+  public static native   void  vdAcosh ( int n,  double [] a, double [] r);
+  public static native   void  vsAsinh ( int n,  float [] a, float [] r);
+  public static native   void  vdAsinh ( int n,  double [] a, double [] r);
+  public static native   void  vsAtanh ( int n,  float [] a, float [] r);
+  public static native   void  vdAtanh ( int n,  double [] a, double [] r);
+  public static native   void  vsErf ( int n,  float [] a, float [] r);
+  public static native   void  vdErf ( int n,  double [] a, double [] r);
+  public static native   void  vsErfInv ( int n,  float [] a, float [] r);
+  public static native   void  vdErfInv ( int n,  double [] a, double [] r);
+  public static native   void  vsHypot ( int n,  float [] a,  float [] b, float [] r);
+  public static native   void  vdHypot ( int n,  double [] a,  double [] b, double [] r);
+  public static native   void  vsErfc ( int n,  float [] a, float [] r);
+  public static native   void  vdErfc ( int n,  double [] a, double [] r);
+  public static native   void  vsErfcInv ( int n,  float [] a, float [] r);
+  public static native   void  vdErfcInv ( int n,  double [] a, double [] r);
+  public static native   void  vsCdfNorm ( int n,  float [] a, float [] r);
+  public static native   void  vdCdfNorm ( int n,  double [] a, double [] r);
+  public static native   void  vsCdfNormInv ( int n,  float [] a, float [] r);
+  public static native   void  vdCdfNormInv ( int n,  double [] a, double [] r);
+  public static native   void  vsLGamma ( int n,  float [] a, float [] r);
+  public static native   void  vdLGamma ( int n,  double [] a, double [] r);
+  public static native   void  vsTGamma ( int n,  float [] a, float [] r);
+  public static native   void  vdTGamma ( int n,  double [] a, double [] r);
+  public static native   void  vsAtan2 ( int n,  float [] a,  float [] b, float [] r);
+  public static native   void  vdAtan2 ( int n,  double [] a,  double [] b, double [] r);
+  public static native   void  vsMul ( int n,  float [] a,  float [] b, float [] r);
+  public static native   void  vdMul ( int n,  double [] a,  double [] b, double [] r);
+  public static native   void  vsDiv ( int n,  float [] a,  float [] b, float [] r);
+  public static native   void  vdDiv ( int n,  double [] a,  double [] b, double [] r);
+  public static native   void  vsPow ( int n,  float [] a,  float [] b, float [] r);
+  public static native   void  vdPow ( int n,  double [] a,  double [] b, double [] r);
+  public static native   void  vsPow3o2 ( int n,  float [] a, float [] r);
+  public static native   void  vdPow3o2 ( int n,  double [] a, double [] r);
+  public static native   void  vsPow2o3 ( int n,  float [] a, float [] r);
+  public static native   void  vdPow2o3 ( int n,  double [] a, double [] r);
+  public static native   void  vsPowx ( int n,  float [] a,  float b, float [] r);
+  public static native   void  vdPowx ( int n,  double [] a,  double b, double [] r);
+  public static native   void  vsSinCos ( int n,  float [] a, float [] r1, float [] r2);
+  public static native   void  vdSinCos ( int n,  double [] a, double [] r1, double [] r2);
+  public static native   void  vsLinearFrac ( int n,  float [] a,  float [] b,  float scalea,  float shifta,  float scaleb,  float shiftb, float [] r);
+  public static native   void  vdLinearFrac ( int n,  double [] a,  double [] b,  double scalea,  double shifta,  double scaleb,  double shiftb, double [] r);
+  public static native   void  vsCeil ( int n,  float [] a, float [] r);
+  public static native   void  vdCeil ( int n,  double [] a, double [] r);;
+  public static native   void  vsFloor ( int n,  float [] a, float [] r);
+  public static native   void  vdFloor ( int n,  double [] a, double [] r);
+  public static native   void  vsModf ( int n,  float [] a, float [] r1, float [] r2);
+  public static native   void  vdModf ( int n,  double [] a, double [] r1, double [] r2);
+  public static native   void  vmsModf ( int n,  float [] a, float [] r1, float [] r2, long mode);
+  public static native   void  vmdModf ( int n,  double [] a, double [] r1, double [] r2, long mode);
+  public static native   void  vsNearbyInt ( int n,  float [] a, float [] r);
+  public static native   void  vdNearbyInt ( int n,  double [] a, double [] r);
+  public static native   void  vsRint ( int n,  float [] a, float [] r);
+  public static native   void  vdRint ( int n,  double [] a, double [] r);
+  public static native   void  vsRound ( int n,  float [] a, float [] r);
+  public static native   void  vdRound ( int n,  double [] a, double [] r);
+  public static native   void  vsTrunc ( int n,  float [] a, float [] r);
+  public static native   void  vdTrunc ( int n,  double [] a, double [] r);
+  public static native   void  vsPackI ( int n,  float [] a,  int incra, float [] y);
+  public static native   void  vdPackI ( int n,  double [] a,  int incra, double [] y);
+  public static native   void  vsPackV ( int n,  float [] a,  int [] ia, float [] y);
+  public static native   void  vdPackV ( int n,  double [] a,  int [] ia, double [] y);
+  public static native   void  vsPackM ( int n,  float [] a,  int [] ma, float [] y);
+  public static native   void  vdPackM ( int n,  double [] a,  int [] ma, double [] y);
+  public static native   void  vsUnpackI ( int n,  float [] a, float [] y,  int incry );
+  public static native   void  vdUnpackI ( int n,  double [] a, double [] y,  int incry );
+  public static native   void  vsUnpackV ( int n,  float [] a, float [] y,  int [] iy );
+  public static native   void  vdUnpackV ( int n,  double [] a, double [] y,  int [] iy );
+  public static native   void  vsUnpackM ( int n,  float [] a, float [] y,  int [] my );
+  public static native   void  vdUnpackM ( int n,  double [] a, double [] y,  int [] my );
+  public static native   int  vmlSetErrStatus ( int status);
+  public static native   int  vmlGetErrStatus ();
+  public static native   int  vmlClearErrStatus ();
+  public static native    int  vmlSetMode ( int newmode);
+  public static native    int  vmlGetMode ();
+  public static native   void  MKLFreeTls ( int fdwReason);
+
+}
diff --git a/src/main/java/edu/berkeley/bid/VSL.java b/src/main/java/edu/berkeley/bid/VSL.java
new file mode 100755
index 00000000..e4994b8e
--- /dev/null
+++ b/src/main/java/edu/berkeley/bid/VSL.java
@@ -0,0 +1,118 @@
+package edu.berkeley.bid;
+
+public final class VSL {
+
+    static { System.loadLibrary( "bidmatmkl" ); }
+
+    private long handle = 0;
+
+    public VSL() {}
+
+    protected void finalize() {
+        if (handle != 0) {
+            vslDeleteStream(this);
+            handle = 0;
+        }
+    }
+
+    public static native int vslNewStream(VSL stream, int brng, int seed);
+
+    public static native int vslDeleteStream(VSL stream);
+
+    public static native int vdRngCauchy(int method, VSL stream, int n, double[] r, double a, double beta);
+
+    public static native int vsRngCauchy(int method, VSL stream, int n, float[] r, float a, float beta);
+
+    public static native int vdRngUniform(int method, VSL stream, int n, double[] r, double a, double b);
+
+    public static native int vsRngUniform(int method, VSL stream, int n, float[] r, float a, float b);
+
+    public static native int vdRngGaussian(int method, VSL stream, int n, double[] r, double a, double sigma);
+
+    public static native int vsRngGaussian(int method, VSL stream, int n, float[] r, float a, float sigma);
+
+    public static native int vdRngGaussianMV(int method, VSL stream, int n, double[] r, int dimen, int mstorage, double[] a, double[] t);
+
+    public static native int vsRngGaussianMV(int method, VSL stream, int n, float[] r, int dimen, int mstorage, float[] a, float[] t);
+
+    public static native int vdRngExponential(int method, VSL stream, int n, double[] r, double a, double beta);
+
+    public static native int vsRngExponential(int method, VSL stream, int n, float[] r, float a, float beta);
+
+    public static native int vdRngLaplace(int method, VSL stream, int n, double[] r, double a, double beta);
+
+    public static native int vsRngLaplace(int method, VSL stream, int n, float[] r, float a, float beta);
+
+    public static native int vdRngWeibull(int method, VSL stream, int n, double[] r, double alpha, double a, double beta);
+
+    public static native int vsRngWeibull(int method, VSL stream, int n, float[] r, float alpha, float a, float beta);
+
+    public static native int vdRngRayleigh(int method, VSL stream, int n, double[] r, double a, double beta);
+
+    public static native int vsRngRayleigh(int method, VSL stream, int n, float[] r, float a, float beta);
+
+    public static native int vdRngLognormal(int method, VSL stream, int n, double[] r, double a, double sigma, double b, double beta);
+
+    public static native int vsRngLognormal(int method, VSL stream, int n, float[] r, float a, float sigma, float b, float beta);
+
+    public static native int vdRngGumbel(int method, VSL stream, int n, double[] r, double a, double beta);
+
+    public static native int vsRngGumbel(int method, VSL stream, int n, float[] r, float a, float beta);
+
+    public static native int vdRngGamma(int method, VSL stream, int n, double[] r, double alpha, double a, double beta);
+
+    public static native int vsRngGamma(int method, VSL stream, int n, float[] r, float alpha, float a, float beta);
+
+    public static native int vdRngBeta(int method, VSL stream, int n, double[] r, double p, double q, double a, double beta);
+
+    public static native int vsRngBeta(int method, VSL stream, int n, float[] r, float p, float q, float a, float beta);
+
+    public static native int viRngBernoulli(int method, VSL stream, int n, int[] r, double p);
+
+    public static native int viRngUniform(int method, VSL stream, int n, int[] r, int a, int b);
+
+    public static native int viRngUniformBits(int method, VSL stream, int n, int[] r);
+
+    public static native int viRngGeometric(int method, VSL stream, int n, int[] r, double p);
+
+    public static native int viRngBinomial(int method, VSL stream, int n, int[] r, int ntrial, double p);
+
+    public static native int viRngHypergeometric(int method, VSL stream, int n, int[] r, int l, int s, int m);
+
+    public static native int viRngNegbinomial(int method, VSL stream, int n, int[] r, double a, double p);
+
+    public static native int viRngPoisson(int method, VSL stream, int n, int[] r, double lambda);
+
+    public static native int viRngPoissonV(int method, VSL stream, int n, int[] r, double[] lambda);
+
+    public static native int vslSkipAheadStream(VSL stream, int nskip);
+
+    public static native int vslGetStreamStateBrng(VSL stream);
+
+    public static native int vslGetNumRegBrngs();
+
+    public final static int BRNG_MCG31 = 0x100000;
+
+    public final static int BRNG_R250 = 0x200000;
+
+    public final static int BRNG_MRG32K3A = 0x300000;
+
+    public final static int BRNG_MCG59 = 0x400000;
+
+    public final static int BRNG_WH = 0x500000;
+
+    public final static int BRNG_SOBOL = 0x600000;
+
+    public final static int BRNG_NIEDERR = 0x700000;
+
+    public final static int BRNG_MT19937 = 0x800000;
+
+    public final static int BRNG_MT2203 = 0x900000;
+
+    public final static int BRNG_IABSTRACT = 0xa00000;
+
+    public final static int BRNG_DABSTRACT = 0xb00000;
+
+    public final static int BRNG_SABSTRACT = 0xc00000;
+
+}
diff --git a/src/main/scala/BIDMat/BMat.scala b/src/main/scala/BIDMat/BMat.scala
new file mode 100755
index 00000000..443a1e84
--- /dev/null
+++ b/src/main/scala/BIDMat/BMat.scala
@@ -0,0 +1,172 @@
+package BIDMat
+import edu.berkeley.bid.CBLAS._
+import edu.berkeley.bid.LAPACK._
+
+case class BMat(nr:Int, nc:Int, nnz1:Int, ir0:Array[Int], jc0:Array[Int], data0:Array[Byte]) extends SparseMat[Byte](nr, nc, nnz1, ir0, jc0, data0) {
+
+  def size() = length;
+  
+  def tryForBMat(m:Mat, s:String):BMat = 
+  	m match {
+  	case mm:BMat => mm
+  	case _ => throw new RuntimeException("wrong type for operator "+s+" arg "+m)
+  }
+    
+  def tryForOutBMat(out:Mat):BMat = 
+  	if (out.asInstanceOf[AnyRef] == null) {
+  		null
+  	} else {
+  		out match {
+  		case outmat:BMat => outmat
+  		case _ => throw new RuntimeException("wrong type for LHS matrix "+out)
+  		}
+  	}
+  
+  override def mytype = "BMat"
+      
+  override def t:BMat = BMat(gt)
+  
+  def horzcat(b: BMat) = BMat(super.horzcat(b))
+  
+  def vertcat(b: BMat) = BMat(super.vertcat(b))
+  
+  def find3:(IMat, IMat, IMat) = { 
+    val (ii, jj, vv) = gfind3 
+    val vi = IMat(vv.length, 1)
+    Mat.copyToIntArray(vv.data, 0, vi.data, 0, vv.length)
+    (IMat(ii), IMat(jj), vi)
+  }
+  
+  override def apply(a:IMat, b:IMat):BMat = BMat(gapply(a, b))	
+  
+  override def apply(a:IMat, b:Int):BMat = BMat(gapply(a, IMat.ielem(b)))	
+  
+  override def apply(a:Int, b:IMat):BMat = BMat(gapply(IMat.ielem(a), b))
+  
+  def bbMatOp(b: BMat, f:(Byte, Byte) => Byte, out:Mat):BMat = BMat(sgMatOp(b, f, out))
+  
+  def bbMatOpScalar(b: Byte, f:(Byte, Byte) => Byte, out:Mat):BMat = BMat(sgMatOpScalar(b, f, out))
+  
+  def bbReduceOp(n:Int, f1:(Byte) => Byte, f2:(Byte, Byte) => Byte) = IMat(sgReduceOp(n, f1, f2, null))
+  
+  def toCSMat:CSMat = {
+    val out = CSMat(ncols, 1)
+    val ioff = Mat.ioneBased
+    var i = 0
+    while (i < ncols) {
+      out.data(i) = new String(data, jc(i)-ioff, jc(i+1)-jc(i), BMat.encoding)
+      i += 1
+    }
+    out
+  }
+  
+  override def toString:String = { 
+  	val somespaces = "                                                               "
+  	val ioff = Mat.ioneBased
+  	val ss = new StringBuilder
+  	val nChars = Mat.terminalWidth-4
+  	val totchars = 10*nChars
+  	var nelems = 0
+  	var maxlen = 0
+  	val lbuf = new scala.collection.mutable.ListBuffer[String]
+  	while (maxlen * nelems < totchars && nelems < ncols) {
+  	  val str = new String(data, jc(nelems)-ioff, jc(nelems+1)-jc(nelems), BMat.encoding)
+  	  lbuf.append(str)
+  	  maxlen = math.max(maxlen, 1+str.length)
+  	  nelems += 1
+  	}
+  	nelems -= 1
+  	var i = 0
+  	var thisrow = 0
+  	lbuf.forall((str:String) => {
+  		ss.append(str + somespaces.substring(0, maxlen - str.length))
+  	  thisrow += 1
+  	  if ((thisrow + 1) * maxlen >= nChars) {
+  	    ss.append("\n")
+  	    thisrow = 0
+  	  }
+  		true
+  	})
+  	if (nelems < ncols) {
+  		ss.append("...")
+  	}
+  	ss.toString
+  }
+    
+  def > (b : Byte) = bbMatOpScalar(b, (x:Byte, y:Byte) => if (x > y) 1 else 0, null)
+  def < (b : Byte) = bbMatOpScalar(b, (x:Byte, y:Byte) => if (x < y) 1 else 0, null)
+  def == (b : Byte) = bbMatOpScalar(b, (x:Byte, y:Byte) => if (x == y) 1 else 0, null)
+  def === (b : Byte) = bbMatOpScalar(b, (x:Byte, y:Byte) => if (x == y) 1 else 0, null)
+  def >= (b : Byte) = bbMatOpScalar(b, (x:Byte, y:Byte) => if (x >= y) 1 else 0, null)
+  def <= (b : Byte) = bbMatOpScalar(b, (x:Byte, y:Byte) => if (x <= y) 1 else 0, null)
+  def != (b : Byte) = bbMatOpScalar(b, (x:Byte, y:Byte) => if (x != y) 1 else 0, null) 
+  
+  override def \ (b: Mat) = b match {
+    case fb:BMat => horzcat(fb)
+  }
+  
+  override def on (b: Mat) = b match {
+    case fb:BMat => vertcat(fb)
+  }
+  
+  override def ~ (b: Mat):Pair = 
+    b match {
+    case db:BMat => new BPair(this, db)
+    case _ => throw new RuntimeException("mismatched types for operator ~")
+  }
+}
+
+class BPair (val omat:Mat, val mat:BMat) extends Pair {
+  
+ 
+  def > (b : Byte) = mat.bbMatOpScalar(b, (x:Byte, y:Byte) => if (x > y) 1 else 0, omat)
+  def < (b : Byte) = mat.bbMatOpScalar(b, (x:Byte, y:Byte) => if (x < y) 1 else 0, omat)
+  def == (b : Byte) = mat.bbMatOpScalar(b, (x:Byte, y:Byte) => if (x == y) 1 else 0, omat)
+  def >= (b : Byte) = mat.bbMatOpScalar(b, (x:Byte, y:Byte) => if (x >= y) 1 else 0, omat)
+  def <= (b : Byte) = mat.bbMatOpScalar(b, (x:Byte, y:Byte) => if (x <= y) 1 else 0, omat)
+  def != (b : Byte) = mat.bbMatOpScalar(b, (x:Byte, y:Byte) => if (x != y) 1 else 0, omat) 
+}
+
+object BMat {
+  
+  def apply(nr:Int, nc:Int, nnz0:Int):BMat = new BMat(nr, nc, nnz0, new Array[Int](nnz0), new Array[Int](nc+1), new Array[Byte](nnz0)) 
+  
+  def apply(a:SparseMat[Byte]):BMat = new BMat(a.nrows, a.ncols, a.nnz, a.ir, a.jc, a.data) 
+   
+  def SnoRows(nr:Int, nc:Int, nnz0:Int):BMat = new BMat(nr, nc, nnz0, null, new Array[Int](nc+1), new Array[Byte](nnz0))
+  
+  var encoding = "UTF8"
+//  	var encoding = "UTF_16LE"
+  
+  def apply(cc:CSMat):BMat = {
+    val ioff = Mat.ioneBased
+    val ncolsx = cc.length
+    var nrowsx = 0
+    var nnzx = 0
+    var i = 0
+    while (i < ncolsx) {
+      val len = cc(i).getBytes(encoding).length
+      nnzx += len
+      nrowsx = math.max(nrowsx, 1+len)
+      i += 1
+    }
+    val out = SnoRows(nrowsx, ncolsx, nnzx)
+    nnzx = 0
+    i = 0
+    while (i < ncolsx) {
+      out.jc(i) = nnzx + ioff
+      val bytes = cc(i).getBytes(encoding)
+      System.arraycopy(bytes, 0, out.data, nnzx, bytes.length)
+      nnzx += bytes.length
+      i += 1
+    } 
+    out.jc(i) = nnzx
+    out
+  }
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/CMat.scala b/src/main/scala/BIDMat/CMat.scala
new file mode 100755
index 00000000..081f120a
--- /dev/null
+++ b/src/main/scala/BIDMat/CMat.scala
@@ -0,0 +1,1056 @@
+package BIDMat
+import edu.berkeley.bid.CBLAS._
+import edu.berkeley.bid.LAPACK._
+import java.util.Arrays
+
+case class CMat(nr:Int, nc:Int, data0:Array[Float]) extends DenseMat[Float](nr, nc, data0) {
+
+  def size() = length;
+  
+  override def dv:Double =
+    if (nrows > 1 || ncols > 1) {
+      throw new RuntimeException("Matrix should be 1x1 to extract value")
+    } else {
+      data(0)
+    }
+  
+  override def mytype = "CMat"
+   
+  def get(r0:Int, c0:Int):CMat = {
+    val off = Mat.oneBased
+    val r = r0 - off
+    val c = c0 - off
+    if (r >= nrows || c >= ncols) {
+      throw new IndexOutOfBoundsException("("+(r+off)+","+(c+off)+") >= ("+nrows+","+ncols+")");
+    } else {
+    	val indx = 2*(r+c*nrows)
+    	CMat.celem(data(indx), data(indx+1))
+    }
+  }
+  
+  def get(i0:Int):CMat = {
+  	val off = Mat.oneBased
+    val i = i0 - off
+    if (i < 0 || i >= length) {
+      throw new IndexOutOfBoundsException(""+(i+off)+" >= ("+nrows+","+ncols+")");
+    } else {
+      CMat.celem(data(2*i), data(2*i+1))
+    }
+  } 
+
+  def update(r0:Int, c0:Int, v:CMat):CMat = {
+    val off = Mat.oneBased
+    val r = r0 - off
+    val c = c0 - off
+    if (r >= nrows || c >= ncols) {
+      throw new IndexOutOfBoundsException("("+(r+off)+","+(c+off)+") >= ("+nrows+","+ncols+")");
+    } else {
+    	val indx = 2*(r+c*nrows)
+    	data(indx) = v.data(0)
+    	data(indx+1) = v.data(1)
+    }
+    v
+  }
+
+  def update(i0:Int, v:CMat):CMat = {
+  	val off = Mat.oneBased
+    val i = i0 - off
+    if (i < 0 || i >= length) {
+      throw new IndexOutOfBoundsException(""+(i+off)+" >= ("+nrows+","+ncols+")");
+    } else {
+      data(2*i) = v.data(0)
+      data(2*i+1) = v.data(1)
+    }
+    v
+  }
+  
+  def t(oldmat:Mat):CMat  = {
+    var out = CMat.newOrCheckCMat(ncols, nrows, oldmat)
+    var i = 0
+    while (i < nrows) {
+      var j = 0
+      while (j < ncols) {
+        out.data(2*(j+i*ncols)) = data(2*(i+j*nrows))
+        out.data(2*(j+i*ncols)+1) = data(2*(i+j*nrows)+1)
+        j += 1
+      }
+      i += 1
+    }
+    out
+  }
+  
+  override def t:CMat = t(null:CMat)
+  
+  def h(oldmat:Mat):CMat  = {
+    var out = CMat.newOrCheckCMat(ncols, nrows, oldmat)
+    var i = 0
+    while (i < nrows) {
+      var j = 0
+      while (j < ncols) {
+        out.data(2*(j+i*ncols)) = data(2*(i+j*nrows))
+        out.data(2*(j+i*ncols)+1) = -data(2*(i+j*nrows)+1)
+        j += 1
+      }
+      i += 1
+    }
+    out
+  }
+  
+  def h:CMat = h(null:CMat)
+
+  def vertcat(a:CMat):CMat = 
+    if (ncols != a.ncols) {
+      throw new RuntimeException("ncols must match")
+    } else {
+      var out = CMat(nrows+a.nrows, ncols)
+      var i = 0
+      while (i < ncols) {
+        System.arraycopy(data, 2*i*nrows, out.data, 2*i*(nrows+a.nrows), 2*nrows)
+        System.arraycopy(a.data, 2*i*a.nrows, out.data, 2*(nrows+i*(nrows+a.nrows)), 2*a.nrows)
+        i += 1
+      }
+      out
+    }
+
+  def horzcat(a:CMat):CMat= 
+    if (nrows != a.nrows) {
+      throw new RuntimeException("nrows must match")
+    } else {
+      var out = CMat(nrows, ncols+a.ncols)
+      System.arraycopy(data, 0, out.data, 0, 2*nrows*ncols)
+      System.arraycopy(a.data, 0, out.data, 2*nrows*ncols, 2*nrows*a.ncols)
+      out
+    }
+
+  override def nnz:Int = {
+    var count:Int = 0
+    var i = 0
+    while (i < length) {
+      if (data(2*i) != 0 || data(2*i+1) != 0) {
+        count += 1
+      }
+      i += 1
+    }
+    count
+  }
+  
+  override def findInds(out:IMat, off:Int):IMat = {
+    var count = 0
+    var i = 0
+    while (i < length) {
+      if (data(2*i) != 0 || data(2*i+1) != 0) {
+        out.data(count) = i + off
+        count += 1
+      } 
+      i += 1
+    }
+    out
+  }
+  
+  def find3:(IMat, IMat, CMat) = {
+    val off = Mat.oneBased
+    val iout = IMat(nnz, 1)
+    val jout = IMat(nnz, 1)
+    val vout = CMat(nnz, 1)
+    findInds(iout, 0)
+    var i = 0
+    while (i < iout.length) {
+      val ival:Int = iout.data(i)
+      vout.data(2*i) = data(2*ival)
+      vout.data(2*i+1) = data(2*ival+1)
+      jout.data(i) = (ival / nrows) + off
+      iout.data(i) = (ival % nrows) + off
+      i += 1
+    }
+    (iout, jout, vout)
+  } 
+  
+  override def apply(iv:IMat):CMat = 
+    iv match {
+      case aa:MatrixWildcard => {
+        val out = CMat(length, 1)
+        System.arraycopy(data, 0, out.data, 0, 2*out.length)
+        out
+      }
+      case _ => {
+      	val off = Mat.oneBased
+        val out = CMat(iv.nrows, iv.ncols)
+        var i = 0
+        while (i < out.length) {
+          val ind = iv.data(i) - off
+          if (ind < 0 || ind >= length) {
+            throw new RuntimeException("bad linear index "+(ind+off)+" vs "+length)
+          } else {
+            out.data(2*i) = data(2*ind)
+            out.data(2*i+1) = data(2*ind+1)
+          }
+          i += 1
+        }
+        out
+      }
+    } 
+  
+  def update(iv:IMat, b:CMat) = 
+    iv match {
+      case aaa:MatrixWildcard => {
+        if (length != b.length || b.ncols != 1) {
+          if (b.length == 1) {
+          	var i = 0
+          	val b0 = b.data(0)
+          	val b1 = b.data(1)
+          	while (i < length) {
+          		data(2*i) = b0
+          		data(2*i+1) = b1
+          		i += 1
+          	}
+          } else throw new RuntimeException("dims mismatch")
+        } else {
+          System.arraycopy(b.data, 0, data, 0, 2*length)
+        }
+      }
+      case _ => {
+      	val off = Mat.oneBased
+        if (iv.nrows != b.nrows || iv.ncols != b.ncols) {
+          if (b.length == 1) {
+          	val b0 = b.data(0)
+          	val b1 = b.data(1)
+          	var i = 0
+          	while (i < iv.length) {
+          		val ind = iv.data(i) - off
+          		if (ind < 0 || ind >= length) {
+          			throw new RuntimeException("bad linear index "+(ind+off)+" vs "+length)
+          		} else {
+          			data(2*ind) = b0
+          			data(2*ind+1) = b1
+          		}
+          		i += 1
+          	}
+          } else throw new RuntimeException("dims mismatch")
+        } else {
+          var i = 0
+          while (i < iv.length) {
+            val ind = iv.data(i) - off
+            if (ind < 0 || ind >= length) {
+            	throw new RuntimeException("bad linear index "+(ind+off)+" vs "+length)
+            } else {
+              data(2*ind) = b.data(2*i)
+              data(2*ind+1) = b.data(2*i+1)
+            }
+            i += 1
+          }
+        }
+      }
+    } 
+  
+  override def apply(iv:IMat, jv:IMat):CMat = {
+    val off = Mat.oneBased
+    val rowinds = DenseMat.getInds(iv, nrows)
+    val colinds = DenseMat.getInds(jv, ncols) 
+    val out = CMat(rowinds.length, colinds.length)
+    var i = 0
+    while (i < out.ncols) {
+      var j = 0
+      val c = colinds(i) - off
+      while (j < out.nrows) {
+        val r = rowinds(j) - off
+        out.data(2*(j+i*out.nrows)) = data(2*(r+nrows*c))
+        out.data(2*(j+i*out.nrows)+1) = data(2*(r+nrows*c)+1)
+        j += 1
+      }
+      i += 1
+    }
+    out
+  }	
+  
+  override def apply(iv:IMat, j:Int):CMat = {
+  	apply(iv, IMat.ielem(j))
+  } 
+  
+  override def apply(i:Int, jv:IMat):CMat = {
+  	apply(IMat.ielem(i), jv)
+  }
+  
+  def update(iv:IMat, jv:IMat, b:CMat):CMat = {
+  	val off = Mat.oneBased
+    val rowinds = DenseMat.getInds(iv, nrows)
+    val colinds = DenseMat.getInds(jv, ncols) 
+    if (rowinds.length != b.nrows || colinds.length != b.ncols) {
+      if (b.length == 1) {
+      	val b0 = b.data(0)
+    	  val b1 = b.data(1)
+      	var i = 0
+      	while (i < b.ncols) {
+      	  val c = colinds(i) - off
+      		var j = 0
+      		while (j < b.nrows) {
+      			val r = rowinds(j) - off
+      			data(2*(r+nrows*c)) = b0
+      			data(2*(r+nrows*c)+1) = b1
+      			j += 1
+      		}
+      		i += 1
+      	}      
+      } else throw new RuntimeException("dims mismatch in assignment")
+    } else {
+      var i = 0
+      while (i < b.ncols) {
+      	val c = colinds(i) - off
+        var j = 0
+        while (j < b.nrows) {
+        	val r = rowinds(j) - off
+          data(2*(r+nrows*c)) = b.data(2*(j+i*b.nrows))
+          data(2*(r+nrows*c)+1) = b.data(2*(j+i*b.nrows)+1)
+          j += 1
+        }
+        i += 1
+      }
+    }
+    b
+  }
+
+  def update(iv:IMat, j:Int, b:CMat):CMat = {
+  	update(iv, IMat.ielem(j), b)
+  }
+
+  def update(i:Int, jv:IMat, b:CMat):CMat = {
+  	update(IMat.ielem(i), jv, b)
+  }
+  
+   /*
+  * Implement sliced assignment, a(iv,jv) = b:T where iv and jv are vectors, using ? as wildcard
+  */ 
+  
+   def ccMatOp(a:Mat, op2:(Float,Float,Float,Float) => (Float,Float), oldmat:Mat):CMat = {
+    a match {
+      case aa:CMat => {
+        if (nrows==a.nrows && ncols==1) {
+          val out = CMat.newOrCheckCMat(nrows, a.ncols, oldmat)
+          Mat.nflops += aa.length
+          var i = 0
+          while (i < a.ncols) {
+            var j = 0
+            while (j < nrows) {
+              val (v0, v1) = op2(data(2*j), data(2*j), aa.data(2*(j+i*a.nrows)), aa.data(2*(j+i*a.nrows)+1))
+              out.data(2*(j+i*nrows)) = v0
+              out.data(2*(j+i*nrows)+1) = v1
+              j += 1
+            }
+            i += 1
+          }
+          out
+        } else if (ncols==a.ncols && nrows==1) {
+          val out = CMat.newOrCheckCMat(a.nrows, ncols, oldmat)
+          Mat.nflops += aa.length
+          var i = 0
+          while (i < ncols) {
+            var j = 0
+            while (j < a.nrows) {
+              val (v0, v1) = op2(data(2*i), data(2*i+1), aa.data(2*(j+i*a.nrows)), aa.data(2*(j+i*a.nrows)+1))
+              out.data(2*(j+i*a.nrows)) = v0
+              out.data(2*(j+i*a.nrows)+1) = v1
+              j += 1
+            }
+            i += 1
+          }
+          out
+        } else if (nrows==a.nrows && a.ncols==1) {
+          val out = CMat.newOrCheckCMat(nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i < ncols) {
+            var j = 0
+            while (j < nrows) {
+              val (v0, v1) = op2(data(2*(j+i*nrows)), data(2*(j+i*nrows)+1), aa.data(2*j), aa.data(2*j+1))
+              out.data(2*(j+i*nrows)) = v0
+              out.data(2*(j+i*nrows)+1) = v1
+              j += 1
+            }
+            i += 1
+          }
+          out
+        } else if (ncols==a.ncols && a.nrows==1) {
+          val out = CMat.newOrCheckCMat(nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i <  ncols) {
+            var j = 0
+            while (j < nrows) {
+              val (v0, v1) = op2(data(2*(j+i*nrows)), data(2*(j+i*nrows)+1), aa.data(2*i), aa.data(2*i+1))
+              out.data(2*(j+i*nrows)) = v0
+              out.data(2*(j+i*nrows)+1) = v1
+              j += 1
+            }
+            i += 1   
+          }
+          out
+        } else ccMatOpStrict(a, op2, oldmat)
+      }
+      case _ => throw new RuntimeException("arg must be dense")
+    }
+  }
+  /*
+   * This version applies the operator op2 with stricter dimension checking, 
+   * either dims must match or one arg must be scalar
+   */
+  def ccMatOpStrict(a:Mat, op2:(Float,Float,Float,Float) => (Float,Float), oldmat:Mat):CMat =
+    a match {
+      case aa:CMat => {
+        if (nrows==a.nrows && ncols==a.ncols) {
+          val out = CMat.newOrCheckCMat(nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i < aa.length) {
+            val (v0, v1) = op2(data(2*i), data(2*i+1), aa.data(2*i), aa.data(2*i+1))
+            out.data(2*i) = v0
+            out.data(2*i+1) = v1
+            i += 1
+          }
+          out
+        } else if (a.nrows == 1 && a.ncols == 1) {
+          val out = CMat.newOrCheckCMat(nrows, ncols, oldmat)
+          Mat.nflops += length
+          val a0 = aa.data(0)
+          val a1 = aa.data(1)
+          var i = 0
+          while (i < length) {
+            val (v0, v1) = op2(data(2*i), data(2*i+1), a0, a1)
+            out.data(2*i) = v0
+            out.data(2*i+1) = v1
+            i += 1
+          }
+          out
+        } else if (nrows == 1 && ncols == 1) {
+          val out = CMat.newOrCheckCMat(a.nrows, a.ncols, oldmat)
+          Mat.nflops += aa.length
+          val a0 = aa.data(0)
+          val a1 = aa.data(1)
+          var i = 0
+          while (i < aa.length) {
+          	val (v0, v1) = op2(a0, a1, aa.data(2*i), aa.data(2*i+1))
+          	out.data(2*i) = v0
+          	out.data(2*i+1) = v1
+            i += 1
+          }
+          out
+        } else throw new RuntimeException("dims incompatible")
+      }
+      case _ => throw new RuntimeException("arg must be dense")
+    }
+  
+   def ccMatOpv(a:Mat, opv:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, oldmat:Mat):CMat = {
+     a match {
+      case aa:CMat => {
+        if (nrows==a.nrows && ncols==1) {
+          val out = CMat.newOrCheckCMat(nrows, a.ncols, oldmat)
+          Mat.nflops += aa.length
+          var i = 0         
+          while (i < a.ncols) {
+            opv(data, 0, 1, aa.data, i*a.nrows, 1, out.data, i*nrows, 1, nrows)
+            i += 1
+          }
+          out
+        } else if (ncols==a.ncols && nrows==1) {
+          val out = CMat.newOrCheckCMat(a.nrows, ncols, oldmat)
+          Mat.nflops += aa.length
+          var i = 0
+          while (i < ncols) {
+            opv(data, i, 0, aa.data, i*a.nrows, 1, out.data, i*a.nrows, 1, a.nrows)
+            i += 1
+          }
+          out
+        } else if (nrows==a.nrows && a.ncols==1) {
+          val out = CMat.newOrCheckCMat(nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i < ncols) {
+            opv(data, i*nrows, 1, aa.data, 0, 1, out.data, i*nrows, 1, nrows)
+            i += 1
+          }
+          out
+        } else if (ncols==a.ncols && a.nrows==1) {
+          val out = CMat.newOrCheckCMat(nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i <  ncols) {
+            opv(data, i*nrows, 1, aa.data, i, 0, out.data, i*nrows, 1, a.nrows)
+            i += 1   
+          }
+          out
+        } else ccMatOpStrictv(a, opv, oldmat)
+      }
+      case _ => throw new RuntimeException("arg must be dense")
+    }
+   }
+
+  def ccMatOpStrictv(a:Mat, opv:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, oldmat:Mat):CMat =
+    a match {
+      case aa:CMat => {
+        if (nrows==a.nrows && ncols==a.ncols) {
+          val out = CMat.newOrCheckCMat(nrows, ncols, oldmat)
+          Mat.nflops += length
+          opv(data, 0, 1, aa.data, 0, 1, out.data, 0, 1, aa.length)
+          out
+        } else if (a.nrows == 1 && a.ncols == 1) {
+          val out = CMat.newOrCheckCMat(nrows, ncols, oldmat)
+          Mat.nflops += length
+          opv(data, 0, 1, aa.data, 0, 0, out.data, 0, 1, length)
+          out
+        } else if (nrows == 1 && ncols == 1) {
+          val out = CMat.newOrCheckCMat(a.nrows, a.ncols, oldmat)
+          Mat.nflops += aa.length
+          opv(data, 0, 0, aa.data, 0, 1, out.data, 0, 1, aa.length)
+          out
+        } else throw new RuntimeException("dims incompatible")
+      }
+      case _ => throw new RuntimeException("arg must be dense")
+    }
+  
+  def ccMatOpScalarv(a0:Float, a1:Float, opv:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, omat:Mat):CMat = {
+    val out = CMat.newOrCheckCMat(nrows, ncols, omat)
+    Mat.nflops += length
+    val aa = new Array[Float](2)
+    aa(0) = a0
+    aa(1) = a1
+    opv(data, 0, 1, aa, 0, 0, out.data, 0, 1, length)    
+    out
+  }
+  
+  def ffReduceOp(n:Int, f1:(Float) => Float, f2:(Float, Float) => Float, out:Mat) = 
+    CMat(ggReduceOp(n, f1, f2, out))
+  
+  def ffReduceOpv(n:Int, f:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, out:Mat) = 
+    CMat(ggReduceOpv(n, f, out))
+    
+  def ccReduceOpv(dim0:Int, opv:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, oldmat:Mat):CMat = {
+    var dim = if (nrows == 1 && dim0 == 0) 2 else math.max(1, dim0)
+    if (dim == 1) {
+      val out = CMat.newOrCheckCMat(1, ncols, oldmat)
+      Mat.nflops += length
+      var i = 0
+      while (i < ncols) { 
+        out.data(i) = data(i*nrows)
+        opv(data, i*nrows+1, 1, out.data, i, 0, out.data, i, 0, nrows-1)
+        i += 1
+      }
+      out
+    } else if (dim == 2) { 
+      val out = CMat.newOrCheckCMat(nrows, 1, oldmat)
+      Mat.nflops += length
+      var j = 0
+      while (j < 2*nrows) { 
+        out.data(j) = data(j)
+        j += 1
+      }
+      var i = 1
+      while (i < ncols) { 
+        opv(data, i*nrows, 1, out.data, 0, 1, out.data, 0, 1, nrows)
+        i += 1
+      }
+      out
+    } else
+      throw new RuntimeException("index must 1 or 2");
+  }
+  
+  def ffReduceAll(n:Int, f1:(Float) => Float, f2:(Float, Float) => Float, out:Mat) = 
+    CMat(ggReduceAll(n, f1, f2, out))
+  
+  def ffReduceAllv(n:Int, f:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, out:Mat) = 
+    CMat(ggReduceAllv(n, f, out))
+  
+  override def printOne(i:Int):String = {
+  		val u = data(2*i)
+  		val v = data(2*i+1)
+  		val s0 = if (u % 1 == 0 && math.abs(u) < 1e10) {	      
+  			"%d" format u.intValue
+  		} else {
+  			"%.5g" format u
+  		}
+  		if (v == 0) {
+  		  s0
+  		} else {
+  			val s1 = if (v % 1 == 0 && math.abs(v) < 1e10) {	      
+  				"%d" format v.intValue
+  			} else {
+  				"%.5g" format v
+  			}
+  			if (u == 0) {
+  			  s1+"i"
+  			} else if (v > 0) {
+  				s0+"+"+s1+"i"
+  			} else {
+  				s0+s1+"i"
+  			}
+  		}
+  }
+  
+  override def copyTo(out:Mat) = {
+    out match {
+      case cout:CMat => System.arraycopy(data, 0, cout.data, 0, 2*length)
+    }  	
+  	out
+  }
+  
+  override def copy = {
+  	val out = CMat(nrows, ncols)
+  	System.arraycopy(data, 0, out.data, 0, 2*length)
+  	out
+  }
+  
+  override def zeros(nr:Int, nc:Int) = {
+  	CMat(nr, nc)
+  }
+  
+  override def ones(nr:Int, nc:Int) = {
+  	val out = CMat(nr, nc)
+  	var i = 0
+  	while (i < out.length) {
+  	  out(2*i) = 1
+  	  i += 1
+  	}
+  	out
+  }
+  
+  def fDMult(aa:CMat, outmat:Mat):CMat = { 
+  		if (ncols == aa.nrows) {
+  			val out = CMat.newOrCheckCMat(nrows, aa.ncols, outmat)
+  			Mat.nflops += 2L * length * aa.ncols
+  			if (Mat.noMKL) {
+  				if (outmat.asInstanceOf[AnyRef] != null) out.clear
+  				var i = 0
+  				while (i < aa.ncols) {
+  					var j = 0
+  					while (j < aa.nrows) {
+  						var k = 0
+  						val u0 = aa.data(2*(j + i*ncols))
+  						val u1 = aa.data(2*(j + i*ncols)+1)
+  						while (k < nrows) {
+  							val v0 = data(2*(k+j*nrows))
+  							val v1 = data(2*(k+j*nrows)+1)
+  							out.data(2*(k+i*nrows)) += u0*v0-u1*v1
+  							out.data(2*(k+i*nrows)+1) += u1*v0+u0*v1
+  							k += 1
+  						}
+  						j += 1
+  					}
+  					i += 1									
+  				}
+  			} else {
+  				val alpha = List(1.0f,0f).toArray
+  				val beta = List(0f,0f).toArray
+  				if (nrows == 1) {
+  					cgemv(ORDER.ColMajor, TRANSPOSE.Trans, aa.nrows, aa.ncols, alpha, aa.data, aa.nrows, data, 1, beta, out.data, 1)
+  				} else if (aa.ncols == 1) {
+  					cgemv(ORDER.ColMajor, TRANSPOSE.NoTrans, nrows, ncols, alpha, data, nrows, aa.data, 1, beta, out.data, 1)
+  				} else {
+  					cgemm(ORDER.ColMajor, TRANSPOSE.NoTrans, TRANSPOSE.NoTrans,
+  							nrows, aa.ncols, ncols, alpha, data, nrows, aa.data, aa.nrows, beta, out.data, nrows)
+  				}
+  			}
+  			out
+  		} else if (ncols == 1 && nrows == 1){
+  			val out = CMat.newOrCheckCMat(aa.nrows, aa.ncols, outmat)
+  			Mat.nflops += aa.length
+  			var i = 0
+  			val u0 = data(0)
+  			val u1 = data(1)
+  			while (i < aa.length) {
+  				val v0 = aa.data(2*i)
+  				val v1 = aa.data(2*i+1)
+  				out.data(2*i) = u0*v0-u1*v1
+  				out.data(2*i+1) = u0*v1+u1*v0
+  				i += 1
+  			}			    
+  			out			  
+  		} else if (aa.ncols == 1 && aa.nrows == 1){
+  			val out = CMat.newOrCheckCMat(nrows, ncols, outmat)
+  			Mat.nflops += length
+  			var i = 0
+  			val u0 = aa.data(0)
+  			val u1 = aa.data(1)
+  			while (i < length) {
+  				val v0 = data(2*i)
+  				val v1 = data(2*i+1)
+  				out.data(2*i) = u0*v0-u1*v1
+  				out.data(2*i+1) = u0*v1+u1*v0
+  				i += 1
+  			}			    
+  			out			  
+  		}	else throw new RuntimeException("dimensions mismatch")
+  }
+ 
+  
+  def dot (b : CMat):CMat = 
+  	if (math.min(nrows, ncols) != 1 || math.min(b.nrows,b.ncols) != 1 || length != b.length) {
+  		throw new RuntimeException("vector dims not compatible")
+  	} else {
+  		Mat.nflops += 2 * length
+  		var w0 = 0.0
+  		var w1 = 0.0
+  		var i = 0
+  		while (i < length){
+  			val u0 = data(2*i)
+  			val u1 = data(2*i+1)
+  			val v0 = b.data(2*i)
+  			val v1 = b.data(2*i+1)
+  			w0 += u0*v0-u1*v1
+  			w1 += u0*v1+u1*v0
+  			i += 1
+  		}
+  		CMat.celem(w0.asInstanceOf[Float], w1.asInstanceOf[Float])
+  	}
+
+  def solvel(a0:Mat):CMat = 
+    a0 match {
+      case a:CMat => { 
+        Mat.nflops += 2L*a.nrows*a.nrows*a.nrows/3 + 2L*nrows*a.nrows*a.nrows
+        if (a.nrows != a.ncols || ncols != a.nrows) {
+          throw new RuntimeException("solve needs a square matrix")
+        } else {
+          val out = CMat(nrows, ncols)
+          val tmp = new Array[Float](2*length)
+          System.arraycopy(a.data, 0, tmp, 0, 2*a.length)
+          System.arraycopy(data, 0, out.data, 0, 2*length)
+          val ipiv = new Array[Int](ncols)
+          cgetrf(ORDER.RowMajor, ncols, ncols, tmp, ncols, ipiv)
+          cgetrs(ORDER.RowMajor, "N", ncols, nrows, tmp, ncols, ipiv, out.data, nrows)
+          out
+        }
+      }
+      case _ => throw new RuntimeException("unsupported arg to / "+a0)
+    }
+  
+  def solver(a0:Mat):CMat = 
+    a0 match {
+      case a:CMat => { 
+        Mat.nflops += 2L*nrows*nrows*nrows/3 + 2L*nrows*nrows*a.ncols
+        if (nrows != ncols || ncols != a.nrows) {
+          throw new RuntimeException("solve needs a square matrix")
+        } else {
+          val out = CMat(a.nrows, a.ncols)
+          val tmp = new Array[Float](2*length)
+          System.arraycopy(data, 0, tmp, 0, 2*length)
+          System.arraycopy(a.data, 0, out.data, 0, 2*a.length)
+          val ipiv = new Array[Int](ncols)
+          cgetrf(ORDER.ColMajor, ncols, ncols, tmp, ncols, ipiv)
+          cgetrs(ORDER.ColMajor, "N", ncols, a.ncols, tmp, nrows, ipiv, out.data, nrows)
+          out
+        }
+      }
+      case _ => throw new RuntimeException("unsupported arg to \\ "+a0)
+    }
+  
+  def inv:CMat = {
+    import edu.berkeley.bid.LAPACK._
+    if (nrows != ncols) {
+      throw new RuntimeException("inv method needs a square matrix")
+    } else {
+      val out = CMat(nrows, ncols)
+      System.arraycopy(data, 0, out.data, 0, length)
+      val ipiv = new Array[Int](nrows)
+      cgetrf(ORDER.ColMajor, nrows, ncols, out.data, nrows, ipiv)
+      cgetri(ORDER.ColMajor, nrows, out.data, nrows, ipiv)
+      out
+    }
+  }
+  
+  override def clear = {
+    Arrays.fill(this.data,0,2*length,0)
+    this
+  }
+  
+  override def clearUpper(off:Int) = {
+    if (nrows != ncols) {
+      throw new RuntimeException("clearUpper assumes a square matrix")
+    } else {
+      var i = 1
+      while (i < ncols) {
+      	var j = 0
+      	while (j < i+off) {
+      		data(2*(j + i*nrows)) = 0
+      		data(2*(j + i*nrows)+1) = 0
+      		j += 1
+      	}
+      	i += 1
+      }
+      this
+    }
+  }
+  override def clearUpper = clearUpper(0)
+  
+  override def clearLower(off:Int):CMat = {
+    if (nrows != ncols) {
+    	throw new RuntimeException("clearLower assumes a square matrix")
+    } else {
+    	var i = 0
+    	while (i < ncols-1) {
+    		var j = i+1+off
+    		while (j < nrows) {
+    			data(2*(j + i*nrows)) = 0
+    			data(2*(j + i*nrows)+1) = 0
+    			j += 1
+    		}
+    		i += 1
+    	}
+    }
+    this
+  }
+  
+  override def clearLower:CMat = clearLower(0)
+  
+  override def mkdiag = {
+    if (math.min(nrows, ncols) > 1) {
+      throw new RuntimeException("mkdiag needs a vector input")
+    }
+    val n = math.max(nrows, ncols)
+    val out = CMat(n,n)
+    var i = 0
+    while (i < n) {
+      out.data(2*i*(n+1)) = data(2*i)
+      out.data(2*i*(n+1)+1) = data(2*i+1)
+      i += 1
+    }
+    out
+  }
+  
+  override def getdiag = {
+    val n = math.min(nrows, ncols)
+    val out = CMat(n,1)
+    var i = 0
+    while (i < n) {
+      out.data(2*i) = data(2*i*(nrows+1))
+      out.data(2*i+1) = data(2*i*(nrows+1)+1)
+      i += 1
+    }
+    out
+  }
+
+  def *  (b : CMat) = fDMult(b, null)
+  def +  (b : CMat) = ccMatOpv(b, CMat.vecAdd _, null)
+  def -  (b : CMat) = ccMatOpv(b, CMat.vecSub _, null)
+  def *@ (b : CMat) = ccMatOpv(b, CMat.vecMul _, null)
+  def /@ (b : CMat) = ccMatOpv(b, CMat.vecDiv _, null)
+  def /  (b : CMat) = solvel(b)
+  def \\ (b : CMat) = solver(b)
+  
+  def == (b : CMat) = ccMatOp(b, (ar:Float, ai:Float, br:Float, bi:Float) => if (ar == br && ai == bi) (1f, 0f) else (0f, 0f), null)
+  def != (b : CMat) = ccMatOp(b, (ar:Float, ai:Float, br:Float, bi:Float) => if (ar != br || ai != bi) (1f, 0f) else (0f, 0f), null)
+  
+  override def *  (b : Float) = ccMatOpScalarv(b, 0, CMat.vecMul _, null)
+  override def +  (b : Float) = ccMatOpScalarv(b, 0, CMat.vecAdd _, null)
+  override def -  (b : Float) = ccMatOpScalarv(b, 0, CMat.vecSub _, null)
+  override def *@ (b : Float) = ccMatOpScalarv(b, 0, CMat.vecMul _, null)
+  override def /@ (b : Float) = ccMatOpScalarv(b, 0, CMat.vecDiv _, null)
+  
+  override def == (b : Float) = ccMatOp(CMat.celem(b, 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar == br && ai == bi) (1f, 0f) else (0f, 0f), null)
+  override def != (b : Float) = ccMatOp(CMat.celem(b, 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar != br || ai != bi) (1f, 0f) else (0f, 0f), null)
+  
+  override def *  (b : Double) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecMul _, null)
+  override def +  (b : Double) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecAdd _, null)
+  override def -  (b : Double) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecSub _, null)
+  override def *@ (b : Double) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecMul _, null)
+  override def /@ (b : Double) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecDiv _, null)
+  
+  override def == (b : Double) = ccMatOp(CMat.celem(b.asInstanceOf[Float], 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar == br && ai == bi) (1f, 0f) else (0f, 0f), null)
+  override def != (b : Double) = ccMatOp(CMat.celem(b.asInstanceOf[Float], 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar != br || ai != bi) (1f, 0f) else (0f, 0f), null)
+  
+  override def *  (b : Int) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecMul _, null)
+  override def +  (b : Int) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecAdd _, null)
+  override def -  (b : Int) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecSub _, null)
+  override def *@ (b : Int) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecMul _, null)
+  override def /@ (b : Int) = ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecDiv _, null)
+  
+  override def == (b : Int) = ccMatOp(CMat.celem(b.asInstanceOf[Float], 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar == br && ai == bi) (1f, 0f) else (0f, 0f), null)
+  override def != (b : Int) = ccMatOp(CMat.celem(b.asInstanceOf[Float], 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar != br || ai != bi) (1f, 0f) else (0f, 0f), null)
+  
+  def \ (b: CMat) = horzcat(b)  
+  def on (b: CMat) = vertcat(b)
+  
+  def ~ (b : CMat):CPair = new CPair(this, b)
+
+  override def ~ (b: Mat):Pair = 
+    b match {
+    case db:CMat => new CPair(this, db)
+    case _ => throw new RuntimeException("mismatched types for operator ~")
+  }
+  
+   /*
+  * Operators whose second arg is generic. 
+  */ 
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(this, b, null, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(this, b, null, Mop_Minus)
+  override def *  (b : Mat):Mat = applyMat(this, b, null, Mop_Times)
+  override def /  (b : Mat):Mat = applyMat(this, b, null, Mop_Div)
+  override def \\ (b : Mat):Mat = applyMat(this, b, null, Mop_RSolve)
+  override def *@ (b : Mat):Mat = applyMat(this, b, null, Mop_ETimes)
+  override def /@ (b : Mat):Mat = applyMat(this, b, null, Mop_EDiv)
+  override def \  (b : Mat):Mat = applyMat(this, b, null, Mop_HCat)
+  override def on (b : Mat):Mat = applyMat(this, b, null, Mop_VCat)
+  
+  override def == (b : Mat):Mat = applyMat(this, b, null, Mop_EQ)
+  override def != (b : Mat):Mat = applyMat(this, b, null, Mop_NE)
+  
+  override def recycle(nr:Int, nc:Int, nnz:Int):CMat = {
+    if (nrows == nr && nc == ncols) {
+      this
+    } else if (data.size >= 2*nr*nc) {
+      new CMat(nr, nc, data)
+    } else {
+      CMat(nr, nc)
+    }  
+  }
+}
+
+class CPair (val omat:Mat, val mat:CMat) extends Pair {
+  
+  override def t:CMat = CMat(mat.gt(omat))
+  
+  def * (b : CMat) = mat.fDMult(b, omat)  
+  def + (b : CMat) = mat.ccMatOpv(b, CMat.vecAdd _, omat)
+  def - (b : CMat) = mat.ccMatOpv(b, CMat.vecSub _, omat)
+  def *@ (b : CMat) = mat.ccMatOpv(b, CMat.vecMul _, omat)
+  def /@ (b : CMat) = mat.ccMatOpv(b, CMat.vecDiv _, omat)  
+//  override def ^ (b : Mat) = mat.ccMatOp(b, (x:Float, y:Float) => math.pow(x,y).toFloat, null)  
+  
+  def == (b : CMat) = mat.ccMatOp(b, (ar:Float, ai:Float, br:Float, bi:Float) => if (ar == br && ai == bi) (1f, 0f) else (0f, 0f), omat)
+  def != (b : CMat) = mat.ccMatOp(b, (ar:Float, ai:Float, br:Float, bi:Float) => if (ar != br || ai != bi) (1f, 0f) else (0f, 0f), omat)
+  
+  override def * (b : Float) = mat.ccMatOpScalarv(b, 0, CMat.vecMul _, omat)
+  override def + (b : Float) = mat.ccMatOpScalarv(b, 0, CMat.vecAdd _, omat)
+  override def - (b : Float) = mat.ccMatOpScalarv(b, 0, CMat.vecSub _, omat)
+  override def *@ (b : Float) = mat.ccMatOpScalarv(b, 0, CMat.vecMul _, omat)
+  override def /@ (b : Float) = mat.ccMatOpScalarv(b, 0, CMat.vecDiv _, omat)
+  
+    
+  override def == (b : Float) = mat.ccMatOp(CMat.celem(b, 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar == br && ai == bi) (1f, 0f) else (0f, 0f), omat)
+  override def != (b : Float) = mat.ccMatOp(CMat.celem(b, 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar != br || ai != bi) (1f, 0f) else (0f, 0f), omat)
+
+  override def *  (b : Double) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecMul _, omat)
+  override def +  (b : Double) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecAdd _, omat)
+  override def -  (b : Double) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecSub _, omat)
+  override def *@ (b : Double) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecMul _, omat)
+  override def /@ (b : Double) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecDiv _, omat)
+  
+  override def == (b : Double) = mat.ccMatOp(CMat.celem(b.asInstanceOf[Float], 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar == br && ai == bi) (1f, 0f) else (0f, 0f), omat)
+  override def != (b : Double) = mat.ccMatOp(CMat.celem(b.asInstanceOf[Float], 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar != br || ai != bi) (1f, 0f) else (0f, 0f), omat)
+  
+  override def *  (b : Int) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecMul _, omat)
+  override def +  (b : Int) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecAdd _, omat)
+  override def -  (b : Int) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecSub _, omat)
+  override def *@ (b : Int) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecMul _, omat)
+  override def /@ (b : Int) = mat.ccMatOpScalarv(b.asInstanceOf[Float], 0, CMat.vecDiv _, omat)
+     
+  override def == (b : Int) = mat.ccMatOp(CMat.celem(b.asInstanceOf[Float], 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar == br && ai == bi) (1f, 0f) else (0f, 0f), omat)
+  override def != (b : Int) = mat.ccMatOp(CMat.celem(b.asInstanceOf[Float], 0), (ar:Float, ai:Float, br:Float, bi:Float) => if (ar != br || ai != bi) (1f, 0f) else (0f, 0f), omat)
+
+}
+
+object CMat {
+  
+  def apply(nr:Int, nc:Int) = new CMat(nr, nc, new Array[Float](2*nr*nc))
+  
+  def real(a:FMat):CMat = {
+    val out = CMat(a.nrows, a.ncols)
+    var i = 0
+    while (i < a.length) {
+      out.data(2*i) = a.data(i) 
+      i += 1
+    }
+    out
+  }
+  
+  def imag(a:FMat):CMat = {
+    val out = CMat(a.nrows, a.ncols)
+    var i = 0
+    while (i < a.length) {
+      out.data(2*i+1) = a.data(i) 
+      i += 1
+    }
+    out
+  }
+
+  def apply(x:Mat):CMat = {
+    x match {
+      case dd:DMat => real(FMat(dd))
+      case cc:CMat => {val out = CMat(x.nrows, x.ncols); System.arraycopy(cc.data, 0, out.data, 0, 2*cc.length); out}
+      case ii:IMat => real(FMat(ii))
+      case ff:FMat => real(ff)
+//      case xx:DenseMat[Float] => new CMat(xx.nrows, xx.ncols, xx.data)
+      case _ => throw new RuntimeException("Unsupported source type")
+    }
+  }
+
+  def celem(x:Float, y:Float) = {
+    val out = CMat(1,1)
+    out.data(0) = x
+    out.data(1) = y
+    out
+  }
+  
+  def vecAdd(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(2*ci) = a(2*ai) + b(2*bi)
+      c(2*ci+1) = a(2*ai+1) + b(2*bi+1)
+      ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecSub(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(2*ci) = a(2*ai) - b(2*bi)
+      c(2*ci+1) = a(2*ai+1) - b(2*bi+1) 
+      ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecMul(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      val u0 = a(2*ai)
+      val u1 = a(2*ai+1)
+      val v0 = b(2*ai)
+      val v1 = b(2*ai+1)
+      c(2*ci) = u0*v0-u1*v1
+      c(2*ci+1) = u0*v1+v0*u1 
+      ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecDiv(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      val u0 = a(2*ai)
+      val u1 = a(2*ai+1)
+      val v0 = b(2*ai)
+      val v1 = b(2*ai+1)
+      val denom = v0*v0 + v1*v1
+      c(2*ci) = (u0*v0+u1*v1)/denom
+      c(2*ci+1) = (u1*v0-v1*u0)/denom 
+      ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def newOrCheckCMat(nr:Int, nc:Int, outmat:Mat):CMat = {
+    if (outmat.asInstanceOf[AnyRef] == null || (outmat.nrows == 0 && outmat.ncols == 0)) {
+      CMat(nr, nc)
+    } else {
+      if (outmat.nrows != nr || outmat.ncols != nc) {
+        outmat.recycle(nr, nc, 0).asInstanceOf[CMat]
+      } else {
+      	outmat.asInstanceOf[CMat]
+      }
+    }
+  }
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/CSMat.scala b/src/main/scala/BIDMat/CSMat.scala
new file mode 100755
index 00000000..b1c0a1b8
--- /dev/null
+++ b/src/main/scala/BIDMat/CSMat.scala
@@ -0,0 +1,165 @@
+package BIDMat
+import Mat._
+
+case class CSMat(override val nrows:Int, override val ncols:Int, override val data:Array[String]) extends DenseMat[String](nrows, ncols, data) {	
+    
+	def size() = length;
+	
+	override def t:CSMat = CSMat(gt(null))
+	
+	override def mytype = "CSMat"
+	
+	def horzcat(b: CSMat) = CSMat(ghorzcat(b))
+	
+	def vertcat(b: CSMat) = CSMat(gvertcat(b))
+	
+	def find3:(IMat, IMat, CSMat) = { val vv = gfind3 ; (IMat(vv._1), IMat(vv._2), CSMat(vv._3)) }
+	
+	override def apply(a:IMat):CSMat = CSMat(gapply(a))
+	
+	override def apply(a:IMat, b:IMat):CSMat = CSMat(gapply(a, b))	
+	
+	override def apply(a:Int, b:IMat):CSMat = CSMat(gapply(a, b))	
+		
+	override def apply(a:IMat, b:Int):CSMat = CSMat(gapply(a, b))	
+		
+	def ccMatOp(b: CSMat, f:(String, String) => String, old:CSMat) = CSMat(ggMatOp(b, f, old))
+	
+	def ccMatOpScalar(b: String, f:(String, String) => String, old:CSMat) = CSMat(ggMatOpScalar(b, f, old))
+	
+	def ccReduceOp(n:Int, f1:(String) => String, f2:(String, String) => String, old:CSMat) = CSMat(ggReduceOp(n, f1, f2, old))
+	
+	override def printOne(i:Int):String = {
+	  val v = data(i)
+	  if (v != null)
+		  v.toString()
+		else	
+		  "NULL"
+	}
+	
+	/* 
+	 * Trait to implement binary operations on dense matrices
+	 */
+	trait DCSMatOp {
+		@inline def op1(x:String):String = x;
+		def op2(x:String, y:String):String;
+
+		def dCSMatOp(a:CSMat):CSMat = 
+			if (nrows==a.nrows && ncols==1) {
+				val out = CSMat(nrows, a.ncols)
+				for (i <- 0 until a.ncols) {
+					for (j <- 0 until nrows) {
+						out.data(j+i*nrows) = op2(data(j), a.data(j+i*a.nrows))
+					}
+				}
+				out
+			} else if (ncols==a.ncols && nrows==1) {
+				val out = CSMat(a.nrows, ncols)
+				for (i <- 0 until ncols) {
+					for (j <- 0 until a.nrows) {
+						out.data(j+i*a.nrows) = op2(data(i), a.data(j+i*a.nrows))
+					}
+				}
+				out
+			} else if (nrows==a.nrows && a.ncols==1) {
+				val out = CSMat(nrows, ncols)
+				for (i <- 0 until ncols) {
+					for (j <- 0 until nrows) {
+						out.data(j+i*nrows) = op2(data(j+i*nrows), a.data(j))
+					}
+				}
+				out
+			} else if (ncols==a.ncols && a.nrows==1) {
+				val out = CSMat(nrows, ncols)
+				for (i <- 0 until ncols) {
+					for (j <- 0 until nrows) {
+						out.data(j+i*nrows) = op2(data(j+i*nrows), a.data(i))
+					}
+				}
+				out
+			} else dCSMatOpStrict(a)
+		
+		def dCSMatOpStrict(a:CSMat):CSMat = 
+			if (nrows==a.nrows && ncols==a.ncols) {
+				val out = CSMat(nrows, ncols)
+				var i = 0
+				while (i < a.length) {
+					out.data(i) = op2(data(i), a.data(i))
+					i += 1
+				}
+				out
+			} else if (a.nrows == 1 && a.ncols == 1) {
+				val out = CSMat(nrows, ncols)
+				val aval = a.data(0)
+				for (i <- 0 until length) {
+					out.data(i) = op2(data(i), aval)
+				}
+				out
+			} else if (nrows == 1 && ncols == 1) {
+				val out = CSMat(a.nrows, a.ncols)
+				val aval = data(0)
+				for (i <- 0 until a.length) {
+					out.data(i) = op2(aval, a.data(i))
+				}
+				out
+			} else throw new RuntimeException("dims incompatible")
+
+		def dCSMatReduceOp(dim:Int):CSMat = 
+		  if (dim == 1) {
+		    val out = CSMat(1, ncols)
+		    for (i <- 0 until ncols) { 
+		      var j = 1
+		      var acc = op1(data(i*nrows))
+		      while (j < nrows) { 
+			acc = op2(acc, data(j+i*nrows))
+			j += 1
+		      }
+		      out.data(i) = acc
+		    }
+		    out
+		  } else if (dim == 2) { 
+		    val out = CSMat(nrows, 1)
+		    var j = 0
+		    while (j < nrows) { 
+		      out.data(j) = op1(data(j))
+		      j += 1
+		    }
+		    for (i <- 1 until ncols) { 
+		      var j = 0
+		      while (j < nrows) { 
+			out.data(j) = op2(out.data(j), data(j+i*nrows))
+			j += 1
+		      }
+		    }
+		    out
+		  } else
+		    throw new RuntimeException("index must 1 or 2")			    
+	}
+	
+  def + (b : CSMat) = ccMatOp(b, (x:String, y:String) => x + y, null)
+  
+	def \ (b: CSMat) = horzcat(b)
+	def \ (b: String) = horzcat(CSMat.cselem(b))
+	def on (b: CSMat) = vertcat(b)
+	def on (b: String) = vertcat(CSMat.cselem(b))
+}
+
+object CSMat {
+  
+    def apply(nr:Int, nc:Int):CSMat = new CSMat(nr, nc, new Array[String](nr*nc))
+
+    def apply(a:DenseMat[String]):CSMat = new CSMat(a.nrows, a.ncols, a.data) 
+    
+    def cselem(x:String) = {
+    	val out = CSMat(1,1)
+    	out.data(0) = x
+    	out
+	}
+
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/Copyright.txt b/src/main/scala/BIDMat/Copyright.txt
new file mode 100755
index 00000000..21326596
--- /dev/null
+++ b/src/main/scala/BIDMat/Copyright.txt
@@ -0,0 +1,25 @@
+Copyright (c) 2012, Regents of the University of California
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/src/main/scala/BIDMat/DMat.scala b/src/main/scala/BIDMat/DMat.scala
new file mode 100755
index 00000000..00928b4d
--- /dev/null
+++ b/src/main/scala/BIDMat/DMat.scala
@@ -0,0 +1,668 @@
+package BIDMat
+
+import edu.berkeley.bid.CBLAS._
+import edu.berkeley.bid.LAPACK._
+import edu.berkeley.bid.SPBLAS._
+import java.util.Arrays
+
+case class DMat(nr:Int, nc:Int, data0:Array[Double]) extends DenseMat[Double](nr, nc, data0) {
+
+  def size() = length;
+
+  def getdata() = data
+  
+  override def set(v:Float):DMat = {
+    Arrays.fill(data,0,length,v)
+    this
+  }
+ 
+  override def t:DMat = if (Mat.noMKL) { 
+    DMat(gt(null))
+  } else { 
+    val out = DMat(ncols, nrows)
+    domatcopy("C", "T", nrows, ncols, 1.0, data, nrows, out.data, ncols)
+    out
+  }
+  
+  override def dv:Double =
+    if (nrows > 1 || ncols > 1) {
+      throw new RuntimeException("Matrix should be 1x1 to extract value")
+    } else {
+      data(0)
+    }
+
+  override def mytype = "DMat"
+      
+  def horzcat(b: DMat) = DMat(ghorzcat(b))
+
+  def vertcat(b: DMat) = DMat(gvertcat(b))
+
+  def find3:(IMat, IMat, DMat) = { val (ii, jj, vv) = gfind3 ; (ii, jj, DMat(vv)) }
+
+  override def apply(a:IMat):DMat = DMat(gapply(a))
+
+  override def apply(a:IMat, b:IMat):DMat = DMat(gapply(a, b))	
+
+  override def apply(a:IMat, b:Int):DMat = DMat(gapply(a, b))	
+
+  override def apply(a:Int, b:IMat):DMat = DMat(gapply(a, b))
+     
+  def update(iv:IMat, jv:IMat, b:DMat):DMat = DMat(_update(iv, jv, b))
+
+  def update(iv:IMat, j:Int, b:DMat):DMat = DMat(_update(iv, IMat.ielem(j), b))
+
+  def update(i:Int, jv:IMat, b:DMat):DMat = DMat(_update(IMat.ielem(i), jv, b))
+  
+  def ddMatOp(b: Mat, f:(Double, Double) => Double, out:Mat) = 
+    b match {
+      case bb:DMat => DMat(ggMatOp(bb, f, out))
+      case _ => throw new RuntimeException("unsupported operation "+f+" on "+this+" and "+b)	
+    }
+
+  def ddMatOpv(b: Mat, f:(Array[Double],Int,Int,Array[Double],Int,Int,Array[Double],Int,Int,Int) => Double, out:Mat) = 
+    b match {
+      case bb:DMat => DMat(ggMatOpv(bb, f, out))
+      case _ => throw new RuntimeException("unsupported operation "+f+" on "+this+" and "+b)	
+    }
+
+  def ddMatOpScalar(b: Double, f:(Double, Double) => Double, out:Mat) = DMat(ggMatOpScalar(b, f, out))
+
+  def ddMatOpScalarv(b: Double, f:(Array[Double],Int,Int,Array[Double],Int,Int,Array[Double],Int,Int,Int) => Double, out:Mat) = 
+  	DMat(ggMatOpScalarv(b, f, out))
+
+  def ddReduceOp(n:Int, f1:(Double) => Double, f2:(Double, Double) => Double, out:Mat) = DMat(ggReduceOp(n, f1, f2, out))
+
+  def ddReduceOpv(n:Int, f:(Array[Double],Int,Int,Array[Double],Int,Int,Array[Double],Int,Int,Int) => Double, out:Mat) = 
+  	DMat(ggReduceOpv(n, f, out))
+  	
+  def ddReduceAll(n:Int, f1:(Double) => Double, f2:(Double, Double) => Double, out:Mat) = 
+  	DMat(ggReduceAll(n, f1, f2, out))  
+
+  def ddReduceAllv(n:Int, f:(Array[Double],Int,Int,Array[Double],Int,Int,Array[Double],Int,Int,Int) => Double, out:Mat) = 
+  	DMat(ggReduceAllv(n, f, out))
+
+  override def printOne(i:Int):String = {
+    val v = data(i)
+  	if (v % 1 == 0 && math.abs(v) < 1e10) {	      
+  		"%d" format v.intValue
+  	} else {
+  		"%.5g" format v
+  	}
+  }
+  
+  override def copyTo(a:Mat) = {
+  	a match {
+  	  case out:DMat => System.arraycopy(data, 0, out.data, 0, length)
+  	}
+  	a
+  }
+  
+  override def copy = {
+  	val out = DMat(nrows, ncols)
+  	System.arraycopy(data, 0, out.data, 0, length)
+  	out
+  }
+  
+  override def zeros(nr:Int, nc:Int) = {
+  	DMat(nr, nc)
+  }
+  
+  override def ones(nr:Int, nc:Int) = {
+  	val out = DMat(nr, nc)
+  	var i = 0
+  	while (i < out.length) {
+  	  out(i) = 1
+  	  i += 1
+  	}
+  	out
+  }
+  
+  override def clearUpper(off:Int) = setUpper(0, off)
+  override def clearUpper = setUpper(0, 0)
+  
+  override def clearLower(off:Int) = setLower(0, off)
+  override def clearLower = setLower(0, 0)
+
+
+  def fDMult(aa:DMat, outmat:Mat):DMat = {
+	if (ncols == aa.nrows) {
+	  val out = DMat.newOrCheckDMat(nrows, aa.ncols, outmat)
+	  Mat.nflops += 2 * length.toLong * aa.ncols.toLong
+	  if (Mat.noMKL) {
+	  	out.clear
+	  	var i = 0
+	  	while (i < aa.ncols) {
+	  		var j = 0
+	  		while (j < aa.nrows) {
+	  			var k = 0
+	  			val dval = aa.data(j + i*ncols)
+	  			while (k < nrows) {
+	  				out.data(k+i*nrows) += data(k+j*nrows)*dval
+	  				k += 1
+	  			}
+	  			j += 1
+	  		}
+	  		i += 1									
+	  	}
+	  } else {
+	    if (nrows == 1) {
+	      dgemv(ORDER.ColMajor, TRANSPOSE.Trans, aa.nrows, aa.ncols, 1.0, aa.data, aa.nrows, data, 1, 0, out.data, 1)
+	    } else if (aa.ncols == 1) {
+	      dgemv(ORDER.ColMajor, TRANSPOSE.NoTrans, nrows, ncols, 1.0, data, nrows, aa.data, 1, 0, out.data, 1)
+	    } else {
+	      dgemm(ORDER.ColMajor, TRANSPOSE.NoTrans, TRANSPOSE.NoTrans,
+		    nrows, aa.ncols, ncols, 1.0, data, nrows, aa.data, aa.nrows, 0, out.data, nrows)
+	    }
+	  }
+	  out
+	} else if (ncols == 1 && nrows == 1) {
+	  val out = DMat.newOrCheckDMat(aa.nrows, aa.ncols, outmat)
+	  Mat.nflops += aa.length
+	  var i = 0
+	  val dvar = data(0)
+	  while (i < aa.length) {
+	    out.data(i) = dvar * aa.data(i)
+	    i += 1						
+	  }			    
+	  out			  
+	} else if (aa.ncols == 1 && aa.nrows == 1) {
+	  val out = DMat.newOrCheckDMat(nrows, ncols, outmat)
+	  Mat.nflops += length
+	  var i = 0
+	  val dvar = aa.data(0)
+	  while (i < length) {
+	    out.data(i) = dvar * data(i)
+	    i += 1
+	  }			    
+	  out			  
+	} else throw new RuntimeException("dimensions mismatch")
+  }
+  
+  def fSMult(ss:SDMat, outmat:Mat):DMat = {
+  	if (ncols != ss.nrows) {
+  		throw new RuntimeException("dimensions mismatch")
+  	}	else {
+  		val out = DMat.newOrCheckDMat(nrows, ss.ncols, outmat)
+  		Mat.nflops += 2 * nrows.toLong * ss.nnz
+  		val ioff = Mat.ioneBased;
+  		val nr = ss.nrows
+  		val nc = ss.ncols
+  		val kk = ncols
+  		var jc0:Array[Int] = null
+  		var ir0:Array[Int] = null
+  		if (ioff == 0) {
+  			jc0 = SparseMat.incInds(ss.jc)
+  			ir0 = SparseMat.incInds(ss.ir)
+  		}	else {
+  			jc0 = ss.jc
+  			ir0 = ss.ir
+  		}	 
+  		if (nrows == 1 && !Mat.noMKL) {
+  			dcscmv("T", nr, nc, 1.0, "GLNF", ss.data, ir0, jc0, data, 0.0, out.data)
+  			out
+  		} else {
+  			out.clear
+  			if (nrows < 20 || Mat.noMKL) {
+  				var i = 0
+  				while (i < ss.ncols) {
+  					var j = ss.jc(i) - ioff
+  					while (j < ss.jc(i+1)-ioff) {
+  						val dval = ss.data(j)
+  						val ival = ss.ir(j) - ioff
+  						var k = 0
+  						while (k < nrows) {
+  							out.data(k+i*nrows) += data(k+ival*nrows)*dval
+  							k += 1
+  						}
+  						j += 1
+  					}
+  					i += 1
+  				}
+  			} else {
+  				dmcscm(nrows, ss.ncols, data, nrows, ss.data, ss.ir, ss.jc, out.data, nrows)
+  				//              dcsrmm("N", ss.ncols, nrows, ncols, 1.0, "GLNF", ss.data, ss.ir, ss.jc, data, ncols, 0, out.data, out.ncols)
+  			}
+  		}
+  		out
+  	}
+  }
+  
+  def multT(a:SDMat, outmat:Mat):DMat = {
+    import edu.berkeley.bid.CBLAS._
+    if (ncols == a.nrows) {
+    	val out = DMat.newOrCheckDMat(nrows, a.ncols, outmat)
+    	if (outmat.asInstanceOf[AnyRef] != null) out.clear
+    	dmcsrm(nrows, a.ncols, data, nrows, a.data, a.ir, a.jc, out.data, nrows)
+    	Mat.nflops += 2L * a.nnz * nrows
+    	out
+    } else {
+      throw new RuntimeException("xT dimensions mismatch")
+    }
+  }
+  
+  /*
+   * Very slow, row-and-column multiply
+   */
+  def sDMult(a:Mat):DMat = 
+  	a match {
+  	case aa:DMat => {
+  		if (ncols == a.nrows) {
+  			val out = DMat(nrows, a.ncols)
+  			var i = 0
+  			while (i < a.ncols) {
+  				var j = 0
+  				while (j < nrows) {
+  					var k = 0
+  					var sum = 0.0
+  					while (k < ncols) {
+  						sum += data(j+k*nrows) * aa.data(k+i*a.nrows)
+  						k += 1
+  					}
+  					out.data(j + i*out.nrows) = sum
+  					j += 1
+  				}
+  				i += 1
+  			}
+  			out
+  		} else throw new RuntimeException("dimensions mismatch")
+  	}
+  	case _ => throw new RuntimeException("argument must be dense")
+  }
+  
+  /*
+  * Weka multiply
+  */
+
+  def wDMult(a:Mat, omat:Mat):DMat = 
+  	a match {
+  	case aa:DMat => {
+  		if (ncols == a.nrows) {
+  			val out = DMat.newOrCheckDMat(nrows, a.ncols, omat)
+  			val tmp = new Array[Double](ncols)
+  			var i = 0
+  			while (i < nrows) {
+  				var j = 0							
+  				while (j < ncols) {
+  					tmp(j) = data(i+j*nrows)
+  					j += 1
+  				}					 
+  				j = 0
+  				while (j < a.ncols) {
+  					var k = 0
+  					var sum = 0.0
+  					while (k < ncols) {
+  						sum += tmp(k) * aa.data(k+i*a.nrows)
+  						k += 1
+  					}
+  					out.data(j + i*out.nrows) = sum
+  					j += 1
+  				}
+  				i += 1
+  			}
+  			out
+  		} else throw new RuntimeException("dimensions mismatch")
+  	}
+  	case _ => throw new RuntimeException("argument must be dense")
+  }
+  
+  def dot(a:DMat):Double = super.dot(a)
+  
+  override def dot(a:Mat):Double = super.dot(a.asInstanceOf[DMat])
+ 
+  def solvel(a0:Mat):DMat = 
+    a0 match {
+      case a:DMat => { 
+        Mat.nflops += 2L*a.nrows*a.nrows*a.nrows/3 + 2L*nrows*a.nrows*a.nrows
+        if (a.nrows != a.ncols || ncols != a.nrows) {
+          throw new RuntimeException("solve needs a square matrix")
+        } else {
+          val out = DMat(nrows, ncols)
+          val tmp = new Array[Double](ncols*ncols)
+          System.arraycopy(a.data, 0, tmp, 0, a.length)
+          System.arraycopy(data, 0, out.data, 0, length)
+          val ipiv = new Array[Int](ncols)
+          dgetrf(ORDER.RowMajor, ncols, ncols, tmp, ncols, ipiv)
+          dgetrs(ORDER.RowMajor, "N", ncols, nrows, tmp, ncols, ipiv, out.data, nrows)
+          out
+        }
+      }
+      case _ => throw new RuntimeException("unsupported arg to / "+a0)
+    }
+  
+  def solver(a0:Mat):DMat = 
+    a0 match {
+      case a:DMat => { 
+        Mat.nflops += 2L*nrows*nrows*nrows/3 + 2L*nrows*nrows*a.ncols
+        if (nrows != ncols || ncols != a.nrows) {
+          throw new RuntimeException("solve needs a square matrix")
+        } else {
+          val out = DMat(a.nrows, a.ncols)
+          val tmp = new Array[Double](ncols*ncols)
+          System.arraycopy(data, 0, tmp, 0, length)
+          System.arraycopy(a.data, 0, out.data, 0, a.length)
+          val ipiv = new Array[Int](ncols)
+          dgetrf(ORDER.ColMajor, ncols, ncols, tmp, ncols, ipiv)
+          dgetrs(ORDER.ColMajor, "N", ncols, a.ncols, tmp, nrows, ipiv, out.data, nrows)
+          out
+        }
+      }
+      case _ => throw new RuntimeException("unsupported arg to / "+a0)
+    }
+  
+  override def clear = {
+    Arrays.fill(this.data,0,length,0)
+    this
+  }
+  
+  override def recycle(nr:Int, nc:Int, nnz:Int):DMat = {
+    if (nrows == nr && nc == ncols) {
+      this
+    } else if (data.size >= nr*nc) {
+      new DMat(nr, nc, data)
+    } else {
+      DMat(nr, nc)
+    }  
+  }
+  /*
+   * Routines to operate on two DMats. These are the compute routines.
+   */
+  def *  (b : DMat) = fDMult(b, null)
+  def *  (b : SDMat) = fSMult(b, null)
+  def xT (b : SDMat) = multT(b, null)
+  def /  (b : DMat) = solvel(b)
+  def \\ (b : DMat) = solver(b)
+  def ^  (b : DMat) = ddMatOp(b, (x:Double, y:Double) => math.pow(x,y), null)
+
+  def +  (b : DMat) = ddMatOpv(b, DMat.vecAdd _, null)
+  def -  (b : DMat) = ddMatOpv(b, DMat.vecSub _, null)
+  def *@ (b : DMat) = ddMatOpv(b, DMat.vecMul _, null)
+  def /@ (b : DMat) = ddMatOpv(b, DMat.dVecDiv _, null)
+
+  def >   (b : DMat) = ddMatOp(b, (x:Double, y:Double) => if (x > y) 1.0 else 0.0, null)
+  def <   (b : DMat) = ddMatOp(b, (x:Double, y:Double) => if (x < y) 1.0 else 0.0, null)
+  def ==  (b : DMat) = ddMatOp(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, null)
+  def === (b : DMat) = ddMatOp(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, null)
+  def >=  (b : DMat) = ddMatOp(b, (x:Double, y:Double) => if (x >= y) 1.0 else 0.0, null)
+  def <=  (b : DMat) = ddMatOp(b, (x:Double, y:Double) => if (x <= y) 1.0 else 0.0, null)
+  def !=  (b : DMat) = ddMatOp(b, (x:Double, y:Double) => if (x != y) 1.0 else 0.0, null)
+
+  override def *  (b : Double) = fDMult(DMat.elem(b), null)
+  override def +  (b : Double) = ddMatOpScalarv(b, DMat.vecAdd _, null)
+  override def -  (b : Double) = ddMatOpScalarv(b, DMat.vecSub _, null)
+  override def *@ (b : Double) = ddMatOpScalarv(b, DMat.vecMul _, null)
+  override def /@ (b : Double) = ddMatOpScalarv(b, DMat.dVecDiv _, null)
+  override def ^  (b : Double) = ddMatOpScalar(b, (x:Double, y:Double) => math.pow(x,y), null)
+
+  override def >   (b : Double) = ddMatOpScalar(b, (x:Double, y:Double) => if (x > y) 1.0 else 0.0, null)
+  override def <   (b : Double) = ddMatOpScalar(b, (x:Double, y:Double) => if (x < y) 1.0 else 0.0, null)
+  override def ==  (b : Double) = ddMatOpScalar(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, null)
+  override def >=  (b : Double) = ddMatOpScalar(b, (x:Double, y:Double) => if (x >= y) 1.0 else 0.0, null)
+  override def <=  (b : Double) = ddMatOpScalar(b, (x:Double, y:Double) => if (x <= y) 1.0 else 0.0, null)
+  override def !=  (b : Double) = ddMatOpScalar(b, (x:Double, y:Double) => if (x != y) 1.0 else 0.0, null) 
+  
+  override def *  (b : Float) = fDMult(DMat.elem(b), null)
+  override def +  (b : Float) = ddMatOpScalarv(b, DMat.vecAdd _, null)
+  override def -  (b : Float) = ddMatOpScalarv(b, DMat.vecSub _, null)
+  override def *@ (b : Float) = ddMatOpScalarv(b, DMat.vecMul _, null)
+  override def /@ (b : Float) = ddMatOpScalarv(b, DMat.dVecDiv _, null)
+  override def ^  (b : Float) = ddMatOpScalar(b, (x:Double, y:Double) => math.pow(x,y), null)
+
+  override def >   (b : Float) = ddMatOpScalar(b, (x:Double, y:Double) => if (x > y) 1.0 else 0.0, null)
+  override def <   (b : Float) = ddMatOpScalar(b, (x:Double, y:Double) => if (x < y) 1.0 else 0.0, null)
+  override def ==  (b : Float) = ddMatOpScalar(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, null)
+  override def >=  (b : Float) = ddMatOpScalar(b, (x:Double, y:Double) => if (x >= y) 1.0 else 0.0, null)
+  override def <=  (b : Float) = ddMatOpScalar(b, (x:Double, y:Double) => if (x <= y) 1.0 else 0.0, null)
+  override def !=  (b : Float) = ddMatOpScalar(b, (x:Double, y:Double) => if (x != y) 1.0 else 0.0, null)
+
+  def \ (b: DMat) = DMat(ghorzcat(b))
+  def \ (b:Double) = DMat(ghorzcat(DMat.elem(b)))
+
+  def on (b: DMat) = DMat(gvertcat(b))
+  def on (b: Double) = vertcat(DMat.elem(b))
+  
+  def ~ (b : DMat):DPair = new DPair(this, b)
+  def ~ (b : SDMat):SDPair = new SDPair(this, b)
+
+  override def ~ (b: Mat):Pair = b match {
+    case db:DMat => new DPair(this, db)
+    case sb:SDMat => new SDPair(this, sb)
+    case _ => throw new RuntimeException("wrong types for operator ~ ")
+  } 
+ /*
+  * Specialize to IMats to help the type system. 
+  */ 
+  def +  (b : IMat):DMat = this + DMat(b)
+  def -  (b : IMat):DMat = this - DMat(b)
+  def *  (b : IMat):DMat = this * DMat(b)
+  def /  (b : IMat):DMat = this / DMat(b)
+  def \\ (b : IMat):DMat = this \\ DMat(b)
+  def *@ (b : IMat):DMat = this *@ DMat(b)
+  def /@ (b : IMat):DMat = this /@ DMat(b)
+  def \  (b : IMat):DMat = this \ DMat(b)
+  def on (b : IMat):DMat = this on DMat(b) 
+  
+  def >   (b : IMat):DMat = this > DMat(b)
+  def <   (b : IMat):DMat = this < DMat(b)
+  def >=  (b : IMat):DMat = this >= DMat(b)
+  def <=  (b : IMat):DMat = this <= DMat(b)
+  def ==  (b : IMat):DMat = this == DMat(b)
+  def === (b : IMat):DMat = this === DMat(b) 
+  def !=  (b : IMat):DMat = this != DMat(b)
+  
+ /*
+  * Specialize to FMats to help the type system. 
+  */ 
+  def +  (b : FMat):DMat = this + DMat(b)
+  def -  (b : FMat):DMat = this - DMat(b)
+  def *  (b : FMat):DMat = this * DMat(b)
+  def /  (b : FMat):DMat = this / DMat(b)
+  def \\ (b : FMat):DMat = this \\ DMat(b)
+  def *@ (b : FMat):DMat = this *@ DMat(b)
+  def /@ (b : FMat):DMat = this /@ DMat(b)
+  def \  (b : FMat):DMat = this \ DMat(b)
+  def on (b : FMat):DMat = this on DMat(b) 
+  
+  def >   (b : FMat):DMat = this > DMat(b)
+  def <   (b : FMat):DMat = this < DMat(b)
+  def >=  (b : FMat):DMat = this >= DMat(b)
+  def <=  (b : FMat):DMat = this <= DMat(b)
+  def ==  (b : FMat):DMat = this == DMat(b)
+  def === (b : FMat):DMat = this === DMat(b) 
+  def !=  (b : FMat):DMat = this != DMat(b)
+  
+ /*
+  * Specialize to CMats to help the type system. 
+  */ 
+  def +  (b : CMat):CMat = CMat(this) + b
+  def -  (b : CMat):CMat = CMat(this) - b
+  def *  (b : CMat):CMat = CMat(this) * b
+  def /  (b : CMat):CMat = CMat(this) / b
+  def \\ (b : CMat):CMat = CMat(this) \\ b
+  def *@ (b : CMat):CMat = CMat(this) *@ b
+  def /@ (b : CMat):CMat = CMat(this) /@ b
+  def \  (b : CMat):CMat = CMat(this) \ b
+  def on (b : CMat):CMat = CMat(this) on b 
+  
+ /*
+  * Operators whose second arg is generic. 
+  */ 
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(this, b, null, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(this, b, null, Mop_Minus)
+  override def *  (b : Mat):Mat = applyMat(this, b, null, Mop_Times)
+  override def /  (b : Mat):Mat = applyMat(this, b, null, Mop_Div)
+  override def \\ (b : Mat):Mat = applyMat(this, b, null, Mop_RSolve)
+  override def *@ (b : Mat):Mat = applyMat(this, b, null, Mop_ETimes)
+  override def /@ (b : Mat):Mat = applyMat(this, b, null, Mop_EDiv)
+  override def \  (b : Mat):Mat = applyMat(this, b, null, Mop_HCat)
+  override def on (b : Mat):Mat = applyMat(this, b, null, Mop_VCat)
+  
+  override def >   (b : Mat):Mat = applyMat(this, b, null, Mop_GT)
+  override def <   (b : Mat):Mat = applyMat(this, b, null, Mop_LT)
+  override def >=  (b : Mat):Mat = applyMat(this, b, null, Mop_GE)
+  override def <=  (b : Mat):Mat = applyMat(this, b, null, Mop_LE)
+  override def ==  (b : Mat):Mat = applyMat(this, b, null, Mop_EQ)
+  override def === (b : Mat):Mat = applyMat(this, b, null, Mop_EQ) 
+  override def !=  (b : Mat):Mat = applyMat(this, b, null, Mop_NE)
+  
+}
+
+class DPair (val omat:Mat, val mat:DMat) extends Pair{
+  override def t:DMat = if (Mat.noMKL) {
+  	DMat(mat.gt(omat))
+  } else { 
+    val out = DMat.newOrCheckDMat(mat.ncols, mat.nrows, omat)
+    domatcopy("C", "T", mat.nrows, mat.ncols, 1.0, mat.data, mat.nrows, out.data, mat.ncols)
+    out
+  }
+
+  def * (b : DMat) = mat.fDMult(b, omat) 
+  def * (b : SDMat) = mat.fSMult(b, omat)
+  def xT (b : SDMat) = mat.multT(b, omat)
+  def + (b : DMat) = mat.ddMatOpv(b, DMat.vecAdd _, omat)
+  def - (b : DMat) = mat.ddMatOpv(b, DMat.vecSub _, omat)
+  def *@ (b : DMat) = mat.ddMatOpv(b, DMat.vecMul _, omat)
+  def /@ (b : DMat) = mat.ddMatOpv(b, DMat.dVecDiv _, omat)
+  def ^ (b : DMat) = mat.ddMatOp(b, (x:Double, y:Double) => math.pow(x,y), null)
+
+  def > (b : DMat) = mat.ddMatOp(b, (x:Double, y:Double) => if (x > y) 1.0 else 0.0, omat)
+  def < (b : DMat) = mat.ddMatOp(b, (x:Double, y:Double) => if (x < y) 1.0 else 0.0, omat)
+  def == (b : DMat) = mat.ddMatOp(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, omat)
+  def === (b : DMat) = mat.ddMatOp(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, omat)
+  def >= (b : DMat) = mat.ddMatOp(b, (x:Double, y:Double) => if (x >= y) 1.0 else 0.0, omat)
+  def <= (b : DMat) = mat.ddMatOp(b, (x:Double, y:Double) => if (x <= y) 1.0 else 0.0, omat)
+  def != (b : DMat) = mat.ddMatOp(b, (x:Double, y:Double) => if (x != y) 1.0 else 0.0, omat) 
+
+  override def * (b : Double) = mat.fDMult(DMat.elem(b), omat) 
+  override def * (b : Float) = mat.fDMult(DMat.elem(b), omat)
+  override def + (b : Double) = mat.ddMatOpScalarv(b, DMat.vecAdd _, omat)
+  override def - (b : Double) = mat.ddMatOpScalarv(b, DMat.vecSub _, omat)
+  override def *@ (b : Double) = mat.ddMatOpScalarv(b, DMat.vecMul _, omat)
+  override def /@ (b : Double) = mat.ddMatOpScalarv(b, DMat.dVecDiv _, omat)  
+  override def ^ (b : Double) = mat.ddMatOpScalar(b, (x:Double, y:Double) => math.pow(x,y), omat)
+
+  override def > (b : Double) = mat.ddMatOpScalar(b, (x:Double, y:Double) => if (x > y) 1.0 else 0.0, omat)
+  override def < (b : Double) = mat.ddMatOpScalar(b, (x:Double, y:Double) => if (x < y) 1.0 else 0.0, omat)
+  override def == (b : Double) = mat.ddMatOpScalar(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, omat)
+  override def === (b : Double) = mat.ddMatOpScalar(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, omat)
+  override def >= (b : Double) = mat.ddMatOpScalar(b, (x:Double, y:Double) => if (x >= y) 1.0 else 0.0, omat)
+  override def <= (b : Double) = mat.ddMatOpScalar(b, (x:Double, y:Double) => if (x <= y) 1.0 else 0.0, omat)
+  override def != (b : Double) = mat.ddMatOpScalar(b, (x:Double, y:Double) => if (x != y) 1.0 else 0.0, omat) 
+  
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Minus)
+  override def *  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Times)
+  override def /  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Div)
+  override def \\ (b : Mat):Mat = applyMat(mat, b, omat, Mop_RSolve)
+  override def *@ (b : Mat):Mat = applyMat(mat, b, omat, Mop_ETimes)
+  override def /@ (b : Mat):Mat = applyMat(mat, b, omat, Mop_EDiv)
+  override def \  (b : Mat):Mat = applyMat(mat, b, omat, Mop_HCat)
+  override def on (b : Mat):Mat = applyMat(mat, b, omat, Mop_VCat)
+  
+  override def >   (b : Mat):Mat = applyMat(mat, b, omat, Mop_GT)
+  override def <   (b : Mat):Mat = applyMat(mat, b, omat, Mop_LT)
+  override def >=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_GE)
+  override def <=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_LE)
+  override def ==  (b : Mat):Mat = applyMat(mat, b, omat, Mop_EQ)
+  override def === (b : Mat):Mat = applyMat(mat, b, omat, Mop_EQ) 
+  override def !=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_NE)
+}
+
+object DMat {
+
+  def dVecDiv(a:Array[Double], a0:Int, ainc:Int, b:Array[Double], b0:Int, binc:Int, c:Array[Double], c0:Int, cinc:Int, n:Int):Double = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) / b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def newOrCheckDMat(nr:Int, nc:Int, omat:Mat):DMat = {
+    if (omat.asInstanceOf[AnyRef] == null || (omat.nrows == 0 && omat.ncols == 0)) {
+      DMat(nr, nc)
+    } else {
+      omat match {
+        case outmat:DMat =>
+          if (outmat.nrows != nr || outmat.ncols != nc) {
+        	 outmat.recycle(nr, nc, 0)
+          } else {
+          	outmat
+          }
+        case _ => throw new RuntimeException("wrong type for out matrix "+omat)
+      }
+    }
+  }
+    
+  def apply(nr:Int, nc:Int) = new DMat(nr, nc, new Array[Double](nr*nc))
+
+  def apply(a:DenseMat[Double]):DMat = new DMat(a.nrows, a.ncols, a.data) 
+
+  def apply(x:Mat):DMat = {
+    var out:DMat = null
+    x match {
+      case dd:DMat => {out = DMat(x.nrows, x.ncols); System.arraycopy(dd.data, 0, out.data, 0, dd.length)}
+      case ff:FMat => {out = DMat(x.nrows, x.ncols); Mat.copyToDoubleArray(ff.data, 0, out.data, 0, ff.length)}
+      case ii:IMat => {out = DMat(x.nrows, x.ncols); Mat.copyToDoubleArray(ii.data, 0, out.data, 0, ii.length)}
+      case ss:SDMat => out = DMat(ss.full)
+      case _ => throw new RuntimeException("Unsupported source type")
+    }
+    out
+  }
+  
+    
+  def vecAdd(a:Array[Double], a0:Int, ainc:Int, b:Array[Double], b0:Int, binc:Int, c:Array[Double], c0:Int, cinc:Int, n:Int):Double = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) + b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecSub(a:Array[Double], a0:Int, ainc:Int, b:Array[Double], b0:Int, binc:Int, c:Array[Double], c0:Int, cinc:Int, n:Int):Double = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) - b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecMul(a:Array[Double], a0:Int, ainc:Int, b:Array[Double], b0:Int, binc:Int, c:Array[Double], c0:Int, cinc:Int, n:Int):Double = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) * b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecMax(a:Array[Double], a0:Int, ainc:Int, b:Array[Double], b0:Int, binc:Int, c:Array[Double], c0:Int, cinc:Int, n:Int):Double = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = math.max(a(ai), b(bi));  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+ def vecMin(a:Array[Double], a0:Int, ainc:Int, b:Array[Double], b0:Int, binc:Int, c:Array[Double], c0:Int, cinc:Int, n:Int):Double = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = math.min(a(ai), b(bi));  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+
+
+  def elem(x:Double) = {
+    val out = DMat(1,1)
+    out.data(0) = x
+    out
+  }
+  
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/DenseMat.scala b/src/main/scala/BIDMat/DenseMat.scala
new file mode 100755
index 00000000..54f1077a
--- /dev/null
+++ b/src/main/scala/BIDMat/DenseMat.scala
@@ -0,0 +1,1313 @@
+package BIDMat
+import scala.math.Numeric._
+import java.util.Arrays
+import java.util.Comparator
+import scala.actors._
+import scala.actors.Actor._
+
+class DenseMat[@specialized(Double,Float,Int,Byte) T]
+(nr: Int, nc: Int, val data:Array[T])(implicit manifest:ClassManifest[T]) extends Mat(nr, nc) {
+  
+  def this(nr:Int, nc:Int)(implicit manifest:ClassManifest[T]) = this(nr, nc, new Array[T](nr*nc))
+
+  /*
+   * Return the (0,0) value as a scalar
+   */
+  def v:T =
+    if (nrows > 1 || ncols > 1) {
+      throw new RuntimeException("Matrix should be 1x1 to extract value")
+    } else {
+      data(0)
+    }
+  
+  override def mytype = "DenseMat"
+  /*
+   * Test if this matrix is a row or column vector
+   */
+  def isvector(): Boolean = {
+    if (nrows == 1 || ncols == 1) {
+      true
+    } else {
+      false
+    }
+  }
+  /*
+   * Bounds-checked matrix access, 0- or 1-based 
+   */ 
+  def apply(r0:Int, c0:Int):T = {
+    val off = Mat.oneBased
+    val r = r0 - off
+    val c = c0 - off
+    if (r < 0 || r >= nrows || c < 0 || c >= ncols) {
+      throw new IndexOutOfBoundsException("("+(r+off)+","+(c+off)+") vs ("+nrows+","+ncols+")");
+    } else {
+    	data(r+c*nrows)
+    }
+  }
+  /*
+   * Bounds-checked linear access, 0- or 1-based 
+   */ 
+  def apply(i0:Int):T = {
+    val off = Mat.oneBased
+    val i = i0 - off
+    if (i < 0 || i >= length) {
+      throw new IndexOutOfBoundsException(""+(i+off)+" >= ("+length+")");
+    } else {
+      data(i)
+    }
+  } 
+  /*
+   * Unchecked 0-based matrix access
+   */ 
+  def get_(r:Int, c:Int):T = {
+    data(r+c*nrows)
+  }
+  
+  /*
+   * Update a matrix value, m(r,c) = v, 0- or 1-based 
+   */
+  def update(r0:Int, c0:Int, v:T):T = {
+    val off = Mat.oneBased
+    val r = r0 - off
+    val c = c0 - off
+    if (r < 0 || r >= nrows || c < 0 || c >= ncols) {
+      throw new IndexOutOfBoundsException("("+(r+off)+","+(c+off)+") vs ("+nrows+","+ncols+")");
+    } else {
+      data(r+c*nrows) = v
+    }
+    v
+  }
+  /*
+   * Update a matrix value with linear access, m(i) = v
+   */
+  def update(i0:Int, v:T):T = {
+    val off = Mat.oneBased
+    val i = i0 - off
+    if (i < 0 || i >= length) {
+      throw new IndexOutOfBoundsException(""+(i+off)+" vs ("+length+")");
+    } else {
+      data(i) = v
+    }
+    v
+  }
+  /*
+  * Unchecked 0-based set
+  */ 
+  def set_(r:Int, c:Int, v:T):T = {
+    data(r+c*nrows) = v
+    v
+  } 
+  /*
+  * Transpose
+  */
+  def gt(oldmat:Mat):DenseMat[T]  = {
+    var out:DenseMat[T] = DenseMat.newOrCheck(ncols, nrows, oldmat)
+    var i = 0
+    while (i < nrows) {
+      var j = 0
+      while (j < ncols) {
+        out.data(j+i*ncols) = data(i+j*nrows)
+        j += 1
+      }
+      i += 1
+    }
+    out
+  }
+  /*
+  * Stack matrices vertically
+  */
+  def gvertcat(a:DenseMat[T]):DenseMat[T] = 
+    if (ncols != a.ncols) {
+      throw new RuntimeException("ncols must match")
+    } else {
+      var out = new DenseMat[T](nrows+a.nrows, ncols)
+      var i = 0
+      while (i < ncols) {
+        System.arraycopy(data, i*nrows, out.data, i*(nrows+a.nrows), nrows)
+        System.arraycopy(a.data, i*a.nrows, out.data, nrows+i*(nrows+a.nrows), a.nrows)
+        i += 1
+      }
+      out
+    }
+  /*
+  * Stack matrices horizontally
+  */ 
+  def ghorzcat(a:DenseMat[T]):DenseMat[T]= 
+    if (nrows != a.nrows) {
+      throw new RuntimeException("nrows must match")
+    } else {
+      var out = new DenseMat[T](nrows, ncols+a.ncols)
+      System.arraycopy(data, 0, out.data, 0, nrows*ncols)
+      System.arraycopy(a.data, 0, out.data, nrows*ncols, nrows*a.ncols)
+      out
+    }
+  /*
+  * Count number of non-zero entries
+  */
+  override def nnz:Int = {
+    var count:Int = 0
+    var i = 0
+    while (i < length) {
+      if (data(i) != 0) {
+        count += 1
+      }
+      i += 1
+    }
+    count
+  }
+  /*
+  * Helper function for find functions
+  */ 
+  def findInds(out:IMat, off:Int):IMat = {
+    var count = 0
+    var i = off
+    while (i < length+off) {
+      if (data(i) != 0) {
+        out.data(count) = i
+        count += 1
+      } 
+      i += 1
+    }
+    out
+  }
+  /*
+  * Find indices (linear) for all non-zeros elements
+  */
+  def find:IMat = {
+    var out = IMat(nnz, 1)
+    findInds(out, Mat.oneBased)
+  }  
+  /*
+  * Find indices (i,j) for non-zero elements
+  */ 
+  def find2:(IMat, IMat) = {
+    val iout = IMat(nnz, 1)
+    val jout = IMat(nnz, 1)
+    findInds(iout, 0)
+    val off = Mat.oneBased
+    var i = 0
+    while (i < iout.length) {
+      val ival:Int = iout.data(i)
+      jout.data(i) = (ival / nrows) + off
+      iout.data(i) = (ival % nrows) + off
+      i += 1
+    }
+    (iout, jout)
+  } 
+  /*
+  * Find tuples (i,j,v) for non-zero elements
+  */ 
+  def gfind3:(IMat, IMat, DenseMat[T]) = {
+    val iout = IMat(nnz, 1)
+    val jout = IMat(nnz, 1)
+    val vout = new DenseMat[T](nnz, 1)
+    findInds(iout, 0)
+    val off = Mat.oneBased
+    var i = 0
+    while (i < iout.length) {
+      val ival:Int = iout.data(i)
+      vout.data(i) = data(ival)
+      jout.data(i) = (ival / nrows) + off
+      iout.data(i) = (ival % nrows) + off
+      i += 1
+    }
+    (iout, jout, vout)
+  }  
+  /*
+  * Return a(im) where im is a matrix of indices
+  */
+  def gapply(im:IMat):DenseMat[T] = 
+    im match {
+      case aa:MatrixWildcard => {
+        val out = new DenseMat[T](length, 1)
+        System.arraycopy(data, 0, out.data, 0, out.length)
+        out
+      }
+      case _ => {
+        val out = new DenseMat[T](im.nrows, im.ncols)
+        var i = 0
+        val off = Mat.oneBased
+        while (i < out.length) {
+          val ind = im.data(i) - off
+          if (ind < 0 || ind >= length) {
+            throw new RuntimeException("bad linear index "+(ind+off)+" vs "+length)
+          } else {
+            out.data(i) = data(ind)
+          }
+          i += 1
+        }
+        out
+      }
+    } 
+  
+  /*
+  * Implement a(im) = b where im is a matrix of indices to a and im and b are same-sized
+  */
+  def update(im:IMat, b:DenseMat[T]):DenseMat[T] = 
+    im match {
+      case aaa:MatrixWildcard => {
+        if (length != b.length || b.ncols != 1) {
+          throw new RuntimeException("dims mismatch")
+        } else {
+          System.arraycopy(b.data, 0, data, 0, length)
+        }
+        b
+      }
+      case _ => {
+        if (im.nrows != b.nrows || im.ncols != b.ncols) {
+          throw new RuntimeException("dims mismatch")
+        } else {
+        	val off = Mat.oneBased
+          var i = 0
+          while (i < im.length) {
+            val ind = im.data(i) - off
+            if (ind < 0 || ind >= length) {
+              throw new RuntimeException("bad linear index "+(ind+off)+" vs "+length)
+            } else {
+              data(ind) = b.data(i)
+            }
+            i += 1
+          }
+        }
+        b
+      }
+    } 
+  
+ /*
+  * Implement a(im) = b where im is a matrix of indices to a, and b is a constant
+  */
+  def update(a:IMat, b:T):T = {
+    a match {
+  		case aaa:MatrixWildcard => {
+  			var i = 0
+  			while (i < length) {
+  				data(i) = b
+  				i += 1
+  			}
+  		}
+  		case _ => {
+  			var i = 0
+  			val off = Mat.oneBased
+  			while (i < a.length) {
+  				val ind = a.data(i) - off
+  				if (ind < 0 || ind >= length) {
+  					throw new RuntimeException("bad linear index "+(ind+off)+" vs "+length)
+  				} else {
+  					data(ind) = b
+  				}
+  				i += 1
+  			}
+  		}
+    }  
+    b
+  }
+  /*
+  * Implement slicing, a(iv,jv) where iv and jv are vectors, using ? as wildcard
+  */
+  def gapply(iv:IMat, jv:IMat):DenseMat[T] = {
+    val rowinds = DenseMat.getInds(iv, nrows)
+    val colinds = DenseMat.getInds(jv, ncols)
+    val out = new DenseMat[T](rowinds.length, colinds.length)
+    val off = Mat.oneBased
+    var i = 0
+    while (i < out.ncols) {
+      var j = 0
+      val c = colinds(i) - off
+      while (j < out.nrows) {
+        out.data(j+i*out.nrows) = data(rowinds(j)-off+nrows*c)
+        j += 1
+      }
+      i += 1
+    }
+    out
+  }
+  /*
+  * Implement slicing, a(iv,j) where iv a vector, j an integer, using ? as wildcard
+  */
+  def gapply(iv:IMat, jv:Int):DenseMat[T] = {
+  		gapply(iv, IMat.ielem(jv))
+  }
+  /*
+  * Implement slicing, a(i,jv) where i integer, jv a vector, using ? as wildcard
+  */
+  def gapply(i:Int, jv:IMat):DenseMat[T] = {
+  		gapply(IMat.ielem(i), jv)
+  }
+
+  /*
+  * Implement sliced assignment, a(iv,jv) = b where iv and jv are vectors, using ? as wildcard
+  */ 
+  def _update(iv:IMat, jv:IMat, b:DenseMat[T]):DenseMat[T] = {
+    val rowinds = DenseMat.getInds(iv, nrows)
+    val colinds = DenseMat.getInds(jv, ncols) 
+    if (rowinds.length != b.nrows || colinds.length != b.ncols) {
+      throw new RuntimeException("dims mismatch in assignment")
+    } else {
+    	val off = Mat.oneBased
+      var i = 0
+      while (i < b.ncols) {
+      	val c = colinds(i) - off 
+        var j = 0
+        while (j < b.nrows) {
+          data(rowinds(j)-off+nrows*c) = b.data(j+i*b.nrows)
+          j += 1
+        }
+        i += 1
+      }
+    }
+    b
+  }
+  
+  override def update(iv:IMat, jv:IMat, b:Mat):Mat = {
+    (this, b) match {
+      case (me:FMat, bb:FMat) => me.update(iv, jv, bb):FMat
+      case (me:DMat, bb:DMat) => me.update(iv, jv, bb):DMat
+      case (me:IMat, bb:IMat) => me.update(iv, jv, bb):IMat
+      case (me:CMat, bb:CMat) => me.update(iv, jv, bb):CMat
+    }
+  }
+  
+ /*
+  * Implement sliced assignment, a(iv,jv) = b:T where iv and jv are vectors, using ? as wildcard
+  */ 
+  def update(iv:IMat, jv:IMat, b:T):T = {
+    val rowinds = DenseMat.getInds(iv, nrows)
+    val colinds = DenseMat.getInds(jv, ncols) 
+    val off = Mat.oneBased
+    var i = 0
+    while (i < colinds.length) {
+    	val c = colinds(i) - off
+    	var j = 0
+    	while (j < rowinds.length) {
+    		val r = rowinds(j) - off
+    		data(r+nrows*c) = b
+    		j += 1
+    	}
+    	i += 1
+    }
+    b
+  }
+  /*
+  * Implement sliced assignment, a(iv,j) = b where iv a vectors, j integer, using ? as wildcard
+  */ 
+  def update(iv:IMat, j:Int, b:T):T = {
+    update(iv, IMat.ielem(j), b)
+  }
+  /*
+  * Implement sliced assignment, a(i,jv) = b where jv a vector, using ? as wildcard
+  */ 
+  def update(i:Int, jv:IMat, b:T):T = {
+    update(IMat.ielem(i), jv, b)
+  }
+  
+  def printOne(i:Int):String = " "
+  
+  override def toString:String = {
+    val nChars = Mat.terminalWidth-4
+    val maxRows = 640/nChars
+    var maxCols = nChars
+    var fieldWidth = 4
+    var icols = 0
+    while (icols < math.min(ncols, maxCols)) {
+    	var newWidth = fieldWidth
+    	for (j <- 0 until math.min(nrows,maxRows)) newWidth = math.max(newWidth, 2+(printOne(j+nrows*icols).length))
+    	if ((icols+1)*newWidth < nChars) {
+    		fieldWidth = newWidth
+    		icols += 1
+    	} else {
+    		maxCols = icols
+    	}
+    }
+    val sb:StringBuilder = new StringBuilder
+    val somespaces = "                                             "
+    for (i <- 0 until math.min(nrows, maxRows)) {
+      for (j <- 0 until math.min(ncols, icols)) {
+      	val str = printOne(i+j*nrows)
+      	sb.append(somespaces.substring(0,fieldWidth-str.length)+str)
+      }
+      if (ncols > icols) {
+      	sb.append("...")
+      }
+      sb.append("\n")
+    }
+    if (nrows > maxRows) {
+    	for (j <- 0 until math.min(ncols, maxCols)) {
+    		sb.append(somespaces.substring(0, fieldWidth-2)+"..")
+    	}
+    	sb.append("\n")
+    }
+    sb.toString()
+  }
+  
+  override def clear:DenseMat[T] ={
+    if (length == 0) {
+      this
+    } else {
+      val v = data(0)
+      v match {
+        case a:Float => Arrays.fill(data.asInstanceOf[Array[Float]], 0, length, 0)
+        case a:Double => Arrays.fill(data.asInstanceOf[Array[Double]], 0, length, 0)
+        case a:Int => Arrays.fill(data.asInstanceOf[Array[Int]], 0, length, 0)
+        case a:AnyRef => Arrays.fill(data.asInstanceOf[Array[AnyRef]], 0, length, null)
+      }
+    }
+    this
+  }
+  
+  def setUpper(v:T, off:Int) = {
+  	var i = 0
+  	while (i < ncols) {
+  		var j = 0
+  		while (j < i+off) {
+  			data(j + i*nrows) = v
+  			j += 1
+  		}
+  		i += 1
+  	}
+    this
+  }
+  
+  def setLower(v:T, off:Int) = {
+  	var i = 0
+  	while (i < ncols) {
+  		var j = math.max(0,i+1+off)
+  		while (j < nrows) {
+  			data(j + i*nrows) = v
+  			j += 1
+  		}
+  		i += 1
+  	}
+    this
+  }
+
+  /*
+  * General operation between two matrices. Apply op2 to corresponding elements from the input matrices.
+  */
+  def ggMatOp(aa:DenseMat[T], op2:(T,T) => T, oldmat:Mat):DenseMat[T] = {
+        if (nrows==aa.nrows && ncols==1) {
+          val out = DenseMat.newOrCheck(nrows, aa.ncols, oldmat)
+          Mat.nflops += aa.length
+          var i = 0
+          while (i < aa.ncols) {
+            var j = 0
+            while (j < nrows) {
+              out.data(j+i*nrows) = op2(data(j), aa.data(j+i*aa.nrows))
+              j += 1
+            }
+            i += 1
+          }
+          out
+        } else if (ncols==aa.ncols && nrows==1) {
+          val out = DenseMat.newOrCheck[T](aa.nrows, ncols, oldmat)
+          Mat.nflops += aa.length
+          var i = 0
+          while (i < ncols) {
+            var j = 0
+            while (j < aa.nrows) {
+              out.data(j+i*aa.nrows) = op2(data(i), aa.data(j+i*aa.nrows))
+              j += 1
+            }
+            i += 1
+          }
+          out
+        } else if (nrows==aa.nrows && aa.ncols==1) {
+          val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i < ncols) {
+            var j = 0
+            while (j < nrows) {
+              out.data(j+i*nrows) = op2(data(j+i*nrows), aa.data(j))
+              j += 1
+            }
+            i += 1
+          }
+          out
+        } else if (ncols==aa.ncols && aa.nrows==1) {
+          val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i <  ncols) {
+            var j = 0
+            while (j < nrows) {
+              out.data(j+i*nrows) = op2(data(j+i*nrows), aa.data(i))
+              j += 1
+            }
+            i += 1   
+          }
+          out
+        } else ggMatOpStrict(aa, op2, oldmat)
+      }
+
+  /*
+   * This version applies the operator op2 with stricter dimension checking, 
+   * either dims must match or one arg must be scalar
+   */
+  def ggMatOpStrict(aa:DenseMat[T], op2:(T,T) => T, oldmat:Mat):DenseMat[T] =
+        if (nrows==aa.nrows && ncols==aa.ncols) {
+          val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i < aa.length) {
+            out.data(i) = op2(data(i), aa.data(i))
+            i += 1
+          }
+          out
+        } else if (aa.nrows == 1 && aa.ncols == 1) {
+          val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+          Mat.nflops += length
+          val aval = aa.data(0)
+          var i = 0
+          while (i < length) {
+            out.data(i) = op2(data(i), aval)
+            i += 1
+          }
+          out
+        } else if (nrows == 1 && ncols == 1) {
+          val out = DenseMat.newOrCheck[T](aa.nrows, aa.ncols, oldmat)
+          Mat.nflops += aa.length
+          val aval = data(0)
+          var i = 0
+          while (i < aa.length) {
+            out.data(i) = op2(aval, aa.data(i))
+            i += 1
+          }
+          out
+        } else throw new RuntimeException("dims incompatible");
+
+  /*
+   * Apply the binary operation op2 to the matrix and a scalar argument
+   */  
+  def ggMatOpScalar(a:T, op2:(T,T) => T, oldmat:Mat):DenseMat[T] = {
+    val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+    Mat.nflops += length
+    var i  = 0
+    while (i < length) {
+      out.data(i) = op2(data(i), a)
+      i += 1
+    }
+    out
+  }
+  /*
+  * General operation between two matrices. Apply op2 to corresponding elements from the input matrices.
+  * Implemented with vector operation primitives.
+  */
+  def ggMatOpv(aa:DenseMat[T], opv:(Array[T],Int,Int,Array[T],Int,Int,Array[T],Int,Int,Int) => T, oldmat:Mat):DenseMat[T] = 
+        if (nrows==aa.nrows && ncols==1) {
+          val out = DenseMat.newOrCheck[T](nrows, aa.ncols, oldmat)
+          Mat.nflops += aa.length
+          var i = 0          
+          while (i < aa.ncols) {
+            opv(data, 0, 1, aa.data, i*aa.nrows, 1, out.data, i*nrows, 1, nrows)
+            i += 1
+          }
+          out
+        } else if (ncols==aa.ncols && nrows==1) {
+          val out = DenseMat.newOrCheck[T](aa.nrows, ncols, oldmat)
+          Mat.nflops += aa.length
+          var i = 0
+          while (i < ncols) {
+            opv(data, i, 0, aa.data, i*aa.nrows, 1, out.data, i*aa.nrows, 1, aa.nrows)
+            i += 1
+          }
+          out
+        } else if (nrows==aa.nrows && aa.ncols==1) {
+          val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i < ncols) {
+            opv(data, i*nrows, 1, aa.data, 0, 1, out.data, i*nrows, 1, nrows)
+            i += 1
+          }
+          out
+        } else if (ncols==aa.ncols && aa.nrows==1) {
+          val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+          Mat.nflops += length
+          var i = 0
+          while (i < ncols) {
+            opv(data, i*nrows, 1, aa.data, i, 0, out.data, i*nrows, 1, nrows)
+            i += 1   
+          }
+          out
+        } else ggMatOpStrictv(aa, opv, oldmat);    
+
+
+  def ggMatOpStrictv(aa:DenseMat[T], opv:(Array[T],Int,Int,Array[T],Int,Int,Array[T],Int,Int,Int) => T, oldmat:Mat):DenseMat[T] = {
+        var out:DenseMat[T] = null
+        var mylen = 0
+        if ((nrows==aa.nrows && ncols==aa.ncols) || (aa.nrows == 1 && aa.ncols == 1)) {
+        	out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+        	mylen = length
+        } else if (nrows == 1 && ncols == 1) {
+        	val out = DenseMat.newOrCheck[T](aa.nrows, aa.ncols, oldmat)
+        	mylen = aa.length
+        } else throw new RuntimeException("dims incompatible")
+        if (mylen > 100000 && Mat.numThreads > 1) {
+        	val done = IMat(1, Mat.numThreads)
+        	for (ithread<- 0 until Mat.numThreads) {
+        		val istart = ithread*mylen/Mat.numThreads
+        		val len = (ithread+1)*mylen/Mat.numThreads - istart
+        		actor {
+        			if (nrows==aa.nrows && ncols==aa.ncols) {
+        				opv(data, istart, 1, aa.data, istart, 1, out.data, istart, 1, len)
+        			} else if (aa.nrows == 1 && aa.ncols == 1) {
+        				opv(data, istart, 1, aa.data, 0, 0, out.data, istart, 1, len)
+        			} else {
+        				opv(data, 0, 0, aa.data, istart, 1, out.data, istart, 1, len)
+        			}
+        			done(ithread) = 1
+        		}
+        	}
+        	while (SciFunctions.sum(done).v < Mat.numThreads) {Thread.`yield`()}         
+        } else if (nrows==aa.nrows && ncols==aa.ncols) {
+        	opv(data, 0, 1, aa.data, 0, 1, out.data, 0, 1, aa.length)
+        } else if (aa.nrows == 1 && aa.ncols == 1) {
+          opv(data, 0, 1, aa.data, 0, 0, out.data, 0, 1, length)
+        } else if (nrows == 1 && ncols == 1) {
+          opv(data, 0, 0, aa.data, 0, 1, out.data, 0, 1, aa.length)
+        } 
+        Mat.nflops += mylen
+        out
+      }
+  
+  def ggMatOpScalarv(a:T, opv:(Array[T],Int,Int,Array[T],Int,Int,Array[T],Int,Int,Int) => T, oldmat:Mat):DenseMat[T] = {
+    val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+    Mat.nflops += length
+    val aa = new Array[T](1)
+    aa(0) = a
+    opv(data, 0, 1, aa, 0, 0, out.data, 0, 1, length)    
+    out
+  }
+
+  def ggReduceOp(dim0:Int, op1:(T) => T, op2:(T,T) => T, oldmat:Mat):DenseMat[T] = {
+    var dim = if (nrows == 1 && dim0 == 0) 2 else math.max(1, dim0)
+    if (dim == 1) {
+      val out = DenseMat.newOrCheck[T](1, ncols, oldmat)
+      Mat.nflops += length
+      var i = 0
+      while (i < ncols) { 
+        var j = 1
+        var acc = op1(data(i*nrows))
+        while (j < nrows) { 
+          acc = op2(acc, data(j+i*nrows))
+          j += 1
+        }
+        out.data(i) = acc
+        i += 1
+      }
+      out
+    } else if (dim == 2) { 
+      val out = DenseMat.newOrCheck[T](nrows, 1, oldmat)
+      Mat.nflops += length
+      var j = 0
+      while (j < nrows) { 
+        out.data(j) = op1(data(j))
+        j += 1
+      }
+      var i = 1
+      while (i < ncols) { 
+        var j = 0
+        while (j < nrows) { 
+          out.data(j) = op2(out.data(j), data(j+i*nrows))
+          j += 1
+        }
+        i += 1
+      }
+      out
+    } else
+      throw new RuntimeException("index must 1 or 2");
+  }
+  
+  def ggOpt2(dim0:Int, op2:(T,T) => Boolean):(DenseMat[T],IMat) = {
+    var dim = if (nrows == 1 && dim0 == 0) 2 else math.max(1, dim0)
+    if (dim == 1) {
+      val out = new DenseMat[T](1, ncols)
+      val iout = IMat(1, ncols)
+      Mat.nflops += length
+      var i = 0
+      while (i < ncols) { 
+        var j = 1
+        var acc = data(i*nrows)
+        var iacc = 0
+        while (j < nrows) { 
+          val v = data(j+i*nrows)
+          if (op2(v, acc)) {
+            acc = v
+            iacc = j            
+          }
+          j += 1
+        }
+        out.data(i) = acc
+        iout.data(i) = iacc
+        i += 1
+      }
+      (out, iout)
+    } else if (dim == 2) { 
+      val out = new DenseMat[T](nrows, 1)
+      val iout = IMat(nrows, 1)
+      Mat.nflops += length
+      var j = 0
+      while (j < nrows) { 
+        out.data(j) = data(j)
+        iout.data(j) = 0
+        j += 1
+      }
+      var i = 1
+      while (i < ncols) { 
+        var j = 0
+        while (j < nrows) { 
+          val v = data(j+i*nrows)
+          if (op2(v, out.data(j))) {
+          	out.data(j) = v
+          	iout.data(j) = i
+          }
+          j += 1
+        }
+        i += 1
+      }
+      (out, iout)
+    } else
+      throw new RuntimeException("index must 1 or 2");
+  }
+  
+  def ggReduceOpv(dim0:Int, opv:(Array[T],Int,Int,Array[T],Int,Int,Array[T],Int,Int,Int) => T, oldmat:Mat):DenseMat[T] = {
+    var dim = if (nrows == 1 && dim0 == 0) 2 else math.max(1, dim0)
+    if (dim == 1) {
+      val out = DenseMat.newOrCheck[T](1, ncols, oldmat)
+      Mat.nflops += length
+      var i = 0
+      while (i < ncols) { 
+        out.data(i) = data(i*nrows)
+        opv(data, i*nrows+1, 1, out.data, i, 0, out.data, i, 0, nrows-1)
+        i += 1
+      }
+      out
+    } else if (dim == 2) { 
+      val out = DenseMat.newOrCheck[T](nrows, 1, oldmat)
+      Mat.nflops += length
+      var j = 0
+      while (j < nrows) { 
+        out.data(j) = data(j)
+        j += 1
+      }
+      var i = 1
+      while (i < ncols) { 
+        opv(data, i*nrows, 1, out.data, 0, 1, out.data, 0, 1, nrows)
+        i += 1
+      }
+      out
+    } else
+      throw new RuntimeException("index must 1 or 2");
+  }
+
+  def ggReduceAll(dim0:Int, op1:(T) => T, op2:(T,T) => T, oldmat:Mat):DenseMat[T] = {
+    var dim = if (nrows == 1 && dim0 == 0) 2 else math.max(1, dim0)
+    if (dim == 1) {
+      val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+      Mat.nflops += length
+      var i = 0
+      while (i < ncols) { 
+        val i0 = i*nrows
+        var j = 1
+        var acc = op1(data(i0))
+        out.data(i0) = acc
+        while (j < nrows) { 
+          acc = op2(acc, data(j+i0))
+          out.data(j+i0) = acc
+          j += 1
+        }
+        i += 1
+      }
+      out
+    } else if (dim == 2) { 
+      val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+      Mat.nflops += length
+      var j = 0
+      while (j < nrows) { 
+        out.data(j) = op1(data(j))
+        j += 1
+      }
+      var i = 1
+      while (i < ncols) { 
+        val i0 = i*nrows
+        var j = 0
+        while (j < nrows) { 
+          out.data(j+i0) = op2(out.data(j+i0-nrows), data(j+i0))
+          j += 1
+        }
+        i += 1
+      }
+      out
+    } else
+      throw new RuntimeException("index must 1 or 2")  
+  }
+  
+  def ggReduceAllv(dim0:Int, opv:(Array[T],Int,Int,Array[T],Int,Int,Array[T],Int,Int,Int) => T, oldmat:Mat):DenseMat[T] = {
+    var dim = if (nrows == 1 && dim0 == 0) 2 else math.max(1, dim0)
+    if (dim == 1) {
+      val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+      Mat.nflops += length
+      var i = 0
+      while (i < ncols) { 
+        val i0 = i*nrows
+        out.data(i0) = data(i0)
+        opv(data, i0+1, 1, out.data, i0, 1, out.data, i0+1, 1, nrows-1)
+        i += 1
+      }
+      out
+    } else if (dim == 2) { 
+      val out = DenseMat.newOrCheck[T](nrows, ncols, oldmat)
+      Mat.nflops += length
+      var j = 0
+      while (j < nrows) { 
+        out.data(j) = data(j)
+        j += 1
+      }
+      var i = 1
+      while (i < ncols) { 
+        val i0 = i*nrows
+        opv(data, i0, 1, out.data, i0-nrows, 1, out.data, i0, 1, nrows)
+        i += 1
+      }
+      out
+    } else
+      throw new RuntimeException("index must 1 or 2")  
+  }
+    
+  def dot (a : DenseMat[T])(implicit numeric:Numeric[T]):Double = 
+  	if (nrows != a.nrows || ncols != a.ncols) {
+  		throw new RuntimeException("dot dims not compatible")
+  	} else {
+  		Mat.nflops += 2 * length
+  		var v = 0.0
+  		var i = 0
+  		while (i < length){
+  			v += numeric.toDouble(numeric.times(data(i),a.data(i)))
+  			i += 1
+  		}
+  		v
+  	}
+ 
+  def mkdiag = {
+    if (math.min(nrows, ncols) > 1) {
+      throw new RuntimeException("mkdiag needs a vector input")
+    }
+    val n = math.max(nrows, ncols)
+    val out = new DenseMat[T](n,n)
+    var i = 0
+    while (i < n) {
+      out.data(i*(n+1)) = data(i)
+      i += 1
+    }
+    out
+  }
+  
+  def getdiag = {
+    val n = math.min(nrows, ncols)
+    val out = new DenseMat[T](n,1)
+    var i = 0
+    while (i < n) {
+      out.data(i) = data(i*(nrows+1))
+      i += 1
+    }
+    out
+  }
+ 
+}
+
+object DenseMat {
+  
+  def vecCmp[@specialized(Double, Float, Int, Byte) T](xmap:Array[T])(a:Array[T], a0:Int, ainc:Int, b:Array[T], b0:Int, binc:Int, c:Array[T], c0:Int, cinc:Int, n:Int)
+  (implicit numeric:Numeric[T]):T = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      val indx = numeric.compare(a(ai), b(bi));  c(ci) = xmap(indx+1); ai += ainc; bi += binc;  ci += cinc
+    }
+    numeric.zero
+  }
+
+  
+  def newOrCheck[T](nr:Int, nc:Int, oldmat:Mat)
+  (implicit classManifest:ClassManifest[T]):DenseMat[T] = {
+    if (oldmat.asInstanceOf[AnyRef] == null || (oldmat.nrows == 0 && oldmat.ncols == 0)) {
+      new DenseMat[T](nr, nc)
+    } else {
+      val omat = oldmat.asInstanceOf[DenseMat[T]]
+      if (oldmat.nrows != nr || oldmat.ncols != nc) {
+        if (nr*nc <= omat.data.size) {
+          return new DenseMat[T](nr, nc, omat.data)
+        } else {
+        	new DenseMat[T](nr, nc)
+        }
+      } else {
+        omat
+      }
+    }
+  }
+  
+  def getInds(ii:IMat, n:Int):Array[Int] = {
+    var inds:Array[Int] = null
+    val off = Mat.oneBased
+    ii match {
+      case aaa:MatrixWildcard => {
+        inds = new Array[Int](n)
+        var i = 0
+        while (i < n) {
+          inds(i) = i + off
+          i += 1
+        }
+        inds
+      }
+      case _ => {
+        var i = 0
+        while (i < ii.length) {
+          val ind = ii.data(i) - off
+          if (ind < 0 || ind >= n) {
+            throw new RuntimeException("index out of range "+(ind+off)+" vs "+n)
+          } 
+          i += 1
+        }
+        ii.data
+      }
+    }
+  }
+  
+  def getSInds(in:Seq[Int], n:Int):Array[Int] = {
+    var inds:Array[Int] = new Array[Int](math.min(in.length,n))
+    val off = Mat.oneBased
+    var i = 0
+    while (i < in.length) {
+    	val ind = in(i) - off
+    	if (ind < 0 || ind >= n) {
+    		throw new RuntimeException("index out of range "+(ind+off)+" vs "+n)
+    	} 
+    	i += 1
+    }
+  inds
+  }
+   
+  def genSort[@specialized(Double, Float, Int, Byte) T](a:Array[T],from:Int,to:Int):Unit = { 
+    a match { 
+      case aa:Array[Double] => { 
+        Arrays.sort(aa, from, to)
+      }
+      case aa:Array[Float] => { 
+        Arrays.sort(aa, from, to)
+      }
+      case aa:Array[Int] => { 
+        Arrays.sort(aa, from, to)
+      }
+      case aa:Array[Byte] => { 
+        Arrays.sort(aa, from, to)
+      }
+    }
+  }
+  
+  def genSort[@specialized(Double, Float, Int, Byte) T](a:Array[T]):Unit = { 
+  	genSort(a, 0, a.size)
+  }
+  
+  def reverse[@specialized(Double, Float, Int, Byte) T](a:Array[T],from:Int,to:Int) = {
+  	var i = 0
+  	var n = to - from
+  	while (2*i < n-1) {
+  		val tmp = a(i+from)
+  		a(i+from) = a(to-i-1)
+  		a(to-i-1) = tmp
+  		i += 1
+  	}
+  }
+  
+  def reverse[@specialized(Double, Float, Int, Byte) T](a:Array[T]):Unit = { 
+  	reverse(a, 0, a.size)
+  }
+
+  def sort[@specialized(Double, Float, Int, Byte) T](a:DenseMat[T], ik0:Int, asc:Boolean)
+  (implicit classManifest:ClassManifest[T], ordering:Ordering[T]):DenseMat[T] = {
+    import BIDMat.Sorting._
+    val out = new DenseMat[T](a.nrows, a.ncols)
+    var ik = ik0
+    if (ik0 == 0) {
+      if (a.nrows == 1) {
+        ik = 2
+      } else {
+        ik = 1
+      }
+    }    
+    if (a.nrows == 1 || a.ncols == 1) {
+      System.arraycopy(a.data, 0, out.data, 0, a.length)
+      genSort(out.data)
+      if (!asc) {
+      	reverse(a.data)
+      }
+      out
+    } else if (ik == 1) {
+      val thiscol = new Array[T](a.nrows)
+      var i = 0
+      while (i < a.ncols) {
+        var j = 0
+        while (j < a.nrows) {
+          thiscol(j) = a.data(j+i*a.nrows)
+          j += 1
+        }
+        genSort(thiscol)
+        j = 0
+        if (asc) {
+        	while (j < a.nrows) {
+        		out.data(j+i*a.nrows) = thiscol(j)
+        		j += 1
+        	}
+        } else {
+          while (j < a.nrows) {
+        		out.data(j+i*a.nrows) = thiscol(a.nrows-j-1)
+        		j += 1
+        	}
+        }
+        i += 1
+      }    
+      out
+    } else {
+      val thisrow = new Array[T](a.ncols)
+      var i = 0
+      while (i < a.nrows) {
+        var j = 0
+        while (j < a.ncols) {
+          thisrow(j) = a.data(i+j*a.nrows)
+          j += 1
+        }
+        genSort(thisrow)
+        j = 0
+        if (asc) {
+        	while (j < a.ncols) {
+        		out.data(i+j*out.nrows) = thisrow(j)
+        		j += 1
+        	}
+        } else {
+        	while (j < a.ncols) {
+        		out.data(i+j*out.nrows) = thisrow(a.ncols-j-1)
+        		j += 1
+        	}
+        }
+        i += 1
+      }     
+      out
+    }
+  }
+  
+  class MyComparator[@specialized(Double, Float, Int, Byte) T](a:Array[T])
+  	(implicit ordering:Ordering[T]) extends java.util.Comparator[Int] {
+      def compare(ii:Int, jj:Int):Int = {
+      val c0 = ordering.compare(a(ii), a(jj))
+      if (c0 != 0) {
+        c0
+      } else {
+        ii compare jj
+      }      
+    }
+  }
+  
+   def sort2[@specialized(Double, Float, Int, Byte) T](a:DenseMat[T], asc:Boolean)
+  (implicit classManifest:ClassManifest[T], ord:Ordering[T]): (DenseMat[T], IMat) = 
+    if (a.nrows == 1) {
+      sort2(a, 2, asc)
+    } else {
+      sort2(a, 1, asc)
+    }
+
+  def sort2[@specialized(Double, Float, Int, Byte) T](a:DenseMat[T], ik:Int, asc:Boolean)
+  (implicit classManifest:ClassManifest[T], ord:Ordering[T]):(DenseMat[T], IMat) = {
+    import BIDMat.Sorting._
+    val out = new DenseMat[T](a.nrows, a.ncols)
+    val iout = IMat(a.nrows, a.ncols)
+    if (ik == 1) {
+      var i = 0
+      while (i < a.ncols) {
+        var j = 0
+        while (j < a.nrows) {
+        	iout.data(j+i*a.nrows) = j
+        	out.data(j+i*a.nrows) = a.data(j+i*a.nrows)
+        	j += 1
+        }
+        i += 1
+      }
+      i = 0
+      while (i < a.ncols) {
+      	if (asc) {
+      		quickSort2(out.data, iout.data, i*a.nrows, (i+1)*a.nrows, 1)
+      	} else {
+      		quickSort2(out.data, iout.data, (i+1)*a.nrows-1, i*a.nrows-1, -1)       
+      	}
+      	i += 1
+      } 
+      (out, iout)
+    } else {
+      val vcols = new Array[T](a.ncols)
+      val icols = new Array[Int](a.ncols)
+      var i = 0
+      while (i < a.nrows) {
+        var j = 0
+        while (j < a.ncols) {
+          vcols(j) = a.data(i + j*a.nrows)
+          icols(j) = j
+          j += 1
+        }
+        if (asc) {
+          quickSort2(vcols, icols, 0, icols.length, 1)
+        } else {
+          quickSort2(vcols, icols, icols.length-1, -1, -1)      
+        }
+        j = 0
+        while (j < a.ncols) {
+          out.data(i+j*out.nrows) = vcols(j)
+          iout.data(i+j*iout.nrows) = icols(j)
+          j += 1
+        }
+        i += 1
+      }     
+      (out, iout)
+    }
+  }
+  
+  def sortlex[@specialized(Double, Float, Int, Byte) T](a:DenseMat[T], asc:Boolean)(implicit ordering:Ordering[T]):IMat = {
+    import BIDMat.Sorting._
+    val out = IMat(a.nrows,1)
+    val ii = out.data
+    val aa = a.data
+    val nr = a.nrows
+    var i = 0
+    while (i < a.nrows) {
+      out.data(i) = i
+      i += 1
+    }
+    def comp(i:Int, j:Int):Int = {
+      var k = 0
+      val ip = ii(i)
+      val jp = ii(j)
+      var c0 = 0
+      while (k < a.ncols && c0 == 0) {
+        c0 = ordering.compare(aa(ip+k*nr), aa(jp+k*nr))
+        k += 1
+      }
+      if (c0 != 0) {
+        c0
+      } else {
+        ip compare jp
+      }
+    }
+    def swap(i:Int, j:Int):Unit = {
+      val tmp = ii(i)
+      ii(i) = ii(j)
+      ii(j) = tmp
+    }
+    if (asc) {
+      quickSort(comp, swap, 0, a.nrows)
+    } else {
+      quickSort((i:Int,j:Int)=>comp(j,i), swap, 0, a.nrows)
+    }
+    out
+  }
+  
+  def unique2[@specialized(Double, Float, Int) T](a:DenseMat[T])
+  (implicit manifest:Manifest[T], numeric:Numeric[T],  ord:Ordering[T]):(IMat, IMat) = {
+    val (vss, iss) = sort2(a, true)  
+    val iptrs = IMat(a.length,1)
+    var lastpos = 0
+    iptrs.data(iss.data(0)) = lastpos
+    var i = 1
+    while (i < iss.length) {
+      if (vss.data(i-1) !=  vss.data(i)) {
+        lastpos += 1
+      }
+      iptrs.data(iss.data(i)) = lastpos
+      i += 1
+    }
+    val bptrs = IMat(lastpos+1,1)
+    i = iss.length
+    while (i > 0) {
+      bptrs.data(iptrs.data(i-1)) = i-1
+      i = i - 1
+    }
+    (bptrs, iptrs)    
+  } 
+  
+  def uniquerows2[@specialized(Double, Float, Int) T](a:DenseMat[T])(implicit ordering:Ordering[T]):(IMat, IMat) = {
+    val iss = sortlex(a, true)
+    def compeq(i:Int, j:Int):Boolean = {
+      var k:Int = 0;
+      while (k < a.ncols && ordering.equiv(a(i,k):T, a(j,k):T)) {
+        k += 1
+      }
+      if (k == a.ncols) true
+      else false
+    }
+    val iptrs = IMat(a.nrows, 1)
+    var lastpos = 0
+    iptrs.data(iss.data(0)) = lastpos
+    var i = 1
+    while (i < iss.length) {
+      if (!compeq(iss.data(i-1), iss.data(i))) {
+        lastpos += 1
+      }
+      iptrs.data(iss.data(i)) = lastpos
+      i += 1
+    }
+    val bptrs = IMat(lastpos+1,1)
+    i = iss.length
+    while (i > 0) {
+      bptrs.data(iptrs.data(i-1)) = i-1
+      i = i - 1
+    }
+    (bptrs, iptrs)    
+  }    
+  
+  def accum[@specialized(Double, Float, Int) T](inds:IMat, vals:DenseMat[T], nr:Int, nc:Int)
+  (implicit numeric:Numeric[T], classManifest:ClassManifest[T]):DenseMat[T] = { 
+    if (inds.ncols > 2 || (vals.length > 1 && (inds.nrows != vals.nrows)))
+      throw new RuntimeException("mismatch in array dimensions")
+    else { 
+      if (inds.ncols == 1) {
+        val out = new DenseMat[T](nr, nc)
+        Mat.nflops += inds.nrows
+        var i = 0
+        if (vals.length > 1) {
+          while (i < inds.nrows) { 
+            out.data(inds.data(i)) = numeric.plus(out.data(inds.data(i)), vals.data(i))
+            i += 1
+          }
+        } else {
+          while (i < inds.nrows) { 
+            out.data(inds.data(i)) = numeric.plus(out.data(inds.data(i)), vals.data(0))
+            i += 1
+          }
+        }
+        out
+      } else { 
+        val out = new DenseMat[T](nr, nc)
+        Mat.nflops += inds.nrows
+        var i = 0
+        if (vals.length > 1) {
+          while (i < inds.nrows) { 
+            if (inds.data(i) >= nr || inds.data(i+inds.nrows) >= nc)
+              throw new RuntimeException("indices out of bounds "+inds.data(i)+" "+inds.data(i+inds.nrows))
+            val indx = inds.data(i) + nr*inds.data(i+inds.nrows)
+            out.data(indx) = numeric.plus(out.data(indx), vals.data(i))
+            i += 1
+          }
+        } else {
+          while (i < inds.nrows) { 
+            if (inds.data(i) >= nr || inds.data(i+inds.nrows) >= nc)
+              throw new RuntimeException("indices out of bounds "+inds.data(i)+" "+inds.data(i+inds.nrows))
+            val indx = inds.data(i) + nr*inds.data(i+inds.nrows)
+            out.data(indx) = numeric.plus(out.data(indx), vals.data(0))
+            i += 1
+          }
+        }
+        out
+      }
+    }
+  }
+
+}
+
+trait MatrixWildcard extends Mat
+
diff --git a/src/main/scala/BIDMat/FMat.scala b/src/main/scala/BIDMat/FMat.scala
new file mode 100755
index 00000000..868172fe
--- /dev/null
+++ b/src/main/scala/BIDMat/FMat.scala
@@ -0,0 +1,720 @@
+package BIDMat
+import edu.berkeley.bid.CBLAS._
+import edu.berkeley.bid.LAPACK._
+import edu.berkeley.bid.SPBLAS._
+import scala.actors.Actor._
+import java.util.Arrays
+
+
+case class FMat(nr:Int, nc:Int, data0:Array[Float]) extends DenseMat[Float](nr, nc, data0) {
+
+  def size() = length;
+   
+  override def t:FMat = FMat(gt(null))
+  
+  override def dv:Double =
+    if (nrows > 1 || ncols > 1) {
+      throw new RuntimeException("Matrix should be 1x1 to extract value")
+    } else {
+      data(0)
+    }
+  
+  override def mytype = "FMat"
+  
+  def i:CMat = CMat.imag(this)
+  
+  def horzcat(b: FMat) = FMat(ghorzcat(b))
+  
+  def vertcat(b: FMat) = FMat(gvertcat(b))
+  
+  def find3:(IMat, IMat, FMat) = { val (ii, jj, vv) = gfind3 ; (IMat(ii), IMat(jj), FMat(vv)) }
+  
+  override def apply(a:IMat):FMat = FMat(gapply(a))
+  
+  override def apply(a:IMat, b:IMat):FMat = FMat(gapply(a, b))	
+  
+  override def apply(a:IMat, b:Int):FMat = FMat(gapply(a, b))	
+  
+  override def apply(a:Int, b:IMat):FMat = FMat(gapply(a, b))
+  
+  def update(iv:IMat, jv:IMat, b:FMat):FMat = FMat(_update(iv, jv, b))
+
+  def update(iv:IMat, j:Int, b:FMat):FMat = FMat(_update(iv, IMat.ielem(j), b))
+
+  def update(i:Int, jv:IMat, b:FMat):FMat = FMat(_update(IMat.ielem(i), jv, b))
+  
+  def ffMatOp(b: Mat, f:(Float, Float) => Float, out:Mat):FMat = 
+    b match {
+      case bb:FMat => FMat(ggMatOp(bb, f, out))
+      case _ => throw new RuntimeException("unsupported operation "+f+" on "+this+" and "+b)	
+    }
+  
+  def ffMatOpv(b: Mat, f:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, out:Mat) = 
+    b match {
+      case bb:FMat => FMat(ggMatOpv(bb, f, out))
+      case _ => throw new RuntimeException("unsupported operation "+f+" on "+this+" and "+b)	
+    }
+  
+  def ffMatOpScalar(b: Float, f:(Float, Float) => Float, out:Mat):FMat = FMat(ggMatOpScalar(b, f, out))
+  
+  def ffMatOpScalarv(b: Float, f:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, out:Mat) = 
+    FMat(ggMatOpScalarv(b, f, out))
+  
+  def ffReduceOp(n:Int, f1:(Float) => Float, f2:(Float, Float) => Float, out:Mat) = 
+    FMat(ggReduceOp(n, f1, f2, out))
+  
+  def ffReduceOpv(n:Int, f:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, out:Mat) = 
+    FMat(ggReduceOpv(n, f, out))
+  
+  def ffReduceAll(n:Int, f1:(Float) => Float, f2:(Float, Float) => Float, out:Mat) = 
+    FMat(ggReduceAll(n, f1, f2, out))
+  
+  def ffReduceAllv(n:Int, f:(Array[Float],Int,Int,Array[Float],Int,Int,Array[Float],Int,Int,Int) => Float, out:Mat) = 
+    FMat(ggReduceAllv(n, f, out))
+  
+  override def printOne(i:Int):String = {
+    val v = data(i)
+    if (v % 1 == 0 && math.abs(v) < 1e10) {	      
+      "%d" format v.intValue
+    } else {
+      "%.5g" format v
+    }
+  }
+  
+  override def copy = {
+  	val out = FMat(nrows, ncols)
+  	System.arraycopy(data, 0, out.data, 0, length)
+  	out
+  }
+  
+  def copyTo(a:FMat) = {
+    val aa = a.recycle(nrows, ncols, 0)
+    System.arraycopy(data, 0, aa.data, 0, length)
+    aa
+  }
+  
+  override def set(v:Float):FMat = {
+    Arrays.fill(data,0,length,v)
+    this
+  }
+  
+  override def copyTo(a:Mat) = {
+  	a match {
+  	  case out:FMat => copyTo(out):FMat
+  	  case aa:GMat => aa.copyFrom(this)
+  	}
+  	a
+  }
+  
+  override def zeros(nr:Int, nc:Int) = {
+  	FMat(nr, nc)
+  }
+  
+  override def ones(nr:Int, nc:Int) = {
+  	val out = FMat(nr, nc)
+  	var i = 0
+  	while (i < out.length) {
+  	  out(i) = 1
+  	  i += 1
+  	}
+  	out
+  }
+   
+  override def clearUpper(off:Int) = setUpper(0, off)
+  override def clearUpper = setUpper(0, 0)
+  
+  override def clearLower(off:Int) = setLower(0, off)
+  override def clearLower = setLower(0, 0)
+
+  
+  def fDMult(a:FMat, outmat:Mat):FMat = { 
+  	if (ncols == a.nrows) {
+  		val out = FMat.newOrCheckFMat(nrows, a.ncols, outmat)
+  		Mat.nflops += 2L * length * a.ncols
+  		if (Mat.noMKL) {
+  			out.clear
+  			var i = 0
+  			while (i < a.ncols) {
+  				var j = 0
+  				while (j < a.nrows) {
+  					var k = 0
+  					val dval = a.data(j + i*ncols)
+  					while (k < nrows) {
+  						out.data(k+i*nrows) += data(k+j*nrows)*dval
+  						k += 1
+  					}
+  					j += 1
+  				}
+  				i += 1									
+  			}
+  		} else if (nrows == 1) {
+  			sgemv(ORDER.ColMajor, TRANSPOSE.Trans, a.nrows, a.ncols, 1.0f, a.data, a.nrows, data, 1, 0, out.data, 1)
+  		} else if (a.ncols == 1) {
+  			sgemv(ORDER.ColMajor, TRANSPOSE.NoTrans, nrows, ncols, 1.0f, data, nrows, a.data, 1, 0, out.data, 1)
+  		} else {
+  			sgemm(ORDER.ColMajor, TRANSPOSE.NoTrans, TRANSPOSE.NoTrans,
+  					nrows, a.ncols, ncols, 1.0f, data, nrows, a.data, a.nrows, 0, out.data, nrows)
+  		}
+  		out
+  	} else if (ncols == 1 && nrows == 1){
+  		val out = FMat.newOrCheckFMat(a.nrows, a.ncols, outmat)
+  		Mat.nflops += a.length
+  		var i = 0
+  		val dvar = data(0)
+  		while (i < a.length) {
+  			out.data(i) = dvar * a.data(i)
+  			i += 1
+  		}			    
+  		out			  
+  	} else if (a.ncols == 1 && a.nrows == 1){
+  		val out = FMat.newOrCheckFMat(nrows, ncols, outmat)
+  		Mat.nflops += length
+  		var i = 0
+  		val dvar = a.data(0)
+  		while (i < length) {
+  			out.data(i) = dvar * data(i)
+  			i += 1
+  		}			    
+  		out			  
+  	}	else throw new RuntimeException("dimensions mismatch")
+  }
+  
+  def fSMultHelper(a:SMat, out:FMat, istart:Int, iend:Int, ioff:Int) = {
+  	var i = istart
+  	while (i < iend) {
+  		var j = a.jc(i) - ioff
+  		while (j < a.jc(i+1)-ioff) {
+  			val dval = a.data(j)
+  			val ival = a.ir(j) - ioff
+  			if (Mat.noMKL || nrows < 220) {
+  				var k = 0
+  				while (k < nrows) {
+  					out.data(k+i*nrows) += data(k+ival*nrows)*dval
+  					k += 1
+  				} 			  
+  			} else {
+  				saxpyxx(nrows, dval, data, ival*nrows, out.data, i*nrows)
+  			}
+  			j += 1
+  		}
+  		i += 1
+  	}
+  }
+  
+  def fSMultHelper2(a:SMat, out:FMat, istart:Int, iend:Int, ioff:Int) = {
+  	var i = 0
+  	while (i < a.ncols) {
+  		var j = a.jc(i) - ioff
+  		while (j < a.jc(i+1)-ioff) {
+  			val dval = a.data(j)
+  			val ival = a.ir(j) - ioff
+  			var k = istart
+  			while (k < iend) {
+  				out.data(k+i*nrows) += data(k+ival*nrows)*dval
+  				k += 1
+  			} 			  
+  			j += 1
+  		}
+  		i += 1
+  	}
+  }
+  
+  def fSMult(a:SMat, outmat:Mat):FMat = {
+    if (ncols != a.nrows) {
+    	throw new RuntimeException("dimensions mismatch")
+    } else {
+    	val out = FMat.newOrCheckFMat(nrows, a.ncols, outmat)
+    	out.clear
+    	Mat.nflops += 2L * nrows * a.nnz
+    	val ioff = Mat.ioneBased;
+    	if (Mat.noMKL || Mat.numThreads > 1) {
+    		if (1L*nrows*a.nnz > 100000L && Mat.numThreads > 1) {
+    			val done = IMat(1,Mat.numThreads)
+    			for (ithread <- 0 until Mat.numThreads) {
+    				val istart = ithread*a.ncols/Mat.numThreads
+    				val iend = (ithread+1)*a.ncols/Mat.numThreads 
+    				actor {
+    					fSMultHelper(a, out, istart, iend, ioff)
+    					done(ithread) = 1
+    				}
+    			}
+    			while (SciFunctions.sum(done).v < Mat.numThreads) {Thread.`yield`()}
+    		} else {
+    			fSMultHelper(a, out, 0, a.ncols, ioff)
+    		}
+    	} else {
+    		var jc0 = if (ioff == 0) SparseMat.incInds(a.jc) else a.jc
+    		var ir0 = if (ioff == 0) SparseMat.incInds(a.ir) else a.ir 
+    		if (nrows == 1) {
+    			scscmv("T", a.nrows, a.ncols, 1.0f, "GLNF", a.data, ir0, jc0, data, 0f, out.data)
+    		} else {
+    			smcscm(nrows, a.ncols, data, nrows, a.data, ir0, jc0, out.data, nrows)
+    		}
+    	}
+    	out
+    }
+  }
+  
+  def multT(a:SMat, outmat:Mat):FMat = {
+    import edu.berkeley.bid.CBLAS._
+    if (ncols == a.ncols) {
+    	val out = FMat.newOrCheckFMat(nrows, a.nrows, outmat)
+    	out.clear
+    	smcsrm(nrows, a.ncols, data, nrows, a.data, a.ir, a.jc, out.data, nrows)
+    	Mat.nflops += 2L * a.nnz * nrows
+    	out
+    } else {
+      throw new RuntimeException("xT dimensions mismatch")
+    }
+  }
+  
+  def multT(a:FMat, outmat:Mat):FMat = {
+    import edu.berkeley.bid.CBLAS._
+    if (ncols == a.ncols) {
+    	val out = FMat.newOrCheckFMat(nrows, a.nrows, outmat)
+    	sgemm(ORDER.ColMajor, TRANSPOSE.NoTrans, TRANSPOSE.Trans,
+  					nrows, a.nrows, ncols, 1.0f, data, nrows, a.data, a.nrows, 0, out.data, nrows)
+    	Mat.nflops += 2L * length * a.nrows
+    	out
+    } else {
+      throw new RuntimeException("xT dimensions mismatch")
+    }
+  }
+  /*
+  * Column-based (Streaming) multiply
+  */
+  
+  def DMult(aa:FMat, omat:Mat):FMat = 
+  	if (ncols == aa.nrows) {
+  		val out = FMat.newOrCheckFMat(nrows, aa.ncols, omat) // Needs to be cleared
+  		out.clear
+  		for (i <- 0 until aa.ncols)
+  			for (j <- 0 until aa.nrows) {
+  				var k = 0
+  				val dval = aa.data(j + i*ncols)
+  				while (k < nrows) {
+  					out.data(k+i*nrows) += data(k+j*nrows)*dval
+  					k += 1
+  				}
+  			}
+  		out
+  	} else throw new RuntimeException("dimensions mismatch")
+
+  /*
+   * Very slow, row-and-column multiply
+   */
+  
+  def sDMult(aa:FMat, omat:Mat):FMat = 
+  	if (ncols == aa.nrows) {
+  		val out = FMat.newOrCheckFMat(nrows, aa.ncols, omat)
+  		for (i <- 0 until aa.ncols)
+  			for (j <- 0 until nrows) {
+  				var k = 0
+  				var sum = 0f
+  				while (k < ncols) {
+  					sum += data(j+k*nrows) * aa.data(k+i*aa.nrows)
+  					k += 1
+  				}
+  				out.data(j + i*out.nrows) = sum
+  			}
+  		out
+  	} else throw new RuntimeException("dimensions mismatch");
+  
+  def GPUmult(b:FMat, out:Mat) = GMat.GPUmult(this, b, out)
+  
+  def dot(a:FMat):Double = super.dot(a)
+  
+  override def dot(a:Mat):Double = super.dot(a.asInstanceOf[FMat])
+  
+  def solvel(a0:Mat):FMat = 
+    a0 match {
+      case a:FMat => { 
+        Mat.nflops += 2L*a.nrows*a.nrows*a.nrows/3 + 2L*nrows*a.nrows*a.nrows
+        if (a.nrows != a.ncols || ncols != a.nrows) {
+          throw new RuntimeException("solve needs a square matrix")
+        } else {
+          val out = FMat(nrows, ncols)
+          val tmp = new Array[Float](ncols*ncols)
+          System.arraycopy(a.data, 0, tmp, 0, a.length)
+          System.arraycopy(data, 0, out.data, 0, length)
+          val ipiv = new Array[Int](ncols)
+          sgetrf(ORDER.RowMajor, ncols, ncols, tmp, ncols, ipiv)
+          sgetrs(ORDER.RowMajor, "N", ncols, nrows, tmp, ncols, ipiv, out.data, nrows)
+          out
+        }
+      }
+      case _ => throw new RuntimeException("unsupported arg to / "+a0)
+    }
+  
+  def solver(a0:Mat):FMat = 
+    a0 match {
+      case a:FMat => { 
+        Mat.nflops += 2L*nrows*nrows*nrows/3 + 2L*nrows*nrows*a.ncols
+        if (nrows != ncols || ncols != a.nrows) {
+          throw new RuntimeException("solve needs a square matrix")
+        } else {
+          val out = FMat(a.nrows, a.ncols)
+          val tmp = new Array[Float](ncols*ncols)
+          System.arraycopy(data, 0, tmp, 0, length)
+          System.arraycopy(a.data, 0, out.data, 0, a.length)
+          val ipiv = new Array[Int](ncols)
+          sgetrf(ORDER.ColMajor, ncols, ncols, tmp, ncols, ipiv)
+          sgetrs(ORDER.ColMajor, "N", ncols, a.ncols, tmp, nrows, ipiv, out.data, nrows)
+          out
+        }
+      }
+      case _ => throw new RuntimeException("unsupported arg to \\ "+a0)
+    }
+  
+  def inv:FMat = {
+    import edu.berkeley.bid.LAPACK._
+    if (nrows != ncols) {
+      throw new RuntimeException("inv method needs a square matrix")
+    } else {
+      val out = FMat(nrows, ncols)
+      System.arraycopy(data, 0, out.data, 0, length)
+      val ipiv = new Array[Int](nrows)
+      sgetrf(ORDER.ColMajor, nrows, ncols, out.data, nrows, ipiv)
+      sgetri(ORDER.ColMajor, nrows, out.data, nrows, ipiv)
+      out
+    }
+  }
+  
+  override def clear = {
+    Arrays.fill(this.data,0,length,0)
+    this
+  }
+  
+  override def recycle(nr:Int, nc:Int, nnz:Int):FMat = {
+    if (nrows == nr && nc == ncols) {
+      this
+    } else if (data.size >= nr*nc) {
+      new FMat(nr, nc, data)
+    } else {
+      FMat(nr, nc)
+    }  
+  }
+
+  /*
+   * Basic operators on pairs of FMats. These are the compute routines.
+   */
+  def xG (b :FMat) = GPUmult(b, null)
+  def +  (b : FMat) = ffMatOpv(b, FMat.vecAdd _, null)
+  def -  (b : FMat) = ffMatOpv(b, FMat.vecSub _, null)
+  def *  (b : FMat) = fDMult(b, null)
+  def *  (b : SMat) = fSMult(b, null)
+  def xT  (b : SMat) = multT(b, null)
+  def xT  (b : FMat) = multT(b, null)
+  def /  (b : FMat) = solvel(b)
+  def \\ (b : FMat) = solver(b)
+  def *@ (b : FMat) = ffMatOpv(b, FMat.vecMul _, null)
+  def /@ (b : FMat) = ffMatOpv(b, FMat.fVecDiv _, null)
+
+  override def *  (b : Float) = fDMult(FMat.felem(b), null)
+  override def +  (b : Float) = ffMatOpScalarv(b, FMat.vecAdd _, null)
+  override def -  (b : Float) = ffMatOpScalarv(b, FMat.vecSub _, null)
+  override def *@ (b : Float) = ffMatOpScalarv(b, FMat.vecMul _, null)
+  override def /@ (b : Float) = ffMatOpScalarv(b, FMat.fVecDiv _, null)
+
+  override def *  (b : Int) = fDMult(FMat.felem(b), null)
+  override def +  (b : Int) = ffMatOpScalarv(b, FMat.vecAdd _, null)
+  override def -  (b : Int) = ffMatOpScalarv(b, FMat.vecSub _, null)
+  override def *@ (b : Int) = ffMatOpScalarv(b, FMat.vecMul _, null)
+  override def /@ (b : Int) = ffMatOpScalarv(b, FMat.fVecDiv _, null)
+
+  override def *  (b : Double) = fDMult(FMat.felem(b.asInstanceOf[Float]), null)
+  override def +  (b : Double) = ffMatOpScalarv(b.asInstanceOf[Float], FMat.vecAdd _, null)
+  override def -  (b : Double) = ffMatOpScalarv(b.asInstanceOf[Float], FMat.vecSub _, null)
+  override def *@ (b : Double) = ffMatOpScalarv(b.asInstanceOf[Float], FMat.vecMul _, null)
+  override def /@ (b : Double) = ffMatOpScalarv(b.asInstanceOf[Float], FMat.fVecDiv _, null)
+
+  def >   (b : FMat) = ffMatOp(b, (x:Float, y:Float) => if (x > y) 1f else 0f, null)
+  def <   (b : FMat) = ffMatOp(b, (x:Float, y:Float) => if (x < y) 1f else 0f, null)
+  def ==  (b : FMat) = ffMatOp(b, (x:Float, y:Float) => if (x == y) 1f else 0f, null)
+  def === (b : FMat) = ffMatOp(b, (x:Float, y:Float) => if (x == y) 1f else 0f, null)
+  def >=  (b : FMat) = ffMatOp(b, (x:Float, y:Float) => if (x >= y) 1f else 0f, null)
+  def <=  (b : FMat) = ffMatOp(b, (x:Float, y:Float) => if (x <= y) 1f else 0f, null)
+  def !=  (b : FMat) = ffMatOp(b, (x:Float, y:Float) => if (x != y) 1f else 0f, null)
+
+  override def >   (b : Double) = ffMatOpScalar(b.asInstanceOf[Float], (x:Float, y:Float) => if (x > y) 1f else 0f, null)
+  override def <   (b : Double) = ffMatOpScalar(b.asInstanceOf[Float], (x:Float, y:Float) => if (x < y) 1f else 0f, null)
+  override def ==  (b : Double) = ffMatOpScalar(b.asInstanceOf[Float], (x:Float, y:Float) => if (x == y) 1f else 0f, null)
+  override def === (b : Double) = ffMatOpScalar(b.asInstanceOf[Float], (x:Float, y:Float) => if (x == y) 1f else 0f, null)
+  override def >=  (b : Double) = ffMatOpScalar(b.asInstanceOf[Float], (x:Float, y:Float) => if (x >= y) 1f else 0f, null)
+  override def <=  (b : Double) = ffMatOpScalar(b.asInstanceOf[Float], (x:Float, y:Float) => if (x <= y) 1f else 0f, null)
+  override def !=  (b : Double) = ffMatOpScalar(b.asInstanceOf[Float], (x:Float, y:Float) => if (x != y) 1f else 0f, null) 
+  
+  override def >   (b : Int) = ffMatOpScalar(b, (x:Float, y:Float) => if (x > y) 1f else 0f, null)
+  override def <   (b : Int) = ffMatOpScalar(b, (x:Float, y:Float) => if (x < y) 1f else 0f, null)
+  override def ==  (b : Int) = ffMatOpScalar(b, (x:Float, y:Float) => if (x == y) 1f else 0f, null)
+  override def === (b : Int) = ffMatOpScalar(b, (x:Float, y:Float) => if (x == y) 1f else 0f, null)
+  override def >=  (b : Int) = ffMatOpScalar(b, (x:Float, y:Float) => if (x >= y) 1f else 0f, null)
+  override def <=  (b : Int) = ffMatOpScalar(b, (x:Float, y:Float) => if (x <= y) 1f else 0f, null)
+  override def !=  (b : Int) = ffMatOpScalar(b, (x:Float, y:Float) => if (x != y) 1f else 0f, null) 
+  
+  def \ (b: FMat) = horzcat(b)
+  def \ (b: Float) = horzcat(FMat.felem(b))
+  
+  def on (b: FMat) = vertcat(b)
+  def on (b: Float) = vertcat(FMat.felem(b))
+  
+  def ~ (b : FMat):FPair = new FPair(this, b)
+  def ~ (b : SMat):SPair = new SPair(this, b)
+  
+  override def ~ (b: Mat):Pair = 
+    b match {
+    case db:FMat => new FPair(this, db)
+    case sb:SMat => new SPair(this, sb)
+    case _ => throw new RuntimeException("mismatched types for operator ~")
+  }
+  
+ /*
+  * Specialize to IMats to help the type system. 
+  */ 
+  def +  (b : IMat):FMat = this + FMat(b)
+  def -  (b : IMat):FMat = this - FMat(b)
+  def *  (b : IMat):FMat = this * FMat(b)
+  def /  (b : IMat):FMat = this / FMat(b)
+  def \\ (b : IMat):FMat = this \\ FMat(b)
+  def *@ (b : IMat):FMat = this *@ FMat(b)
+  def /@ (b : IMat):FMat = this /@ FMat(b)
+  def \  (b : IMat):FMat = this \ FMat(b)
+  def on (b : IMat):FMat = this on FMat(b) 
+  
+  def >   (b : IMat):FMat = this > FMat(b)
+  def <   (b : IMat):FMat = this < FMat(b)
+  def >=  (b : IMat):FMat = this >= FMat(b)
+  def <=  (b : IMat):FMat = this <= FMat(b)
+  def ==  (b : IMat):FMat = this == FMat(b)
+  def === (b : IMat):FMat = this === FMat(b) 
+  def !=  (b : IMat):FMat = this != FMat(b)
+  
+ /*
+  * Specialize to DMats to help the type system. 
+  */ 
+  def +  (b : DMat):DMat = DMat(this) + b
+  def -  (b : DMat):DMat = DMat(this) - b
+  def *  (b : DMat):DMat = DMat(this) * b
+  def /  (b : DMat):DMat = DMat(this) / b
+  def \\ (b : DMat):DMat = DMat(this) \\ b
+  def *@ (b : DMat):DMat = DMat(this) *@ b
+  def /@ (b : DMat):DMat = DMat(this) /@ b
+  def \  (b : DMat):DMat = DMat(this) \ b
+  def on (b : DMat):DMat = DMat(this) on b 
+  
+  def >   (b : DMat):DMat = DMat(this) > b
+  def <   (b : DMat):DMat = DMat(this) < b
+  def >=  (b : DMat):DMat = DMat(this) >= b
+  def <=  (b : DMat):DMat = DMat(this) <= b
+  def ==  (b : DMat):DMat = DMat(this) == b
+  def === (b : DMat):DMat = DMat(this) === b 
+  def !=  (b : DMat):DMat = DMat(this) != b
+  
+ /*
+  * Specialize to CMats to help the type system. 
+  */ 
+  def +  (b : CMat):CMat = CMat(this) + b
+  def -  (b : CMat):CMat = CMat(this) - b
+  def *  (b : CMat):CMat = CMat(this) * b
+  def /  (b : CMat):CMat = CMat(this) / b
+  def \\ (b : CMat):CMat = CMat(this) \\ b
+  def *@ (b : CMat):CMat = CMat(this) *@ b
+  def /@ (b : CMat):CMat = CMat(this) /@ b
+  def \  (b : CMat):CMat = CMat(this) \ b
+  def on (b : CMat):CMat = CMat(this) on b 
+  
+ /*
+  * Operators whose second arg is generic. 
+  */ 
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(this, b, null, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(this, b, null, Mop_Minus)
+  override def *  (b : Mat):Mat = applyMat(this, b, null, Mop_Times)
+  override def xT  (b : Mat) = b match {
+    case bb:SMat => multT(bb, null)
+    case bb:FMat => multT(bb, null)
+  }
+  override def /  (b : Mat):Mat = applyMat(this, b, null, Mop_Div)
+  override def \\ (b : Mat):Mat = applyMat(this, b, null, Mop_RSolve)
+  override def *@ (b : Mat):Mat = applyMat(this, b, null, Mop_ETimes)
+  override def /@ (b : Mat):Mat = applyMat(this, b, null, Mop_EDiv)
+  override def \  (b : Mat):Mat = applyMat(this, b, null, Mop_HCat)
+  override def on (b : Mat):Mat = applyMat(this, b, null, Mop_VCat)
+  
+  override def >   (b : Mat):Mat = applyMat(this, b, null, Mop_GT)
+  override def <   (b : Mat):Mat = applyMat(this, b, null, Mop_LT)
+  override def >=  (b : Mat):Mat = applyMat(this, b, null, Mop_GE)
+  override def <=  (b : Mat):Mat = applyMat(this, b, null, Mop_LE)
+  override def ==  (b : Mat):Mat = applyMat(this, b, null, Mop_EQ)
+  override def === (b : Mat):Mat = applyMat(this, b, null, Mop_EQ) 
+  override def !=  (b : Mat):Mat = applyMat(this, b, null, Mop_NE)
+
+}
+
+class FPair(val omat:Mat, val mat:FMat) extends Pair {
+  
+  override def t:FMat = FMat(mat.gt(omat))
+  
+  def xG (b :FMat) = mat.GPUmult(b, omat)
+  def * (b : FMat) = mat.fDMult(b, omat) 
+  def * (b : SMat) = mat.fSMult(b, omat) 
+  def xT  (b : SMat) = mat.multT(b, omat)
+  def xT  (b : FMat) = mat.multT(b, omat)
+  def + (b : FMat) = mat.ffMatOpv(b, FMat.vecAdd _, omat)
+  def - (b : FMat) = mat.ffMatOpv(b, FMat.vecSub _, omat)
+  def *@ (b : FMat) = mat.ffMatOpv(b, FMat.vecMul _, omat)
+  def /@ (b : FMat) = mat.ffMatOpv(b, FMat.fVecDiv _, omat)  
+  def ^ (b : FMat) = mat.ffMatOp(b, (x:Float, y:Float) => math.pow(x,y).toFloat, omat)  
+
+  def > (b : FMat) = mat.ffMatOp(b, (x:Float, y:Float) => if (x > y) 1.0f else 0.0f, omat)
+  def < (b : FMat) = mat.ffMatOp(b, (x:Float, y:Float) => if (x < y) 1.0f else 0.0f, omat)
+  def == (b : FMat) = mat.ffMatOp(b, (x:Float, y:Float) => if (x == y) 1.0f else 0.0f, omat)
+  def === (b : FMat) = mat.ffMatOp(b, (x:Float, y:Float) => if (x == y) 1.0f else 0.0f, omat)
+  def >= (b : FMat) = mat.ffMatOp(b, (x:Float, y:Float) => if (x >= y) 1.0f else 0.0f, omat)
+  def <= (b : FMat) = mat.ffMatOp(b, (x:Float, y:Float) => if (x <= y) 1.0f else 0.0f, omat)
+  def != (b : FMat) = mat.ffMatOp(b, (x:Float, y:Float) => if (x != y) 1.0f else 0.0f, omat) 
+  
+  override def * (b : Float) = mat.fDMult(FMat.felem(b), omat)
+  override def * (b : Double) = mat.fDMult(FMat.felem(b.asInstanceOf[Float]), omat)
+  override def + (b : Float) = mat.ffMatOpScalarv(b, FMat.vecAdd _, omat)
+  override def - (b : Float) = mat.ffMatOpScalarv(b, FMat.vecSub _, omat)
+  override def *@ (b : Float) = mat.ffMatOpScalarv(b, FMat.vecMul _, omat)
+  override def /@ (b : Float) = mat.ffMatOpScalarv(b, FMat.fVecDiv _, omat)
+  override def ^ (b : Float) = mat.ffMatOpScalar(b, (x:Float, y:Float) => math.pow(x,y).toFloat, omat)
+
+  override def > (b : Float) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x > y) 1.0f else 0.0f, omat)
+  override def < (b : Float) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x < y) 1.0f else 0.0f, omat)
+  override def == (b : Float) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x == y) 1.0f else 0.0f, omat)
+  override def >= (b : Float) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x >= y) 1.0f else 0.0f, omat)
+  override def <= (b : Float) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x <= y) 1.0f else 0.0f, omat)
+  override def != (b : Float) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x != y) 1.0f else 0.0f, omat)  
+    
+  override def * (b : Int) = mat.fDMult(FMat.felem(b), omat)
+  override def + (b : Int) = mat.ffMatOpScalarv(b, FMat.vecAdd _, omat)
+  override def - (b : Int) = mat.ffMatOpScalarv(b, FMat.vecSub _, omat)
+  override def *@ (b : Int) = mat.ffMatOpScalarv(b, FMat.vecMul _, omat)
+  override def /@ (b : Int) = mat.ffMatOpScalarv(b, FMat.fVecDiv _, omat)
+  override def ^ (b : Int) = mat.ffMatOpScalar(b, (x:Float, y:Float) => math.pow(x,y).toFloat, omat)
+
+  override def > (b : Int) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x > y) 1.0f else 0.0f, omat)
+  override def < (b : Int) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x < y) 1.0f else 0.0f, omat)
+  override def == (b : Int) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x == y) 1.0f else 0.0f, omat)
+  override def >= (b : Int) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x >= y) 1.0f else 0.0f, omat)
+  override def <= (b : Int) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x <= y) 1.0f else 0.0f, omat)
+  override def != (b : Int) = mat.ffMatOpScalar(b, (x:Float, y:Float) => if (x != y) 1.0f else 0.0f, omat) 
+  
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Minus)
+  override def *  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Times)
+  override def xT  (b : Mat) = b match {
+    case bb:SMat => mat.multT(bb, omat)
+    case bb:FMat => mat.multT(bb, omat)
+    }
+  override def /  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Div)
+  override def \\ (b : Mat):Mat = applyMat(mat, b, omat, Mop_RSolve)
+  override def *@ (b : Mat):Mat = applyMat(mat, b, omat, Mop_ETimes)
+  override def /@ (b : Mat):Mat = applyMat(mat, b, omat, Mop_EDiv)
+  override def \  (b : Mat):Mat = applyMat(mat, b, omat, Mop_HCat)
+  override def on (b : Mat):Mat = applyMat(mat, b, omat, Mop_VCat)
+  
+  override def >   (b : Mat):Mat = applyMat(mat, b, omat, Mop_GT)
+  override def <   (b : Mat):Mat = applyMat(mat, b, omat, Mop_LT)
+  override def >=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_GE)
+  override def <=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_LE)
+  override def ==  (b : Mat):Mat = applyMat(mat, b, omat, Mop_EQ)
+  override def === (b : Mat):Mat = applyMat(mat, b, omat, Mop_EQ) 
+  override def !=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_NE)
+}
+
+object FMat {
+  
+  def fVecDiv(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) / b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0f
+  }
+  
+  def apply(nr:Int, nc:Int) = new FMat(nr, nc, new Array[Float](nr*nc))
+  
+  def apply(a:DenseMat[Float]):FMat = new FMat(a.nrows, a.ncols, a.data) 
+
+  def apply(x:Mat):FMat = {
+    var out:FMat = null
+    x match {
+      case dd:DMat => {out = FMat(x.nrows, x.ncols); Mat.copyToFloatArray(dd.data, 0, out.data, 0, dd.length)}
+      case ff:FMat => {out = FMat(x.nrows, x.ncols); System.arraycopy(ff.data, 0, out.data, 0, ff.length)}
+      case ii:IMat => {out = FMat(x.nrows, x.ncols); Mat.copyToFloatArray(ii.data, 0, out.data, 0, ii.length)}
+      case ss:SMat => out = FMat(ss.full)
+      case gg:GMat => out = gg.toFMat
+      case _ => throw new RuntimeException("Unsupported source type")
+    }
+    out
+  }
+  
+  def vecAdd(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) + b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecSub(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) - b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecMul(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) * b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecMax(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = math.max(a(ai), b(bi));  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+ def vecMin(a:Array[Float], a0:Int, ainc:Int, b:Array[Float], b0:Int, binc:Int, c:Array[Float], c0:Int, cinc:Int, n:Int):Float = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = math.min(a(ai), b(bi));  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+
+  def felem(x:Float) = {
+    val out = FMat(1,1)
+    out.data(0) = x
+    out
+  }
+  
+  def newOrCheckFMat(nr:Int, nc:Int, outmat:Mat):FMat = {
+    if (outmat.asInstanceOf[AnyRef] == null || (outmat.nrows == 0 && outmat.ncols == 0)) {
+      FMat(nr, nc)
+    } else {
+      if (outmat.nrows != nr || outmat.ncols != nc) {
+        outmat.recycle(nr, nc, 0).asInstanceOf[FMat]
+      } else {
+      	outmat.asInstanceOf[FMat]
+      }
+    }
+  }
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/GIMat.scala b/src/main/scala/BIDMat/GIMat.scala
new file mode 100755
index 00000000..cdf1eb18
--- /dev/null
+++ b/src/main/scala/BIDMat/GIMat.scala
@@ -0,0 +1,120 @@
+package BIDMat
+import jcuda._;
+import jcuda.jcublas.JCublas;
+import jcuda.runtime.JCuda;
+import edu.berkeley.bid.CUMAT;
+
+class GIMat(nr:Int, nc:Int, val data:Pointer, val realsize:Int) extends Mat(nr, nc) {
+  
+  override def toString:String = {
+    val nr = scala.math.min(nrows,10)
+    val nc = scala.math.min(ncols,50)        
+    val tmpMat = IMat(nr, nc)
+    JCublas.cublasGetMatrix(nr, nc, Sizeof.INT, data, nrows, Pointer.to(tmpMat.data), nr)
+    tmpMat.toString
+  }
+
+  override def mytype = "GIMat"
+  
+  def GIop(a:GIMat, oldmat:GIMat, op:Int):GIMat = {
+    if ((nrows == a.nrows && ncols == a.ncols) ||
+        (nrows == a.nrows && (a.ncols == 1 || ncols == 1)) ||
+        (ncols == a.ncols && (a.nrows == 1 || nrows == 1)) ||
+        (a.ncols == 1 && a.nrows == 1) ||
+        (ncols == 1 && nrows == 1)) {
+    	val out = GIMat.newOrCheckGIMat(nrows, a.ncols, oldmat)
+      Mat.nflops += scala.math.max(length, a.length)
+      CUMAT.applyiop(data, nrows, ncols, a.data, a.nrows, a.ncols, out.data, op)
+      JCuda.cudaDeviceSynchronize()
+      out
+    }	else throw new RuntimeException("dimensions mismatch")
+  }
+
+  def toIMat():IMat = {
+    val out = IMat(nrows, ncols)
+    JCublas.cublasGetVector(nrows*ncols, Sizeof.INT, data, 1, Pointer.to(out.data), 1);
+    out
+  }
+  
+  def free() = {
+    JCublas.cublasFree(data)
+  }
+
+  def + (a : GIMat) = GIop(a, null, 0)
+  def - (a : GIMat) = GIop(a, null, 1)
+  def *@ (a : GIMat) = GIop(a, null, 2)
+  def /@ (a : GIMat) = GIop(a, null, 3)
+  def > (b : GIMat) = GIop(b, null, 4)
+  def < (b : GIMat) = GIop(b, null, 5)
+  def == (b : GIMat) = GIop(b, null, 6)
+  def === (b : GIMat) = GIop(b, null, 6)
+  def >= (b : GIMat) = GIop(b, null, 7)
+  def <= (b : GIMat) = GIop(b, null, 8)
+  def != (b : GIMat) = GIop(b, null, 9)
+  
+  def ~ (b: GIMat) = new GIPair(this, b)
+
+  override def recycle(nr:Int, nc:Int, nnz:Int):GIMat = {
+    if (nrows == nr && nc == ncols) {
+      this
+    } else if (realsize >= nr*nc) {
+      new GIMat(nr, nc, data, realsize)
+    } else {
+      free
+      GIMat(nr, nc)
+    }  
+  }
+}
+
+class GIPair (val omat:GIMat, val mat:GIMat){
+
+    def + (a : GIMat) = mat.GIop(a, omat, 0)
+    def - (a : GIMat) = mat.GIop(a, omat, 1)
+    def *@ (a : GIMat) = mat.GIop(a, omat, 2)
+    def /@ (a : GIMat) = mat.GIop(a, omat, 3)
+    def > (b : GIMat) = mat.GIop(b, omat, 4)
+    def < (b : GIMat) = mat.GIop(b, omat, 5)
+    def == (b : GIMat) = mat.GIop(b, omat, 6)
+    def === (b : GIMat) = mat.GIop(b, omat, 6)
+    def >= (b : GIMat) = mat.GIop(b, omat, 7)
+    def <= (b : GIMat) = mat.GIop(b, omat, 8)
+    def != (b : GIMat) = mat.GIop(b, omat, 9)
+}
+
+
+object GIMat {
+  
+  def apply(nr:Int, nc:Int):GIMat = {
+    val retv = new GIMat(nr, nc, new Pointer(), nr*nc)        
+    JCublas.cublasAlloc(nr*nc, Sizeof.INT, retv.data)
+    retv        
+  }    
+  
+  def apply(a:IMat):GIMat = {
+    val retv = new GIMat(a.nrows, a.ncols, new Pointer(), a.length)
+    val rsize = a.nrows*a.ncols
+    JCublas.cublasAlloc(rsize, Sizeof.INT, retv.data)
+    JCublas.cublasSetVector(rsize, Sizeof.INT, Pointer.to(a.data), 1, retv.data, 1);
+    retv
+  }
+
+  def newOrCheckGIMat(nr:Int, nc:Int, oldmat:GIMat):GIMat = {
+    if (oldmat.asInstanceOf[AnyRef] == null) {
+      GIMat(nr, nc)
+    } else {
+      if (oldmat.nrows != nr || oldmat.ncols != nc) {
+      	oldmat.recycle(nr, nc, 0)
+      } else {
+      	oldmat
+      }
+    }
+  }
+}
+
+
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/GMat.scala b/src/main/scala/BIDMat/GMat.scala
new file mode 100755
index 00000000..0daab9a8
--- /dev/null
+++ b/src/main/scala/BIDMat/GMat.scala
@@ -0,0 +1,545 @@
+package BIDMat
+import jcuda._
+import jcuda.jcublas._
+import jcuda.jcublas.JCublas._
+import jcuda.runtime.JCuda._
+import jcuda.runtime._
+import scala.actors.Actor._
+import edu.berkeley.bid.CUMAT
+
+
+class GMat(nr:Int, nc:Int, val data:Pointer, val realsize:Int) extends Mat(nr, nc) {
+  
+  override def dv:Double =
+    if (nrows > 1 || ncols > 1) {
+      throw new RuntimeException("Matrix should be 1x1 to extract value")
+    } else {
+      toFMat.data(0)
+    }
+  
+  override def mytype = "GMat"
+    
+  override def nnz = length
+  
+  override def clear = {
+  	cudaMemset(data, 0, Sizeof.FLOAT*length)
+  	cudaDeviceSynchronize
+  	this    
+  }
+  
+  override def t = {
+    val out = GMat(ncols, nrows)
+    CUMAT.transpose(this.data, nrows, out.data, ncols, nrows, ncols)
+    cudaDeviceSynchronize()
+    out
+  }
+  
+  override def set(v:Float):GMat = {
+    val a = MatFunctions.row(v)
+    JCublas.cublasSetVector(length, Sizeof.FLOAT, Pointer.to(a.data), 0, data, 1);
+    cudaDeviceSynchronize()
+    this
+  }
+  
+  
+  override def toString:String = {
+    val nr = scala.math.min(nrows,10)
+    val nc = scala.math.min(ncols,50)        
+    val tmpMat = FMat(nr, nc)
+    cublasGetMatrix(nr, nc, Sizeof.FLOAT, data, nrows, Pointer.to(tmpMat.data), nr)
+    cudaDeviceSynchronize()
+    tmpMat.toString
+  }
+  
+  override def zeros(nr:Int, nc:Int) = GMat.gzeros(nr, nc)
+  
+  override def ones(nt:Int, nc:Int) = GMat.gones(nr, nc)
+
+  def GMult(a:GMat, oldmat:Mat):GMat = {
+    if (ncols == a.nrows) {
+      val out = GMat.newOrCheckGMat(nrows, a.ncols, oldmat)
+      Mat.nflops += 2L * length * a.ncols
+      cublasSgemm('n', 'n', nrows, a.ncols, ncols, 1.0f, data, nrows, a.data, a.nrows, 0f, out.data, nrows)
+      cudaDeviceSynchronize()
+      if (cublasGetError != 0) {
+        println("device is %d" format SciFunctions.device)
+        throw new RuntimeException("Cublas error in * "+cublasGetError)
+      }
+      out
+    }	else if (ncols == 1 && nrows == 1) {
+      val out = GMat.newOrCheckGMat(a.nrows, a.ncols, oldmat)
+      Mat.nflops += 1L * a.length
+      out.clear
+      cublasSaxpy(a.length, this.dv.asInstanceOf[Float], a.data, 1, out.data, 1)
+      cudaDeviceSynchronize()
+      out
+    } else if (a.ncols == 1 && a.nrows == 1) {
+      val out = GMat.newOrCheckGMat(nrows, ncols, oldmat)
+      Mat.nflops += 1L * length
+      out.clear
+      cublasSaxpy(length, a.dv.asInstanceOf[Float], data, 1, out.data, 1)
+      cudaDeviceSynchronize()
+      out
+    } else throw new RuntimeException("dimensions mismatch")
+  }
+  
+  def GMultT(a:GMat, oldmat:Mat):GMat = {
+    if (ncols == a.ncols) {
+      val out = GMat.newOrCheckGMat(nrows, a.nrows, oldmat)
+      Mat.nflops += 2L * length * a.nrows
+      cublasSgemm('n', 't', nrows, a.nrows, ncols, 1.0f, data, nrows, a.data, a.nrows, 0f, out.data, nrows)
+      cudaDeviceSynchronize()
+      val ee = cublasGetError
+      if (ee != 0) {
+        println("device is %d" format SciFunctions.device)
+        throw new RuntimeException("Cublas error in xT "+ee)
+      }
+      out
+    } else throw new RuntimeException("dimensions mismatch")
+  }
+  
+  def GSMult(a:GSMat, oldmat:Mat):GMat = {
+    if (ncols == a.nrows) {
+      val out = GMat.newOrCheckGMat(nrows, a.ncols, oldmat)
+      Mat.nflops += 2L * nrows * a.nnz
+      out.clear
+      CUMAT.dsmult(nrows, ncols, a.nnz, data, a.data, a.ir, a.ic, out.data)
+      cudaDeviceSynchronize()
+      out
+    }	else throw new RuntimeException("dimensions mismatch")
+  }
+  
+  def GSMultT(a:GSMat, oldmat:Mat):GMat = {
+    if (ncols == a.ncols) {
+      val out = GMat.newOrCheckGMat(nrows, a.nrows, oldmat)
+      Mat.nflops += 2L * nrows * a.nnz
+      out.clear
+      CUMAT.dsmultT(nrows, ncols, a.nnz, data, a.data, a.ir, a.ic, out.data)
+      cudaDeviceSynchronize()
+      out
+    }	else throw new RuntimeException("dimensions mismatch")
+  }
+  
+  def gOp(a:GMat, oldmat:Mat, op:Int):GMat = {
+    if ((nrows == a.nrows && ncols == a.ncols) ||
+        (nrows == a.nrows && (a.ncols == 1 || ncols == 1)) ||
+        (ncols == a.ncols && (a.nrows == 1 || nrows == 1)) ||
+        (a.ncols == 1 && a.nrows == 1) ||
+        (ncols == 1 && nrows == 1)) {
+    	val out = GMat.newOrCheckGMat(math.max(nrows, a.nrows), math.max(ncols, a.ncols), oldmat)
+      Mat.nflops += scala.math.max(length, a.length)
+      CUMAT.applyop(data, nrows, ncols, a.data, a.nrows, a.ncols, out.data, op)
+      cudaDeviceSynchronize()
+      out
+    }	else throw new RuntimeException("dimensions mismatch")
+  }
+  
+  def dot (a : GMat):Double = 
+  	if (nrows != a.nrows || ncols != a.ncols) {
+  		throw new RuntimeException("dot dims not compatible")
+  	} else {
+  	  cublasSdot(length, data, 1, a.data, 1)
+  	}
+  
+  override def dot (a : Mat):Double = 
+  	if (nrows != a.nrows || ncols != a.ncols) {
+  		throw new RuntimeException("dot dims not compatible")
+  	} else {
+  	  a match {
+  	    case aa:GMat => cublasSdot(length, data, 1, aa.data, 1)
+  	  }
+  	}
+  
+  def reduceOp(oldmat:Mat, dir:Int, op:Int):GMat = {
+    if (dir == 1 || (dir == 0 && nrows > 1)) {
+      val out = GMat.newOrCheckGMat(1, ncols, oldmat) 
+      out.clear
+      CUMAT.reduce1op(nrows, ncols, data, out.data, op)
+      Mat.nflops += length
+      cudaDeviceSynchronize()
+      out
+    } else if (dir == 2 || dir == 0) {
+      val out = GMat.newOrCheckGMat(nrows, 1, oldmat)  
+      out.clear
+      CUMAT.reduce2op(nrows, ncols, data, out.data, op)
+      Mat.nflops += length
+      cudaDeviceSynchronize()
+      out
+    } else {
+      throw new RuntimeException("dimension must be 1 or 2")
+    }
+  }
+
+  def toFMat():FMat = {
+    val out = FMat(nrows, ncols)
+    cublasGetVector(nrows*ncols, Sizeof.FLOAT, data, 1, Pointer.to(out.data), 1)
+    cudaDeviceSynchronize()
+    out
+  }
+  
+  def copyTo(out:FMat):FMat = {
+  		val a = out.recycle(nrows, ncols, 0)
+  		cublasGetVector(nrows*ncols, Sizeof.FLOAT, data, 1, Pointer.to(a.data), 1)
+  		cudaDeviceSynchronize()
+  		a
+  }
+  
+  def copyFrom(in:FMat):GMat = {
+  		cublasSetVector(nrows*ncols, Sizeof.FLOAT, Pointer.to(in.data), 1, data, 1)
+  		cudaDeviceSynchronize()
+  		this
+  }
+  
+  def copyTo(out:GMat):GMat = {
+    val a = out.recycle(nrows, ncols, 0)
+    cudaMemcpy(a.data, data, length*Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyDeviceToDevice)
+    cudaDeviceSynchronize()
+    a
+  }
+  
+  override def copyTo(out:Mat):Mat = {
+    out match {
+      case a:FMat => copyTo(a)
+      case a:GMat => copyTo(a)
+    }
+  }
+  
+  def free() = {
+    JCublas.cublasFree(data)
+  }
+
+  import GMat.BinOp._
+  def * (a : GMat) = GMult(a, null)
+  def * (a : GSMat) = GSMult(a, null)
+  def xT (a : GMat) = GMultT(a, null)
+  def xT (a : GSMat) = GSMultT(a, null)
+  def + (a : GMat) = gOp(a, null, op_add)
+  def - (a : GMat) = gOp(a, null, op_sub)
+  def *@ (a : GMat) = gOp(a, null, op_mul)
+  def /@ (a : GMat) = gOp(a, null, op_div)
+  
+  def > (b : GMat) = gOp(b, null, op_gt)
+  def < (b : GMat) = gOp(b, null, op_lt)
+  def == (b : GMat) = gOp(b, null, op_eq)
+  def === (b : GMat) = gOp(b, null, op_eq)
+  def >= (b : GMat) = gOp(b, null, op_ge)
+  def <= (b : GMat) = gOp(b, null, op_le)
+  def != (b : GMat) = gOp(b, null, op_ne)
+  
+  override def +  (b : Float):Mat = gOp(GMat(b), null, op_add)
+  override def -  (b : Float):Mat = gOp(GMat(b), null, op_sub)
+  override def *@  (b : Float):Mat = gOp(GMat(b), null, op_mul)
+  override def /@  (b : Float):Mat = gOp(GMat(b), null, op_div)
+  
+  override def > (b : Float) = gOp(GMat(b), null, op_gt)
+  override def < (b : Float) = gOp(GMat(b), null, op_lt)
+  override def == (b : Float) = gOp(GMat(b), null, op_eq)
+  override def === (b : Float) = gOp(GMat(b), null, op_eq)
+  override def >= (b : Float) = gOp(GMat(b), null, op_ge)
+  override def <= (b : Float) = gOp(GMat(b), null, op_le)
+  override def != (b : Float) = gOp(GMat(b), null, op_ne)
+
+  def ~ (b: GMat) = new GPair(this, b)
+  def ~ (b: GSMat) = new GSPair(this, b)
+  override def ~ (b: Mat):Pair = b match {
+    case bb:GMat => new GPair(this, bb)
+    case bb:GSMat => new GSPair(this, bb)
+  }
+  
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(this, b, null, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(this, b, null, Mop_Minus)
+  override def *  (b : Mat):Mat = applyMat(this, b, null, Mop_Times)
+  override def *  (b : Float):Mat = applyMat(this, GMat(FMat.felem(b)), null, Mop_Times)
+  override def *  (b : Int):Mat = applyMat(this, GMat(FMat.felem(b)), null, Mop_Times)
+  override def *  (b : Double):Mat = applyMat(this, GMat(FMat.felem(b.asInstanceOf[Float])), null, Mop_Times)
+  override def xT  (b : Mat) = b match {
+    case bb:GSMat => GSMultT(bb, null)
+    case bb:GMat => GMultT(bb, null)
+    }
+  override def /  (b : Mat):Mat = applyMat(this, b, null, Mop_Div)
+  override def \\ (b : Mat):Mat = applyMat(this, b, null, Mop_RSolve)
+  override def *@ (b : Mat):Mat = applyMat(this, b, null, Mop_ETimes)
+  override def /@ (b : Mat):Mat = applyMat(this, b, null, Mop_EDiv)
+  override def \  (b : Mat):Mat = applyMat(this, b, null, Mop_HCat)
+  override def on (b : Mat):Mat = applyMat(this, b, null, Mop_VCat)
+  
+  override def >   (b : Mat):Mat = applyMat(this, b, null, Mop_GT)
+  override def <   (b : Mat):Mat = applyMat(this, b, null, Mop_LT)
+  override def >=  (b : Mat):Mat = applyMat(this, b, null, Mop_GE)
+  override def <=  (b : Mat):Mat = applyMat(this, b, null, Mop_LE)
+  override def ==  (b : Mat):Mat = applyMat(this, b, null, Mop_EQ)
+  override def === (b : Mat):Mat = applyMat(this, b, null, Mop_EQ) 
+  override def !=  (b : Mat):Mat = applyMat(this, b, null, Mop_NE)
+  
+  override def recycle(nr:Int, nc:Int, nnz:Int):GMat = {
+    if (nrows == nr && nc == ncols) {
+      this
+    } else if (realsize >= nr*nc) {
+      new GMat(nr, nc, data, realsize)
+    } else {
+      free
+      GMat(nr, nc)
+    }  
+  }
+}
+
+class GPair(val omat:Mat, val mat:GMat) extends Pair{
+	import GMat.BinOp._
+	
+	override def t = {
+    val out = GMat.newOrCheckGMat(mat.ncols, mat.nrows, omat)
+    CUMAT.transpose(mat.data, mat.nrows, out.data, mat.ncols, mat.nrows, mat.ncols)
+    out
+  }
+
+	def + (a : GMat) = mat.gOp(a, omat, op_add)
+	def - (a : GMat) = mat.gOp(a, omat, op_sub)
+	def *@ (a : GMat) = mat.gOp(a, omat, op_mul)
+	def /@ (a : GMat) = mat.gOp(a, omat, op_div)
+	def > (b : GMat) = mat.gOp(b, omat, op_gt)
+	def < (b : GMat) = mat.gOp(b, omat, op_lt)
+	def == (b : GMat) = mat.gOp(b, omat, op_eq)
+	def === (b : GMat) = mat.gOp(b, omat, op_eq)
+	def >= (b : GMat) = mat.gOp(b, omat, op_ge)
+	def <= (b : GMat) = mat.gOp(b, omat, op_le)
+	def != (b : GMat) = mat.gOp(b, omat, op_ne)
+	
+	override def +  (b : Float):Mat = mat.gOp(GMat(b), omat, op_add)
+  override def -  (b : Float):Mat = mat.gOp(GMat(b), omat, op_sub)
+  override def *@  (b : Float):Mat = mat.gOp(GMat(b), omat, op_mul)
+  override def *  (b : Float):Mat = mat.gOp(GMat(b), omat, op_mul)
+  override def /@  (b : Float):Mat = mat.gOp(GMat(b), omat, op_div)
+  
+  override def > (b : Float) = mat.gOp(GMat(b), omat, op_gt)
+  override def < (b : Float) = mat.gOp(GMat(b), omat, op_lt)
+  override def == (b : Float) = mat.gOp(GMat(b), omat, op_eq)
+  override def === (b : Float) = mat.gOp(GMat(b), omat, op_eq)
+  override def >= (b : Float) = mat.gOp(GMat(b), omat, op_ge)
+  override def <= (b : Float) = mat.gOp(GMat(b), omat, op_le)
+  override def != (b : Float) = mat.gOp(GMat(b), omat, op_ne)
+
+	def * (a : GMat) = mat.GMult(a, omat)
+	def * (a : GSMat) = mat.GSMult(a, omat)
+
+	override def * (b: Mat):Mat = b match {
+	case bb:GMat => mat.GMult(bb, omat)
+	case bb:GSMat => mat.GSMult(bb, omat)
+	}
+
+	def xT (a : GSMat) = mat.GSMultT(a, omat)
+	def xT (a : GMat) = mat.GMultT(a, omat)
+	override def xT (b: Mat):Mat = b match {
+	case bb:GSMat => mat.GSMultT(bb, omat)
+	case bb:GMat => mat.GMultT(bb, omat)
+	}
+    
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Minus)
+  override def /  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Div)
+  override def \\ (b : Mat):Mat = applyMat(mat, b, omat, Mop_RSolve)
+  override def *@ (b : Mat):Mat = applyMat(mat, b, omat, Mop_ETimes)
+  override def /@ (b : Mat):Mat = applyMat(mat, b, omat, Mop_EDiv)
+  override def \  (b : Mat):Mat = applyMat(mat, b, omat, Mop_HCat)
+  override def on (b : Mat):Mat = applyMat(mat, b, omat, Mop_VCat)
+  
+  override def >   (b : Mat):Mat = applyMat(mat, b, omat, Mop_GT)
+  override def <   (b : Mat):Mat = applyMat(mat, b, omat, Mop_LT)
+  override def >=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_GE)
+  override def <=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_LE)
+  override def ==  (b : Mat):Mat = applyMat(mat, b, omat, Mop_EQ)
+  override def === (b : Mat):Mat = applyMat(mat, b, omat, Mop_EQ) 
+  override def !=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_NE)
+  
+}
+
+
+object GMat {
+  
+  object BinOp {
+  	val op_add=0
+  	val op_sub=1
+	  val op_mul=2
+	  val op_div=3
+	  val op_gt=4
+	  val op_lt=5
+	  val op_eq=6
+	  val op_ge=7
+	  val op_le=8
+	  val op_ne=9
+	  val op_max=10
+	  val op_min=11
+  }  
+  
+  object TransF {
+    val abs=0
+    val exp=1 
+    val expm1=2
+    val sqrt=3
+    val ln=4
+    val log10=5 
+    val log1p=6
+    val cos=7
+    val sin=8
+    val tan=9
+    val cosh=10 
+    val sinh=11 
+    val tanh=12 
+    val acos=13 
+    val asin=14
+    val atan=15 
+    val acosh=16 
+    val asinh=17 
+    val atanh=18
+    val erf=19
+    val erfinv=20 
+    val erfc=21
+    val erfcinv=22 
+    val gammaln=23
+    val gamma=24
+    val ceil=25
+    val floor=26
+    val round=27
+    val trunc=28
+    val sign=29
+    val exppsi=34
+  }
+  
+  object TransF2 {
+    val atan2=0
+    val pow=1 
+  }  
+  
+  def gzeros(nr:Int, nc:Int) = {
+    val out = GMat(nr, nc)
+    cudaMemset(out.data, 0, Sizeof.FLOAT*out.length)
+    cudaDeviceSynchronize()
+    out
+  }
+  
+  def gones(nr:Int, nc:Int) = {
+    val out = GMat(nr, nc)
+    val one = GMat(FMat.felem(1))
+    cublasScopy(out.length, one.data, 0, out.data, 1)
+    cudaDeviceSynchronize()
+    out
+  }
+  
+  def apply(nr:Int, nc:Int):GMat = {
+//  	println("nr, nc = %d,%d" format (nr,nc))
+    val retv = new GMat(nr, nc, new Pointer(), nr*nc)        
+    val status = cublasAlloc(nr*nc, Sizeof.FLOAT, retv.data)
+    if (status != cublasStatus.CUBLAS_STATUS_SUCCESS) throw new RuntimeException("CUDA alloc failed "+status)
+    retv        
+  }
+
+  def toFMat(a:GMat):FMat = a.toFMat()     
+  
+  def apply(a:FMat):GMat = {
+  	val rsize = a.nrows*a.ncols
+    val retv = GMat(a.nrows, a.ncols)
+    JCublas.cublasSetVector(rsize, Sizeof.FLOAT, Pointer.to(a.data), 1, retv.data, 1);
+  	cudaDeviceSynchronize()
+    retv
+  }
+  
+  def apply(a:Mat):GMat = a match {
+    case aa:GMat => aa
+    case aa:FMat => GMat(aa)
+    case aa:DMat => GMat(FMat(aa))
+  }
+  
+  def apply(a:Float):GMat = {
+    GMat(FMat.felem(a))
+  }
+  
+  def fromFMat(a:FMat, b:GMat):GMat = {
+    val bb = b.recycle(a.nrows, a.ncols, 0)
+    JCublas.cublasSetVector(a.length, Sizeof.FLOAT, Pointer.to(a.data), 1, bb.data, 1)
+    cudaDeviceSynchronize()
+    bb
+  }
+
+  def DDS(A:GMat, B:GMat, C:GSMat, oldmat:Mat):GSMat = {
+    if (A.nrows != B.nrows || C.nrows != A.ncols || C.ncols != B.ncols) {
+      throw new RuntimeException("dimensions mismatch")
+    }
+    val out = GSMat.newOrCheckGSMat(C, oldmat)
+    cudaMemcpy(out.ir, C.ir, Sizeof.INT * C.nnz, cudaMemcpyKind.cudaMemcpyDeviceToDevice)
+    cudaMemcpy(out.ic, C.ic, Sizeof.INT * C.nnz, cudaMemcpyKind.cudaMemcpyDeviceToDevice)
+    CUMAT.dds(A.nrows, C.nnz, A.data, B.data, C.ir, C.ic, out.data)
+    cudaDeviceSynchronize()
+    Mat.nflops += 2L * C.nnz * A.nrows
+    out    
+  }
+  
+  def GPUmult(a:FMat, b:FMat, omat:Mat):FMat = 
+  	if (a.ncols != b.nrows) {
+  		throw new RuntimeException("dimensions mismatch in xG")
+  	} else {
+  		val out = FMat.newOrCheckFMat(a.nrows, b.ncols, omat)
+  		val nthreads = Mat.hasCUDA
+  	  val done = IMat(nthreads,1)
+  	  val nncols = b.ncols/nthreads
+  		for (i <- 0 until nthreads) {
+  			actor {
+  				if (SciFunctions.device(i) == 0) {
+  					val aa = new Pointer
+  					var status = cublasAlloc(a.nrows*a.ncols, Sizeof.FLOAT, aa)
+  					if (status != cublasStatus.CUBLAS_STATUS_SUCCESS) throw new RuntimeException("CUDA alloc failed "+status)
+  					val bb = new Pointer
+  					status = cublasAlloc(b.nrows*nncols, Sizeof.FLOAT, bb)
+  					if (status != cublasStatus.CUBLAS_STATUS_SUCCESS) throw new RuntimeException("CUDA alloc failed "+status)
+  					val cc = new Pointer
+  					status = cublasAlloc(a.nrows*nncols, Sizeof.FLOAT, cc)
+  					if (status != cublasStatus.CUBLAS_STATUS_SUCCESS) throw new RuntimeException("CUDA alloc failed "+status)
+  					status = cublasSetVector(a.nrows*a.ncols, Sizeof.FLOAT, Pointer.to(a.data), 1, aa, 1)
+  					cudaDeviceSynchronize
+  					if (status != cublasStatus.CUBLAS_STATUS_SUCCESS) throw new RuntimeException("CUDA copy a failed "+status)
+  					status = cublasSetVector(b.nrows*nncols, Sizeof.FLOAT, Pointer.to(b.data).withByteOffset(Sizeof.FLOAT*i*b.nrows*nncols), 1, bb, 1) 
+  					cudaDeviceSynchronize
+  					if (status != cublasStatus.CUBLAS_STATUS_SUCCESS) throw new RuntimeException("CUDA copy b failed "+status)
+  					cublasSgemm('n', 'n', a.nrows, nncols, a.ncols, 1.0f, aa, a.nrows, bb, b.nrows, 0f, cc, a.nrows)
+  					cudaDeviceSynchronize
+  					val err = cublasGetError
+  					if (err != 0) throw new RuntimeException("Cublas error in xG, sgemm "+err)
+  					status = cublasGetVector(a.nrows*nncols, Sizeof.FLOAT, cc, 1, Pointer.to(out.data).withByteOffset(Sizeof.FLOAT*i*a.nrows*nncols), 1) 
+  					cudaDeviceSynchronize
+  					if (status != cublasStatus.CUBLAS_STATUS_SUCCESS) throw new RuntimeException("CUDA copy c failed "+status)
+  					cublasFree(cc)
+  					cublasFree(bb)
+  					cublasFree(aa)
+  				} else {
+  				  done(i) = 1
+  				  throw new RuntimeException("Couldnt set device "+i)
+  				}
+  				done(i) = 1
+  			}
+  		}
+  	  while (SciFunctions.sum(done,1).dv < nthreads) {Thread.`yield`};
+  	  Mat.nflops += 2L * a.nrows * a.ncols * b.ncols
+  		out
+  	}
+
+  def newOrCheckGMat(nr:Int, nc:Int, outmat:Mat):GMat = {
+    if (outmat.asInstanceOf[AnyRef] == null || (outmat.nrows == 0 && outmat.ncols == 0)) {
+      GMat(nr, nc)
+    } else {
+      outmat match {
+        case omat:GMat => if (omat.nrows != nr || omat.ncols != nc) {
+        omat.recycle(nr, nc, 0)
+      } else {
+      	omat
+      }
+      }
+    }
+  }
+}
+
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/GSMat.scala b/src/main/scala/BIDMat/GSMat.scala
new file mode 100755
index 00000000..dffa3cb4
--- /dev/null
+++ b/src/main/scala/BIDMat/GSMat.scala
@@ -0,0 +1,120 @@
+package BIDMat
+import jcuda._
+import jcuda.jcublas.JCublas
+import jcuda.runtime.JCuda
+import jcuda.runtime._
+import edu.berkeley.bid.CUMAT
+
+case class GSMat(nr:Int, nc:Int, val nnz0:Int, val ir:Pointer, val ic:Pointer, val data:Pointer, val realnnz:Int) extends Mat(nr, nc) {
+	
+  def getdata() = data;	
+
+  override def mytype = "GSMat"
+    
+  override def nnz = nnz0
+  
+  override def contents:GMat = new GMat(nnz, 1, data, realnnz)
+    
+  override def toString:String = {
+    val nnz0 = scala.math.min(nnz,12)       
+    val tmpMat = SMat(nnz0, nnz0, nnz0)
+    val tmpcols = new Array[Int](nnz0)
+    JCublas.cublasGetVector(nnz0, Sizeof.INT, ir, 1, Pointer.to(tmpMat.ir), 1)
+    JCublas.cublasGetVector(nnz0, Sizeof.FLOAT, data, 1, Pointer.to(tmpMat.data), 1)
+    JCublas.cublasGetVector(nnz0, Sizeof.INT, ic, 1, Pointer.to(tmpcols), 1)
+    SparseMat.compressInds(tmpcols, math.min(ncols, tmpcols(nnz0-1)+1), tmpMat.jc, nnz0)
+    if (Mat.ioneBased == 1) {
+      SparseMat.incInds(tmpMat.ir, tmpMat.ir)
+    }
+    tmpMat.toString
+  }
+      
+  def toSMat():SMat = { 
+    val out = SMat(nrows, ncols, nnz)
+    val tmpcols = new Array[Int](nnz)
+    JCublas.cublasGetVector(nnz, Sizeof.INT, ir, 1, Pointer.to(out.ir), 1)
+    JCublas.cublasGetVector(nnz, Sizeof.FLOAT, data, 1, Pointer.to(out.data), 1)
+    JCublas.cublasGetVector(nnz, Sizeof.INT, ic, 1, Pointer.to(tmpcols), 1)
+    SparseMat.compressInds(tmpcols, ncols, out.jc, nnz)
+    if (Mat.ioneBased == 1) {
+      SparseMat.incInds(out.ir, out.ir)
+    }
+    out
+  }
+  
+  def free() = {
+    JCublas.cublasFree(data)
+    JCublas.cublasFree(ic)
+    JCublas.cublasFree(ir)
+  }
+  
+  override def recycle(nr:Int, nc:Int, nnz:Int):GSMat = {
+    if (realnnz >= nnz) {  
+      new GSMat(nr, nc, nnz, ir, ic, data, realnnz)
+    } else {
+      free
+      GSMat(nr, nc, nnz)
+    }
+  }
+}
+
+class GSPair (val omat:GMat, val mat:GSMat) extends Pair {
+
+}
+
+object GSMat {
+
+  def apply(nr:Int, nc:Int, nnz0:Int):GSMat = { 
+//  		println("nr, nc, nnz = %d,%d,%d" format (nr,nc,nnz0))
+    val out = new GSMat(nr, nc, nnz0, new Pointer(), new Pointer(), new Pointer(), nnz0) 
+    JCublas.cublasAlloc(out.nnz, Sizeof.INT, out.ir)
+    JCublas.cublasAlloc(out.nnz, Sizeof.INT, out.ic)
+    JCublas.cublasAlloc(out.nnz, Sizeof.FLOAT, out.data)
+    out
+  }
+  
+  def apply(a:SMat):GSMat = { 
+    val out = GSMat(a.nrows, a.ncols, a.nnz)
+    JCublas.cublasSetVector(a.nnz, Sizeof.FLOAT, Pointer.to(a.data), 1, out.data, 1)
+    if (Mat.ioneBased == 1) {
+      JCublas.cublasSetVector(a.nnz, Sizeof.INT, Pointer.to(SparseMat.decInds(a.ir)), 1, out.ir, 1)
+    } else {
+      JCublas.cublasSetVector(a.nnz, Sizeof.INT, Pointer.to(a.ir), 1, out.ir, 1)
+    }
+    JCublas.cublasSetVector(a.nnz, Sizeof.INT, Pointer.to(SparseMat.uncompressInds(a.jc, a.ir)), 1, out.ic, 1)
+    out
+  }
+ 
+  def fromSMat(a:SMat, b:GSMat):GSMat = {
+    val out = b.recycle(a.nrows, a.ncols, a.nnz)
+    JCublas.cublasSetVector(a.nnz, Sizeof.FLOAT, Pointer.to(a.data), 1, out.data, 1)
+    if (Mat.ioneBased == 1) {
+      JCublas.cublasSetVector(a.nnz, Sizeof.INT, Pointer.to(SparseMat.decInds(a.ir)), 1, out.ir, 1)
+    } else {
+      JCublas.cublasSetVector(a.nnz, Sizeof.INT, Pointer.to(a.ir), 1, out.ir, 1)
+    }
+    JCublas.cublasSetVector(a.nnz, Sizeof.INT, Pointer.to(SparseMat.uncompressInds(a.jc, a.ir)), 1, out.ic, 1)
+    out
+  }
+
+  def newOrCheckGSMat(mat:GSMat, oldmat:Mat):GSMat = {
+  	if (oldmat.asInstanceOf[AnyRef] == null || (oldmat.nrows ==0 && oldmat.ncols == 0)) {
+  		GSMat(mat.nrows, mat.ncols, mat.nnz)
+  	} else {
+  		oldmat match {
+  		case omat:GSMat => if (oldmat.nrows == mat.nrows && oldmat.ncols == mat.ncols && oldmat.nnz == mat.nnz) {
+  			omat
+  		} else {
+  			omat.recycle(mat.nrows, mat.ncols, mat.nnz)
+  		}
+  		}
+  	}
+  }
+}
+  
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/HMat.scala b/src/main/scala/BIDMat/HMat.scala
new file mode 100755
index 00000000..1b604107
--- /dev/null
+++ b/src/main/scala/BIDMat/HMat.scala
@@ -0,0 +1,344 @@
+package BIDMat
+
+import java.io._
+import java.util.zip._
+import scala.util.matching.Regex
+import Regex._
+import scala.collection.mutable._
+import scala.actors._
+import scala.actors.Actor._
+import MatFunctions._
+import MatHDF5._
+import edu.berkeley.bid.UTILS._
+
+case class HMat(nr:Int, nc:Int, fileList:List[String], varname:String, blkinds:Array[Int], catdim:Int) extends Mat(nr, nc) {
+
+  var fnameCache:String = null
+
+  var fmatCache:Mat = null
+  
+  override def mytype = "HMat"
+  
+// Implement slicing from a hard disk matrix
+  override def apply(a:IMat, b:IMat):Mat = { 
+    var ilast:Int = 0
+    def findindx(ind:Int):Int = {
+    	while (ilast >= 0 && ind < blkinds(ilast)) ilast -= 1
+    	while (ilast < blkinds.length && ind >= blkinds(ilast)) ilast += 1
+    	if (ilast >= 0 && blkinds(ilast) <= ind && blkinds(ilast+1) > ind) {
+    		ilast
+    	} else {
+    		-1
+      }
+    }
+    
+    val locs = IMat(1,b.length)
+    var i = 0
+    var iblk = 0
+    var out:Mat = null
+    while (i <= b.length) {
+    	if (i < b.length) locs(i) = findindx(b(i))
+    	if (i == b.length || locs(i) != locs(iblk)) {
+    		if (fnameCache == null || fileList(locs(iblk)) != fnameCache) {
+    			fmatCache = MatHDF5.hload(fileList(locs(iblk)), varname).asInstanceOf[Mat] 
+    			fnameCache = fileList(locs(iblk))
+    		}
+      	val newmat = fmatCache(a, b(MatFunctions.irow(iblk->i)))
+      	if (out.asInstanceOf[AnyRef] != null) {
+    	  	out = out \ newmat
+        } else {
+          out = newmat
+        }
+        iblk = i + 1
+      }
+      i += 1
+    }
+    out
+  }
+}
+
+object HMat {
+  
+  def readSomeInts(din:InputStream, a:Array[Int], buf:Array[Byte], n:Int) {
+    var nread = 0
+    while (nread < 4*n) {
+      val readnow = din.read(buf, 0, math.min(buf.length, 4*n-nread))
+      memcpybi(readnow, buf, 0, a, nread)
+      nread += readnow
+    }
+  }
+  
+  def readSomeFloats(din:InputStream, a:Array[Float], buf:Array[Byte], n:Int) {
+    var nread = 0
+    while (nread < 4*n) {
+      val readnow = din.read(buf, 0, math.min(buf.length, 4*n-nread))
+      memcpybf(readnow, buf, 0, a, nread)
+      nread += readnow
+    }
+  }
+  
+  def readSomeDoubles(din:InputStream, a:Array[Double], buf:Array[Byte], n:Int) {
+    var nread = 0
+    while (nread < 8*n) {
+      val readnow = din.read(buf, 0, math.min(buf.length, 8*n-nread))
+      memcpybd(readnow, buf, 0, a, nread)
+      nread += readnow
+    }
+  }
+  
+  def writeSomeInts(dout:OutputStream, a:Array[Int], buf:Array[Byte], n:Int) {
+    var nwritten = 0
+    while (nwritten < 4*n) {
+      val todo = math.min(4*n-nwritten, buf.length)
+    	memcpyib(todo, a, nwritten, buf, 0)
+      dout.write(buf, 0, todo)
+      nwritten += todo
+    }
+  }
+  
+  def writeSomeFloats(dout:OutputStream, a:Array[Float], buf:Array[Byte], n:Int) {
+    var nwritten = 0
+    while (nwritten < 4*n) {
+      val todo = math.min(4*n-nwritten, buf.length)
+    	memcpyfb(todo, a, nwritten, buf, 0)
+      dout.write(buf, 0, todo)
+      nwritten += todo
+    }
+  }
+  
+  def writeSomeDoubles(dout:OutputStream, a:Array[Double], buf:Array[Byte], n:Int) {
+    var nwritten = 0
+    while (nwritten < 8*n) {
+      val todo = math.min(8*n-nwritten, buf.length)
+    	memcpydb(todo, a, nwritten, buf, 0)
+      dout.write(buf, 0, todo)
+      nwritten += todo
+    }
+  }
+  
+  def getInputStream(fname:String, compressed:Boolean):InputStream = {
+    val fin = new FileInputStream(fname)
+    if (compressed) {
+      new GZIPInputStream(fin, 1024*1024)
+    } else {
+      new BufferedInputStream(fin, 1024*1024)
+    }
+  }
+  
+  def getOutputStream(fname:String, compressed:Boolean):OutputStream = {
+    import edu.berkeley.bid.UTILS._
+  	_getOutputStream(fname, compressed, Mat.compressionLevel)
+  }
+  
+  def loadFMat(fname:String, compressed:Boolean=true):FMat = {
+    val gin = getInputStream(fname, compressed)
+    val buff = new Array[Byte](1024*1024)
+    val hints = new Array[Int](4)
+    readSomeInts(gin, hints, buff, 4)
+    val ftype = hints(0)
+    val nrows = hints(1)
+    val ncols = hints(2)
+    val out = FMat(nrows, ncols)
+    readSomeFloats(gin, out.data, buff, ncols*nrows)
+    gin.close
+    out
+  }
+   
+  def loadIMat(fname:String, compressed:Boolean=true):IMat = {
+    val gin = getInputStream(fname, compressed)
+    val buff = new Array[Byte](1024*1024)
+    val hints = new Array[Int](4)
+    readSomeInts(gin, hints, buff, 4)
+    val ftype = hints(0)
+    val nrows = hints(1)
+    val ncols = hints(2)
+    val out = IMat(nrows, ncols)
+    readSomeInts(gin, out.data, buff, ncols*nrows)
+    gin.close
+    out
+  }
+   
+  def loadDMat(fname:String, compressed:Boolean=true):DMat = {
+    val gin = getInputStream(fname, compressed)
+    val buff = new Array[Byte](1024*1024)
+    val hints = new Array[Int](4)
+    readSomeInts(gin, hints, buff, 4)
+    val ftype = hints(0)
+    val nrows = hints(1)
+    val ncols = hints(2)
+    val out = DMat(nrows, ncols)
+    readSomeDoubles(gin, out.data, buff, ncols*nrows)
+    gin.close
+    out
+  }
+  
+  def saveFMat(fname:String, m:FMat, compressed:Boolean=true):Unit = {
+    val gout = getOutputStream(fname, compressed)
+    val hints = new Array[Int](4)
+    val tbuf = new Array[Byte](16)
+    hints(0) = 130 // 1=dense, 3=float
+    hints(1) = m.nrows
+    hints(2) = m.ncols
+    hints(3) = 0
+    writeSomeInts(gout, hints, tbuf, 4)
+    val buff = new Array[Byte](math.min(1024*1024, 4*m.ncols*m.nrows))
+    writeSomeFloats(gout, m.data, buff, m.nrows*m.ncols)
+    gout.close
+  }
+  
+  def saveIMat(fname:String, m:IMat, compressed:Boolean=true):Unit = {
+  	val gout = getOutputStream(fname, compressed)
+    val hints = new Array[Int](4)
+    val tbuf = new Array[Byte](16)
+    hints(0) = 110 // 1=dense, 1=int
+    hints(1) = m.nrows
+    hints(2) = m.ncols
+    hints(3) = 0
+    writeSomeInts(gout, hints, tbuf, 4)
+    val buff = new Array[Byte](math.min(1024*1024, 4*m.ncols*m.nrows))
+    writeSomeInts(gout, m.data, buff, m.nrows*m.ncols)
+    gout.close
+  }
+  
+  def saveDMat(fname:String, m:DMat, compressed:Boolean=true):Unit = {
+    val gout = getOutputStream(fname, compressed)
+    val hints = new Array[Int](4)
+    val tbuf = new Array[Byte](16)
+    hints(0) = 140 // 1=dense, 4=double
+    hints(1) = m.nrows
+    hints(2) = m.ncols
+    hints(3) = 0
+    writeSomeInts(gout, hints, tbuf, 4)
+    val buff = new Array[Byte](math.min(1024*1024, 4*m.ncols*m.nrows))
+    writeSomeDoubles(gout, m.data, buff, m.nrows*m.ncols)
+    gout.close
+  }
+  
+  def loadSMat(fname:String, compressed:Boolean=true):SMat = {
+    val gin = getInputStream(fname, compressed)
+    val buff = new Array[Byte](1024*1024)
+    val hints = new Array[Int](4)
+    readSomeInts(gin, hints, buff, 4)
+    val ftype = hints(0)
+    val nrows = hints(1)
+    val ncols = hints(2)
+    val nnz = hints(3)
+    val out = SMat(nrows, ncols, nnz)
+    readSomeInts(gin, out.jc, buff, ncols+1)
+    readSomeInts(gin, out.ir, buff, nnz)
+    readSomeFloats(gin, out.data, buff, nnz)
+    MatHDF5.addOne(out.jc)
+    MatHDF5.addOne(out.ir)
+    gin.close
+    out
+  }
+  
+  def saveSMat(fname:String, m:SMat, compressed:Boolean=true):Unit = {
+    val gout = getOutputStream(fname, compressed)
+    val hints = new Array[Int](4)
+    val tbuf = new Array[Byte](16)
+    hints(0) = 231 // 2=sparse, 3=float, 1=int
+    hints(1) = m.nrows
+    hints(2) = m.ncols
+    hints(3) = m.nnz
+    writeSomeInts(gout, hints, tbuf, 4)
+    val buff = new Array[Byte](math.min(1024*1024, 4*math.max(m.ncols+1, m.nnz)))
+    try {
+    	MatHDF5.subOne(m.jc)
+    	MatHDF5.subOne(m.ir)
+    	writeSomeInts(gout, m.jc, buff, m.ncols+1)
+    	writeSomeInts(gout, m.ir, buff, m.nnz)
+    	writeSomeFloats(gout, m.data, buff, m.nnz)
+    } catch {
+      case e:Exception => {
+      	MatHDF5.addOne(m.jc)
+      	MatHDF5.addOne(m.ir)
+      	throw new RuntimeException("Exception in saveSMat "+e)
+      }
+      case _ => {
+      	MatHDF5.addOne(m.jc)
+      	MatHDF5.addOne(m.ir)
+      	throw new RuntimeException("Problem in saveSMat")
+      }
+    }
+    MatHDF5.addOne(m.jc)
+    MatHDF5.addOne(m.ir)
+    gout.close
+  } 
+  
+  def testLoad(fname:String, varname:String, n:Int) = {
+    val a = new Array[SMat](n)
+    var ndone = izeros(n,1)
+    for (i <- 0 until n) {
+      actor {
+        a(i) = loadSMat(("/disk%02d/" format i)+fname)
+        ndone(i) = 1
+      }
+    }
+    while (SciFunctions.sum(ndone).v < n) {Thread.sleep(10)}
+    a
+  }
+  
+  def apply(dirname:String, filepat:String, varname:String, catd:Int) { 
+    var files:ListBuffer[String] = new ListBuffer[String]
+    val dir:File = new File(dirname)
+    val slen = dir.getName.length + 1
+    
+    def searchDir(dir:File) {
+      for (f <- dir.listFiles) {
+        if (f.isDirectory) {
+	      searchDir(f)
+        } else { 
+          if (f.getName.substring(slen).matches(filepat)) { 
+            files.append(f.getName)
+          }
+        }
+      }
+    }
+
+    searchDir(dir)
+    val blkinds = new Array[Int](files.length+1)
+    var i = 0
+    var nrows = -1
+    var ncols = -1
+    files.foreach((fn:String) => { 
+      val (nr, nc) = MatHDF5.readMatDims(fn, varname)
+      if (catd == 2) { 
+        if (nrows >= 0) { 
+          if (nr != nrows) { 
+            throw new RuntimeException("incorrect number of rows in file "+fn)
+          }
+        } else { 
+          nrows = nr.asInstanceOf[Int]
+        }
+        blkinds(i+1) = blkinds(i) + nc.asInstanceOf[Int]
+        i += 1
+      } else if (catd == 1) { 
+        if (ncols >= 0) { 
+          if (nc != 1) { 
+            throw new RuntimeException("incorrect number of cols in file "+fn)
+          }
+        } else { 
+          ncols = 1
+        }
+        blkinds(i+1) = blkinds(i) + nr.asInstanceOf[Int]
+        i += 1
+      }
+    })
+    if (catd == 2) { 
+      HMat(nrows, blkinds(files.length), files.toList, varname, blkinds, 2)
+    } else {
+      if (catd == 1) { 
+      	HMat(blkinds(files.length), ncols, files.toList, varname, blkinds, 1)
+      } else {
+      	throw new RuntimeException("cat dimension must be 1 or 2")
+      }
+    }
+  }
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/IMat.scala b/src/main/scala/BIDMat/IMat.scala
new file mode 100755
index 00000000..206d3eaa
--- /dev/null
+++ b/src/main/scala/BIDMat/IMat.scala
@@ -0,0 +1,433 @@
+package BIDMat
+
+import java.util.Arrays
+
+case class IMat(nr:Int, nc:Int, data0:Array[Int]) extends DenseMat[Int](nr, nc, data0) { 
+  
+  def size() = length;
+  
+  override def t:IMat = IMat(gt(null))
+  
+  override def dv:Double =
+    if (nrows > 1 || ncols > 1) {
+      throw new RuntimeException("Matrix should be 1x1 to extract value")
+    } else {
+      data(0)
+    }
+  
+  override def mytype = "IMat"
+    
+  override def set(v:Float):IMat = {
+    Arrays.fill(data,0,length,v.asInstanceOf[Int])
+    this
+  }
+  
+  def horzcat(b: IMat) = IMat(ghorzcat(b))
+  
+  def vertcat(b: IMat) = IMat(gvertcat(b))
+  
+  def find3:(IMat, IMat, IMat) = { val (ii, jj, vv) = gfind3 ; (ii, jj, IMat(vv)) }
+  
+  override def apply(a:IMat):IMat = IMat(gapply(a))
+  
+  override def apply(a:IMat, b:IMat):IMat = IMat(gapply(a, b))	
+  
+  override def apply(a:IMat, b:Int):IMat = IMat(gapply(a, b))	
+  
+  override def apply(a:Int, b:IMat):IMat = IMat(gapply(a, b))
+  
+  def update(iv:IMat, jv:IMat, b:IMat):IMat = IMat(_update(iv, jv, b))
+
+  def update(iv:IMat, j:Int, b:IMat):IMat = IMat(_update(iv, IMat.ielem(j), b))
+
+  def update(i:Int, jv:IMat, b:IMat):IMat = IMat(_update(IMat.ielem(i), jv, b))
+  
+  def iiMatOp(b: Mat, f:(Int, Int) => Int, old:Mat):IMat = 
+    b match {
+      case bb:IMat => IMat(ggMatOp(bb, f, old))
+      case _ => throw new RuntimeException("unsupported operation "+f+" on "+this+" and "+b)	
+    }
+  
+  def iiMatOpv(b: Mat, f:(Array[Int],Int,Int,Array[Int],Int,Int,Array[Int],Int,Int,Int) => Int, old:Mat):IMat = 
+    b match {
+      case bb:IMat => IMat(ggMatOpv(bb, f, old))
+      case _ => throw new RuntimeException("unsupported operation "+f+" on "+this+" and "+b)	
+    }
+  
+  def iiMatOpScalar(b: Int, f:(Int, Int) => Int, old:Mat) = IMat(ggMatOpScalar(b, f, old))
+  
+  def iiMatOpScalarv(b: Int, f:(Array[Int],Int,Int,Array[Int],Int,Int,Array[Int],Int,Int,Int) => Int, old:Mat) = IMat(ggMatOpScalarv(b, f, old))
+  
+  def iiReduceOp(n:Int, f1:(Int) => Int, f2:(Int, Int) => Int, old:Mat) = IMat(ggReduceOp(n, f1, f2, old))	
+  
+  def iiReduceOpv(n:Int, f:(Array[Int],Int,Int,Array[Int],Int,Int,Array[Int],Int,Int,Int) => Int, old:Mat) = IMat(ggReduceOpv(n, f, old))
+  
+  def iiReduceAll(n:Int, f1:(Int) => Int, f2:(Int, Int) => Int, old:Mat) = IMat(ggReduceAll(n, f1, f2, old))
+  
+  def iiReduceAllv(n:Int, f:(Array[Int],Int,Int,Array[Int],Int,Int,Array[Int],Int,Int,Int) => Int, old:Mat) = IMat(ggReduceAllv(n, f, old))
+  
+  override def printOne(i:Int):String = {
+    val v = data(i)
+  	"%d" format v
+  }
+  
+  override def copyTo(a:Mat) = {
+  	a match {
+  	  case out:IMat => System.arraycopy(data, 0, out.data, 0, length)
+  	}
+  	a
+  }
+  
+  override def copy = {
+  	val out = IMat(nrows, ncols)
+  	System.arraycopy(data, 0, out.data, 0, length)
+  	out
+  }
+  
+  override def zeros(nr:Int, nc:Int) = {
+  	IMat(nr, nc)
+  }
+  
+  override def ones(nr:Int, nc:Int) = {
+  	val out = IMat(nr, nc)
+  	var i = 0
+  	while (i < out.length) {
+  	  out(i) = 1
+  	  i += 1
+  	}
+  	out
+  }
+    
+  override def clearUpper(off:Int) = setUpper(0, off)
+  override def clearUpper = setUpper(0, 0)
+  
+  override def clearLower(off:Int) = setLower(0, off)
+  override def clearLower = setLower(0, 0)
+
+  
+  def iMult(a0:Mat, omat:Mat):IMat = 
+    a0 match {
+    case a:IMat =>
+	    if (ncols == a.nrows) {
+	      val out = IMat.newOrCheckIMat(nrows, a.ncols, omat)
+	      out.clear
+	    	Mat.nflops += 2L * length * a.ncols
+	    	for (i <- 0 until a.ncols)
+	    		for (j <- 0 until a.nrows) {
+	    			var k = 0
+	    			val dval = a.data(j + i*ncols)
+	    			while (k < nrows) {
+	    				out.data(k+i*nrows) += data(k+j*nrows)*dval
+	    				k += 1
+	    			}
+	    		}
+	    	out
+	    } else if (ncols == 1 && nrows == 1) {
+	    	val out = IMat(a.nrows, a.ncols)
+	    	Mat.nflops += a.length
+	    	var i = 0
+	    	val dvar = data(0)
+	    	while (i < a.length) {
+	    		out.data(i) = dvar * a.data(i)
+	    		i += 1
+	    	}			    
+	    	out			  
+	    } else if (a.ncols == 1 && a.nrows == 1) {
+	    	val out = IMat(nrows, ncols)
+	    	Mat.nflops += length
+	    	var i = 0
+	    	val dvar = a.data(0)
+	    	while (i < length) {
+	    		out.data(i) = dvar * data(i)
+	    		i += 1
+	    	}			    
+	    	out			  
+	    } else throw new RuntimeException("dimensions mismatch")
+    case _ => throw new RuntimeException("unsupported arg to * "+a0)
+  }
+  
+  def dot(a:IMat):Double = super.dot(a)
+  
+  override def dot(a:Mat):Double = super.dot(a.asInstanceOf[IMat])
+
+  def *  (b : IMat) = iMult(b, null)	
+  def +  (b : IMat) = iiMatOpv(b, IMat.vecAdd _, null)
+  def -  (b : IMat) = iiMatOpv(b, IMat.vecSub _, null)
+  def *@ (b : IMat) = iiMatOpv(b, IMat.vecMul _, null)
+  def /@ (b : IMat) = iiMatOpv(b, IMat.iVecDiv _, null)
+  
+  override def +  (b : Int) = iiMatOpScalarv(b, IMat.vecAdd _, null)
+  override def -  (b : Int) = iiMatOpScalarv(b, IMat.vecSub _, null)
+  override def *@ (b : Int) = iiMatOpScalarv(b, IMat.vecMul _, null)
+  override def /@ (b : Int) = iiMatOpScalarv(b, IMat.iVecDiv _, null)
+
+  def >   (b : IMat) = iiMatOp(b, (x:Int, y:Int) => if (x > y) 1 else 0, null)
+  def <   (b : IMat) = iiMatOp(b, (x:Int, y:Int) => if (x < y) 1 else 0, null)
+  def ==  (b : IMat) = iiMatOp(b, (x:Int, y:Int) => if (x == y) 1 else 0, null)
+  def === (b : IMat) = iiMatOp(b, (x:Int, y:Int) => if (x == y) 1 else 0, null)
+  def >=  (b : IMat) = iiMatOp(b, (x:Int, y:Int) => if (x >= y) 1 else 0, null)
+  def <=  (b : IMat) = iiMatOp(b, (x:Int, y:Int) => if (x <= y) 1 else 0, null)
+  def !=  (b : IMat) = iiMatOp(b, (x:Int, y:Int) => if (x != y) 1 else 0, null)
+
+  override def >  (b : Int) = iiMatOpScalar(b, (x:Int, y:Int) => if (x > y) 1 else 0, null)
+  override def <  (b : Int) = iiMatOpScalar(b, (x:Int, y:Int) => if (x < y) 1 else 0, null)
+  override def == (b : Int) = iiMatOpScalar(b, (x:Int, y:Int) => if (x == y) 1 else 0, null)
+  override def === (b : Int) = iiMatOpScalar(b, (x:Int, y:Int) => if (x == y) 1 else 0, null)
+  override def >= (b : Int) = iiMatOpScalar(b, (x:Int, y:Int) => if (x >= y) 1 else 0, null)
+  override def <= (b : Int) = iiMatOpScalar(b, (x:Int, y:Int) => if (x <= y) 1 else 0, null)
+  override def != (b : Int) = iiMatOpScalar(b, (x:Int, y:Int) => if (x != y) 1 else 0, null) 
+
+  def \ (b: IMat) = horzcat(b)
+  def \ (b: Int) = horzcat(IMat.ielem(b))
+  def on (b: IMat) = vertcat(b)
+  def on (b: Int) = vertcat(IMat.ielem(b))
+  
+ /*
+  * Specialize to FMats to help the type system. 
+  */ 
+  def +  (b : FMat):FMat = FMat(this) + b
+  def -  (b : FMat):FMat = FMat(this) - b
+  def *  (b : FMat):FMat = FMat(this) * b
+  def /  (b : FMat):FMat = FMat(this) / b
+  def \\ (b : FMat):FMat = FMat(this) \\ b
+  def *@ (b : FMat):FMat = FMat(this) *@ b
+  def /@ (b : FMat):FMat = FMat(this) /@ b
+  def \  (b : FMat):FMat = FMat(this) \ b
+  def on (b : FMat):FMat = FMat(this) on b 
+  
+  def >   (b : FMat):FMat = FMat(this) > b
+  def <   (b : FMat):FMat = FMat(this) < b
+  def >=  (b : FMat):FMat = FMat(this) >= b
+  def <=  (b : FMat):FMat = FMat(this) <= b
+  def ==  (b : FMat):FMat = FMat(this) == b
+  def === (b : FMat):FMat = FMat(this) === b 
+  def !=  (b : FMat):FMat = FMat(this) != b
+  
+ /*
+  * Specialize to DMats to help the type system. 
+  */ 
+  def +  (b : DMat):DMat = DMat(this) + b
+  def -  (b : DMat):DMat = DMat(this) - b
+  def *  (b : DMat):DMat = DMat(this) * b
+  def /  (b : DMat):DMat = DMat(this) / b
+  def \\ (b : DMat):DMat = DMat(this) \\ b
+  def *@ (b : DMat):DMat = DMat(this) *@ b
+  def /@ (b : DMat):DMat = DMat(this) /@ b
+  def \  (b : DMat):DMat = DMat(this) \ b
+  def on (b : DMat):DMat = DMat(this) on b 
+  
+  def >   (b : DMat):DMat = DMat(this) > b
+  def <   (b : DMat):DMat = DMat(this) < b
+  def >=  (b : DMat):DMat = DMat(this) >= b
+  def <=  (b : DMat):DMat = DMat(this) <= b
+  def ==  (b : DMat):DMat = DMat(this) == b
+  def === (b : DMat):DMat = DMat(this) === b 
+  def !=  (b : DMat):DMat = DMat(this) != b
+   /*
+  * Specialize to CMats to help the type system. 
+  */ 
+  def +  (b : CMat):CMat = CMat(this) + b
+  def -  (b : CMat):CMat = CMat(this) - b
+  def *  (b : CMat):CMat = CMat(this) * b
+  def /  (b : CMat):CMat = CMat(this) / b
+  def \\ (b : CMat):CMat = CMat(this) \\ b
+  def *@ (b : CMat):CMat = CMat(this) *@ b
+  def /@ (b : CMat):CMat = CMat(this) /@ b
+  def \  (b : CMat):CMat = CMat(this) \ b
+  def on (b : CMat):CMat = CMat(this) on b 
+  /*
+  * Operators whose second arg is generic. 
+  */ 
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(this, b, null, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(this, b, null, Mop_Minus)
+  override def *  (b : Mat):Mat = applyMat(this, b, null, Mop_Times)
+  override def /  (b : Mat):Mat = applyMat(this, b, null, Mop_Div)
+  override def \\ (b : Mat):Mat = applyMat(this, b, null, Mop_RSolve)
+  override def *@ (b : Mat):Mat = applyMat(this, b, null, Mop_ETimes)
+  override def /@ (b : Mat):Mat = applyMat(this, b, null, Mop_EDiv)
+  override def \  (b : Mat):Mat = applyMat(this, b, null, Mop_HCat)
+  override def on (b : Mat):Mat = applyMat(this, b, null, Mop_VCat)
+  
+  override def >   (b : Mat):Mat = applyMat(this, b, null, Mop_GT)
+  override def <   (b : Mat):Mat = applyMat(this, b, null, Mop_LT)
+  override def >=  (b : Mat):Mat = applyMat(this, b, null, Mop_GE)
+  override def <=  (b : Mat):Mat = applyMat(this, b, null, Mop_LE)
+  override def ==  (b : Mat):Mat = applyMat(this, b, null, Mop_EQ)
+  override def === (b : Mat):Mat = applyMat(this, b, null, Mop_EQ) 
+  override def !=  (b : Mat):Mat = applyMat(this, b, null, Mop_NE)
+  
+  def ~ (b : IMat):IPair = new IPair(this, b)
+  
+  override def ~ (b: Mat):Pair = 
+    b match {
+    case db:IMat => new IPair(this, db)
+    case _ => throw new RuntimeException("mismatched types for operator ~")
+  }
+  
+  override def clear = {
+    Arrays.fill(this.data,0,length,0)
+    this
+  }
+  
+  override def recycle(nr:Int, nc:Int, nnz:Int):IMat = {
+    if (nrows == nr && nc == ncols) {
+      this
+    } else if (data.size >= nr*nc) {
+      new IMat(nr, nc, data)
+    } else {
+      IMat(nr, nc)
+    }  
+  }
+}
+
+class IPair(val omat:Mat, val mat:IMat) extends Pair {
+  
+  override def t:IMat = IMat(mat.gt(omat))
+  
+  def * (b : IMat) = mat.iMult(b, omat) 
+  def * (b : SMat) = mat.iMult(b, omat) 
+//  def xT  (b : SMat) = mat.multT(b, omat)
+  def + (b : IMat) = mat.iiMatOpv(b, IMat.vecAdd _, omat)
+  def - (b : IMat) = mat.iiMatOpv(b, IMat.vecSub _, omat)
+  def *@ (b : IMat) = mat.iiMatOpv(b, IMat.vecMul _, omat)
+//  def /@ (b : IMat) = mat.iiMatOpv(b, IMat.fVecDiv _, omat)  
+//  def ^ (b : IMat) = mat.iiMatOp(b, (x:Float, y:Float) => math.pow(x,y).toFloat, omat)  
+
+  def > (b : IMat) = mat.iiMatOp(b, (x:Int, y:Int) => if (x > y) 1 else 0, omat)
+  def < (b : IMat) = mat.iiMatOp(b, (x:Int, y:Int) => if (x < y) 1 else 0, omat)
+  def == (b : IMat) = mat.iiMatOp(b, (x:Int, y:Int) => if (x == y) 1 else 0, omat)
+  def === (b : IMat) = mat.iiMatOp(b, (x:Int, y:Int) => if (x == y) 1 else 0, omat)
+  def >= (b : IMat) = mat.iiMatOp(b, (x:Int, y:Int) => if (x >= y) 1 else 0, omat)
+  def <= (b : IMat) = mat.iiMatOp(b, (x:Int, y:Int) => if (x <= y) 1 else 0, omat)
+  def != (b : IMat) = mat.iiMatOp(b, (x:Int, y:Int) => if (x != y) 1 else 0, omat) 
+  
+   
+  override def * (b : Int) = mat.iMult(IMat.ielem(b), omat)
+  override def + (b : Int) = mat.iiMatOpScalarv(b, IMat.vecAdd _, omat)
+  override def - (b : Int) = mat.iiMatOpScalarv(b, IMat.vecSub _, omat)
+  override def *@ (b : Int) = mat.iiMatOpScalarv(b, IMat.vecMul _, omat)
+//  override def /@ (b : Int) = mat.iiMatOpScalarv(b, IMat.fVecDiv _, omat)
+//  override def ^ (b : Int) = mat.iiMatOpScalar(b, (x:Float, y:Float) => math.pow(x,y).toFloat, omat)
+
+  override def > (b : Int) = mat.iiMatOpScalar(b, (x:Int, y:Int) => if (x > y) 1 else 0, omat)
+  override def < (b : Int) = mat.iiMatOpScalar(b, (x:Int, y:Int) => if (x < y) 1 else 0, omat)
+  override def == (b : Int) = mat.iiMatOpScalar(b, (x:Int, y:Int) => if (x == y) 1 else 0, omat)
+  override def >= (b : Int) = mat.iiMatOpScalar(b, (x:Int, y:Int) => if (x >= y) 1 else 0, omat)
+  override def <= (b : Int) = mat.iiMatOpScalar(b, (x:Int, y:Int) => if (x <= y) 1 else 0, omat)
+  override def != (b : Int) = mat.iiMatOpScalar(b, (x:Int, y:Int) => if (x != y) 1 else 0, omat) 
+  
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Minus)
+  override def *  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Times)
+  override def /  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Div)
+  override def \\ (b : Mat):Mat = applyMat(mat, b, omat, Mop_RSolve)
+  override def *@ (b : Mat):Mat = applyMat(mat, b, omat, Mop_ETimes)
+  override def /@ (b : Mat):Mat = applyMat(mat, b, omat, Mop_EDiv)
+  override def \  (b : Mat):Mat = applyMat(mat, b, omat, Mop_HCat)
+  override def on (b : Mat):Mat = applyMat(mat, b, omat, Mop_VCat)
+  
+  override def >   (b : Mat):Mat = applyMat(mat, b, omat, Mop_GT)
+  override def <   (b : Mat):Mat = applyMat(mat, b, omat, Mop_LT)
+  override def >=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_GE)
+  override def <=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_LE)
+  override def ==  (b : Mat):Mat = applyMat(mat, b, omat, Mop_EQ)
+  override def === (b : Mat):Mat = applyMat(mat, b, omat, Mop_EQ) 
+  override def !=  (b : Mat):Mat = applyMat(mat, b, omat, Mop_NE)
+}
+
+
+object IMat {
+  
+	def iVecDiv(a:Array[Int], a0:Int, ainc:Int, b:Array[Int], b0:Int, binc:Int, c:Array[Int], c0:Int, cinc:Int, n:Int):Int = {
+			var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+			while (ci < cend) {
+				c(ci) = a(ai) / b(bi);  ai += ainc; bi += binc;  ci += cinc
+			}
+			0
+	}
+  
+  def apply(nr:Int, nc:Int) = new IMat(nr, nc, new Array[Int](nr*nc))
+  
+  def apply(a:DenseMat[Int]):IMat = new IMat(a.nrows, a.ncols, a.data)
+
+  def apply(x:Mat):IMat = {
+    var out:IMat = null
+    x match {
+      case dd:DMat => {out = IMat(x.nrows, x.ncols) ; Mat.copyToIntArray(dd.data, 0, out.data, 0, dd.length)}
+      case ff:FMat => {out = IMat(x.nrows, x.ncols); Mat.copyToIntArray(ff.data, 0, out.data, 0, ff.length)}
+      case ii:IMat => {out = IMat(x.nrows, x.ncols); System.arraycopy(ii.data, 0, out.data, 0, ii.length)}
+      case gg:GIMat => out = gg.toIMat
+      case _ => throw new RuntimeException("Unsupported source type")
+    }
+    out
+  }
+       
+  def vecAdd(a:Array[Int], a0:Int, ainc:Int, b:Array[Int], b0:Int, binc:Int, c:Array[Int], c0:Int, cinc:Int, n:Int):Int = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) + b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecSub(a:Array[Int], a0:Int, ainc:Int, b:Array[Int], b0:Int, binc:Int, c:Array[Int], c0:Int, cinc:Int, n:Int):Int = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) - b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecMul(a:Array[Int], a0:Int, ainc:Int, b:Array[Int], b0:Int, binc:Int, c:Array[Int], c0:Int, cinc:Int, n:Int):Int = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = a(ai) * b(bi);  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+  def vecMax(a:Array[Int], a0:Int, ainc:Int, b:Array[Int], b0:Int, binc:Int, c:Array[Int], c0:Int, cinc:Int, n:Int):Int = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = math.max(a(ai), b(bi));  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+  
+ def vecMin(a:Array[Int], a0:Int, ainc:Int, b:Array[Int], b0:Int, binc:Int, c:Array[Int], c0:Int, cinc:Int, n:Int):Int = {
+    var ai = a0; var bi = b0; var ci = c0; var cend = c0 + n
+    while (ci < cend) {
+      c(ci) = math.min(a(ai), b(bi));  ai += ainc; bi += binc;  ci += cinc
+    }
+    0
+  }
+
+  
+  def ielem(x:Int) = {
+    val out = IMat(1,1)
+    out.data(0) = x
+    out
+  }
+  
+  def newOrCheckIMat(nr:Int, nc:Int, omat:Mat):IMat = {
+    if (omat.asInstanceOf[AnyRef] == null || (omat.nrows == 0 && omat.ncols == 0)) {
+      IMat(nr, nc)
+    } else {
+      omat match {
+        case outmat:IMat => if (outmat.nrows != nr || outmat.ncols != nc) {
+        outmat.recycle(nr, nc, 0)
+      } else {
+      	outmat
+      }
+      }
+    }
+	}
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/Mat.scala b/src/main/scala/BIDMat/Mat.scala
new file mode 100755
index 00000000..a4f29f3a
--- /dev/null
+++ b/src/main/scala/BIDMat/Mat.scala
@@ -0,0 +1,445 @@
+package BIDMat
+
+class Mat(nr:Int, nc:Int) {
+  val nrows = nr
+  val ncols = nc
+
+  def length = nr*nc
+  
+  def notImplemented0(s:String):Mat = { 
+    throw new RuntimeException("operator "+s+" not implemented for "+this.mytype)
+  }
+  
+  def notImplemented1(s:String,that:Mat):Mat = { 
+    throw new RuntimeException("operator "+s+" not implemented for "+this.mytype+" and "+that.mytype)
+  }
+  
+  def t = notImplemented0("t")  
+  def dv:Double = throw new RuntimeException("operator dv not implemented for "+this.mytype)
+  
+  def mytype = "Mat"
+  def copyTo(a:Mat) = notImplemented0("copy");
+  def copy = notImplemented0("copy");
+  def set(v:Float) = notImplemented0("set")
+  def zeros(nr:Int, nc:Int) = notImplemented0("zeros");
+  def ones(nr:Int, nc:Int) = notImplemented0("ones");
+  def clearUpper(i:Int) = notImplemented0("clearUpper");
+  def clearLower(i:Int) = notImplemented0("clearLower"); 
+  def clearUpper = notImplemented0("clearUpper");
+  def clearLower = notImplemented0("clearLower");
+    
+  def nnz:Int = {notImplemented0("nnz"); 0}
+  def clear = notImplemented0("clear");
+  def zeros(nr:Int, nc:Int, nnz:Int):Mat = zeros(nr, nc)
+  def recycle(nr:Int, nc:Int, nnz:Int):Mat = notImplemented0("recycle");
+  def contents:Mat = notImplemented0("recycle");
+  
+  def apply(a:IMat):Mat = notImplemented0("linear array access");
+  def apply(a:IMat, b:IMat):Mat = notImplemented0("block array access");
+  def apply(a:IMat, b:Int):Mat = notImplemented0("block array access");	
+  def apply(a:Int, b:IMat):Mat = notImplemented0("block array access");
+  
+  def update(a:IMat, b:Mat) = notImplemented0("linear update");
+  def update(a:IMat, b:IMat, m:Mat) = notImplemented0("block update");
+  def update(a:IMat, b:Int, m:Mat) = notImplemented0("block update");	
+  def update(a:Int, b:IMat, m:Mat) = notImplemented0("block update");
+  
+  def + (b : Mat):Mat = notImplemented1("+", b)
+  def - (b : Mat):Mat = notImplemented1("-", b)
+  def * (b : Mat):Mat = notImplemented1("*", b)
+  def xT (b : Mat):Mat = notImplemented1("*", b)
+  def Tx (b : Mat):Mat = notImplemented1("*", b)
+  def / (b : Mat):Mat = notImplemented1("/", b)
+  def *@ (b : Mat):Mat = notImplemented1("*@", b)
+  def /@ (b : Mat):Mat = notImplemented1("/@", b)
+  def \\ (b : Mat):Mat = notImplemented1("\\\\", b)
+  def ^ (b : Mat):Mat = notImplemented1("^", b) 
+  
+  def > (b : Mat):Mat = notImplemented1(">", b)
+  def < (b : Mat):Mat = notImplemented1("<", b)
+  def >= (b : Mat):Mat = notImplemented1(">=", b)
+  def <= (b : Mat):Mat = notImplemented1("<=", b)
+  def == (b : Mat):Mat = notImplemented1("==", b)
+  def === (b : Mat):Mat = notImplemented1("===", b)
+  def != (b : Mat):Mat = notImplemented1("!=", b)
+  
+  def <-- (b : Mat):Mat = b.copyTo(this)
+  
+  def + (b : Int):Mat = notImplemented0("+")
+  def - (b : Int):Mat = notImplemented0("-")
+  def * (b : Int):Mat = notImplemented0("*")
+  def / (b : Int):Mat = notImplemented0("/")
+  def *@ (b : Int):Mat = notImplemented0("*@")
+  def /@ (b : Int):Mat = notImplemented0("/@")
+  def \\ (b : Int):Mat = notImplemented0("\\\\")
+  def ^ (b : Int):Mat = notImplemented0("^") 
+  
+  def > (b : Int):Mat = notImplemented0(">")
+  def < (b : Int):Mat = notImplemented0("<")
+  def >= (b : Int):Mat = notImplemented0(">=")
+  def <= (b : Int):Mat = notImplemented0("<=")
+  def == (b : Int):Mat = notImplemented0("==")
+  def === (b : Int):Mat = notImplemented0("===")
+  def != (b : Int):Mat = notImplemented0("!=")
+  
+  def + (b : Float):Mat = notImplemented0("+")
+  def - (b : Float):Mat = notImplemented0("-")
+  def * (b : Float):Mat = notImplemented0("*")
+  def / (b : Float):Mat = notImplemented0("/")
+  def *@ (b : Float):Mat = notImplemented0("*@")
+  def /@ (b : Float):Mat = notImplemented0("/@")
+  def \\ (b : Float):Mat = notImplemented0("\\\\")
+  def ^ (b : Float):Mat = notImplemented0("^") 
+  
+  def > (b : Float):Mat = notImplemented0(">")
+  def < (b : Float):Mat = notImplemented0("<")
+  def >= (b : Float):Mat = notImplemented0(">=")
+  def <= (b : Float):Mat = notImplemented0("<=")
+  def == (b : Float):Mat = notImplemented0("==")
+  def === (b : Float):Mat = notImplemented0("===")
+  def != (b : Float):Mat = notImplemented0("!=")
+  
+  def + (b : Double):Mat = notImplemented0("+")
+  def - (b : Double):Mat = notImplemented0("-")
+  def * (b : Double):Mat = notImplemented0("*")
+  def / (b : Double):Mat = notImplemented0("/")
+  def *@ (b : Double):Mat = notImplemented0("*@")
+  def /@ (b : Double):Mat = notImplemented0("/@")
+  def \\ (b : Double):Mat = notImplemented0("\\\\")
+  def ^ (b : Double):Mat = notImplemented0("^") 
+  
+  def > (b : Double):Mat = notImplemented0(">")
+  def < (b : Double):Mat = notImplemented0("<")
+  def >= (b : Double):Mat = notImplemented0(">=")
+  def <= (b : Double):Mat = notImplemented0("<=")
+  def == (b : Double):Mat = notImplemented0("==")
+  def === (b : Double):Mat = notImplemented0("===")
+  def != (b : Double):Mat = notImplemented0("!=")  
+  
+  def \ (b : Mat):Mat = notImplemented1("\\", b)
+  def on (b : Mat):Mat = notImplemented1("on", b)
+  def ~ (b : Mat):Pair = b match {
+    case bb:FMat => new FPair(this, bb)
+    case bb:DMat => new DPair(this, bb)
+    case bb:IMat => new IPair(this, bb)
+    case bb:SMat => new SPair(this, bb)
+//    case bb:SDMat => new SDPair(this, bb)
+    case bb:CMat => new CPair(this, bb)
+    case bb:GMat => new GPair(this, bb)
+  }
+  
+  def dot (b : Mat):Double = {notImplemented1("dot", b); 0}
+
+}
+
+abstract class Pair {
+  
+  def notImplemented0(s:String):Mat = { 
+    throw new RuntimeException("operator "+s+" not implemented for "+this)
+  }
+  def notImplemented1(s:String,that:Mat):Mat = { 
+    throw new RuntimeException("operator "+s+" not implemented for "+this+" and "+that)
+  }
+  
+  def t = notImplemented0("t")
+  
+  def + (b : Mat):Mat = notImplemented1("+", b)
+  def - (b : Mat):Mat = notImplemented1("-", b)
+  def * (b : Mat):Mat = notImplemented1("*", b)
+  def xT (b : Mat):Mat = notImplemented1("xT", b)
+  def Tx (b : Mat):Mat = notImplemented1("Tx", b)
+  def / (b : Mat):Mat = notImplemented1("/", b)
+  def *@ (b : Mat):Mat = notImplemented1("*@", b)
+  def /@ (b : Mat):Mat = notImplemented1("/@", b)
+  def \\ (b : Mat):Mat = notImplemented1("\\\\", b)
+  def ^ (b : Mat):Mat = notImplemented1("^", b) 
+  
+  def > (b : Mat):Mat = notImplemented1(">", b)
+  def < (b : Mat):Mat = notImplemented1("<", b)
+  def >= (b : Mat):Mat = notImplemented1(">=", b)
+  def <= (b : Mat):Mat = notImplemented1("<=", b)
+  def == (b : Mat):Mat = notImplemented1("==", b)
+  def === (b : Mat):Mat = notImplemented1("===", b)
+  def != (b : Mat):Mat = notImplemented1("!=", b)
+  
+  def \ (b : Mat):Mat = notImplemented1("\\", b)
+  def on (b : Mat):Mat = notImplemented1("on", b)
+  
+  def + (b : Int):Mat = notImplemented0("+")
+  def - (b : Int):Mat = notImplemented0("-")
+  def * (b : Int):Mat = notImplemented0("*")
+  def / (b : Int):Mat = notImplemented0("/")
+  def *@ (b : Int):Mat = notImplemented0("*@")
+  def /@ (b : Int):Mat = notImplemented0("/@")
+  def \\ (b : Int):Mat = notImplemented0("\\\\")
+  def ^ (b : Int):Mat = notImplemented0("^") 
+  
+  def > (b : Int):Mat = notImplemented0(">")
+  def < (b : Int):Mat = notImplemented0("<")
+  def >= (b : Int):Mat = notImplemented0(">=")
+  def <= (b : Int):Mat = notImplemented0("<=")
+  def == (b : Int):Mat = notImplemented0("==")
+  def === (b : Int):Mat = notImplemented0("===")
+  def != (b : Int):Mat = notImplemented0("!=")
+  
+  def + (b : Float):Mat = notImplemented0("+")
+  def - (b : Float):Mat = notImplemented0("-")
+  def * (b : Float):Mat = notImplemented0("*")
+  def / (b : Float):Mat = notImplemented0("/")
+  def *@ (b : Float):Mat = notImplemented0("*@")
+  def /@ (b : Float):Mat = notImplemented0("/@")
+  def \\ (b : Float):Mat = notImplemented0("\\\\")
+  def ^ (b : Float):Mat = notImplemented0("^") 
+  
+  def > (b : Float):Mat = notImplemented0(">")
+  def < (b : Float):Mat = notImplemented0("<")
+  def >= (b : Float):Mat = notImplemented0(">=")
+  def <= (b : Float):Mat = notImplemented0("<=")
+  def == (b : Float):Mat = notImplemented0("==")
+  def === (b : Float):Mat = notImplemented0("===")
+  def != (b : Float):Mat = notImplemented0("!=")
+  
+  def + (b : Double):Mat = notImplemented0("+")
+  def - (b : Double):Mat = notImplemented0("-")
+  def * (b : Double):Mat = notImplemented0("*")
+  def / (b : Double):Mat = notImplemented0("/")
+  def *@ (b : Double):Mat = notImplemented0("*@")
+  def /@ (b : Double):Mat = notImplemented0("/@")
+  def \\ (b : Double):Mat = notImplemented0("\\\\")
+  def ^ (b : Double):Mat = notImplemented0("^") 
+  
+  def > (b : Double):Mat = notImplemented0(">")
+  def < (b : Double):Mat = notImplemented0("<")
+  def >= (b : Double):Mat = notImplemented0(">=")
+  def <= (b : Double):Mat = notImplemented0("<=")
+  def == (b : Double):Mat = notImplemented0("==")
+  def === (b : Double):Mat = notImplemented0("===")
+  def != (b : Double):Mat = notImplemented0("!=")  
+}
+
+object Mat {
+  import Ordered._
+  import scala.tools.jline.TerminalFactory
+  
+  var compressType = 1            // 0=none, 1=zlib, 2=szip
+  
+  var compressionLevel = 3        // for zlib
+  
+  var chunkSize = 1024*1024         // for either method
+  
+  var szipBlock = 32              // szip block size
+  
+  var numThreads = 8
+  
+  var noMKL:Boolean = false
+  
+  var nflops = 0L
+  
+  var oneBased = 0
+  
+  var ioneBased = 1
+  
+  var hasCUDA = 0
+  
+  def checkCUDA:Unit = {
+    if (hasCUDA == 0) {
+    	try {
+    		val os = System.getProperty("os.name")
+    		if (os.equals("Linux")) {
+    			System.loadLibrary("cudart")
+    		} else {
+    			try {
+    				System.loadLibrary("cudart64_50_35")
+    			} catch {
+    			case _ => try {
+    				System.loadLibrary("cudart64_42_9")
+    			} 
+    			}
+    		}
+    	} catch {
+    	case _ =>  {
+    		println("Cant find CUDA SDK")
+    		hasCUDA = -1    		
+    	}
+    	}
+    }
+    if (hasCUDA >= 0) {
+    	try {
+    		var cudanum = new Array[Int](1)
+    		jcuda.runtime.JCuda.cudaGetDeviceCount(cudanum)
+    		hasCUDA = cudanum(0)
+    		printf("%d CUDA device%s found", hasCUDA, if (hasCUDA == 1) "" else "s")
+    		if (hasCUDA > 0) {
+    			jcuda.runtime.JCuda.cudaRuntimeGetVersion(cudanum)
+    			println(", CUDA version %d.%d" format (cudanum(0)/1000, (cudanum(0)%100) / 10))
+    		} else {
+    			println("")
+    		}
+    	} catch {
+    	case e:NoClassDefFoundError => println("Couldn't load the JCUDA driver")
+    	case e:Exception => println("Exception while initializing JCUDA driver")
+    	case _ => println("Something went wrong while loading JCUDA driver")
+    	}
+    }
+  }
+  
+  var terminal = TerminalFactory.create
+  
+  def terminalWidth = math.max(terminal.getWidth,80)
+
+  def copyToIntArray[@specialized(Double, Float) T](data:Array[T], i0:Int, idata:Array[Int], d0:Int, n:Int)
+  (implicit numeric : Numeric[T]) = {
+    var i = 0 
+    while (i < n) {
+      idata(i+d0) = numeric.toInt(data(i+i0));
+      i += 1
+    }
+  }
+  
+  def copyToDoubleArray[@specialized(Int, Float) T](data:Array[T], i0:Int, ddata:Array[Double], d0:Int, n:Int)
+  (implicit numeric : Numeric[T]) = {
+    var i = 0 
+    while (i < n) {
+      ddata(i+d0) = numeric.toDouble(data(i+i0));
+      i += 1
+    }
+  }
+  
+  def copyToFloatArray[@specialized(Int, Double) T](data:Array[T], i0:Int, fdata:Array[Float], d0:Int, n:Int)
+  (implicit numeric : Numeric[T]) = {
+    var i = 0 
+    while (i < n) {
+      fdata(i+d0) = numeric.toFloat(data(i+i0));
+      i += 1
+    }
+  }
+  
+  def copyListToFloatArray[T](a:List[T], b:Array[Float])(implicit numeric : Numeric[T]) = {
+    var i = 0; 
+    var todo = a.iterator
+    while (i < a.length) {
+      val h = todo.next
+      b(i) = numeric.toFloat(h)
+      i += 1
+    }
+  }
+  
+  def ibinsearch(v:Int, x:Array[Int], istartp:Int, iendp:Int):Int = {
+    var istart = istartp
+    var iend = iendp
+    while (iend - istart > 1) {
+      var mid:Int = (istart + iend)/2
+      if (v < x(mid)) iend = mid else istart = mid
+    }
+    if (v == x(istart)) istart else -1
+  }
+
+  def binsearch[T : Ordering](v:T, x:Array[T], istartp:Int, iendp:Int):Int = {
+    var istart = istartp
+    var iend = iendp
+    while (iend - istart > 1) {
+      var mid:Int = (istart + iend)/2
+      if (v < x(mid)) iend = mid else istart = mid
+    }
+    if (v == x(istart)) istart else -1
+  }
+  
+  def lexsort[T :Ordering](a:List[Array[T]]):Array[Int] = {
+    val n = a(0).length
+    val ind = new Array[Int](n)
+    var i = 0; while(i < n) {ind(i) = i; i += 1}
+    def comp(i:Int, j:Int):Int = {
+      val alen = a.length;
+      val ip = ind(i)
+      val jp = ind(j)
+      var c0 = 0
+      var k = 0;
+      while (k < alen && c0 == 0) {
+        c0 = a(k)(ip) compare a(k)(jp)
+        k += 1
+      }
+      if (c0 != 0) {
+        c0
+      } else {
+        ip compare jp
+      }
+    }
+    def swap(i:Int, j:Int):Unit = {
+      val tmp = ind(i)
+      ind(i) = ind(j)
+      ind(j) = tmp
+    }
+    BIDMat.Sorting.quickSort(comp, swap, 0, n)
+    ind
+  }
+
+  def ilexsort(a:List[Array[Int]]):Array[Int] = {
+    val n = a(0).length
+    val ind = new Array[Int](n)
+    var i = 0; while(i < n) {ind(i) = i; i += 1}
+    def comp(i:Int, j:Int):Int = {
+      var k = 0;
+      val alen = a.length;
+      var c0 = 0
+      val ip = ind(i)
+      val jp = ind(j)
+      while (k < alen && c0 == 0) {
+        c0 = a(k)(ip) compare a(k)(jp)
+        k += 1
+      }
+      if (c0 != 0) {
+        c0
+      } else {
+        ip compare jp
+      }
+    }
+    def swap(i:Int, j:Int):Unit = {
+      val tmp = ind(i)
+      ind(i) = ind(j)
+      ind(j) = tmp
+    }
+    BIDMat.Sorting.quickSort(comp, swap, 0, n)
+    ind
+  }
+  
+  def ilexsort2(a:Array[Int], b:Array[Int]):Array[Int] = {
+    val n = a.length
+    val ind = new Array[Int](n)
+    var i = 0; while(i < n) {ind(i) = i; i += 1}
+    def comp(i:Int, j:Int):Int = {
+      val c0 = a(i) compare a(j)
+      if (c0 != 0) {
+        c0
+      } else {
+        val c1 = b(i) compare b(j)     
+        if (c1 != 0) {
+        	c1
+        } else {
+        	ind(i) compare ind(j)
+        }         
+      }
+    }
+    def swap(i:Int, j:Int):Unit = {
+      val tmpa = a(i)
+      a(i) = a(j)
+      a(j) = tmpa
+      val tmpb = b(i)
+      b(i) = b(j)
+      b(j) = tmpb
+      val tmpi = ind(i)
+      ind(i) = ind(j)
+      ind(j) = tmpi
+    }
+    BIDMat.Sorting.quickSort(comp, swap, 0, n)
+    ind
+  }
+  
+  def ilexsort(args:Array[Int]*):Array[Int] = {
+    ilexsort(args.toList)
+  }
+
+  def lexsort[T : Ordering](args:Array[T]*):Array[Int] = {
+    lexsort(args.toList)
+  }
+  
+}
diff --git a/src/main/scala/BIDMat/MatFunctions.scala b/src/main/scala/BIDMat/MatFunctions.scala
new file mode 100755
index 00000000..36ec790a
--- /dev/null
+++ b/src/main/scala/BIDMat/MatFunctions.scala
@@ -0,0 +1,657 @@
+package BIDMat
+
+import scala.compat.Platform._ 
+import edu.berkeley.bid.CBLAS._
+import edu.berkeley.bid.LAPACK._
+import scala.actors.Actor._
+
+class IMatWildcard extends IMat(0,0,null) with MatrixWildcard
+
+object MatFunctions {
+
+  var currentTimeWasThen:Long = 0
+  
+  var lastFlops:Long = 0
+
+  def tic = { currentTimeWasThen = currentTime }
+
+  def toc:Float = {(currentTime - currentTimeWasThen)/1000.0f}
+  
+  def flip = { lastFlops = Mat.nflops ; tic }
+  
+  def flop:(Float, Float) = { val t1 = toc; ( (Mat.nflops -lastFlops)/t1, t1 ) }
+
+  def gflop:(Float, Float) = { val t1 = toc; ( (Mat.nflops -lastFlops)/t1/1e9f, t1 ) }
+  
+  def size(a:Mat):(Int, Int) = (a.nrows, a.ncols)
+    
+  def size(a:Mat, n:Int):Int = {
+  		if (n == 1) {
+  			a.nrows
+  		} else if (n == 2) {
+  			a.ncols
+  		} else {
+  			throw new RuntimeException("size arg must be 1 or 2")
+  		}
+  }
+  
+  def length(a:DMat):Int = a.length
+
+  def length(a:FMat):Int = a.length
+
+  def length(a:IMat):Int = a.length
+  
+  def nnz(a:DMat):Int = a.nnz
+
+  def nnz(a:FMat):Int = a.nnz
+
+  def nnz(a:IMat):Int = a.nnz
+  
+  def nnz(a:SMat):Int = a.nnz
+
+  def nnz(a:SDMat):Int = a.nnz
+  
+  implicit def flt2FMat(x:Float):FMat = row(x)
+
+  implicit def dbl2FMat(x:Double):FMat = row(x) 
+
+  implicit def int2IMat(x:Int):IMat = irow(x)
+  
+//  implicit def dbl2CMat(x:Double):CMat = CMat.celem(x.asInstanceOf[Float],0)
+
+  implicit def range2IMat(x:Range):IMat = irow(x)
+  
+  implicit def tuple2IMat(x:Tuple2[Int,Int]):IMat = irow(x._1 until x._2)
+
+  implicit def fMat2DMat(x:FMat):DMat = {
+    val out = DMat(x.nrows, x.ncols)
+    Mat.copyToDoubleArray(x.data, 0, out.data, 0, x.length)
+    out
+  }
+
+  implicit def iMat2FMat(x:IMat):FMat = {
+    val out = FMat(x.nrows, x.ncols)
+    Mat.copyToFloatArray(x.data, 0, out.data, 0, x.length)
+    out
+  }
+  
+  def recycleTry(a:Mat, nr:Int, nc:Int, b:FMat, nnz:Int):FMat = recycleTry(a, nr, nc, b:Mat, nnz).asInstanceOf[FMat]
+  
+  def recycleTry(a:Mat, nr:Int, nc:Int, b:DMat, nnz:Int):DMat = recycleTry(a, nr, nc, b:Mat, nnz).asInstanceOf[DMat]
+  
+  def recycleTry(a:Mat, nr:Int, nc:Int, b:IMat, nnz:Int):IMat = recycleTry(a, nr, nc, b:Mat, nnz).asInstanceOf[IMat]
+  
+  def recycleTry(a:Mat, nr:Int, nc:Int, b:SMat, nnz:Int):SMat = recycleTry(a, nr, nc, b:Mat, nnz).asInstanceOf[SMat]
+  
+  def recycleTry(a:Mat, nr:Int, nc:Int, b:SDMat, nnz:Int):SDMat = recycleTry(a, nr, nc, b:Mat, nnz).asInstanceOf[SDMat]
+  
+  def recycleTry(a:Mat, nr:Int, nc:Int, b:GMat, nnz:Int):GMat = recycleTry(a, nr, nc, b:Mat, nnz).asInstanceOf[GMat]
+  
+  def recycleTry(a:Mat, nr:Int, nc:Int, b:GIMat, nnz:Int):GIMat = recycleTry(a, nr, nc, b:Mat, nnz).asInstanceOf[GIMat]
+  
+  def recycleTry(a:Mat, nr:Int, nc:Int, b:GSMat, nnz:Int):GSMat = recycleTry(a, nr, nc, b:Mat, nnz).asInstanceOf[GSMat]
+  
+  def recycleTry(a:Mat, nr:Int, nc:Int, b:Mat, nnz:Int):Mat = {
+    if (a.asInstanceOf[AnyRef] == null  || (a.nrows == 0 && a.ncols == 0)) {
+    	b.zeros(nr, nc, nnz)     
+    } else {
+    	a.recycle(nr, nc, nnz)
+    }
+  }
+  
+  def recycleTry(a:Mat, b:FMat):FMat = recycleTry(a, b.nrows, b.ncols, b:Mat, b.nnz).asInstanceOf[FMat]
+  
+  def recycleTry(a:Mat, b:DMat):DMat = recycleTry(a, b.nrows, b.ncols, b:Mat, b.nnz).asInstanceOf[DMat]
+  
+  def recycleTry(a:Mat, b:IMat):IMat = recycleTry(a, b.nrows, b.ncols, b:Mat, b.nnz).asInstanceOf[IMat]
+  
+  def recycleTry(a:Mat, b:SMat):SMat = recycleTry(a, b.nrows, b.ncols, b:Mat, b.nnz).asInstanceOf[SMat]
+  
+  def recycleTry(a:Mat, b:SDMat):SDMat = recycleTry(a, b.nrows, b.ncols, b:Mat, b.nnz).asInstanceOf[SDMat]
+  
+  def recycleTry(a:Mat, b:GMat):GMat = recycleTry(a, b.nrows, b.ncols, b:Mat, b.nnz).asInstanceOf[GMat]
+  
+  def recycleTry(a:Mat, b:GIMat):GIMat = recycleTry(a, b.nrows, b.ncols, b:Mat, b.nnz).asInstanceOf[GIMat]
+  
+  def recycleTry(a:Mat, b:GSMat):GSMat = recycleTry(a, b.nrows, b.ncols, b:Mat, b.nnz).asInstanceOf[GSMat]
+  
+  def recycleTry(a:Mat, b:Mat):Mat = recycleTry(a, b.nrows, b.ncols, b, b.nnz)
+  
+  def recycleTry(a:Mat, b:FMat, c:FMat):FMat = 
+    recycleTry(a, math.max(b.nrows, c.nrows), math.max(b.ncols, c.ncols), b:Mat, b.nnz).asInstanceOf[FMat];
+  
+  def recycleTry(a:Mat, b:DMat, c:DMat):DMat = 
+    recycleTry(a, math.max(b.nrows, c.nrows), math.max(b.ncols, c.ncols), b:Mat, b.nnz).asInstanceOf[DMat];
+    
+  def recycleTry(a:Mat, b:IMat, c:IMat):IMat = 
+    recycleTry(a, math.max(b.nrows, c.nrows), math.max(b.ncols, c.ncols), b:Mat, b.nnz).asInstanceOf[IMat];
+      
+  def recycleTry(a:Mat, b:SMat, c:SMat):SMat = 
+    recycleTry(a, math.max(b.nrows, c.nrows), math.max(b.ncols, c.ncols), b:Mat, b.nnz).asInstanceOf[SMat];
+        
+  def recycleTry(a:Mat, b:SDMat, c:SDMat):SDMat = 
+    recycleTry(a, math.max(b.nrows, c.nrows), math.max(b.ncols, c.ncols), b:Mat, b.nnz).asInstanceOf[SDMat];
+          
+  def recycleTry(a:Mat, b:GMat, c:GMat):GMat = 
+    recycleTry(a, math.max(b.nrows, c.nrows), math.max(b.ncols, c.ncols), b:Mat, b.nnz).asInstanceOf[GMat];
+            
+  def recycleTry(a:Mat, b:GIMat, c:GIMat):GIMat = 
+    recycleTry(a, math.max(b.nrows, c.nrows), math.max(b.ncols, c.ncols), b:Mat, b.nnz).asInstanceOf[GIMat];
+  
+  def recycleTry(a:Mat, b:GSMat, c:GSMat):GSMat = 
+    recycleTry(a, math.max(b.nrows, c.nrows), math.max(b.ncols, c.ncols), b:Mat, b.nnz).asInstanceOf[GSMat];
+  
+  def recycleTry(a:Mat, b:Mat, c:Mat):Mat = recycleTry(a, math.max(b.nrows, c.nrows), math.max(b.ncols, c.ncols), b, b.nnz)
+
+  def find(a:DMat) = a.find   
+  def find2(a:DMat) = a.find2    
+  def find3(a:DMat) = a.find3
+  def accum(inds:IMat, vals:DMat, nr:Int, nc:Int) = DMat(DenseMat.accum(inds, vals, nr, nc))
+  def accum(inds:IMat, vals:DMat, nr:Int) = DMat(DenseMat.accum(inds, vals, nr, 1))
+  def sort(a:DMat, ind:Int):DMat = DMat(DenseMat.sort(a, ind, true))
+  def sort(a:DMat):DMat = DMat(DenseMat.sort(a, 0, true))
+  def sort2(a:DMat):(DMat, IMat) = {val (d,i) = DenseMat.sort2(a, true); (DMat(d), i)}
+  def sort2(a:DMat,dir:Int):(DMat, IMat) = {val (d,i) = DenseMat.sort2(a, dir, true); (DMat(d), i)}
+  def sortdown(a:DMat, ind:Int):DMat = DMat(DenseMat.sort(a, ind, false))
+  def sortdown(a:DMat):DMat = DMat(DenseMat.sort(a, 0, false))
+  def sortdown2(a:DMat):(DMat, IMat) = {val (d,i) = DenseMat.sort2(a, false); (DMat(d), i)}
+  def sortdown2(a:DMat, dir:Int):(DMat, IMat) = {val (d,i) = DenseMat.sort2(a, dir, false); (DMat(d), i)}
+  def sortrows(a:DMat):(DMat, IMat) = { val ii = DenseMat.sortlex(a, true); (a(ii,?), ii) }
+  def sortrowsdown(a:DMat):(DMat, IMat) = { val ii = DenseMat.sortlex(a, false); (a(ii,?), ii) }
+  def sortlex(a:DMat):IMat = DenseMat.sortlex(a, true)
+  def sortlexdown(a:DMat):IMat = DenseMat.sortlex(a, false)
+  def uniquerows(a:DMat):(DMat, IMat, IMat) = { val (ii, jj) = DenseMat.uniquerows2(a) ; (a(ii,?), ii, jj)}
+  def unique(a:DMat):(DMat, IMat, IMat) = {val (ii, jj) =	DenseMat.unique2(if (math.min(a.nrows,a.ncols)==1) a else a(?)) ; (a(ii,?), ii, jj)}
+
+  def find(a:FMat) = a.find   
+  def find2(a:FMat) = a.find2    
+  def find3(a:FMat) = a.find3
+  def accum(inds:IMat, vals:FMat, nr:Int, nc:Int) = FMat(DenseMat.accum(inds, vals, nr, nc))
+  def accum(inds:IMat, vals:FMat, nr:Int) = FMat(DenseMat.accum(inds, vals, nr, 1))
+  def sort(a:FMat, ind:Int):FMat = FMat(DenseMat.sort(a, ind, true))
+  def sort(a:FMat):FMat = FMat(DenseMat.sort(a, 0, true))
+  def sort2(a:FMat):(FMat, IMat) = {val (d,i) = DenseMat.sort2(a, true); (FMat(d), i)}
+  def sort2(a:FMat,dir:Int):(FMat, IMat) = {val (d,i) = DenseMat.sort2(a, dir, true); (FMat(d), i)}
+  def sortdown(a:FMat, ind:Int):FMat = FMat(DenseMat.sort(a, ind, false))
+  def sortdown(a:FMat):FMat = FMat(DenseMat.sort(a, 0, false))
+  def sortdown2(a:FMat):(FMat, IMat) = {val (d,i) = DenseMat.sort2(a, false); (FMat(d), i)}
+  def sortdown2(a:FMat, dir:Int):(FMat, IMat) = {val (d,i) = DenseMat.sort2(a, dir, false); (FMat(d), i)}
+  def sortrows(a:FMat):(FMat, IMat) = { val ii = DenseMat.sortlex(a, true); (a(ii,?), ii) }
+  def sortrowsdown(a:FMat):(FMat, IMat) = { val ii = DenseMat.sortlex(a, false); (a(ii,?), ii) }
+  def sortlex(a:FMat):IMat = DenseMat.sortlex(a, true)
+  def sortlexdown(a:FMat):IMat = DenseMat.sortlex(a, false)
+  def uniquerows(a:FMat):(FMat, IMat, IMat) = { val (ii, jj) = DenseMat.uniquerows2(a) ; (a(ii,?), ii, jj)}
+  def unique(a:FMat):(FMat, IMat, IMat) = {val (ii, jj) =	DenseMat.unique2(if (math.min(a.nrows,a.ncols)==1) a else a(?)) ; (a(ii,?), ii, jj)}
+
+  def find(a:IMat) = a.find   
+  def find2(a:IMat) = a.find2    
+  def find3(a:IMat) = a.find3
+  def accum(inds:IMat, vals:IMat, nr:Int, nc:Int) = IMat(DenseMat.accum(inds, vals, nr, nc))
+  def accum(inds:IMat, vals:IMat, nr:Int) = IMat(DenseMat.accum(inds, vals, nr, 1))
+  def sort(a:IMat, ind:Int):IMat = IMat(DenseMat.sort(a, ind, true))
+  def sort(a:IMat):IMat = IMat(DenseMat.sort(a, 0, true))
+  def sort2(a:IMat):(IMat, IMat) = {val (d,i) = DenseMat.sort2(a, true); (IMat(d), i)}
+  def sort2(a:IMat,dir:Int):(IMat, IMat) = {val (d,i) = DenseMat.sort2(a, dir, true); (IMat(d), i)}
+  def sortdown(a:IMat, ind:Int):IMat = IMat(DenseMat.sort(a, ind, false))
+  def sortdown(a:IMat):IMat = IMat(DenseMat.sort(a, 0, false))
+  def sortdown2(a:IMat):(IMat, IMat) = {val (d,i) = DenseMat.sort2(a, false); (IMat(d), i)}
+  def sortdown2(a:IMat, dir:Int):(IMat, IMat) = {val (d,i) = DenseMat.sort2(a, dir, false); (IMat(d), i)}
+  def sortrows(a:IMat):(IMat, IMat) = { val ii = DenseMat.sortlex(a, true); (a(ii,?), ii) }
+  def sortrowsdown(a:IMat):(IMat, IMat) = { val ii = DenseMat.sortlex(a, false); (a(ii,?), ii) }
+  def sortlex(a:IMat):IMat = DenseMat.sortlex[Int](a, true)
+  def sortlexdown(a:IMat):IMat = DenseMat.sortlex(a, false)
+  def uniquerows(a:IMat):(IMat, IMat, IMat) = { val (ii, jj) = DenseMat.uniquerows2(a) ; (a(ii,?), ii, jj)}
+  def unique(a:IMat):(IMat, IMat, IMat) = {val (ii, jj) =	DenseMat.unique2(if (math.min(a.nrows,a.ncols)==1) a else a(?)) ; (a(ii,?), ii, jj)}
+  
+  def find(a:CSMat) = a.find   
+  def find2(a:CSMat) = a.find2    
+  def find3(a:CSMat) = a.find3
+  def sort(a:CSMat, ind:Int):CSMat = CSMat(DenseMat.sort(a, ind, true))
+  def sort(a:CSMat):CSMat = CSMat(DenseMat.sort(a, 0, true))
+  def sort2(a:CSMat):(CSMat, IMat) = {val (d,i) = DenseMat.sort2(a, true); (CSMat(d), i)}
+  def sortdown(a:CSMat, ind:Int):CSMat = CSMat(DenseMat.sort(a, ind, false))
+  def sortdown(a:CSMat):CSMat = CSMat(DenseMat.sort(a, 0, false))
+  def sortdown2(a:CSMat):(CSMat, IMat) = {val (d,i) = DenseMat.sort2(a, false); (CSMat(d), i)}
+  def sortrows(a:CSMat):(CSMat, IMat) = { val ii = DenseMat.sortlex(a, true); (a(ii,?), ii) }
+  def sortrowsdown(a:CSMat):(CSMat, IMat) = { val ii = DenseMat.sortlex(a, false); (a(ii,?), ii) }
+  def sortlex(a:CSMat):IMat = DenseMat.sortlex(a, true)
+  def sortlexdown(a:CSMat):IMat = DenseMat.sortlex(a, false)
+  def uniquerows(a:CSMat):(CSMat, IMat, IMat) = { val (ii, jj) = DenseMat.uniquerows2(a) ; (a(ii,?), ii, jj)}
+  
+  def find(a:SDMat) = a.find   
+  def find2(a:SDMat) = a.find2    
+  def find3(a:SDMat) = a.find3
+
+  def find(a:SMat) = a.find   
+  def find2(a:SMat) = a.find2    
+  def find3(a:SMat) = a.find3
+  
+  def invperm(a:IMat):IMat = {
+    val out = IMat(a.nrows, a.ncols) 
+    var nrows = a.nrows
+    var ncols = a.ncols
+    if (a.nrows == 1) {
+      ncols = 1
+      nrows = a.ncols
+    }
+    for (i <- 0 until ncols) {
+      val ioff = i*nrows
+      for (i<-0 until nrows) {
+	out.data(a.data(i + ioff) + ioff) = i
+      }
+    }
+    out
+  }
+
+  def drow(x:Array[Double]):DMat = {
+    val mat = DMat(1,x.length)
+    System.arraycopy(x, 0, mat.data, 0, x.length)
+    mat
+  }
+
+  def drow(x:List[Double]):DMat = {
+    val mat = DMat(1,x.length)
+    x.copyToArray(mat.data)
+    mat
+  }
+
+  def drow(args:Double*):DMat = drow(args.toArray) 
+  
+  def drow(x:Range):DMat = {
+    val mat = DMat(1,x.length)
+    for (i <- 0 until x.length)
+      mat.data(i) = x(i)
+    mat
+  }
+
+  def dcol(x:Range):DMat = {
+    val mat = DMat(x.length,1)
+    for (i <- 0 until x.length)
+      mat.data(i) = x(i)
+    mat
+  }
+
+  def dcol(x:List[Double]):DMat = {
+    val mat = DMat(x.length,1)
+    x.copyToArray(mat.data)
+    mat
+  }
+
+  def dcol(args:Double*):DMat = {
+    dcol(args.toList)
+  }
+
+  def dzeros(nr:Int, nc:Int):DMat = {
+    DMat(nr,nc)
+  }
+
+  def dones(nr:Int, nc:Int):DMat = {
+    val out = DMat(nr,nc)
+    var i = 0
+    while (i < out.length) {
+      out.data(i) = 1
+      i += 1
+    }
+    out
+  }
+
+  def row(x:Array[Float]):FMat = {
+    val mat = FMat(1,x.length)
+    System.arraycopy(x, 0, mat.data, 0, x.length)
+    mat
+  }
+
+  def row(x:Array[Double]):FMat = {
+    val mat = FMat(1,x.length)
+    Mat.copyToFloatArray(x, 0, mat.data, 0, x.length)
+    mat
+  }
+  
+  def row(x:Array[Int]):FMat = {
+    val mat = FMat(1,x.length)
+    Mat.copyToFloatArray(x, 0, mat.data, 0, x.length)
+    mat
+  }
+
+  def row[T](x:List[T])(implicit numeric : Numeric[T]):FMat = {
+  		val mat = FMat(1, x.length)
+  		Mat.copyListToFloatArray(x, mat.data)
+  		mat	
+  }
+
+  def row[T](x:T*)(implicit numeric : Numeric[T]):FMat = row(x.toList)
+  
+  def row(x:Range):FMat = {
+    val mat = FMat(1,x.length)
+    for (i <- 0 until x.length)
+      mat.data(i) = x(i)
+    mat
+  }
+  
+  def col(x:Array[Float]):FMat = {
+    val mat = FMat(x.length, 1)
+    System.arraycopy(x, 0, mat.data, 0, x.length)
+    mat
+  }
+  
+  def col(x:Array[Double]):FMat = {
+    val mat = FMat(x.length, 1)
+    Mat.copyToFloatArray(x, 0, mat.data, 0, x.length)
+    mat
+  }
+  
+  def col(x:Array[Int]):FMat = {
+    val mat = FMat(x.length, 1)
+    Mat.copyToFloatArray(x, 0, mat.data, 0, x.length)
+    mat
+  }
+  
+  def col[T](x:List[T])(implicit numeric : Numeric[T]):FMat = {
+  		val mat = FMat(x.length, 1)
+  		Mat.copyListToFloatArray(x, mat.data)
+  		mat	
+  }
+
+  def col[T](x:T*)(implicit numeric : Numeric[T]):FMat = col(x.toList)
+
+  def col(x:Range):FMat = {
+    val mat = FMat(x.length,1)
+    for (i <- 0 until x.length)
+      mat.data(i) = x(i)
+    mat
+  }
+
+  def zeros(nr:Int, nc:Int):FMat = FMat(nr,nc)
+
+  def ones(nr:Int, nc:Int):FMat = {
+    val out = FMat(nr,nc)
+    var i = 0
+    while (i < out.length) {
+      out.data(i) = 1
+      i += 1
+    }
+    out
+  }  
+
+  def irow(x:Range):IMat = {
+    val mat = IMat(1,x.length)
+    for (i <- 0 until x.length)
+      mat.data(i) = x(i)
+    mat
+  }
+  
+  def irow(x:Tuple2[Int,Int]):IMat = irow(x._1 until x._2)
+
+  def irow(x:Array[Int]):IMat = {
+    val mat = IMat(1,x.length)
+    System.arraycopy(x, 0, mat.data, 0, x.length)
+    mat
+  }
+
+  def irow(x:List[Int]):IMat = {
+    val mat = IMat(1,x.length)
+    x.copyToArray(mat.data)
+    mat
+  }
+
+  def irow(args:Int*):IMat = {
+    irow(args.toList)
+  }
+
+  def icol(x:Range):IMat = {
+    val mat = IMat(x.length,1)
+    for (i <- 0 until x.length)
+      mat.data(i) = x(i)
+    mat
+  }
+  
+  def icol(x:Tuple2[Int,Int]):IMat = icol(x._1 until x._2)
+
+  def icol(x:List[Int]):IMat = {
+    val mat = IMat(x.length,1)
+    x.copyToArray(mat.data)
+    mat
+  }
+
+  def icol(args:Int*):IMat = {
+    icol(args.toList)
+  }
+
+  def izeros(nr:Int, nc:Int):IMat = {
+    IMat(nr,nc)
+  }
+
+  def iones(nr:Int, nc:Int):IMat = {
+    val out = IMat(nr,nc)
+    var i = 0
+    while (i < out.length) {
+      out.data(i) = 1
+      i += 1
+    }
+    out
+  }
+  
+  def crow(x:List[String]):CSMat = {
+    val mat = CSMat(1, x.length)
+    x.copyToArray(mat.data)
+    mat
+  }
+
+  def crow(args:String*):CSMat = {
+    crow(args.toList)
+  }
+  
+  def ccol(x:List[String]):CSMat = {
+    val mat = CSMat(x.length,1)
+    x.copyToArray(mat.data)
+    mat
+  }
+
+  def ccol(args:String*):CSMat = {
+    ccol(args.toList)
+  }
+
+  def blank = new Mat(0,0)
+  
+  def fblank = new FMat(0,0,null)
+  
+  def dblank = new DMat(0,0,null)
+  
+  def cblank = new CMat(0,0,null)
+  
+  def iblank = new IMat(0,0,null)
+  
+  def sblank = new SMat(0,0,0,null,null,null)
+  
+  def sdblank = new SDMat(0,0,0,null,null,null)
+  
+  def gblank = new GMat(0,0,null,0)
+  
+  def giblank = new GIMat(0,0,null,0)
+  
+  def gsblank = new GSMat(0,0,0,null,null,null,0)
+  
+  
+  def sparse(a:DMat):SDMat = {
+    val (ii, jj, vv) = a.find3
+    val out = SDMat(a.nrows, a.ncols, ii.nrows)
+    var i = 0
+    val ioff = Mat.ioneBased
+    while (i < ii.nrows) {out.ir(i) = ii.data(i) + ioff; i+= 1}
+    SparseMat.compressInds(jj.data, a.ncols, out.jc, a.nnz)
+    System.arraycopy(vv.data, 0, out.data, 0, ii.nrows)
+    out
+  }    
+
+  def sparse(a:FMat):SMat = {
+    val (ii, jj, vv) = a.find3
+    val out = SMat(a.nrows, a.ncols, ii.nrows)
+    var i = 0
+    val ioff = Mat.ioneBased
+    while (i < ii.nrows) {out.ir(i) = ii.data(i) + ioff; i+= 1}
+    SparseMat.compressInds(jj.data, a.ncols, out.jc, a.nnz)
+    System.arraycopy(vv.data, 0, out.data, 0, ii.nrows)
+    out
+  }    
+  
+  def sparse(ii:IMat, jj:IMat, vv:DMat, nr:Int, nc:Int):SDMat = {
+    SDMat(SparseMat.sparseImpl[Double](ii.data, jj.data, vv.data, nr, nc))
+  } 
+  
+  def _maxi(a:IMat) = a.iiReduceOp(0, (x:Int) => x, (x:Int, y:Int) => math.max(x,y), null)
+
+  def sparse(ii:IMat, jj:IMat, vv:DMat):SDMat = {
+    SDMat(SparseMat.sparseImpl[Double](ii.data, jj.data, vv.data, _maxi(ii).v+1, _maxi(jj).v+1))
+  } 
+
+  def sparse(ii:IMat, jj:IMat, vv:FMat, nr:Int, nc:Int):SMat = {
+    SMat(SparseMat.sparseImpl[Float](ii.data, jj.data, vv.data, nr, nc))
+  } 
+
+  def sparse(ii:IMat, jj:IMat, vv:FMat):SMat = {
+    SMat(SparseMat.sparseImpl[Float](ii.data, jj.data, vv.data, _maxi(ii).v+1, _maxi(jj).v+1))
+  } 
+
+  def full(a:DMat):DMat = a
+
+  def full(a:FMat):FMat = a
+
+  def full(sd:SDMat):DMat = DMat(sd.full)
+
+  def full(ss:SMat):FMat = FMat(ss.full)
+  
+  def full(a:Mat):Mat = a match {
+    case aa:DMat => a
+    case aa:FMat => a
+    case aa:IMat => a
+    case aa:SMat => full(aa):FMat
+    case aa:SDMat => full(aa):DMat
+  }
+  
+  def DDShelper(a:FMat, b:FMat, c:SMat, out:SMat, istart:Int, iend:Int, ioff:Int) = {
+    var i = istart
+    while (i < iend) {
+    	var j = c.jc(i)-ioff
+    	while (j < c.jc(i+1)-ioff) {
+    		var dsum = 0.0f
+    		val a0 = (c.ir(j)-ioff)*a.nrows
+    		val b0 = i*a.nrows
+    		if (Mat.noMKL || a.nrows < 256) {
+    			var k = 0
+    			while (k < a.nrows) {
+    				dsum += a.data(k + a0) * b.data(k + b0)
+    				k += 1
+    			} 
+    		} else {
+    			dsum = sdotxx(a.nrows, a.data, a0, b.data, b0)
+    		}
+    		out.data(j) = dsum
+    		out.ir(j) = c.ir(j)
+    		j += 1
+    	}
+    	out.jc(i+1) = c.jc(i+1)
+    	i += 1
+    }
+  }
+
+  def DDS(a:FMat,b:FMat,c:SMat,omat:Mat):SMat = {
+    if (a.nrows != b.nrows) {
+      throw new RuntimeException("nrows of dense A and B must match")
+    } else if (c.nrows != a.ncols || c.ncols != b.ncols) {
+      throw new RuntimeException("dims of C must match A'*B")
+    } else {
+      val out = SMat.newOrCheckSMat(c, omat)     
+      Mat.nflops += 2L * c.nnz * a.nrows
+      val ioff = Mat.ioneBased
+      out.jc(0) = ioff
+      if (c.nnz > 100000 && Mat.numThreads > 1) {
+        val done = IMat(1,Mat.numThreads)
+        for (i <- 0 until Mat.numThreads) {
+          actor {
+          	val istart = i*c.ncols/Mat.numThreads
+          	val iend = (i+1)*c.ncols/Mat.numThreads
+          	DDShelper(a, b, c, out, istart, iend, ioff)
+          	done(i) = 1
+          }
+        }
+        while (SciFunctions.sum(done).v < Mat.numThreads) {Thread.`yield`()}
+      } else {
+      	DDShelper(a, b, c, out, 0, c.ncols, ioff)
+      }
+      out
+    }
+  }
+  
+  def DDS(a:GMat,b:GMat,c:GSMat,omat:Mat):GSMat = GMat.DDS(a,b,c,omat)
+  
+  def DDS(a:Mat, b:Mat, c:Mat, omat:Mat=null):Mat = {
+    (a, b, c) match {
+      case (a:FMat, b:FMat, c:SMat) => DDS(a, b, c, omat):SMat
+      case (a:GMat, b:GMat, c:GSMat) => GMat.DDS(a, b, c, omat):GSMat
+    }
+  }
+  
+  def DDSQ(a:FMat,b:FMat,c:SMat, veps:Float):SMat = {
+    if (a.nrows != b.nrows) {
+      throw new RuntimeException("nrows of dense A and B must match")
+    } else if (c.nrows != a.ncols || c.ncols != b.ncols) {
+      throw new RuntimeException("dims of C must match A'*B")
+    } else {
+      val out = SMat(c.nrows,c.ncols,c.nnz)
+      Mat.nflops += c.nnz * a.nrows
+      val ioff = Mat.ioneBased
+      var i = 0
+      out.jc(0) = ioff
+      while (i < c.ncols) {
+    	var j = c.jc(i)-ioff
+    	while (j < c.jc(i+1)-ioff) {
+    	  var dsum = 0.0f
+    	  var k = 0
+    	  val a0 = (c.ir(j)-ioff)*a.nrows
+    	  val b0 = i*a.nrows
+    	  if (Mat.noMKL) {
+    	    while (k < a.nrows) {
+    	      dsum += a.data(k + a0) * b.data(k + b0)
+    	      k += 1
+    	    } 
+    	  } else {
+    	    dsum = sdotxx(a.nrows, a.data, a0, b.data, b0)
+    	  }
+    	  out.data(j) = dsum / math.max(veps, dsum)
+    	  out.ir(j) = c.ir(j)
+    	  j += 1
+    	}
+    	out.jc(i+1) = c.jc(i+1)
+    	i += 1
+      }
+      out
+    }
+  }
+  
+  def mkdiag(a:DMat) = DMat(a.mkdiag)
+  def mkdiag(a:FMat) = FMat(a.mkdiag)
+  def mkdiag(a:IMat) = IMat(a.mkdiag)
+  def mkdiag(a:CMat) = CMat(a.mkdiag)
+
+  def getdiag(a:DMat) = DMat(a.getdiag)
+  def getdiag(a:FMat) = FMat(a.getdiag)
+  def getdiag(a:IMat) = IMat(a.getdiag)
+  def getdiag(a:CMat) = CMat(a.getdiag)  
+
+  def load[T](fname:String, vname:String):T = MatHDF5.hload(fname, vname).asInstanceOf[T]
+
+  def load[A,B](fname:String, v1:String, v2:String):(A,B) = {
+    val a = MatHDF5.hload(fname, List(v1, v2));
+    (a(0).asInstanceOf[A], a(1).asInstanceOf[B])
+  }
+
+  def loadx(fname:String, vnames:String*):List[AnyRef] = MatHDF5.hload(fname, vnames.toList)
+
+  def saveAsHDF5(fname:String, args:AnyRef*) = MatHDF5.hsaveAsHDF5(fname, args.toList)
+
+  def saveAs(fname:String, args:AnyRef*) = MatHDF5.hsaveAs(fname, args.toList)
+
+  final val ? = new IMatWildcard
+}
+
+
diff --git a/src/main/scala/BIDMat/MatHDF5.scala b/src/main/scala/BIDMat/MatHDF5.scala
new file mode 100755
index 00000000..c6fe3f0a
--- /dev/null
+++ b/src/main/scala/BIDMat/MatHDF5.scala
@@ -0,0 +1,510 @@
+package BIDMat
+import ncsa.hdf.hdf5lib.structs._
+import ncsa.hdf.hdf5lib.H5._
+import ncsa.hdf.hdf5lib.HDF5Constants._
+
+object MatHDF5 {
+  var refcount:Long = -1
+
+  def setCompressionPlist(dplist_id:Int, dims:Array[Long]) = {
+	if (Mat.compressType > 0) {
+	  if (dims.length == 1) {
+		if (dims(0) > 1024) {
+		  val cdims = new Array[Long](1)
+		  cdims(0) = math.max(1, math.min(dims(0), Mat.chunkSize))
+		  H5Pset_chunk(dplist_id, 1, cdims)
+		  if (Mat.compressType == 1) {
+		  	H5Pset_deflate(dplist_id, Mat.compressionLevel)
+		  } else {
+		  	H5Pset_szip(dplist_id, H5_SZIP_EC_OPTION_MASK, Mat.szipBlock)
+		  }
+		}
+	  } else {
+		if (dims(0)*dims(1) > 1024) {
+		  val cdims = new Array[Long](2)
+		  cdims(0) = math.max(1, math.min(dims(0), 1+Mat.chunkSize/dims(1)))
+		  cdims(1) = math.max(1, dims(1))
+		  if (Mat.compressType == 1) {
+		  	H5Pset_deflate(dplist_id, Mat.compressionLevel)
+		  } else {
+		  	H5Pset_szip(dplist_id, H5_SZIP_EC_OPTION_MASK, Mat.szipBlock)
+		  }
+		}
+	  }
+	}
+  }
+  
+  def getStringAttr(id:Int, obj_name:String, attr_name:String):String = { 
+	val attr_id = H5Aopen_by_name(id, obj_name, attr_name, H5P_DEFAULT, H5P_DEFAULT)
+	val attr_type_id = H5Aget_type(attr_id)
+	val attr_type_size = H5Tget_size(attr_type_id)
+	val sbuf = new Array[Byte](attr_type_size + 1)
+	H5Aread(attr_id, attr_type_id, sbuf)
+	H5Tclose(attr_type_id)
+	H5Aclose(attr_id)
+	new String(sbuf).trim()
+  }
+
+  def putStringAttr(id:Int, attr_name:String, attr_val:String) = { 
+	val space_id = H5Screate(H5S_SCALAR)
+	val memtype_id = H5Tcopy(H5T_FORTRAN_S1) 
+	H5Tset_size(memtype_id, attr_val.length()) 
+	val attr_id = H5Acreate(id, attr_name, memtype_id, space_id, H5P_DEFAULT, H5P_DEFAULT)
+	H5Awrite(attr_id, memtype_id, attr_val.getBytes())
+	H5Tclose(memtype_id)
+	H5Aclose(attr_id)
+	H5Sclose(space_id)
+  }
+
+  def getLongAttr(id:Int, obj_name:String, attr_name:String):Long = { 
+	val attr_id = H5Aopen_by_name(id, obj_name, attr_name, H5P_DEFAULT, H5P_DEFAULT)
+	val attr_type_id = H5Aget_type(attr_id)
+	val attr_type_size = H5Tget_size(attr_type_id)
+	val sbuf = new Array[Long](attr_type_size/8)
+	H5Aread(attr_id, attr_type_id, sbuf)
+	H5Tclose(attr_type_id)
+	H5Aclose(attr_id)
+	sbuf(0)
+  }
+
+  def putIntAttr(id:Int, attr_name:String, attr_val:Int) = { 
+	val space_id = H5Screate(H5S_SCALAR)
+	val attr_id = H5Acreate(id, attr_name, H5T_NATIVE_INT, space_id, H5P_DEFAULT, H5P_DEFAULT)
+	val lbuf = Array[Int](1)
+	lbuf(0) = attr_val
+	H5Awrite(attr_id, H5T_NATIVE_INT, lbuf)
+	H5Aclose(attr_id)
+	H5Sclose(space_id)
+  }
+
+  def putLongAttr(id:Int, attr_name:String, attr_val:Long) = { 
+	val space_id = H5Screate(H5S_SCALAR)
+	val attr_id = H5Acreate(id, attr_name, H5T_NATIVE_LLONG, space_id, H5P_DEFAULT, H5P_DEFAULT)
+	val lbuf = Array[Long](1)
+	lbuf(0) = attr_val
+	H5Awrite(attr_id, H5T_NATIVE_LLONG, lbuf)
+	H5Aclose(attr_id)
+	H5Sclose(space_id)
+  }
+
+  def putByteAttr(id:Int, attr_name:String, attr_val:Byte) = { 
+	val space_id = H5Screate(H5S_SCALAR)
+	val attr_id = H5Acreate(id, attr_name, H5T_NATIVE_UCHAR, space_id, H5P_DEFAULT, H5P_DEFAULT)
+	val lbuf = Array[Byte](1)
+	lbuf(0) = attr_val
+	H5Awrite(attr_id, H5T_NATIVE_UCHAR, lbuf)
+	H5Aclose(attr_id)
+	H5Sclose(space_id)
+  }
+
+  def getMatDims(data_id:Int):Array[Long] = { 
+	val space_id = H5Dget_space(data_id)
+	val dims = new Array[Long](2)
+	val ok = H5Sget_simple_extent_dims(space_id, dims, null)
+	H5Sclose(space_id)
+	dims
+  }
+
+  def readMatDims(fname:String, varname:String):(Long, Long) = { 
+	val fid = H5Fopen(fname,H5F_ACC_RDONLY,H5P_DEFAULT)
+	val data_id = H5Dopen(fid, varname, H5P_DEFAULT)
+    val dims = getMatDims(data_id)
+	H5Dclose(data_id)
+	H5Fclose(fid)
+    if (dims(1) == 0) { 
+      (dims(0), dims(1))
+    } else { 
+      (dims(1), dims(0))
+    }
+  }
+
+  def getDenseMat[T : ClassManifest](fid:Int, varname:String, h5class:Int, dsize:Int):DenseMat[T] = {
+	val data_id = H5Dopen(fid, varname, H5P_DEFAULT)
+	val data_type_id = H5Dget_type(data_id)
+	val data_class = H5Tget_class(data_type_id)
+	val data_size = H5Tget_size(data_type_id)
+	val dims = getMatDims(data_id)
+	var mdata:DenseMat[T] = null
+	if (data_class == h5class && data_size == dsize) {
+	  mdata = new DenseMat[T](dims(1).intValue, dims(0).intValue)
+	  H5Dread(data_id, data_type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, mdata.data)
+	} else {
+	  throw new RuntimeException("Bad class or data size "+data_class+" "+data_size)
+	}
+	H5Tclose(data_type_id)
+	H5Dclose(data_id)
+	mdata
+  }
+
+  def getCellMat(fid:Int, varname:String):CSMat = {
+	val data_id = H5Dopen(fid, varname, H5P_DEFAULT)
+	val data_type_id = H5Dget_type(data_id)
+	val data_class = H5Tget_class(data_type_id)
+	val data_size = H5Tget_size(data_type_id)
+	val dims = getMatDims(data_id)
+	var mdata:CSMat = null
+	mdata = CSMat(dims(1).intValue, dims(0).intValue)
+	val bdata = new Array[Array[Byte]]((dims(0)*dims(1)).intValue)
+	for (i <- 0 until bdata.length) {
+	  bdata(i) = new Array[Byte](data_size)
+	}
+	H5Dread(data_id, data_type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, bdata)
+	val obj_type_out = new Array[Int](1)
+	obj_type_out(0) = 1
+	for (i <- 0 until bdata.length) {
+	  val item_id = H5Rdereference(fid, H5R_OBJECT, bdata(i))
+	  mdata.data(i) = getMat(item_id, ".").asInstanceOf[String]
+	  H5Oclose(item_id)
+	}
+	H5Tclose(data_type_id)
+	H5Dclose(data_id)
+	mdata
+  }
+
+  def getMatString(fid:Int, varname:String):String = {
+	val data_id = H5Dopen(fid, varname, H5P_DEFAULT)
+	val data_type_id = H5Dget_type(data_id)
+	val data_class = H5Tget_class(data_type_id)
+	val data_size = H5Tget_size(data_type_id)
+	val dims = getMatDims(data_id)
+	val nrows = dims(0).intValue
+	val ncols = dims(1).intValue
+	val sbuf = new Array[Byte](data_size*nrows*ncols)
+	H5Dread(data_id, data_type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, sbuf)
+	H5Tclose(data_type_id)
+	H5Dclose(data_id)
+	new String(sbuf, "UTF_16LE").trim()
+  }
+
+  def getSparseMat[T](fid:Int, varname:String)(implicit manifest:Manifest[T], numeric:Numeric[T]):SparseMat[T] = {
+	val nrows = getLongAttr(fid, varname, "MATLAB_sparse").intValue
+	val jc_id = H5Dopen(fid, varname+"/jc", H5P_DEFAULT)
+	val ncols = getMatDims(jc_id)(0).intValue - 1
+	val data_id = H5Dopen(fid, varname+"/data", H5P_DEFAULT)
+	val data_type_id = H5Dget_type(data_id)
+	val nnz = getMatDims(data_id)(0).intValue
+	var ir_id = -1
+	try {
+	  ir_id = H5Dopen(fid, varname+"/ir", H5P_DEFAULT)
+	} catch {
+	  case _ => {}
+	}
+	val sdata = if (ir_id >= 0) {
+	  SparseMat(nrows, ncols, nnz) 
+	} else {
+	  SparseMat.noRows(nrows, ncols, nnz)
+	}
+	val convert_ints = H5Tcopy(H5T_NATIVE_INT)
+	H5Dread_int(jc_id, convert_ints, H5S_ALL, H5S_ALL, H5P_DEFAULT, sdata.jc)
+	addOne(sdata.jc)
+	H5Dclose(jc_id)
+	if (ir_id >= 0) {
+	  H5Dread_int(ir_id, convert_ints, H5S_ALL, H5S_ALL, H5P_DEFAULT, sdata.ir)
+	  addOne(sdata.ir)
+	  H5Dclose(ir_id)
+	}
+	H5Tclose(convert_ints)
+	H5Dread(data_id, data_type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, sdata.data)
+	H5Tclose(data_type_id)
+	H5Dclose(data_id)
+	sdata
+  }
+
+  def getMat(fid:Int, varname:String):AnyRef = {
+	if (fid > 0 && H5Aexists_by_name(fid, varname, "MATLAB_class", H5P_DEFAULT)) {
+	  val attr_class = getStringAttr(fid, varname, "MATLAB_class")
+	  if (attr_class.equals("double")) {
+		if (H5Aexists_by_name(fid, varname, "MATLAB_sparse", H5P_DEFAULT)) {
+		  SDMat(getSparseMat[Double](fid, varname))
+		} else {
+		  DMat(getDenseMat[Double](fid, varname, H5T_FLOAT, 8))
+		}
+	  } else if (attr_class.equals("single")) {
+		if (H5Aexists_by_name(fid, varname, "MATLAB_sparse", H5P_DEFAULT)) {
+		  SMat(getSparseMat[Float](fid, varname))
+		} else {
+		  FMat(getDenseMat[Float](fid, varname, H5T_FLOAT, 4))
+		}
+	  } else if (attr_class.equals("int32")) {
+		if (H5Aexists_by_name(fid, varname, "MATLAB_sparse", H5P_DEFAULT)) {
+		  throw new RuntimeException("Sparse arrays of ints unsupported")
+		} else {
+		  IMat(getDenseMat[Int](fid, varname, H5T_INTEGER, 4))
+		}
+	  } else if (attr_class.equals("int8")) {
+		if (H5Aexists_by_name(fid, varname, "MATLAB_sparse", H5P_DEFAULT)) {
+		  BMat(getSparseMat[Byte](fid, varname))
+		} else {
+		  throw new RuntimeException("Dense arrays of bytes unsupported")
+		}
+	  } else if (attr_class.equals("char")) {
+		if (H5Aexists_by_name(fid, varname, "MATLAB_sparse", H5P_DEFAULT)) {
+		  throw new RuntimeException("Sparse arrays of char unsupported")
+		} else {
+		  getMatString(fid, varname)
+		}
+	  } else if (attr_class.equals("cell")) {
+		if (H5Aexists_by_name(fid, varname, "MATLAB_sparse", H5P_DEFAULT)) {
+		  throw new RuntimeException("Sparse cell arrays unsupported")
+		} else {
+		  getCellMat(fid, varname)
+		}
+	  } else throw new RuntimeException("Couldnt read storage class "+attr_class)
+	} else throw new RuntimeException("Couldnt find matlab var named "+varname)
+  }
+
+  def writeMatHeader(fname:String) = {
+	val ff = new java.io.RandomAccessFile(fname,"rws")
+	val sp = new scala.sys.SystemProperties()
+	val hstring = "MATLAB 7.3 MAT-file, Platform: "+sp.get("os.arch").get+" "+sp.get("os.name").get+" "+sp.get("os.version").get+ " "+
+	"Created by BIDMat on "+(new java.text.SimpleDateFormat("EEE MMM d HH:mm:ss yyyy")).format(new java.util.Date())+
+	" HDF5 Schema 1.0 ."
+	val hb = hstring.getBytes()
+	val hbytes = new Array[Byte](512)
+	for (i <- 0 until 116) hbytes(i) = 32
+	System.arraycopy(hb, 0, hbytes, 0, math.min(hstring.length(), 116))
+	val version:Byte = 2
+	hbytes(125) = version
+	hbytes(126) = 0x49
+	hbytes(127) = 0x4D
+	ff.write(hbytes)
+	//		ff.write(emptyHDF5file)
+	ff.close()
+  }
+
+  def putDenseMat[T](fid:Int, a:DenseMat[T], aname:String, h5class:Int, matclass:String):Array[Byte] = {
+	val dims = new Array[Long](2)
+	dims(0) = a.ncols
+	dims(1) = a.nrows
+	val filespace_id = H5Screate_simple(2, dims, null)
+	val dplist_id = H5Pcreate(H5P_DATASET_CREATE)
+//	setCompressionPlist(dplist_id, dims)
+	val dataset_id = H5Dcreate(fid, "/"+aname, h5class, filespace_id, H5P_DEFAULT, dplist_id, H5P_DEFAULT)
+	H5Dwrite(dataset_id, h5class, H5S_ALL, H5S_ALL, H5P_DEFAULT, a.data)
+	H5Pclose(dplist_id)
+	putStringAttr(dataset_id, "MATLAB_class", matclass)
+	val ref = H5Rcreate(dataset_id, ".", H5R_OBJECT, -1)
+	H5Dclose(dataset_id)
+	H5Sclose(filespace_id)
+	ref
+  }
+
+  def putEmptyRef(id:Int):Array[Byte] = {
+	val dims = new Array[Long](1)
+	dims(0) = 2
+	val tmp = Array[Long](2)
+	val dmatspace_id = H5Screate_simple(1, dims, null)
+	val dmat_id = H5Dcreate(id, "0", H5T_NATIVE_ULLONG, dmatspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT)
+	H5Dwrite(dmat_id, H5T_NATIVE_ULLONG, H5S_ALL, H5S_ALL, H5P_DEFAULT, tmp)
+	putStringAttr(dmat_id, "MATLAB_class", "canonical empty")
+	putByteAttr(dmat_id, "MATLAB_empty", 1)		
+	val ref = H5Rcreate(dmat_id, ".", H5R_OBJECT, -1)
+	H5Dclose(dmat_id)
+	H5Sclose(dmatspace_id)
+	ref
+  }
+
+  def putCellMat(fid:Int, varname:String, a:CSMat) = {
+	var group_id = 0
+	if (refcount < 0) { 
+	  group_id = H5Gcreate(fid, "/#refs#", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT)
+	  putEmptyRef(group_id)
+	  refcount = 1
+	} else {
+	  group_id = H5Gopen(fid, "/#refs#", H5P_DEFAULT)
+	}
+	val refIds = new Array[Array[Byte]](a.length)
+	for (i <- 0 until a.length) {
+	  val newname = "%x" format refcount
+	  refcount += 1
+	  refIds(i) = putMat(group_id, a.data(i), newname)
+	}
+	val dims = new Array[Long](2)
+	dims(0) = a.ncols
+	dims(1) = a.nrows
+	val dplist_id = H5Pcreate(H5P_DATASET_CREATE)
+	setCompressionPlist(dplist_id, dims)
+	val refspace_id = H5Screate_simple(2, dims, null)
+	val refs_id = H5Dcreate(fid, varname, H5T_STD_REF_OBJ, refspace_id, H5P_DEFAULT, dplist_id, H5P_DEFAULT)
+	H5Dwrite(refs_id, H5T_STD_REF_OBJ, H5S_ALL, H5S_ALL, H5P_DEFAULT, refIds)
+	putStringAttr(refs_id, "MATLAB_class", "cell")
+	val ref = H5Rcreate(refs_id, ".", H5R_OBJECT, -1)
+	H5Dclose(refs_id)
+	H5Sclose(refspace_id)
+	H5Pclose(dplist_id)
+	H5Gclose(group_id)
+	ref
+  }
+
+  def putSparseMat[T](fid:Int, a:SparseMat[T], varname:String, nativeClass:Int, className:String):Array[Byte] = {
+	val dims = new Array[Long](1)
+	val group_id = H5Gcreate(fid, "/"+varname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT)
+	putStringAttr(group_id, "MATLAB_class", className)
+	putLongAttr(group_id, "MATLAB_sparse", a.nrows)
+	val convert_ints = H5Tcopy(H5T_NATIVE_INT)
+	dims(0) = a.ncols + 1
+	var dplist_id = H5Pcreate(H5P_DATASET_CREATE)
+	setCompressionPlist(dplist_id, dims)
+	val jcs_id = H5Screate_simple(1, dims, null)
+	val jc_id = H5Dcreate(group_id, "jc", H5T_NATIVE_LLONG, jcs_id, H5P_DEFAULT, dplist_id, H5P_DEFAULT)
+	subOne(a.jc)
+	try {
+	  H5Dwrite(jc_id, convert_ints, H5S_ALL, H5S_ALL, H5P_DEFAULT, a.jc)
+	} catch  {
+	  case e => {
+		addOne(a.jc)
+		throw new RuntimeException("Error writing sparse mat "+e)
+	  }
+	}
+	addOne(a.jc)
+	H5Dclose(jc_id)
+	H5Sclose(jcs_id)
+	H5Pclose(dplist_id)
+
+	dims(0) = a.nnz
+	dplist_id = H5Pcreate(H5P_DATASET_CREATE)
+	setCompressionPlist(dplist_id, dims)
+	if (a.ir != null) {
+	  val irs_id = H5Screate_simple(1, dims, null)
+	  val ir_id = H5Dcreate(group_id, "ir", H5T_NATIVE_LLONG, irs_id, H5P_DEFAULT, dplist_id, H5P_DEFAULT)
+	  subOne(a.ir)
+	  try {
+		H5Dwrite(ir_id, convert_ints, H5S_ALL, H5S_ALL, H5P_DEFAULT, a.ir)
+	  } catch  {
+		case e => {
+		  addOne(a.ir)
+		  throw new RuntimeException("Error writing sparse mat "+e)
+		}
+	  }
+	  addOne(a.ir)
+	  H5Dclose(ir_id)
+	  H5Sclose(irs_id)
+	}
+
+	val dataspace_id = H5Screate_simple(1, dims, null)
+	val data_id = H5Dcreate(group_id, "data", nativeClass, dataspace_id, H5P_DEFAULT, dplist_id, H5P_DEFAULT)
+	H5Dwrite(data_id, nativeClass, H5S_ALL, H5S_ALL, H5P_DEFAULT, a.data)
+	H5Dclose(data_id)
+	H5Sclose(dataspace_id)
+	H5Pclose(dplist_id)
+	H5Tclose(convert_ints)
+	val ref = H5Rcreate(group_id, ".", H5R_OBJECT, -1)
+	H5Gclose(group_id)
+	ref
+  }
+
+  def putMatString(id:Int, varname:String, str:String):Array[Byte] = { 
+	val dims = new Array[Long](2)
+	dims(0) = str.length
+	dims(1) = 1
+	val dplist_id = H5Pcreate(H5P_DATASET_CREATE)
+	setCompressionPlist(dplist_id, dims)
+	val sbytes = str.getBytes("UTF_16LE")
+	val strspace_id = H5Screate_simple(2, dims, null) 
+	val str_id = H5Dcreate(id, varname, H5T_NATIVE_USHORT, strspace_id, H5P_DEFAULT, dplist_id, H5P_DEFAULT)
+	putStringAttr(str_id, "MATLAB_class", "char")
+	putIntAttr(str_id, "MATLAB_int_decode", 2)
+	H5Dwrite(str_id, H5T_NATIVE_USHORT, H5S_ALL, H5S_ALL, H5P_DEFAULT, sbytes)
+	val ref = H5Rcreate(str_id, ".", H5R_OBJECT, -1)
+	H5Dclose(str_id)
+	H5Sclose(strspace_id)
+	H5Pclose(dplist_id)
+	ref
+  }
+
+  def putMat(fid:Int, a:AnyRef, aname:String):Array[Byte] = { 
+	a match { 
+	  case aa:DMat => putDenseMat[Double](fid, aa, aname, H5T_NATIVE_DOUBLE, "double")
+	  case aa:FMat => putDenseMat[Float](fid, aa, aname, H5T_NATIVE_FLOAT, "single")
+	  case aa:IMat => putDenseMat[Int](fid, aa, aname, H5T_NATIVE_INT, "int32")
+	  case aa:BMat => putSparseMat[Byte](fid, aa, aname, H5T_NATIVE_CHAR, "int8")
+	  case aa:SMat => putSparseMat[Float](fid, aa, aname, H5T_NATIVE_FLOAT, "single")
+	  case aa:SDMat => putSparseMat[Double](fid, aa, aname, H5T_NATIVE_DOUBLE, "double")
+	  case aa:CSMat => putCellMat(fid, aname, aa)
+	  case aa:String => putMatString(fid, aname, aa)
+	  case _ => throw new RuntimeException("unsupported matrix type to save")
+	}
+  }
+
+  def hload(fname:String, vname:String):AnyRef = {
+  val fapl = H5Pcreate(H5P_FILE_ACCESS)
+//  H5Pset_fapl_core(fapl, 16*1024*1024, false);  println("core driver")
+  H5Pset_fapl_stdio(fapl); //println("stdio driver")
+	val fid = H5Fopen(fname,H5F_ACC_RDONLY,fapl)
+	H5Pclose(fapl)
+	val mat = getMat(fid, vname)
+	H5Fclose(fid)
+	mat
+  }
+
+  def hload(fname:String, vnames:List[String]):List[AnyRef] = {
+  val fapl = H5Pcreate(H5P_FILE_ACCESS)
+//  H5Pset_fapl_core(fapl, 32*1024*1024, false);  println("core driver")
+  H5Pset_fapl_stdio(fapl); //println("stdio driver")
+	val fid = H5Fopen(fname,H5F_ACC_RDONLY,fapl)
+	H5Pclose(fapl)
+	val mats = vnames.map((vname) => getMat(fid, vname))
+	H5Fclose(fid)
+	mats
+  }
+
+  def hsaveAsHDF5(fname:String, args:List[AnyRef]) = {
+	refcount = -1
+	val fapl_id = H5Pcreate (H5P_FILE_ACCESS)
+	H5Pset_fapl_stdio(fapl_id)
+	val fid = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id)
+	H5Pclose(fapl_id)
+	saveAsImpl(fid, args)
+	H5Fclose(fid)
+  }
+
+  def hsaveAs(fname:String, args:List[AnyRef]) = {
+	refcount = -1
+	val fapl_id = H5Pcreate (H5P_FILE_ACCESS)
+	H5Pset_fapl_stdio(fapl_id)
+	val fcplist_id = H5Pcreate(H5P_FILE_CREATE)
+	H5Pset_userblock(fcplist_id, 512)
+	val fid = H5Fcreate(fname, H5F_ACC_TRUNC, fcplist_id, fapl_id)
+	H5Pclose(fcplist_id)
+	H5Pclose(fapl_id)
+	saveAsImpl(fid, args)
+	H5Fclose(fid)
+	writeMatHeader(fname)
+  }
+
+  def saveAsImpl(fid:Int, argList:List[AnyRef]) = {
+	var i = 0
+	while (i < argList.length) {
+	  argList(i) match {
+		case a:Mat => {
+		  argList(i+1) match {
+			case str:String => putMat(fid, a, str)
+			case _ => throw new RuntimeException("odd numbered args must be String variable names")
+		  }
+		}
+		case _ => throw new RuntimeException("even numbered args must be Mat variables")
+	  }
+	  i += 2
+	}
+  }
+
+  def addOne(ii:Array[Int]) = {
+	if (Mat.ioneBased == 1) {
+	  var i = 0
+	  while (i < ii.length) {
+		ii(i) += 1
+		i += 1
+	  }
+	}
+  }
+
+  def subOne(ii:Array[Int]) = {
+	if (Mat.ioneBased == 1) {
+	  var i = 0
+	  while (i < ii.length) {
+		ii(i) = ii(i) - 1
+		i += 1
+	  }
+	}
+  }
+}
diff --git a/src/main/scala/BIDMat/MySorting.scala b/src/main/scala/BIDMat/MySorting.scala
new file mode 100755
index 00000000..e881fe11
--- /dev/null
+++ b/src/main/scala/BIDMat/MySorting.scala
@@ -0,0 +1,497 @@
+package BIDMat
+
+import scala.reflect.ClassManifest
+import scala.math.Ordering
+import scala.actors.Actor._
+
+object Sorting {
+  
+  def quickSort2[T](ga:Array[T], ii:Array[Int], lo:Int, hi:Int, stride:Int):Unit = {
+    ga match {
+      case a:Array[Float] => quickSort2(a, ii, lo, hi, stride, Mat.numThreads/2)
+      case a:Array[Double] => quickSort2(a, ii, lo, hi, stride, Mat.numThreads/2)
+      case a:Array[Int] => quickSort2(a, ii, lo, hi, stride, Mat.numThreads/2)
+    }
+  }
+  
+  def quickSort2(a:Array[Float], ii:Array[Int], lo:Int, hi:Int, stride:Int, nthreads:Int):Unit = {
+    if ((hi - lo)/stride > 0) {
+    	if ((hi - lo)/stride <= 16) {
+    		isort(a, ii, lo, hi, stride)
+    	} else {
+    		val ip = partition(a, ii, lo, hi, stride)
+    		if (nthreads > 1 && (hi-lo)/stride > 400) {
+    		  var done0 = false
+    		  var done1 = false
+    		  actor { quickSort2(a, ii, lo, ip, stride, nthreads/2); done0 = true }
+    			actor { quickSort2(a, ii, ip, hi, stride, nthreads/2); done1 = true }
+    			while (!done0 || !done1) {Thread.`yield`}
+    		} else {
+    			quickSort2(a, ii, lo, ip, stride, nthreads/2)
+    			quickSort2(a, ii, ip, hi, stride, nthreads/2)
+    		}
+    	}
+    }
+  }
+  
+  def isort(a:Array[Float], ii:Array[Int], lo:Int, hi:Int, stride:Int):Unit = {
+    var i = lo
+    while (i != hi) {
+      var j = i+stride
+      var imin = i
+      var vmin = a(i)
+      while (j != hi) {
+        if (a(j) <= vmin && ((a(j) < vmin) || ii(j) < ii(imin))) {
+          vmin = a(j)
+          imin = j
+        }
+        j += stride
+      }
+      a(imin) = a(i)
+      a(i) = vmin
+      val itmp = ii(imin)
+      ii(imin) = ii(i)
+      ii(i) = itmp
+      i += stride
+    }
+  }
+  
+  def med3(a:Array[Float], ii:Array[Int], lo:Int, hi:Int, stride:Int):Int = {
+    val nv = (hi - lo)/stride
+    val i1 = lo + stride*(math.floor(nv*java.lang.Math.random()).asInstanceOf[Int])
+    val i2 = lo + stride*(math.floor(nv*java.lang.Math.random()).asInstanceOf[Int])
+    val i3 = lo + stride*(math.floor(nv*java.lang.Math.random()).asInstanceOf[Int])
+    val v1 = a(i1)
+    val v2 = a(i2)
+    val v3 = a(i3)
+    val ii1 = ii(i1)
+    val ii2 = ii(i2)
+    val ii3 = ii(i3)
+    if ((v2 >= v1) && ((v2 > v1) || ii2 > ii1)) {
+    	if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i2 else {
+    		if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i3 else i1
+    	}
+    } else {
+    	if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i1 else {
+    		if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i3 else i2
+    	}
+    }
+  }
+  
+  def med9(a:Array[Float], ii:Array[Int], lo:Int, hi:Int, stride:Int):Int = {
+    val i1 = med3(a, ii, lo, hi, stride)
+    val i2 = med3(a, ii, lo, hi, stride)
+    val i3 = med3(a, ii, lo, hi, stride)
+    val v1 = a(i1)
+    val v2 = a(i2)
+    val v3 = a(i3)
+    val ii1 = ii(i1)
+    val ii2 = ii(i2)
+    val ii3 = ii(i3)
+    if ((v2 >= v1) && ((v2 > v1) || ii2 > ii1)) {
+    	if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i2 else {
+    		if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i3 else i1
+    	}
+    } else {
+    	if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i1 else {
+    		if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i3 else i2
+    	}
+    }
+  }
+  
+  def partition(a:Array[Float], ii:Array[Int], lo:Int, hi:Int, stride:Int):Int = {
+      val sstride = math.signum(stride)
+      val nvals = (hi - lo)/stride
+  		val im = if (nvals > 600) {
+  			med9(a, ii, lo, hi, stride)	  
+  		} else if (nvals > 100) {
+  		  med3(a, ii, lo, hi, stride)
+  		} else {
+  		  lo + stride*(math.floor(nvals*java.lang.Math.random()).asInstanceOf[Int])
+  		}
+  		var v = a(im)
+  		var iv = ii(im)
+  		var done = false
+  		var i = lo - stride
+  		var j = hi 
+  		while (! done) { 
+  			i += stride
+  			j -= stride
+  			while ((hi-i)*sstride > sstride*stride && ((a(i) <= v) && ((a(i) < v) || ii(i) <= iv))) {i += stride}
+  			while (                                   ((a(j) >= v) && ((a(j) > v) || ii(j) > iv)))  {j -= stride}
+  			if ((i - j)*sstride >= 0) {
+  				done = true
+  			} else {
+  				val atmp = a(i)
+  				a(i) = a(j)
+  				a(j) = atmp
+  				val itmp = ii(i)
+  				ii(i) = ii(j)
+  				ii(j) = itmp
+  			}
+  		}
+  		j + stride
+   }
+    
+  def quickSort2(a:Array[Double], ii:Array[Int], lo:Int, hi:Int, stride:Int, nthreads:Int):Unit = {
+    if ((hi - lo)/stride > 0) {
+    	if ((hi - lo)/stride <= 16) {
+    		isort(a, ii, lo, hi, stride)
+    	} else {
+    		val ip = partition(a, ii, lo, hi, stride)
+    		if (nthreads > 1 && (hi-lo)/stride > 400) {
+    		  var done0 = false
+    		  var done1 = false
+    		  actor { quickSort2(a, ii, lo, ip, stride, nthreads/2); done0 = true }
+    			actor { quickSort2(a, ii, ip, hi, stride, nthreads/2); done1 = true }
+    			while (!done0 || !done1) {Thread.`yield`}
+    		} else {
+    			quickSort2(a, ii, lo, ip, stride, nthreads/2)
+    			quickSort2(a, ii, ip, hi, stride, nthreads/2)
+    		}
+    	}
+    }
+  }
+  
+  def isort(a:Array[Double], ii:Array[Int], lo:Int, hi:Int, stride:Int):Unit = {
+    var i = lo
+    while (i != hi) {
+      var j = i+stride
+      var imin = i
+      var vmin = a(i)
+      while (j != hi) {
+        if (a(j) <= vmin && ((a(j) < vmin) || ii(j) < ii(imin))) {
+          vmin = a(j)
+          imin = j
+        }
+        j += stride
+      }
+      a(imin) = a(i)
+      a(i) = vmin
+      val itmp = ii(imin)
+      ii(imin) = ii(i)
+      ii(i) = itmp
+      i += stride
+    }
+  }
+  
+  def med3(a:Array[Double], ii:Array[Int], lo:Int, hi:Int, stride:Int):Int = {
+    val nv = (hi - lo)/stride
+    val i1 = lo + stride*(math.floor(nv*java.lang.Math.random()).asInstanceOf[Int])
+    val i2 = lo + stride*(math.floor(nv*java.lang.Math.random()).asInstanceOf[Int])
+    val i3 = lo + stride*(math.floor(nv*java.lang.Math.random()).asInstanceOf[Int])
+    val v1 = a(i1)
+    val v2 = a(i2)
+    val v3 = a(i3)
+    val ii1 = ii(i1)
+    val ii2 = ii(i2)
+    val ii3 = ii(i3)
+    if ((v2 >= v1) && ((v2 > v1) || ii2 > ii1)) {
+    	if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i2 else {
+    		if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i3 else i1
+    	}
+    } else {
+    	if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i1 else {
+    		if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i3 else i2
+    	}
+    }
+  }
+  
+  def med9(a:Array[Double], ii:Array[Int], lo:Int, hi:Int, stride:Int):Int = {
+    val i1 = med3(a, ii, lo, hi, stride)
+    val i2 = med3(a, ii, lo, hi, stride)
+    val i3 = med3(a, ii, lo, hi, stride)
+    val v1 = a(i1)
+    val v2 = a(i2)
+    val v3 = a(i3)
+    val ii1 = ii(i1)
+    val ii2 = ii(i2)
+    val ii3 = ii(i3)
+    if ((v2 >= v1) && ((v2 > v1) || ii2 > ii1)) {
+    	if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i2 else {
+    		if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i3 else i1
+    	}
+    } else {
+    	if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i1 else {
+    		if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i3 else i2
+    	}
+    }
+  }
+   
+  def partition(a:Array[Double], ii:Array[Int], lo:Int, hi:Int, stride:Int):Int = {
+      val sstride = math.signum(stride)
+      val nvals = (hi - lo)/stride
+  		val im = if (nvals > 600) {
+  			med9(a, ii, lo, hi, stride)	  
+  		} else if (nvals > 100) {
+  		  med3(a, ii, lo, hi, stride)
+  		} else {
+  		  lo + stride*(math.floor(nvals*java.lang.Math.random()).asInstanceOf[Int])
+  		}
+  		var v = a(im)
+  		var iv = ii(im)
+  		var done = false
+  		var i = lo - stride
+  		var j = hi 
+  		while (! done) { 
+  			i += stride
+  			j -= stride
+  			while ((hi-i)*sstride > sstride*stride && ((a(i) <= v) && ((a(i) < v) || ii(i) <= iv))) {i += stride}
+  			while (                                   ((a(j) >= v) && ((a(j) > v) || ii(j) > iv)))  {j -= stride}
+  			if ((i - j)*sstride >= 0) {
+  				done = true
+  			} else {
+  				val atmp = a(i)
+  				a(i) = a(j)
+  				a(j) = atmp
+  				val itmp = ii(i)
+  				ii(i) = ii(j)
+  				ii(j) = itmp
+  			}
+  		}
+  		j + stride
+   }
+ 
+  
+    
+ def quickSort2(a:Array[Int], ii:Array[Int], lo:Int, hi:Int, stride:Int, nthreads:Int):Unit = {
+    if ((hi - lo)/stride > 0) {
+    	if ((hi - lo)/stride <= 16) {
+    		isort(a, ii, lo, hi, stride)
+    	} else {
+    		val ip = partition(a, ii, lo, hi, stride)
+    		if (nthreads > 1 && (hi-lo)/stride > 400) {
+    		  var done0 = false
+    		  var done1 = false
+    		  actor { quickSort2(a, ii, lo, ip, stride, nthreads/2); done0 = true }
+    			actor { quickSort2(a, ii, ip, hi, stride, nthreads/2); done1 = true }
+    			while (!done0 || !done1) {Thread.`yield`}
+    		} else {
+    			quickSort2(a, ii, lo, ip, stride, nthreads/2)
+    			quickSort2(a, ii, ip, hi, stride, nthreads/2)
+    		}
+    	}
+    }
+  }
+  
+  def isort(a:Array[Int], ii:Array[Int], lo:Int, hi:Int, stride:Int):Unit = {
+    var i = lo
+    while (i != hi) {
+      var j = i+stride
+      var imin = i
+      var vmin = a(i)
+      while (j != hi) {
+        if (a(j) <= vmin && ((a(j) < vmin) || ii(j) < ii(imin))) {
+          vmin = a(j)
+          imin = j
+        }
+        j += stride
+      }
+      a(imin) = a(i)
+      a(i) = vmin
+      val itmp = ii(imin)
+      ii(imin) = ii(i)
+      ii(i) = itmp
+      i += stride
+    }
+  }
+  
+  def med3(a:Array[Int], ii:Array[Int], lo:Int, hi:Int, stride:Int):Int = {
+    val nv = (hi - lo)/stride
+    val i1 = lo + stride*(math.floor(nv*java.lang.Math.random()).asInstanceOf[Int])
+    val i2 = lo + stride*(math.floor(nv*java.lang.Math.random()).asInstanceOf[Int])
+    val i3 = lo + stride*(math.floor(nv*java.lang.Math.random()).asInstanceOf[Int])
+    val v1 = a(i1)
+    val v2 = a(i2)
+    val v3 = a(i3)
+    val ii1 = ii(i1)
+    val ii2 = ii(i2)
+    val ii3 = ii(i3)
+    if ((v2 >= v1) && ((v2 > v1) || ii2 > ii1)) {
+    	if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i2 else {
+    		if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i3 else i1
+    	}
+    } else {
+    	if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i1 else {
+    		if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i3 else i2
+    	}
+    }
+  }
+  
+  def med9(a:Array[Int], ii:Array[Int], lo:Int, hi:Int, stride:Int):Int = {
+    val i1 = med3(a, ii, lo, hi, stride)
+    val i2 = med3(a, ii, lo, hi, stride)
+    val i3 = med3(a, ii, lo, hi, stride)
+    val v1 = a(i1)
+    val v2 = a(i2)
+    val v3 = a(i3)
+    val ii1 = ii(i1)
+    val ii2 = ii(i2)
+    val ii3 = ii(i3)
+    if ((v2 >= v1) && ((v2 > v1) || ii2 > ii1)) {
+    	if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i2 else {
+    		if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i3 else i1
+    	}
+    } else {
+    	if ((v3 >= v1) && ((v3 > v1) || ii3 > ii1)) i1 else {
+    		if ((v3 >= v2) && ((v3 > v2) || ii3 > ii2)) i3 else i2
+    	}
+    }
+  }
+   
+  def partition(a:Array[Int], ii:Array[Int], lo:Int, hi:Int, stride:Int):Int = {
+      val sstride = math.signum(stride)
+      val nvals = (hi - lo)/stride
+  		val im = if (nvals > 600) {
+  			med9(a, ii, lo, hi, stride)	  
+  		} else if (nvals > 100) {
+  		  med3(a, ii, lo, hi, stride)
+  		} else {
+  		  lo + stride*(math.floor(nvals*java.lang.Math.random()).asInstanceOf[Int])
+  		}
+  		var v = a(im)
+  		var iv = ii(im)
+  		var done = false
+  		var i = lo - stride
+  		var j = hi 
+  		while (! done) { 
+  			i += stride
+  			j -= stride
+  			while ((hi-i)*sstride > sstride*stride && ((a(i) <= v) && ((a(i) < v) || ii(i) <= iv))) {i += stride}
+  			while (                                   ((a(j) >= v) && ((a(j) > v) || ii(j) > iv)))  {j -= stride}
+  			if ((i - j)*sstride >= 0) {
+  				done = true
+  			} else {
+  				val atmp = a(i)
+  				a(i) = a(j)
+  				a(j) = atmp
+  				val itmp = ii(i)
+  				ii(i) = ii(j)
+  				ii(j) = itmp
+  			}
+  		}
+  		j + stride
+   }
+ 
+  
+  def quickSort[@specialized(Double, Float, Int, Byte) T](a:Array[T])(implicit ord:Ordering[T]) = { 
+    def comp(i:Int, j:Int):Int = {
+      ord.compare(a(i),a(j))
+    }
+    def swap(i:Int, j:Int) = {
+      val tmp = a(i)
+      a(i) = a(j)
+      a(j) = tmp
+    }
+    sort1(comp, swap, 0, a.length) 
+  }
+  
+  def quickSort(comp:(Int,Int)=>Int, swap: (Int,Int) => Unit, start:Int, len:Int) { sort1(comp, swap, start, len) }
+  
+  private def sort1(comp: (Int, Int) => Int, swap: (Int,Int) => Unit, off: Int, len: Int) {
+    
+    def vecswap(_a: Int, _b: Int, n: Int) {
+      var a = _a
+      var b = _b
+      var i = 0
+      while (i < n) {
+        swap(a, b)
+        i += 1
+        a += 1
+        b += 1
+      }
+    }
+    def med3(a: Int, b: Int, c: Int) = {
+      if (comp(a,b) < 0) {
+        if (comp(b,c) < 0) b else if (comp(a,c) < 0) c else a
+      } else {
+        if (comp(b,c) > 0) b else if (comp(a,c) > 0) c else a
+      }
+    }
+    def sort2(off: Int, len: Int) {
+      if (len < 7) {
+        var i = off
+        while (i < len + off) {
+          var j = i
+          while (j > off && comp(j-1,j) > 0) {
+            swap(j, j-1)
+            j -= 1
+          }
+          i += 1
+        }
+      } else {
+        var m = off + (len >> 1) 
+        if (len > 30) {
+          var l = off
+          var n = off + len - 1
+          if (len > 300) { 
+            val s = len / 8
+            l = med3(l, l+s, l+2*s)
+            m = med3(m-s, m, m+s)
+            n = med3(n-2*s, n-s, n)
+          }
+          m = med3(l, m, n) 
+        }
+
+        var a = off
+        var b = a
+        var c = off + len - 1
+        var d = c
+        var done = false
+        while (!done) {
+          var pp = -1
+          while (b <= c && pp <= 0) {
+            pp = comp(b, m) 
+            if (pp == 0) {
+              swap(a, b)
+              m = a
+              a += 1
+            }
+            if (pp <= 0) b += 1
+          }
+          pp = 1
+          while (c >= b &&  pp >= 0) {
+            pp = comp(c, m)
+            if (pp == 0) {
+              swap(c, d)
+              m = d
+              d -= 1
+            }
+            if (pp >= 0) c -= 1
+          }
+          if (b > c) {
+            done = true
+          } else {
+            swap(b, c)
+            c -= 1
+            b += 1
+          }
+        }
+
+        val n = off + len
+        var s = math.min(a-off, b-a)
+        vecswap(off, b-s, s)
+        s = math.min(d-c, n-d-1)
+        vecswap(b, n-s, s)
+
+        s = b - a
+        if (s > 1)
+          sort2(off, s)
+        s = d - c
+        if (s > 1)
+          sort2(n-s, s)
+      }
+    }
+    sort2(off, len)
+  }
+  
+  def main(args:Array[String]) = {
+    import BIDMat.SciFunctions._
+    import BIDMat.MatFunctions._
+    val n = args(0).toInt
+    val a = SciFunctions.rand(n, 1)
+    val ii = MatFunctions.icol(0->n)
+    quickSort2(a.data, ii.data, 0, n, 1)
+    println("check %d" format find(a(1->n,0) < a(0->(n-1),0)).length)
+  }
+}
diff --git a/src/main/scala/BIDMat/Operators.scala b/src/main/scala/BIDMat/Operators.scala
new file mode 100644
index 00000000..1876d9a4
--- /dev/null
+++ b/src/main/scala/BIDMat/Operators.scala
@@ -0,0 +1,320 @@
+package BIDMat
+import MatFunctions._
+
+object Operator {
+  def applyMat(a:FMat, b:Mat, c:Mat, op:Mop):Mat = {
+    b match {
+      case fb:FMat => op.fop(a, fb, c)
+      case sb:SMat => op.fop(a, sb, c)
+      case db:DMat => op.dop(DMat(a), db, c)
+      case ib:IMat => op.fop(a, FMat(ib), c)
+      case cb:CMat => op.cop(CMat(a), cb, c)
+    }
+  }
+  
+  def applyMat(a:DMat, b:Mat, c:Mat, op:Mop):Mat = {
+    b match {
+      case fb:FMat => op.dop(a, DMat(fb), c)
+      case db:DMat => op.dop(a, db, c)
+      case ib:IMat => op.dop(a, DMat(ib), c)
+      case cb:CMat => op.cop(CMat(a), cb, c)
+    }
+  }
+  
+  def applyMat(a:IMat, b:Mat, c:Mat, op:Mop):Mat = {
+    b match {
+      case fb:FMat => op.fop(FMat(a), fb, c)
+      case db:DMat => op.dop(DMat(a), db, c)
+      case ib:IMat => op.iop(a, ib, c)
+      case cb:CMat => op.cop(CMat(a), cb, c)
+    }
+  }
+  
+  def applyMat(a:CMat, b:Mat, c:Mat, op:Mop):Mat = {
+    b match {
+      case fb:FMat => op.cop(a, CMat(fb), c)
+      case db:DMat => op.cop(a, CMat(db), c)
+      case ib:IMat => op.cop(a, CMat(ib), c)
+      case cb:CMat => op.cop(CMat(a), cb, c)
+    }
+  }
+  
+  def applyMat(a:GMat, b:Mat, c:Mat, op:Mop):Mat = {
+    b match {
+      case gb:GMat => op.gop(a, gb, c)
+    }
+  }
+  
+  def applyMat(a:SMat, b:Mat, c:Mat, op:Mop):Mat = {
+    b match {
+      case sb:SMat => op.sop(a, sb, c)
+    }
+  }
+  
+  def multDim1(a:Mat, b:Mat):Int = {
+    if (a.nrows == 1 && a.ncols == 1) {
+      b.nrows
+    } else {
+      a.nrows
+    }
+  } 
+  
+  def multDim2(a:Mat, b:Mat):Int = {
+    if (b.nrows == 1 && b.ncols == 1) {
+      a.ncols
+    } else {
+      b.ncols
+    }
+  } 
+  
+  def getFPair(c:Mat, a:FMat):FPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new FPair(c, a)
+    } else {
+      new FPair(FMat(a.nrows, a.ncols), a)
+    }
+  }
+  
+  def getFPair(c:Mat, a:FMat, b:FMat):FPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new FPair(c, a)
+    } else {
+      new FPair(FMat(multDim1(a,b), multDim2(a,b)), a)
+    }
+  }
+  
+  def getDPair(c:Mat, a:DMat):DPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new DPair(c, a)
+    } else {
+      new DPair(DMat(a.nrows, a.ncols), a)
+    }
+  }
+  
+  def getDPair(c:Mat, a:DMat, b:DMat):DPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new DPair(c, a)
+    } else {
+      new DPair(DMat(multDim1(a,b), multDim2(a,b)), a)
+    }
+  }
+  
+  def getIPair(c:Mat, a:IMat):IPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new IPair(c, a)
+    } else {
+      new IPair(IMat(a.nrows, a.ncols), a)
+    }
+  }
+  
+  def getIPair(c:Mat, a:IMat, b:IMat):IPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new IPair(c, a)
+    } else {
+      new IPair(IMat(multDim1(a,b), multDim2(a,b)), a)
+    }
+  }
+  
+  def getCPair(c:Mat, a:CMat):CPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new CPair(c, a)
+    } else {
+      new CPair(CMat(a.nrows, a.ncols), a)
+    }
+  }
+  
+  def getCPair(c:Mat, a:CMat, b:CMat):CPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new CPair(c, a)
+    } else {
+      new CPair(CMat(multDim1(a,b), multDim2(a,b)), a)
+    }
+  }
+  
+  def getGPair(c:Mat, a:GMat):GPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new GPair(c, a)
+    } else {
+      new GPair(GMat(a.nrows, a.ncols), a)
+    }
+  }
+  
+  def getGPair(c:Mat, a:GMat, b:GMat):GPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new GPair(c, a)
+    } else {
+      new GPair(GMat(multDim1(a,b), multDim2(a,b)), a)
+    }
+  }
+  
+   def getSPair(c:Mat, a:SMat):SPair = {
+    if (c.asInstanceOf[AnyRef] != null) {
+      new SPair(c, a)
+    } else {
+      new SPair(SMat(a.nrows, a.ncols, a.nnz), a)
+    }
+  }
+}
+
+trait Mop {
+  def fop(a:FMat, b:FMat, c:Mat):FMat
+  def fop(a:FMat, b:SMat, c:Mat):FMat
+  def dop(a:DMat, b:DMat, c:Mat):DMat
+  def iop(a:IMat, b:IMat, c:Mat):IMat 
+  def cop(a:CMat, b:CMat, c:Mat):CMat
+  def gop(a:GMat, b:GMat, c:Mat):GMat
+  def sop(a:SMat, b:SMat, c:Mat):SMat
+  def notImplemented0(s:String, m:Mat):Mat = { 
+    throw new RuntimeException("operator "+s+" not implemented for "+m.mytype)
+  }
+}
+
+object Mop_Plus extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) + b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) + full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) + b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a) + b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = Operator.getCPair(c, a) + b
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) + b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = Operator.getSPair(c, a) + b
+}
+
+object Mop_Minus extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) - b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) - full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) - b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a) - b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = Operator.getCPair(c, a) - b
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) - b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = Operator.getSPair(c, a) - b
+}
+
+object Mop_Times extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a, b) * b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) * b
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a, b) * b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a, b) * b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = Operator.getCPair(c, a, b) * b
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a, b) * b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0("*", a); a}
+}
+
+object Mop_Div extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = a / b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = {notImplemented0("/", a); a}
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = a / b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = a / b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = {notImplemented0("/", a); a}
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = {notImplemented0("/", a); a}
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0("/", a); a}
+}
+
+object Mop_RSolve extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = a \\ b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = {notImplemented0("\\\\", a); a}
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = a \\ b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = a \\ b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = {notImplemented0("\\\\", a); a}
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = {notImplemented0("\\\\", a); a}
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0("\\\\", a); a}
+}
+
+object Mop_ETimes extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) *@ b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) *@ full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) *@ b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a) *@ b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = Operator.getCPair(c, a) *@ b
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) *@ b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = Operator.getSPair(c, a) *@ b
+}
+
+object Mop_EDiv extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) /@ b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) /@ full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) /@ b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = {notImplemented0("/@", a); a}
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = Operator.getCPair(c, a) /@ b
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) /@ b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = Operator.getSPair(c, a) /@ b
+}
+
+object Mop_HCat extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = a \ b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = {notImplemented0("\\", a); a}
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = a \ b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = a \ b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = a \ b
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = {notImplemented0("\\", a); a}
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0("\\", a); a}
+}
+
+object Mop_VCat extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = a on b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = {notImplemented0("on", a); a}
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = a on b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = a on b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = a on b
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = {notImplemented0("on", a); a}
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0("on", a); a}
+}
+
+object Mop_LT extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) < b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) < full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) < b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a) < b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = {notImplemented0("<", a); a}
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) < b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0("<", a); a}
+}
+
+object Mop_GT extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) > b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) > full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) > b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a) > b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = {notImplemented0(">", a); a}
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) > b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0(">", a); a}
+}
+
+object Mop_LE extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) <= b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) <= full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) <= b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a) <= b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = {notImplemented0("<=", a); a}
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) <= b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0("<=", a); a}
+}
+
+object Mop_GE extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) >= b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) >= full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) >= b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a) >= b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = {notImplemented0(">=", a); a}
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) >= b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0(">=", a); a}
+}
+
+object Mop_EQ extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) == b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) == full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) == b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a) == b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = Operator.getCPair(c, a) == b
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) == b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0("==", a); a}
+}
+
+object Mop_NE extends Mop { 
+  override def fop(a:FMat, b:FMat, c:Mat):FMat = Operator.getFPair(c, a) != b
+  override def fop(a:FMat, b:SMat, c:Mat):FMat = Operator.getFPair(c, a) != full(b)
+  override def dop(a:DMat, b:DMat, c:Mat):DMat = Operator.getDPair(c, a) != b
+  override def iop(a:IMat, b:IMat, c:Mat):IMat = Operator.getIPair(c, a) != b
+  override def cop(a:CMat, b:CMat, c:Mat):CMat = Operator.getCPair(c, a) != b
+  override def gop(a:GMat, b:GMat, c:Mat):GMat = Operator.getGPair(c, a) != b
+  override def sop(a:SMat, b:SMat, c:Mat):SMat = {notImplemented0("!=", a); a}
+}
diff --git a/src/main/scala/BIDMat/Plotting.scala b/src/main/scala/BIDMat/Plotting.scala
new file mode 100755
index 00000000..44e33edf
--- /dev/null
+++ b/src/main/scala/BIDMat/Plotting.scala
@@ -0,0 +1,642 @@
+package BIDMat
+import ptolemy.plot._
+import java.awt._
+import java.awt.geom.AffineTransform
+import java.awt.image.BufferedImage
+import javax.swing._
+import javax.imageio.stream.FileImageOutputStream
+import javax.imageio.ImageIO
+import java.io._
+
+object Plotting { 
+  var ifigure:Int = 1
+  
+  def _plot(mats:Mat*)(xlog:Boolean=false, ylog:Boolean=false, isconnected:Boolean=true):Plot = {
+    var p:Plot = new Plot
+    p.setXLog(xlog)
+    p.setYLog(ylog)
+    val dataset = 0
+    if (mats.length == 1) {
+      val m = mats(0)
+      if (m.nrows == 1 || m.ncols == 1) { 
+      	m match {
+      	case mf:FMat => for (i <- 0 until m.length) p.addPoint(dataset, i, mf(i), isconnected)
+      	case md:DMat => for (i <- 0 until m.length) p.addPoint(dataset, i, md(i), isconnected)
+      	case mi:IMat => for (i <- 0 until m.length) p.addPoint(dataset, i, mi(i), isconnected)
+      	}
+      } else {
+      	for (i <- 0 until m.ncols) {
+      		m match { 
+      		case mf:FMat => for (j <- 0 until m.nrows) p.addPoint(i, j, mf(j,i), isconnected)
+      		case md:DMat => for (j <- 0 until m.nrows) p.addPoint(i, j, md(j,i), isconnected)
+      		case mi:IMat => for (j <- 0 until m.nrows) p.addPoint(i, j, mi(j,i), isconnected)
+      		}     
+      	}
+      }
+    } else {
+      var i = 0
+    	while (i*2 < mats.length) {      
+    		(mats(2*i), mats(2*i+1)) match { 
+    		case (a:FMat, b:FMat) => for (j <- 0 until a.length) p.addPoint(i, a(j), b(j), isconnected)
+    		case (a:FMat, b:DMat) => for (j <- 0 until a.length) p.addPoint(i, a(j), b(j), isconnected)
+    		case (a:DMat, b:FMat) => for (j <- 0 until a.length) p.addPoint(i, a(j), b(j), isconnected)
+    		case (a:DMat, b:DMat) => for (j <- 0 until a.length) p.addPoint(i, a(j), b(j), isconnected)
+    		case (a:FMat, b:IMat) => for (j <- 0 until a.length) p.addPoint(i, a(j), b(j), isconnected)
+    		case (a:DMat, b:IMat) => for (j <- 0 until a.length) p.addPoint(i, a(j), b(j), isconnected)
+    		case (a:IMat, b:FMat) => for (j <- 0 until a.length) p.addPoint(i, a(j), b(j), isconnected)
+    		case (a:IMat, b:DMat) => for (j <- 0 until a.length) p.addPoint(i, a(j), b(j), isconnected)
+    		case (a:IMat, b:IMat) => for (j <- 0 until a.length) p.addPoint(i, a(j), b(j), isconnected)
+    		}  
+    		i += 1
+    	}
+    }
+    var pframe:PlotFrame = new PlotFrame("Figure "+ifigure, p)
+    ifigure += 1
+    pframe.setVisible(true)
+    p
+  }
+  
+  def plot(mats:Mat*) = _plot(mats: _*)()
+  
+  def loglog(mats:Mat*) = _plot(mats: _*)(xlog=true, ylog=true)
+  
+  def semilogx(mats:Mat*) = _plot(mats: _*)(xlog=true)
+  
+  def semilogy(mats:Mat*) = _plot(mats: _*)(ylog=true)
+  
+  def p_plot(mats:Mat*) = _plot(mats: _*)(isconnected=false)
+  
+  def ploglog(mats:Mat*) = _plot(mats: _*)(xlog=true, ylog=true, isconnected=false)
+  
+  def psemilogx(mats:Mat*) = _plot(mats: _*)(xlog=true, isconnected=false)
+  
+  def psemilogy(mats:Mat*) = _plot(mats: _*)(ylog=true, isconnected=false)
+   
+  
+  def hist(m:Mat, nbars:Int=10) = { 
+    import SciFunctions._
+    var p:Histogram = new Histogram
+    val dataset = 0
+    if (m.nrows == 1 || m.ncols == 1) { 
+    	m match { 
+    	case mf:FMat => {
+    	  var vmax = maxi(mf,0).v
+    	  var vmin = mini(mf,0).v
+    	  p.setBinWidth((vmax-vmin)/nbars)
+    	  for (i <- 0 until m.length) p.addPoint(dataset, mf(i))
+    	}
+    	case md:DMat => {
+    		var vmax = maxi(md,0).v
+    	  var vmin = mini(md,0).v
+    	  p.setBinWidth((vmax-vmin)/nbars)
+    	  for (i <- 0 until m.length) p.addPoint(dataset, md(i))
+    	}
+    	case mi:IMat => {
+    		var vmax = maxi(mi,0).v.asInstanceOf[Double]
+    	  var vmin = mini(mi,0).v
+    	  p.setBinWidth((vmax-vmin)/nbars)
+    	  for (i <- 0 until m.length) p.addPoint(dataset, mi(i))
+    	}
+      }
+    }
+    var pframe:PlotFrame = new PlotFrame("Figure "+ifigure, p)
+    ifigure += 1
+    pframe.setVisible(true)
+  }
+
+  def heatmap(m:Mat) = {
+    val hc:HeatChart = new HeatChart(m)
+    val img:BufferedImage = hc.getChartImage(true)
+    hc.saveToFile(new File("heat_map_"+ifigure+".jpg"))
+    val jl:JLabel = new JLabel(new ImageIcon(img))
+    val jp:JPanel = new JPanel
+    jp.add(jl)
+    val jsp:JScrollPane = new JScrollPane(jp)
+    val jFrame:JFrame = new JFrame("Figure "+ifigure)        
+    jFrame.getContentPane().add( jsp )
+    jFrame.setSize(800, 600)
+    ifigure += 1
+    jFrame.setVisible(true)
+  }
+}
+
+class HeatChart(mat:Mat) {
+  private var xValues:Array[Double] = new Array[Double](mat.ncols)
+  private var yValues:Array[Double] = new Array[Double](mat.nrows)
+  setXValues(0, 1)
+  setYValues(0, 1)
+  private var xValuesHorizontal:Boolean = false
+  private var yValuesHorizontal:Boolean = true
+
+
+  private var cellSize:Dimension = new Dimension(10,10)
+  private var margin:Int = 20
+  private var backgroundColor = Color.WHITE
+  
+  private var chartSize:Dimension = new Dimension(100,100)
+  
+  private var highValueColor:Color = Color.BLUE
+  private var lowValueColor:Color = Color.WHITE
+
+  private var colorValueDistance:Int = 1
+  private var colorScale:Double = 1.0
+  
+  private var heatMapSize:Dimension = new Dimension(1000,1000)  
+
+  private var heatMapTL:Point = new Point(0, 0)
+  private var heatMapBR:Point = new Point(800, 800)
+  private var heatMapC:Point = new Point(400, 400)
+
+  private var axisThickness:Int = 2
+  private var axisColor:Color = Color.BLACK
+  private var axisLabelsFont:Font = new Font("Sans-Serif", Font.PLAIN, 12)
+  private var axisLabelColor:Color = Color.BLACK
+  private var xAxisLabel:String = "X Label"
+  private var yAxisLabel:String = "Y Label"
+  private var axisValuesColor:Color = Color.BLACK
+  private var axisValuesFont:Font = new Font("Sans-Serif", Font.PLAIN, 10)
+  private var xAxisValuesFrequency:Int = 1
+  private var yAxisValuesFrequency:Int = 1
+  private var showXAxisValues:Boolean = true
+  private var showYAxisValues:Boolean = true
+
+  private var xAxisValuesHeight:Int = 0
+  private var xAxisValuesWidthMax:Int = 0
+	
+  private var yAxisValuesHeight:Int = 0
+  private var yAxisValuesAscent:Int = 0
+  private var yAxisValuesWidthMax:Int = 0
+	
+  private var xAxisLabelSize:Dimension = new Dimension(0,0)
+  private var xAxisLabelDescent:Int = 0
+	
+  private var yAxisLabelSize:Dimension = new Dimension(0,0)
+  private var yAxisLabelAscent:Int = 0
+
+
+  private var lowValue:Double = min(mat)
+  private var highValue:Double = max(mat)
+
+
+  updateColorDistance()
+
+
+  //should be replaced by built-in min and max to speed up
+  private def min(mat:Mat):Double = {
+    mat match {
+      case mi:IMat => IMin(mi).toDouble
+      case mf:FMat => FMin(mf).toDouble
+      case md:DMat => DMin(md)
+    }
+  }
+
+
+  private def max(mat:Mat):Double = {
+    mat match {
+      case mi:IMat => IMax(mi).toDouble
+      case mf:FMat => FMax(mf).toDouble
+      case md:DMat => DMax(md)
+    }
+  }
+
+  private def IMin(mat:IMat):Int = {
+    var minV:Int = mat(0)
+    for (i:Int <- 0 until mat.length) {
+      if (mat(i) < minV)
+        minV = mat(i)
+    }
+    minV
+  }
+
+  private def FMin(mat:FMat):Float = {
+    var minV:Float = mat(0)
+    for (i:Int <- 0 until mat.length) {
+      if (mat(i) < minV)
+        minV = mat(i)
+    }
+    minV
+  }
+
+  private def DMin(mat:DMat):Double = {
+    var minV:Double = mat(0)
+    for (i:Int <- 0 until mat.length) {
+      if (mat(i) < minV)
+        minV = mat(i)
+    }
+    minV
+  }
+
+  private def IMax(mat:IMat):Int = {
+    var maxV:Int = mat(0)
+    for (i:Int <- 0 until mat.length) {
+      if (mat(i) > maxV)
+        maxV = mat(i)
+    }
+    maxV
+  }
+
+  private def FMax(mat:FMat):Float = {
+    var maxV:Float = mat(0)
+    for (i:Int <- 0 until mat.length) {
+      if (mat(i) > maxV)
+        maxV = mat(i)
+    }
+    maxV
+  }
+
+  private def DMax(mat:DMat):Double = {
+    var maxV:Double = mat(0)
+    for (i:Int <- 0 until mat.length) {
+      if (mat(i) > maxV)
+        maxV = mat(i)
+    }
+    maxV
+  }
+
+  def getChartImage(alpha:Boolean):BufferedImage = {
+    measureComponents()
+    updateCoordinates()
+    var imageType:Int = if ( alpha ) BufferedImage.TYPE_4BYTE_ABGR else BufferedImage.TYPE_3BYTE_BGR
+		
+    var chartImage:BufferedImage = new BufferedImage(chartSize.width, chartSize.height, imageType)
+
+    var chartGraphics:Graphics2D = chartImage.createGraphics()
+		
+    chartGraphics.setRenderingHint(RenderingHints.KEY_ANTIALIASING,
+    RenderingHints.VALUE_ANTIALIAS_ON)
+		
+    chartGraphics.setColor(backgroundColor)
+    chartGraphics.fillRect(0, 0, chartSize.width, chartSize.height)
+    
+    drawHeatMap(chartGraphics, mat)
+
+    drawXLabel(chartGraphics)
+    drawYLabel(chartGraphics)
+
+    drawAxisBars(chartGraphics)
+
+    drawXValues(chartGraphics)
+    drawYValues(chartGraphics)
+    chartImage
+  }
+
+  private def drawHeatMap(chartGraphics:Graphics2D, mat:Mat) = {
+
+    var noYCells:Int = mat.nrows
+    var noXCells:Int = mat.ncols
+		
+    var heatMapImage:BufferedImage = new BufferedImage(heatMapSize.width, heatMapSize.height, BufferedImage.TYPE_INT_ARGB)
+    var heatMapGraphics:Graphics2D = heatMapImage.createGraphics()
+  
+  mat match {
+    case mi:IMat => 
+      for ( x <- 0 until noXCells ) {
+        for ( y <- 0 until noYCells ) {
+          heatMapGraphics.setColor( getCellColor( mi(y+x*noYCells).toDouble, lowValue, highValue ) )
+
+          var cellX:Int = x*cellSize.width
+	  var cellY:Int = y*cellSize.height
+
+	  heatMapGraphics.fillRect(cellX, cellY, cellSize.width, cellSize.height)
+        }
+      }
+    case mf:FMat =>
+      for ( x <- 0 until noXCells ) {
+        for ( y <- 0 until noYCells ) {
+          heatMapGraphics.setColor( getCellColor( mf(y+x*noYCells).toDouble, lowValue, highValue ) )
+
+          var cellX:Int = x*cellSize.width
+	  var cellY:Int = y*cellSize.height
+
+	  heatMapGraphics.fillRect(cellX, cellY, cellSize.width, cellSize.height)
+        }
+      }
+    case md:DMat =>
+      for ( x <- 0 until noXCells ) {
+        for ( y <- 0 until noYCells ) {
+          heatMapGraphics.setColor( getCellColor( md(y+x*noYCells), lowValue, highValue ) )
+
+          var cellX:Int = x*cellSize.width
+	  var cellY:Int = y*cellSize.height
+
+	  heatMapGraphics.fillRect(cellX, cellY, cellSize.width, cellSize.height)
+        }
+      }    
+  }
+
+    chartGraphics.drawImage(heatMapImage, heatMapTL.x, heatMapTL.y, heatMapSize.width, heatMapSize.height, null)
+  }
+
+
+
+  private def getCellColor(data:Double, min:Double, max:Double):Color = {
+    var range:Double = max - min
+    var position:Double = data - min
+
+    var percentPosition:Double = position / range
+
+    var colorPosition:Int = getColorPosition(percentPosition)
+    var r:Int = lowValueColor.getRed()
+    var g:Int = lowValueColor.getGreen
+    var b:Int = lowValueColor.getBlue()
+
+    for ( i <- 0 until colorPosition ) {
+      var rDistance:Int = r - highValueColor.getRed()
+      var gDistance:Int = g - highValueColor.getGreen()
+      var bDistance:Int = b - highValueColor.getBlue()
+			
+      if ( (math.abs(rDistance) >= math.abs(gDistance) ) 
+        && (math.abs(rDistance) >= math.abs(bDistance) ) ) {
+        r = changeColorValue(r, rDistance)
+      } else if (math.abs(gDistance) >= math.abs(bDistance)) {
+        g = changeColorValue(g, gDistance)
+      } else {
+        b = changeColorValue(b, bDistance)
+      }
+    }	
+    new Color(r, g, b) 
+  }
+
+
+  private def getColorPosition(percentPosition:Double):Int = {
+    math.round( colorValueDistance * math.pow(percentPosition, colorScale) ).toInt
+  }
+
+  private def updateColorDistance() = {
+    var r1:Int = lowValueColor.getRed()
+    var g1:Int = lowValueColor.getGreen()
+    var b1:Int = lowValueColor.getBlue()
+    var r2:Int = highValueColor.getRed()
+    var g2:Int = highValueColor.getGreen()
+    var b2:Int = highValueColor.getBlue()
+		
+    colorValueDistance = math.abs(r1 - r2)
+    colorValueDistance += math.abs(g1 - g2)
+    colorValueDistance += math.abs(b1 - b2)
+  }
+
+  private def changeColorValue(colorValue:Int, colorDistance:Int):Int = {
+    if (colorDistance < 0) {
+      colorValue+1
+    } else if (colorDistance > 0) {
+      colorValue-1
+    } else {
+      colorValue
+    }
+  }
+
+  private def measureComponents() =  {
+    var chartImage:BufferedImage = new BufferedImage(1, 1, BufferedImage.TYPE_INT_ARGB)
+    var tempGraphics:Graphics2D = chartImage.createGraphics()
+		
+
+    if (xAxisLabel != null) {
+      tempGraphics.setFont(axisLabelsFont)
+      var metrics:FontMetrics = tempGraphics.getFontMetrics()
+      xAxisLabelSize = new Dimension(metrics.stringWidth(xAxisLabel), metrics.getHeight())
+      xAxisLabelDescent = metrics.getDescent()
+    } else {
+      xAxisLabelSize = new Dimension(0, 0)
+    }
+
+	
+    if (yAxisLabel != null) {
+      tempGraphics.setFont(axisLabelsFont)
+      var metrics:FontMetrics = tempGraphics.getFontMetrics()
+      yAxisLabelSize = new Dimension(metrics.stringWidth(yAxisLabel), metrics.getHeight())
+      yAxisLabelAscent = metrics.getAscent()
+    } else {
+      yAxisLabelSize = new Dimension(0, 0)
+    }
+		
+
+    if (showXAxisValues) {
+      tempGraphics.setFont(axisValuesFont)
+      var metrics:FontMetrics = tempGraphics.getFontMetrics()
+      xAxisValuesHeight = metrics.getHeight()
+      xAxisValuesWidthMax = 0
+      
+      for (i <- 0 until xValues.length) {
+        var w:Int = metrics.stringWidth(xValues(i).toString())
+          if (w > xAxisValuesWidthMax) {
+            xAxisValuesWidthMax = w
+          }
+      }
+    } else {
+      xAxisValuesHeight = 0
+    }
+
+    if (showYAxisValues) {
+      tempGraphics.setFont(axisValuesFont)
+      var metrics:FontMetrics = tempGraphics.getFontMetrics()
+      yAxisValuesHeight = metrics.getHeight()
+      yAxisValuesAscent = metrics.getAscent()
+      yAxisValuesWidthMax = 0
+      
+      for (i <-0 until yValues.length) {
+        var w:Int = metrics.stringWidth(yValues(i).toString())
+        if (w > yAxisValuesWidthMax) {
+          yAxisValuesWidthMax = w
+        }
+      }
+    } else {
+      yAxisValuesHeight = 0
+    }
+      
+    
+    var heatMapWidth:Int = ( mat.ncols * cellSize.width)
+    var heatMapHeight:Int = ( mat.nrows * cellSize.height)
+    heatMapSize = new Dimension(heatMapWidth, heatMapHeight)
+
+    var yValuesHorizontalSize:Int = 0
+		
+    if (yValuesHorizontal) {
+      yValuesHorizontalSize = yAxisValuesWidthMax
+    } else {
+      yValuesHorizontalSize = yAxisValuesHeight
+    }
+
+    var xValuesVerticalSize:Int = 0
+    if (xValuesHorizontal) {
+      xValuesVerticalSize = xAxisValuesHeight
+    } else {
+      xValuesVerticalSize = xAxisValuesWidthMax
+    }
+		
+    var chartWidth:Int = heatMapWidth + (2 * margin) + yAxisLabelSize.height + yValuesHorizontalSize + axisThickness
+    var chartHeight:Int = heatMapHeight + (2 * margin) + xAxisLabelSize.height + xValuesVerticalSize + axisThickness
+    chartSize = new Dimension(chartWidth, chartHeight)
+  }
+
+  private def updateCoordinates() {
+    var x:Int = margin + axisThickness + yAxisLabelSize.height
+    if (yValuesHorizontal) x+=yAxisValuesWidthMax else x+=yAxisValuesHeight
+    var y:Int = margin
+    heatMapTL = new Point(x, y)
+
+    x = heatMapTL.x + heatMapSize.width
+    y = heatMapTL.y + heatMapSize.height
+    heatMapBR = new Point(x, y)
+		
+    x = heatMapTL.x + (heatMapSize.width / 2)
+    y = heatMapTL.y + (heatMapSize.height / 2)
+    heatMapC = new Point(x, y)
+  }
+
+  private def drawXLabel(chartGraphics:Graphics2D) = {
+    if (xAxisLabel != null) {
+      var yPosXAxisLabel:Int = chartSize.height - (margin / 2) - xAxisLabelDescent
+      var xPosXAxisLabel:Int = heatMapC.x - (xAxisLabelSize.width / 2)
+
+      chartGraphics.setFont(axisLabelsFont)
+      chartGraphics.setColor(axisLabelColor)
+      chartGraphics.drawString(xAxisLabel, xPosXAxisLabel, yPosXAxisLabel)
+    }
+  }
+  
+  private def drawYLabel(chartGraphics:Graphics2D) = {
+    if (yAxisLabel != null) {
+      var yPosYAxisLabel:Int = heatMapC.y + (yAxisLabelSize.width / 2)
+      var xPosYAxisLabel:Int = (margin / 2) + yAxisLabelAscent
+			
+      chartGraphics.setFont(axisLabelsFont)
+      chartGraphics.setColor(axisLabelColor)
+			
+      var transform:AffineTransform = chartGraphics.getTransform()
+      var originalTransform:AffineTransform = transform.clone().asInstanceOf[AffineTransform]
+      transform.rotate(math.toRadians(270), xPosYAxisLabel, yPosYAxisLabel)
+      chartGraphics.setTransform(transform)
+			
+      chartGraphics.drawString(yAxisLabel, xPosYAxisLabel, yPosYAxisLabel)
+			
+      chartGraphics.setTransform(originalTransform)
+    }
+  }
+
+
+  private def drawAxisBars(chartGraphics:Graphics2D) = {
+    if (axisThickness > 0) {
+      chartGraphics.setColor(axisColor)
+			
+      var x:Int = heatMapTL.x - axisThickness
+      var y:Int = heatMapBR.y
+      var width:Int = heatMapSize.width + axisThickness
+      var height:Int = axisThickness
+      chartGraphics.fillRect(x, y, width, height)
+			
+      x = heatMapTL.x - axisThickness
+      y = heatMapTL.y
+      width = axisThickness
+      height = heatMapSize.height
+      chartGraphics.fillRect(x, y, width, height)
+    }
+  }
+
+
+  private def drawXValues(chartGraphics:Graphics2D) = {
+    if (showXAxisValues) { 
+		
+      chartGraphics.setColor(axisValuesColor)
+		
+      for (i <- 0 until mat.ncols) {
+        if (i % xAxisValuesFrequency == 0) {		
+          var xValueStr:String = xValues(i).toString()		
+          chartGraphics.setFont(axisValuesFont)
+          var metrics:FontMetrics = chartGraphics.getFontMetrics()		
+          var valueWidth:Int = metrics.stringWidth(xValueStr)
+			
+          if (xValuesHorizontal) {
+            var valueXPos:Int = (i * cellSize.width) + ((cellSize.width / 2) - (valueWidth / 2))
+            valueXPos += heatMapTL.x
+            var valueYPos:Int = heatMapBR.y + metrics.getAscent() + 1
+				
+            chartGraphics.drawString(xValueStr, valueXPos, valueYPos)
+          } else {
+            var valueXPos:Int = heatMapTL.x + (i * cellSize.width) + ((cellSize.width / 2) + (xAxisValuesHeight / 2))
+            var valueYPos:Int = heatMapBR.y + axisThickness + valueWidth
+				
+            var transform:AffineTransform = chartGraphics.getTransform()
+            var originalTransform:AffineTransform = transform.clone().asInstanceOf[AffineTransform]
+            transform.rotate(math.toRadians(270), valueXPos, valueYPos)
+            chartGraphics.setTransform(transform)
+				
+            chartGraphics.drawString(xValueStr, valueXPos, valueYPos)
+				
+            chartGraphics.setTransform(originalTransform)
+          }  
+        }
+      }
+    }
+  }
+
+
+  private def drawYValues(chartGraphics:Graphics2D) = {
+    if (showYAxisValues) {
+      chartGraphics.setColor(axisValuesColor)	
+      for (i <- 0 until mat.nrows ) {
+        if (i % yAxisValuesFrequency == 0) {		
+          var yValueStr:String = yValues(i).toString()		
+          chartGraphics.setFont(axisValuesFont)
+          var metrics:FontMetrics = chartGraphics.getFontMetrics()		
+          var valueWidth:Int = metrics.stringWidth(yValueStr)		
+          if (yValuesHorizontal) {
+            var valueXPos:Int = margin + yAxisLabelSize.height + (yAxisValuesWidthMax - valueWidth)
+            var valueYPos:Int = heatMapTL.y + (i * cellSize.height) + (cellSize.height/2) + (yAxisValuesAscent/2)
+				
+            chartGraphics.drawString(yValueStr, valueXPos, valueYPos)
+          } else {
+            var valueXPos:Int = margin + yAxisLabelSize.height + yAxisValuesAscent
+            var valueYPos:Int = heatMapTL.y + (i * cellSize.height) + (cellSize.height/2) + (valueWidth/2)
+				
+            var transform:AffineTransform = chartGraphics.getTransform()
+            var originalTransform:AffineTransform = transform.clone().asInstanceOf[AffineTransform]
+            transform.rotate(math.toRadians(270), valueXPos, valueYPos)
+            chartGraphics.setTransform(transform)
+				
+            chartGraphics.drawString(yValueStr, valueXPos, valueYPos)
+				
+            chartGraphics.setTransform(originalTransform)
+          }
+        }
+      }
+    }
+  }
+
+
+  def setXValues(xOffset:Double, xInterval:Double) = {		
+    for (i <- 0 until mat.ncols) {
+      xValues(i) = xOffset + (i * xInterval)
+    }
+  }
+
+  def setYValues(yOffset:Double, yInterval:Double) = {		
+    for (i <- 0 until mat.nrows) {
+      yValues(i) = yOffset + (i * yInterval)
+    }
+  }
+
+
+  def saveToFile(outputFile:File) = {
+    var filename:String = outputFile.getName()
+    var extPoint:Int = filename.lastIndexOf('.')
+
+    if (extPoint < 0) {
+      throw new IOException("Illegal filename: need a extension.")
+    }
+
+    var ext:String = filename.substring(extPoint + 1)
+		
+    if (ext.toLowerCase().equals("jpg") || ext.toLowerCase().equals("jpeg")) {
+      var chart:BufferedImage = getChartImage(false)
+
+      ImageIO.write(chart, ext, outputFile)
+    } else {
+      var chart:BufferedImage = getChartImage(true)
+			
+      ImageIO.write(chart, ext, outputFile)
+    }
+  }
+
+}
+
+
diff --git a/src/main/scala/BIDMat/SDMat.scala b/src/main/scala/BIDMat/SDMat.scala
new file mode 100755
index 00000000..d6cdcb62
--- /dev/null
+++ b/src/main/scala/BIDMat/SDMat.scala
@@ -0,0 +1,225 @@
+package BIDMat
+
+import edu.berkeley.bid.SPBLAS._
+
+case class SDMat(nr:Int, nc:Int, nnz1:Int, ir0:Array[Int], jc0:Array[Int], data0:Array[Double]) extends SparseMat[Double](nr, nc, nnz1, ir0, jc0, data0) {
+
+  def getdata() = data;	
+  
+  override def t:SDMat = SDMat(gt)
+  
+  override def mytype = "SDMat"
+  
+  def horzcat(b: SDMat) = SDMat(super.horzcat(b))
+  
+  def vertcat(b: SDMat) = SDMat(super.vertcat(b))
+  
+  def find:IMat = IMat(gfind)
+  
+  def find2:(IMat, IMat) = { val (ii, jj) = gfind2 ; (IMat(ii), IMat(jj)) }
+  
+  def find3:(IMat, IMat, DMat) = { val (ii, jj, vv) = gfind3 ; (IMat(ii), IMat(jj), DMat(vv)) }	
+  
+  override def apply(a:IMat, b:IMat):SDMat = SDMat(gapply(a, b))	
+  
+  def ssMatOp(b: SDMat, f:(Double, Double) => Double, omat:Mat) = SDMat(sgMatOp(b, f, omat))
+  
+  def ssMatOpScalar(b: Double, f:(Double, Double) => Double, omat:Mat) = SDMat(sgMatOpScalar(b, f, omat))
+  
+  def ssReduceOp(n:Int, f1:(Double) => Double, f2:(Double, Double) => Double, omat:Mat) = DMat(sgReduceOp(n, f1, f2, omat))
+  
+  def horzcat(a:DMat):DMat = MatFunctions.full(this).horzcat(a)
+  
+  def vertcat(a:DMat):DMat = MatFunctions.full(this).vertcat(a)
+
+  def SMult(a:Mat, omat:DMat):DMat = {
+    val ioff = Mat.ioneBased
+    if (ncols != a.nrows) {
+      throw new RuntimeException("dimensions mismatch")
+    } else {
+      a match {
+	case aa:SDMat => {
+	  val out = DMat.newOrCheckDMat(nrows, a.ncols, omat)
+	  if (omat.asInstanceOf[AnyRef] != null) out.clear
+	  var i = 0
+	  while (i < a.ncols) {
+	    var j =aa.jc(i)-ioff
+	    while (j < aa.jc(i+1)-ioff) {
+	      val dval = aa.data(j)
+	      var k = jc(aa.ir(j)-ioff)-ioff
+	      while (k < jc(aa.ir(j)+1-ioff)-ioff) {
+		out.data(ir(k)-ioff+nrows*i) +=  data(k) * dval
+		k += 1
+	      }
+	      j += 1
+	    }
+	    i += 1
+	  }
+	  out
+	}
+	case dd:DMat => {
+	  val out = DMat.newOrCheckDMat(nrows, a.ncols, omat)
+	  if (omat.asInstanceOf[AnyRef] != null) out.clear
+	  Mat.nflops += 2L * nnz * a.ncols
+	  if (Mat.noMKL) {
+	    var i = 0
+	    while (i < dd.ncols) {
+	      var j = 0
+	      while (j < ncols) {
+		val dval = dd.data(j + i*dd.nrows)
+		var k = jc(j)-ioff
+		while (k < jc(j+1)-ioff) {
+		  out.data(ir(k)-ioff + i*nrows) += dval * data(k);
+		  k += 1
+		}
+		j += 1
+	      }
+	      i += 1
+	    }
+	  } else {
+	    val nc = dd.ncols
+            var jc0 = jc
+            var ir0 = ir
+	    if (ioff == 0) {
+	      jc0 = SparseMat.incInds(jc)
+              ir0 = SparseMat.incInds(ir)
+            }
+            //	    if (dd.ncols == 1) {
+              // Seg faults in Linux and Windows:
+              //                dcscmv("N", nrows, ncols, 1.0, "GLNF", data, ir, jc, dd.data, 0.0, out.data) 
+            //	    } else {
+	    dcscmm("N", nrows, nc, ncols, 1.0, "GLNF", data, ir0, jc0, dd.data, ncols, 0.0, out.data, nr)
+            //	    }
+	  }
+	  out
+	}
+	case _ => throw new RuntimeException("unsupported arg")
+      }
+    }	
+  }
+  
+  def Tmult(a:DMat, omat:DMat):DMat = {
+	  val out = DMat.newOrCheckDMat(ncols, a.ncols, omat)
+	  if (omat.asInstanceOf[AnyRef] != null) out.clear
+	  var jc0 = jc
+	  var ir0 = ir
+	  if (Mat.ioneBased == 0) {
+	  	jc0 = SparseMat.incInds(jc)
+	  	ir0 = SparseMat.incInds(ir)
+	  }
+	  dcscmm("T", nrows, a.ncols, ncols, 1.0f, "GLNF", data, ir0, jc0, a.data, a.nrows, 0f, out.data, out.nrows) 
+	  Mat.nflops += 2L * nnz * a.ncols
+	  out
+  }
+  
+  def SSMult(a:SDMat):SDMat = 
+    if (ncols != a.nrows) {
+      throw new RuntimeException("dimensions mismatch")
+    } else {
+      val ioff = Mat.ioneBased
+      var numnz = 0
+      var i = 0
+      while (i < a.ncols) {
+	var j = a.jc(i)-ioff
+	while (j < a.jc(i+1)-ioff) {
+	  numnz += jc(a.ir(j)-ioff+1) - jc(a.ir(j)-ioff)
+	  j += 1
+	}
+	i += 1
+      }
+      val ii = new Array[Int](numnz)
+      val jj = new Array[Int](numnz)
+      val vv = new Array[Double](numnz)
+      numnz = 0
+      i = 0
+      while (i < a.ncols) {
+	var j = a.jc(i)-ioff
+	while (j < a.jc(i+1)-ioff) {
+	  val dval = a.data(j)
+	  var k = jc(a.ir(j)-ioff)-ioff
+	  while (k < jc(a.ir(j)-ioff+1)-ioff) {
+	    vv(numnz) =  data(k) * dval
+	    ii(numnz) = ir(k)-ioff
+	    jj(numnz) = i
+	    numnz += 1
+	    k += 1
+	  }
+	  j += 1
+	}
+	i += 1
+      }
+      SDMat(SparseMat.sparseImpl[Double](ii, jj, vv, nrows, a.ncols)) 
+    }	
+  
+  def + (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => x + y, null)
+  def - (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => x - y, null)
+  def * (b : DMat):DMat = SMult(b, null)
+  def Tx (b : DMat):DMat = Tmult(b, null)
+  override def * (b : Mat):DMat = SMult(b, null)
+  def *! (b : SDMat) = SSMult(b)
+  def *@ (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => x * y, null)
+  def /@ (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => x / y, null)
+  
+  def > (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => if (x > y) 1.0 else 0.0, null)
+  def < (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => if (x < y) 1.0 else 0.0, null)
+  def == (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, null)
+  def === (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, null)
+  def >= (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => if (x >= y) 1.0 else 0.0, null)
+  def <= (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => if (x <= y) 1.0 else 0.0, null)
+  def != (b : SDMat) = ssMatOp(b, (x:Double, y:Double) => if (x != y) 1.0 else 0.0, null)
+  
+  override def + (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => x + y, null)
+  override def - (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => x - y, null)
+  override def *@ (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => x * y, null)
+  override def /@ (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => x / y, null)
+  
+  override def > (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => if (x > y) 1.0 else 0.0, null)
+  override def < (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => if (x < y) 1.0 else 0.0, null)
+  override def == (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => if (x == y) 1.0 else 0.0, null)
+  override def >= (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => if (x >= y) 1.0 else 0.0, null)
+  override def <= (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => if (x <= y) 1.0 else 0.0, null)
+  override def != (b : Double) = ssMatOpScalar(b, (x:Double, y:Double) => if (x != y) 1.0 else 0.0, null)
+  
+  def \ (b: SDMat) = horzcat(b)
+  def on (b: SDMat) = vertcat(b)
+  
+  def toSMat:SMat = {
+    val out = SMat(nrows, ncols, nnz)
+    System.arraycopy(jc, 0, out.jc, 0, ncols+1)
+    System.arraycopy(ir, 0, out.ir, 0, nnz)
+    Mat.copyToFloatArray(data, 0, out.data, 0, nnz)
+    out
+  }
+  
+  override def zeros(nr:Int, nc:Int, nnz:Int) = SDMat(nr, nc, nnz)
+  
+  override def recycle(nr:Int, nc:Int, nnz:Int):SDMat = {
+  	val jc0 = if (jc.size >= nc+1) jc else new Array[Int](nc+1)
+  	val ir0 = if (ir.size >= nnz) ir else new Array[Int](nnz)
+  	val data0 = if (data.size >= nnz) data else new Array[Double](nnz)
+  	new SDMat(nr, nc, nnz, jc0, ir0, data0)    
+  }
+}
+
+class SDPair (val omat:DMat, val mat:SDMat) extends Pair{
+	def * (b : DMat):DMat = mat.SMult(b, omat)
+  def Tx (b : DMat):DMat = mat.Tmult(b, omat)
+  override def * (b : Mat):DMat = mat.SMult(b, omat)
+}
+
+object SDMat {
+
+  def apply(nr:Int, nc:Int, nnz0:Int):SDMat = new SDMat(nr, nc, nnz0, new Array[Int](nnz0), new Array[Int](nc+1), new Array[Double](nnz0)) 
+  
+  def apply(a:SparseMat[Double]):SDMat = new SDMat(a.nrows, a.ncols, a.nnz, a.ir, a.jc, a.data) 
+  
+  def apply(a:SMat) = a.toSDMat
+  
+  def SDnoRows(nr:Int, nc:Int, nnz0:Int):SDMat = new SDMat(nr, nc, nnz0, null, new Array[Int](nc+1), new Array[Double](nnz0))
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/SMat.scala b/src/main/scala/BIDMat/SMat.scala
new file mode 100755
index 00000000..8ed7164f
--- /dev/null
+++ b/src/main/scala/BIDMat/SMat.scala
@@ -0,0 +1,269 @@
+package BIDMat
+
+import edu.berkeley.bid.SPBLAS._
+
+case class SMat(nr:Int, nc:Int, nnz1:Int, ir0:Array[Int], jc0:Array[Int], data0:Array[Float]) extends SparseMat[Float](nr, nc, nnz1, ir0, jc0, data0) {
+
+  def getdata() = data;	
+  
+  override def t:SMat = SMat(gt)
+  
+  override def mytype = "SMat"
+  
+  def horzcat(b: SMat) = SMat(super.horzcat(b))
+  
+  def vertcat(b: SMat) = SMat(super.vertcat(b))
+  
+  def find:IMat = IMat(gfind)
+  
+  def find2:(IMat, IMat) = { val (ii, jj) = gfind2 ; (IMat(ii), IMat(jj)) }
+  
+  def find3:(IMat, IMat, FMat) = { val (ii, jj, vv) = gfind3 ; (IMat(ii), IMat(jj), FMat(vv)) }	
+  
+  override def contents:FMat = FMat(nnz, 1, data)
+  
+  override def apply(a:IMat, b:IMat):SMat = SMat(gapply(a, b))	
+  
+  def ssMatOp(b: SMat, f:(Float, Float) => Float, omat:Mat) = SMat(sgMatOp(b, f, omat))
+  
+  def ssMatOpScalar(b: Float, f:(Float, Float) => Float, omat:Mat) = SMat(sgMatOpScalar(b, f, omat))
+  
+  def ssReduceOp(n:Int, f1:(Float) => Float, f2:(Float, Float) => Float, omat:Mat) = FMat(sgReduceOp(n, f1, f2, omat))
+  
+  def horzcat(a:FMat):FMat = FMat(MatFunctions.full(this).ghorzcat(a))
+  
+  def vertcat(a:FMat):FMat = FMat(MatFunctions.full(this).gvertcat(a))
+
+  def SMult(a:Mat, omat:Mat):FMat = {
+  		val ioff = Mat.ioneBased
+  		if (ncols != a.nrows) {
+  			throw new RuntimeException("dimensions mismatch")
+  		} else {
+  			a match {
+  			case aa:SMat => {
+  				val out = FMat.newOrCheckFMat(nrows, a.ncols, omat)
+  				if (omat.asInstanceOf[AnyRef] != null) out.clear
+  				var i = 0
+  				while (i < a.ncols) {
+  					var j =aa.jc(i)-ioff
+  					while (j < aa.jc(i+1)-ioff) {
+  						val dval = aa.data(j)
+  						var k = jc(aa.ir(j)-ioff)-ioff
+  						while (k < jc(aa.ir(j)+1-ioff)-ioff) {
+  							out.data(ir(k)-ioff+nrows*i) +=  data(k) * dval
+  							k += 1
+  						}
+  						j += 1
+  					}
+  					i += 1
+  				}
+  				out
+  			}
+  			case dd:FMat => {
+  				val out = FMat.newOrCheckFMat(nrows, a.ncols, omat)
+  				if (omat.asInstanceOf[AnyRef] != null) out.clear
+  				Mat.nflops += 2L * nnz * a.ncols
+  				if (Mat.noMKL) {
+  					var i = 0
+  					while (i < dd.ncols) {
+  						var j = 0
+  						while (j < ncols) {
+  							val dval = dd.data(j + i*dd.nrows)
+  							var k = jc(j)-ioff
+  							while (k < jc(j+1)-ioff) {
+  								out.data(ir(k)-ioff + i*nrows) += dval * data(k);
+  								k += 1
+  							}
+  							j += 1
+  						}
+  						i += 1
+  					}
+  				} else {
+  					val nc = dd.ncols
+  					var jc0 = jc
+  					var ir0 = ir
+  					if (ioff == 0) {
+  						jc0 = SparseMat.incInds(jc)
+  						ir0 = SparseMat.incInds(ir)
+  					}
+  					//	    if (dd.ncols == 1) {
+  					// Seg faults in linux and windows			
+  					//                scscmv("N", nrows, ncols, 1.0f, "GLNF", data, ir, jc, dd.data, 0f, out.data) 
+  					//	    } else {
+  					scscmm("N", nrows, nc, ncols, 1.0f, "GLNF", data, ir0, jc0, dd.data, ncols, 0f, out.data, out.nrows)
+  					//	  }
+  				}
+  				out
+  			}
+  			case _ => throw new RuntimeException("unsupported arg")
+  			}
+  		}	
+  }
+  
+  def Tmult(a:FMat, omat:Mat):FMat = {
+	  val out = FMat.newOrCheckFMat(ncols, a.ncols, omat)
+	  if (omat.asInstanceOf[AnyRef] != null) out.clear
+	  var jc0 = jc
+	  var ir0 = ir
+	  if (Mat.ioneBased == 0) {
+	  	jc0 = SparseMat.incInds(jc)
+	  	ir0 = SparseMat.incInds(ir)
+	  }
+	  scscmm("T", nrows, a.ncols, ncols, 1.0f, "GLNF", data, ir0, jc0, a.data, a.nrows, 0f, out.data, out.nrows) 
+	  Mat.nflops += 2L * nnz * a.ncols
+	  out
+  }
+  
+  def SSMult(a:SMat):SMat = 
+  	if (ncols != a.nrows) {
+  		throw new RuntimeException("dimensions mismatch")
+  	} else {
+  		val ioff = Mat.ioneBased
+  		var numnz = 0
+  		var i = 0
+  		while (i < a.ncols) {
+  			var j = a.jc(i)-ioff
+  			while (j < a.jc(i+1)-ioff) {
+  				numnz += jc(a.ir(j)-ioff+1) - jc(a.ir(j)-ioff)
+  				j += 1
+  			}
+  			i += 1
+  		}
+  		val ii = new Array[Int](numnz)
+  		val jj = new Array[Int](numnz)
+  		val vv = new Array[Float](numnz)
+  		numnz = 0
+  		i = 0
+  		while (i < a.ncols) {
+  			var j = a.jc(i)-ioff
+  			while (j < a.jc(i+1)-ioff) {
+  				val dval = a.data(j)
+  				var k = jc(a.ir(j)-ioff)-ioff
+  				while (k < jc(a.ir(j)-ioff+1)-ioff) {
+  					vv(numnz) =  data(k) * dval
+  					ii(numnz) = ir(k)-ioff
+  					jj(numnz) = i
+  					numnz += 1
+  					k += 1
+  				}
+  				j += 1
+  			}
+  			i += 1
+  		}
+  		SMat(SparseMat.sparseImpl[Float](ii, jj, vv, nrows, a.ncols)) 
+  	}
+  
+  def + (b : SMat) = ssMatOp(b, (x:Float, y:Float) => x + y, null)
+  def - (b : SMat) = ssMatOp(b, (x:Float, y:Float) => x - y, null)
+  def * (b : FMat):FMat = SMult(b, null)
+  def Tx (b : FMat):FMat = Tmult(b, null)
+  def *! (b : SMat) = SSMult(b)
+  def *@ (b : SMat) = ssMatOp(b, (x:Float, y:Float) => x * y, null)
+  def /@ (b : SMat) = ssMatOp(b, (x:Float, y:Float) => x / y, null)
+  
+  def > (b : SMat) = ssMatOp(b, (x:Float, y:Float) => if (x > y) 1.0f else 0f, null)
+  def < (b : SMat) = ssMatOp(b, (x:Float, y:Float) => if (x < y) 1.0f else 0f, null)
+  def == (b : SMat) = ssMatOp(b, (x:Float, y:Float) => if (x == y) 1.0f else 0f, null)
+  def === (b : SMat) = ssMatOp(b, (x:Float, y:Float) => if (x == y) 1.0f else 0f, null)
+  def >= (b : SMat) = ssMatOp(b, (x:Float, y:Float) => if (x >= y) 1.0f else 0f, null)
+  def <= (b : SMat) = ssMatOp(b, (x:Float, y:Float) => if (x <= y) 1.0f else 0f, null)
+  def != (b : SMat) = ssMatOp(b, (x:Float, y:Float) => if (x != y) 1.0f else 0f, null)
+  
+  override def + (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => x + y, null)
+  override def - (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => x - y, null)
+  override def *@ (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => x * y, null)
+  override def /@ (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => x / y, null)
+  
+  override def > (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => if (x > y) 1.0f else 0f, null)
+  override def < (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => if (x < y) 1.0f else 0f, null)
+  override def == (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => if (x == y) 1.0f else 0f, null)
+  override def >= (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => if (x >= y) 1.0f else 0f, null)
+  override def <= (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => if (x <= y) 1.0f else 0f, null)
+  override def != (b : Float) = ssMatOpScalar(b, (x:Float, y:Float) => if (x != y) 1.0f else 0f, null)
+  
+  override def * (b : Mat):FMat = SMult(b, null)
+  override def Tx (b : Mat):Mat = b match {case bb:FMat => Tmult(bb, null)}
+  
+  def \ (b: SMat) = horzcat(b)
+  def on (b: SMat) = vertcat(b)
+  
+  def ~ (b : SMat):SPair = new SPair(this, b)
+  
+  override def ~ (b: Mat):Pair = 
+    b match {
+    case sb:SMat => new SPair(this, sb)
+    case _ => throw new RuntimeException("mismatched types for operator ~")
+  }
+  
+  def toSDMat:SDMat = {
+    val out = SDMat(nrows, ncols, nnz)
+    System.arraycopy(jc, 0, out.jc, 0, ncols+1)
+    System.arraycopy(ir, 0, out.ir, 0, nnz)
+    Mat.copyToDoubleArray(data, 0, out.data, 0, nnz)
+    out
+  }
+  
+  override def zeros(nr:Int, nc:Int, nnz:Int) = SMat(nr, nc, nnz)
+  
+  override def recycle(nr:Int, nc:Int, nnz:Int):SMat = {
+  	val jc0 = if (jc.size >= nc+1) jc else new Array[Int](nc+1)
+  	val ir0 = if (ir.size >= nnz) ir else new Array[Int](nnz)
+  	val data0 = if (data.size >= nnz) data else new Array[Float](nnz)
+  	new SMat(nr, nc, nnz, ir0, jc0, data0)    
+  }
+}
+
+class SPair (val omat:Mat, val mat:SMat) extends Pair{
+  def * (b : FMat):FMat = mat.SMult(b, omat)
+  def Tx (b : FMat):FMat = mat.Tmult(b, omat)
+  override def * (b : Mat):FMat = mat.SMult(b, omat)
+  override def Tx (b : Mat):Mat = b match {case bb:FMat => mat.Tmult(bb, omat)}
+  
+  def + (b : SMat) = mat.ssMatOp(b, (x:Float, y:Float) => x + y, omat)
+  def - (b : SMat) = mat.ssMatOp(b, (x:Float, y:Float) => x - y, omat)
+  def *@ (b : SMat) = mat.ssMatOp(b, (x:Float, y:Float) => x * y, omat)
+  def /@ (b : SMat) = mat.ssMatOp(b, (x:Float, y:Float) => x / y, omat)
+  
+  import Operator._
+  override def +  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Plus)
+  override def -  (b : Mat):Mat = applyMat(mat, b, omat, Mop_Minus)
+  override def *@  (b : Mat):Mat = applyMat(mat, b, omat, Mop_ETimes)
+  override def /@  (b : Mat):Mat = applyMat(mat, b, omat, Mop_EDiv)
+}
+
+object SMat {
+
+  def apply(nr:Int, nc:Int, nnz0:Int):SMat = new SMat(nr, nc, nnz0, new Array[Int](nnz0), new Array[Int](nc+1), new Array[Float](nnz0)) 
+  
+  def apply(a:SparseMat[Float]):SMat = new SMat(a.nrows, a.ncols, a.nnz, a.ir, a.jc, a.data) 
+  
+  def apply(a:SDMat) = a.toSMat
+  
+  def apply(a:Mat) = a match {
+    case aa:SMat => aa
+    case aa:GSMat => aa.toSMat
+    case aa:SDMat => aa.toSMat
+  }
+  
+  def SnoRows(nr:Int, nc:Int, nnz0:Int):SMat = new SMat(nr, nc, nnz0, null, new Array[Int](nc+1), new Array[Float](nnz0))
+  
+  def newOrCheckSMat(mat:SMat, oldmat:Mat):SMat = {
+  	if (oldmat.asInstanceOf[AnyRef] == null || (oldmat.nrows == 0 && oldmat.ncols == 0)) {
+  		SMat(mat.nrows, mat.ncols, mat.nnz)
+  	} else {
+  	  oldmat match {
+  	    case omat:SMat =>	if (oldmat.nrows == mat.nrows && oldmat.ncols == mat.ncols && oldmat.nnz == mat.nnz) {
+  	    	omat
+  	    } else {
+  	    	omat.recycle(mat.nrows, mat.ncols, mat.nnz)
+  	    }
+  	  }
+  	}
+  }
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/SciFunctions.scala b/src/main/scala/BIDMat/SciFunctions.scala
new file mode 100755
index 00000000..bbcf73fb
--- /dev/null
+++ b/src/main/scala/BIDMat/SciFunctions.scala
@@ -0,0 +1,1556 @@
+package BIDMat
+
+import edu.berkeley.bid.VML._
+import edu.berkeley.bid.VSL
+import edu.berkeley.bid.VSL._
+import edu.berkeley.bid.CBLAS._
+import jcuda._;
+import jcuda.jcublas.JCublas;
+import jcuda.runtime.JCuda;
+import jcuda.jcurand.JCurand._;
+import jcuda.jcurand.curandGenerator;
+import jcuda.jcurand.curandRngType._;
+import edu.berkeley.bid.CUMAT;
+import java.util.Random._;
+import MatFunctions._
+
+object SciFunctions {
+  final val SEED:Int = 1452462553 
+  // Java initialization
+  final val myrand = new java.util.Random(SEED)
+  // VSL random number generator initialization
+  final val BRNG:Int = BRNG_MCG31
+  final val METHOD:Int = 0
+  final val stream = new VSL();
+  final val errcode = vslNewStream(stream, BRNG, SEED)
+  // VML mode control, controlled with setVMLmode()
+  final val VMLdefault = VMLMODE.VML_ERRMODE_DEFAULT | VMLMODE.VML_HA   // Default
+  final val VMLfast =    VMLMODE.VML_ERRMODE_DEFAULT | VMLMODE.VML_LA   // Faster, Low accuracy, default error handling
+  final val VMLturbo =   VMLMODE.VML_ERRMODE_DEFAULT | VMLMODE.VML_EP   // Fastest, Lower accuracy, default error handling
+  // Curand initialization
+  var cudarng:curandGenerator = null
+  if (Mat.hasCUDA > 0) {
+  	jcuda.runtime.JCuda.initialize
+    cudarng = new curandGenerator
+    curandCreateGenerator(cudarng, CURAND_RNG_PSEUDO_DEFAULT) 
+    curandSetPseudoRandomGeneratorSeed(cudarng, SEED)
+  }
+  
+  def resetCUDA = JCuda.cudaDeviceReset
+  
+  def device(i:Int) = JCuda.cudaSetDevice(i)
+  
+  def device:Int = {
+    val ar = Array[Int](1)
+    JCuda.cudaGetDevice(ar)
+    ar(0)
+  }
+  
+  def connect(i:Int) = {
+  	val v0 = JCuda.cudaDeviceEnablePeerAccess(i,0)
+    val j = device
+    device(i)
+    val v1 = JCuda.cudaDeviceEnablePeerAccess(j,0)
+    device(j)
+    (v0, v1)
+  }
+  
+  def disconnect(i:Int) = {
+  	val v0 = JCuda.cudaDeviceDisablePeerAccess(i)
+    val j = device
+    device(i)
+    val v1 = JCuda.cudaDeviceDisablePeerAccess(j)
+    device(j)
+    (v0, v1)
+  }
+  
+  def canconnect(i:Int) = {
+  	val ar = Array[Int](1)
+  	val j = device
+  	JCuda.cudaDeviceCanAccessPeer(ar, i, j)
+  	val v0 = ar(0) 
+  	JCuda.cudaDeviceCanAccessPeer(ar, j, i)
+  	(v0, ar(0))
+  }
+    
+  def norm(a:FMat) = math.sqrt(sdot(a.length, a.data, 1, a.data, 1)).asInstanceOf[Float]
+  
+  def norm(a:DMat) = math.sqrt(ddot(a.length, a.data, 1, a.data, 1))
+  
+  def norm(a:GMat) = math.sqrt(JCublas.cublasSdot(a.length, a.data, 1, a.data, 1))
+  
+  def norm (a:Mat):Double = {
+    a match {
+      case aa:FMat => norm(aa)
+      case aa:DMat => norm(aa)
+      case aa:GMat => norm(aa)
+    }
+  }
+  
+  
+  def drand(minv:Double, maxv:Double, out:DMat):DMat = {
+    if (Mat.noMKL) {
+      var i = 0; val len = out.length; val odata = out.data; 
+      while (i < len) {odata(i) = myrand.nextDouble; i += 1}     
+    } else {
+      vdRngUniform( METHOD, stream, out.length, out.data, minv, maxv )
+    }
+    Mat.nflops += 10L*out.nrows*out.ncols
+    out
+  }
+  
+  def drand(m:Int, n:Int, minv:Double, maxv:Double):DMat = drand(minv, maxv, DMat(m, n))
+  
+  def drand(m:Int, n:Int):DMat = drand(m, n, 0, 1)
+  
+  def drand(out:DMat):DMat = drand(0.0, 1.0, out)
+
+  def rand(minv:Float, maxv:Float, out:FMat):FMat = {
+    if (Mat.noMKL) {
+      var i = 0; val len = out.length; val odata = out.data; 
+      while (i < len) {odata(i) = myrand.nextFloat; i += 1}     
+    } else {
+      vsRngUniform( METHOD, stream, out.length, out.data, minv, maxv )
+    }
+    Mat.nflops += 10L*out.nrows*out.ncols
+    out
+  }
+  
+  def rand(m:Int, n:Int, minv:Float, maxv:Float):FMat = rand(minv, maxv, FMat(m, n))
+  
+  def rand(m:Int, n:Int):FMat = rand(m, n, 0, 1)
+  
+  def rand(out:FMat):FMat = rand(0.0f, 1.0f, out)
+
+  def grand(out:GMat, nr:Int, nc:Int):GMat = {
+    Mat.nflops += 10L*out.length
+    curandGenerateUniform(cudarng, out.data, out.length)
+    JCuda.cudaDeviceSynchronize()
+    out
+  }
+  
+  def grand(out:GMat):GMat = grand(out, out.nrows, out.ncols)
+  
+  def grand(nr:Int, nc:Int):GMat = {
+    val out = GMat(nr, nc)
+    grand(out)
+  }
+ 
+  def normrnd(mu:Float, sig:Float, out:FMat):FMat = {
+    if (Mat.noMKL) {
+      var i = 0; val len = out.length; val odata = out.data; 
+      while (i < len) {odata(i) = mu + sig*myrand.nextGaussian.asInstanceOf[Float]; i += 1}  
+    } else {
+      vsRngGaussian(METHOD, stream, out.length, out.data, mu, sig )
+    }
+    Mat.nflops += 10L*out.length
+    out
+  }
+  
+  def normrnd(mu:Float, sig:Float, m:Int, n:Int):FMat = {
+    normrnd(mu, sig, FMat(m, n))
+  }
+  
+  def cnormrnd(mu:Float, sig:Float, out:CMat):CMat = {
+    if (Mat.noMKL) {
+      var i = 0; val len = out.length; val odata = out.data; 
+      while (i < 2*len) {odata(i) = mu + sig*myrand.nextGaussian.asInstanceOf[Float]; i += 1}  
+    } else {
+      vsRngGaussian(METHOD, stream, 2*out.length, out.data, mu, sig )
+    }
+    Mat.nflops += 10L*out.length
+    out  
+  }
+  
+  def cnormrnd(mu:Float, sig:Float, m:Int, n:Int):CMat = {
+    cnormrnd(mu, sig, CMat(m, n))
+  }
+  
+  def gnormrnd(mu:Float, sig:Float, out:GMat, nr:Int, nc:Int):GMat = {
+    Mat.nflops += 10L*out.length
+    curandGenerateNormal(cudarng, out.data, out.length, mu, sig)
+    JCuda.cudaDeviceSynchronize()
+    out
+  }
+  
+  def gnormrnd(mu:Float, sig:Float, out:GMat):GMat = gnormrnd(mu, sig, out, out.nrows, out.ncols)
+  
+  def gnormrnd(mu:Float, sig:Float, nr:Int, nc:Int):GMat = {
+    val out = GMat(nr, nc)
+    gnormrnd(mu, sig, out)
+  }
+
+  def gamrnd(shape:Float, scale:Float, out:FMat):FMat = {
+    vsRngGamma( METHOD, stream, out.length, out.data, shape, 0, scale )
+    Mat.nflops += 20L*out.length
+    out
+  }
+
+  def gamrnd(shape:Float, scale:Float, m:Int, n:Int):FMat = {
+    gamrnd(shape, scale, FMat(m, n))
+  }
+  
+  def laprnd(a:Float, b:Float, out:FMat):FMat = {
+    vsRngLaplace( METHOD, stream, out.length, out.data, a, b )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def laprnd(a:Float, b:Float, m:Int, n:Int):FMat = {
+    laprnd(a, b, FMat(m, n))
+  }
+
+  def cauchyrnd(a:Float, b:Float, out:FMat):FMat = {
+    vsRngCauchy( METHOD, stream, out.length, out.data, a, b )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def cauchyrnd(a:Float, b:Float, m:Int, n:Int):FMat = {
+    cauchyrnd(a, b, FMat(m, n))
+  }
+
+  def exprnd(a:Float, b:Float, out:FMat):FMat = {
+    vsRngExponential( METHOD, stream, out.length, out.data, a, b )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def exprnd(a:Float, m:Int, n:Int):FMat = {
+    exprnd(a, 1, FMat(m, n))
+  }
+
+  def exprnd(a:Float, b:Float, m:Int, n:Int):FMat = {
+    exprnd(a, b, FMat(m, n))
+  }
+  
+  def exprnd(a:Float, out:FMat):FMat = {
+    exprnd(a, 1, out)
+  }
+
+  def betarnd(p:Float, q:Float, out:FMat):FMat = {
+    vsRngBeta( METHOD, stream, out.length, out.data, p, q, 0, 1 )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def betarnd(p:Float, q:Float, m:Int, n:Int):FMat = {
+    betarnd(p, q, FMat(m, n))
+  }
+
+  def poissrnd(lambda:FMat, out:IMat):IMat = {
+    checkSizes(lambda, out)
+    viRngPoissonV( METHOD, stream, out.length, out.data, DMat(lambda).data )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def poissrnd(lambda:FMat):IMat = {
+    poissrnd(lambda, IMat(lambda.nrows, lambda.ncols))
+  }
+  
+  def dnormrnd(mu:Double, sig:Double, out:DMat):DMat = {
+    if (Mat.noMKL) {
+      var i = 0; val len = out.length; val odata = out.data; 
+      while (i < len) {odata(i) = mu + sig*myrand.nextGaussian; i += 1}  
+    } else {
+      vdRngGaussian( METHOD, stream, out.length, out.data, mu, sig )
+    }
+    Mat.nflops += 10L*out.length
+    out
+  }
+  
+  def dnormrnd(mu:Double, sig:Double, m:Int, n:Int):DMat = {
+    dnormrnd(mu, sig, DMat(m, n))
+  }
+  
+  def dgamrnd(shape:Double, scale:Double, out:DMat):DMat = {
+    vdRngGamma( METHOD, stream, out.length, out.data, shape, 0, scale )
+    Mat.nflops += 20L*out.length
+    out
+  }
+
+  def dgamrnd(shape:Double, scale:Double, m:Int, n:Int):DMat = {
+    dgamrnd(shape, scale, DMat(m, n))
+  }
+  
+  def dlaprnd(a:Double, b:Double, out:DMat):DMat = {
+    vdRngLaplace( METHOD, stream, out.length, out.data, a, b )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def dlaprnd(a:Double, b:Double, m:Int, n:Int):DMat = {
+    dlaprnd(a, b, DMat(m, n))
+  }
+
+  def dcauchyrnd(a:Double, b:Double, out:DMat):DMat = {
+    vdRngCauchy( METHOD, stream, out.length, out.data, a, b )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def dcauchyrnd(a:Double, b:Double, m:Int, n:Int):DMat = {
+    dcauchyrnd(a, b, DMat(m, n))
+  }
+
+  def dexprnd(a:Double, b:Double, out:DMat):DMat = {
+    vdRngExponential( METHOD, stream, out.length, out.data, a, b )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def dexprnd(a:Double, m:Int, n:Int):DMat = {
+    dexprnd(a, 1, DMat(m, n))
+  }
+
+  def dexprnd(a:Double, b:Double, m:Int, n:Int):DMat = {
+    dexprnd(a, b, DMat(m, n))
+  }
+  
+  def dexprnd(a:Double, out:DMat):DMat = {
+    dexprnd(a, 1, out)
+  }
+
+  def dbetarnd(p:Double, q:Double, out:DMat):DMat = {
+    vdRngBeta( METHOD, stream, out.length, out.data, p, q, 0, 1 )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def dbetarnd(p:Double, q:Double, m:Int, n:Int):DMat = {
+    dbetarnd(p, q, DMat(m, n))
+  }
+
+  def binornd(k:Int, p:Double, out:IMat):IMat = {
+    viRngBinomial( METHOD, stream, out.length, out.data, k, p )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def binornd(k:Int, p:Double, m:Int, n:Int):IMat = {
+    binornd(k, p, IMat(m, n))
+  }
+  
+  def bernrnd(p:Double, out:IMat):IMat = {
+    viRngBernoulli( METHOD, stream, out.length, out.data, p )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def bernrnd(p:Double, m:Int, n:Int):IMat = {
+    bernrnd(p, IMat(m, n))
+  }
+  
+  def geornd(p:Double, out:IMat):IMat = {
+    viRngGeometric( METHOD, stream, out.length, out.data, p )
+    Mat.nflops += 20L*out.length
+    out
+  }
+
+  def geornd(p:Double, m:Int, n:Int):IMat = {
+    geornd(p, IMat(m, n))
+  }
+  
+  def nbinrnd(a:Double, p:Double, out:IMat):IMat = {
+    viRngNegbinomial( METHOD, stream, out.length, out.data, a, p )
+    Mat.nflops += 20L*out.length
+    out
+  }	
+  
+  def nbinrnd(a:Double, p:Double, m:Int, n:Int):IMat = {
+    nbinrnd(a, p, IMat(m, n))
+  }	
+  
+  def poissrnd(lambda:Double, out:IMat):IMat = {
+    viRngPoisson( METHOD, stream, out.length, out.data, lambda )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def poissrnd(lambda:Double, m:Int, n:Int):IMat = {
+    poissrnd(lambda, IMat(m, n))
+  }
+  
+  def poissrnd(lambda:DMat, out:IMat):IMat = {
+    checkSizes(lambda, out)
+    viRngPoissonV( METHOD, stream, out.length, out.data, lambda.data )
+    Mat.nflops += 20L*out.length
+    out
+  }
+  
+  def poissrnd(lambda:DMat):IMat = {
+    poissrnd(lambda, IMat(lambda.nrows, lambda.ncols))
+  }
+  
+  def min(a:DMat, b:DMat) = a.ddMatOp(b, (x:Double, y:Double) => math.min(x,y), null)
+  def max(a:DMat, b:DMat) = a.ddMatOp(b, (x:Double, y:Double) => math.max(x,y), null)
+  def sum(a:DMat, n:Int) = a.ddReduceOp(n, (x:Double) => x, (x:Double, y:Double) => x+y, null)
+  def cumsum(a:DMat, n:Int) = a.ddReduceAll(n, (x:Double) => x, (x:Double, y:Double) => x+y, null)
+  def maxi(a:DMat, n:Int) = a.ddReduceOp(n, (x:Double) => x, (x:Double, y:Double) => math.max(x,y), null)
+  def mini(a:DMat, n:Int):DMat = a.ddReduceOp(n, (x:Double) => x, (x:Double, y:Double) => math.min(x,y), null)
+  def sum(a:DMat) = a.ddReduceOp(0, (x:Double) => x, (x:Double, y:Double) => x+y, null)
+  def cumsum(a:DMat) = a.ddReduceAll(0, (x:Double) => x, (x:Double, y:Double) => x+y, null)
+  def maxi(a:DMat) = a.ddReduceOp(0, (x:Double) => x, (x:Double, y:Double) => math.max(x,y), null)
+  def mini(a:DMat):DMat = a.ddReduceOp(0, (x:Double) => x, (x:Double, y:Double) => math.min(x,y), null)
+  def maxi2(a:DMat,d:Int):(DMat,IMat) = {val (m,ii)=a.ggOpt2(d,(x:Double,y:Double)=>(x>y)); (DMat(m), ii)}
+  def mini2(a:DMat,d:Int):(DMat,IMat) = {val (m,ii)=a.ggOpt2(d,(x:Double,y:Double)=>(x<y)); (DMat(m), ii)}
+  def maxi2(a:DMat):(DMat,IMat) = {val (m,ii)=a.ggOpt2(0,(x:Double,y:Double)=>(x>y)); (DMat(m), ii)}
+  def mini2(a:DMat):(DMat,IMat) = {val (m,ii)=a.ggOpt2(0,(x:Double,y:Double)=>(x<y)); (DMat(m), ii)}
+  
+  def min(a:DMat, b:DMat, out:Mat) = a.ddMatOp(b, (x:Double, y:Double) => math.min(x,y), out)
+  def max(a:DMat, b:DMat, out:Mat) = a.ddMatOp(b, (x:Double, y:Double) => math.max(x,y), out)
+  def sum(a:DMat, n:Int, out:Mat) = a.ddReduceOp(n, (x:Double) => x, (x:Double, y:Double) => x+y, out)
+  def cumsum(a:DMat, n:Int, out:Mat) = a.ddReduceAll(n, (x:Double) => x, (x:Double, y:Double) => x+y, out)
+  def maxi(a:DMat, n:Int, out:Mat) = a.ddReduceOp(n, (x:Double) => x, (x:Double, y:Double) => math.max(x,y), out)
+  def mini(a:DMat, n:Int, out:Mat):DMat = a.ddReduceOp(n, (x:Double) => x, (x:Double, y:Double) => math.min(x,y), out)
+  def sum(a:DMat, out:Mat) = a.ddReduceOp(0, (x:Double) => x, (x:Double, y:Double) => x+y, out)
+  def cumsum(a:DMat, out:Mat) = a.ddReduceAll(0, (x:Double) => x, (x:Double, y:Double) => x+y, out)
+  def maxi(a:DMat, out:Mat) = a.ddReduceOp(0, (x:Double) => x, (x:Double, y:Double) => math.max(x,y), out)
+  def mini(a:DMat, out:Mat):DMat = a.ddReduceOp(0, (x:Double) => x, (x:Double, y:Double) => math.min(x,y), out)
+  
+  def min(a:FMat, b:FMat) = a.ffMatOp(b, (x:Float, y:Float) => math.min(x,y), null)
+  def max(a:FMat, b:FMat) = a.ffMatOp(b, (x:Float, y:Float) => math.max(x,y), null)
+  def sum(a:FMat, n:Int) = a.ffReduceOp(n, (x:Float) => x, (x:Float, y:Float) => x+y, null)
+  def cumsum(a:FMat, n:Int) = a.ffReduceAll(n, (x:Float) => x, (x:Float, y:Float) => x+y, null)
+  def maxi(a:FMat, n:Int) = a.ffReduceOp(n, (x:Float) => x, (x:Float, y:Float) => math.max(x,y), null)
+  def mini(a:FMat, n:Int):FMat = a.ffReduceOp(n, (x:Float) => x, (x:Float, y:Float) => math.min(x,y), null)
+  def sum(a:FMat) = a.ffReduceOp(0, (x:Float) => x, (x:Float, y:Float) => x+y, null)
+  def cumsum(a:FMat) = a.ffReduceAll(0, (x:Float) => x, (x:Float, y:Float) => x+y, null)
+  def maxi(a:FMat) = a.ffReduceOp(0, (x:Float) => x, (x:Float, y:Float) => math.max(x,y), null)
+  def mini(a:FMat):FMat = a.ffReduceOp(0, (x:Float) => x, (x:Float, y:Float) => math.min(x,y), null)
+  def maxi2(a:FMat,d:Int):(FMat,IMat) = {val (m,ii)=a.ggOpt2(d,(x:Float,y:Float)=>(x>y)); (FMat(m), ii)}
+  def mini2(a:FMat,d:Int):(FMat,IMat) = {val (m,ii)=a.ggOpt2(d,(x:Float,y:Float)=>(x<y)); (FMat(m), ii)}
+  def maxi2(a:FMat):(FMat,IMat) = {val (m,ii)=a.ggOpt2(0,(x:Float,y:Float)=>(x>y)); (FMat(m), ii)}
+  def mini2(a:FMat):(FMat,IMat) = {val (m,ii)=a.ggOpt2(0,(x:Float,y:Float)=>(x<y)); (FMat(m), ii)}
+  
+  def min(a:FMat, b:FMat, out:Mat) = a.ffMatOp(b, (x:Float, y:Float) => math.min(x,y), out)
+  def max(a:FMat, b:FMat, out:Mat) = a.ffMatOp(b, (x:Float, y:Float) => math.max(x,y), out)
+  def sum(a:FMat, n:Int, out:Mat) = a.ffReduceOp(n, (x:Float) => x, (x:Float, y:Float) => x+y, out)
+  def cumsum(a:FMat, n:Int, out:Mat) = a.ffReduceAll(n, (x:Float) => x, (x:Float, y:Float) => x+y, out)
+  def maxi(a:FMat, n:Int, out:Mat) = a.ffReduceOp(n, (x:Float) => x, (x:Float, y:Float) => math.max(x,y), out)
+  def mini(a:FMat, n:Int, out:Mat):FMat = a.ffReduceOp(n, (x:Float) => x, (x:Float, y:Float) => math.min(x,y), out)
+  def sum(a:FMat, out:Mat) = a.ffReduceOp(0, (x:Float) => x, (x:Float, y:Float) => x+y, out)
+  def cumsum(a:FMat, out:Mat) = a.ffReduceAll(0, (x:Float) => x, (x:Float, y:Float) => x+y, out)
+  def maxi(a:FMat, out:Mat) = a.ffReduceOp(0, (x:Float) => x, (x:Float, y:Float) => math.max(x,y), out)
+  def mini(a:FMat, out:Mat):FMat = a.ffReduceOp(0, (x:Float) => x, (x:Float, y:Float) => math.min(x,y), out)
+  
+  def min (a:IMat, b:IMat) = a.iiMatOp(b, (x:Int, y:Int) => math.min(x,y), null)
+  def max (a:IMat, b:IMat) = a.iiMatOp(b, (x:Int, y:Int) => math.max(x,y), null)
+  def sum(a:IMat, n:Int) = a.iiReduceOp(n, (x:Int) => x, (x:Int, y:Int) => x+y, null)
+  def cumsum(a:IMat, n:Int) = a.iiReduceAll(n, (x:Int) => x, (x:Int, y:Int) => x+y, null)
+  def maxi(a:IMat, n:Int) = a.iiReduceOp(n, (x:Int) => x, (x:Int, y:Int) => math.max(x,y), null)
+  def mini(a:IMat, n:Int):IMat = a.iiReduceOp(n, (x:Int) => x, (x:Int, y:Int) => math.min(x,y), null)
+  def sum(a:IMat) = a.iiReduceOp(0, (x:Int) => x, (x:Int, y:Int) => x+y, null)
+  def cumsum(a:IMat) = a.iiReduceAll(0, (x:Int) => x, (x:Int, y:Int) => x+y, null)
+  def maxi(a:IMat) = a.iiReduceOp(0, (x:Int) => x, (x:Int, y:Int) => math.max(x,y), null)
+  def mini(a:IMat):IMat = a.iiReduceOp(0, (x:Int) => x, (x:Int, y:Int) => math.min(x,y), null)
+  def maxi2(a:IMat,d:Int):(IMat,IMat) = {val (m,ii)=a.ggOpt2(d,(x:Int,y:Int)=>(x>y)); (IMat(m), ii)}
+  def mini2(a:IMat,d:Int):(IMat,IMat) = {val (m,ii)=a.ggOpt2(d,(x:Int,y:Int)=>(x<y)); (IMat(m), ii)}
+  def maxi2(a:IMat):(IMat,IMat) = {val (m,ii)=a.ggOpt2(0,(x:Int,y:Int)=>(x>y)); (IMat(m), ii)}
+  def mini2(a:IMat):(IMat,IMat) = {val (m,ii)=a.ggOpt2(0,(x:Int,y:Int)=>(x<y)); (IMat(m), ii)}
+  
+  def min (a:IMat, b:IMat, out:Mat) = a.iiMatOp(b, (x:Int, y:Int) => math.min(x,y), out)
+  def max (a:IMat, b:IMat, out:Mat) = a.iiMatOp(b, (x:Int, y:Int) => math.max(x,y), out)
+  def sum(a:IMat, n:Int, out:Mat) = a.iiReduceOp(n, (x:Int) => x, (x:Int, y:Int) => x+y, out)
+  def cumsum(a:IMat, n:Int, out:Mat) = a.iiReduceAll(n, (x:Int) => x, (x:Int, y:Int) => x+y, out)
+  def maxi(a:IMat, n:Int, out:Mat) = a.iiReduceOp(n, (x:Int) => x, (x:Int, y:Int) => math.max(x,y), out)
+  def mini(a:IMat, n:Int, out:Mat):IMat = a.iiReduceOp(n, (x:Int) => x, (x:Int, y:Int) => math.min(x,y), out)
+  def sum(a:IMat, out:Mat) = a.iiReduceOp(0, (x:Int) => x, (x:Int, y:Int) => x+y, out)
+  def cumsum(a:IMat, out:Mat) = a.iiReduceAll(0, (x:Int) => x, (x:Int, y:Int) => x+y, out)
+  def maxi(a:IMat, out:Mat) = a.iiReduceOp(0, (x:Int) => x, (x:Int, y:Int) => math.max(x,y), out)
+  def mini(a:IMat, out:Mat):IMat = a.iiReduceOp(0, (x:Int) => x, (x:Int, y:Int) => math.min(x,y), out)
+  
+  def min(a:SDMat, b:SDMat) = a.ssMatOp(b, (x:Double, y:Double) => math.min(x,y), null)
+  def max(a:SDMat, b:SDMat) = a.ssMatOp(b, (x:Double, y:Double) => math.max(x,y), null)
+  def sum(a:SDMat, n:Int) = a.ssReduceOp(n, (x:Double) => x, (x:Double, y:Double) => x+y, null)
+  def maxi(a:SDMat, n:Int) = a.ssReduceOp(n, (x:Double) => x, (x:Double, y:Double) => math.max(x,y), null)
+  def mini(a:SDMat, n:Int) = a.ssReduceOp(n, (x:Double) => x, (x:Double, y:Double) => math.min(x,y), null)
+  def sum(a:SDMat) = a.ssReduceOp(0, (x:Double) => x, (x:Double, y:Double) => x+y, null)
+  def maxi(a:SDMat) = a.ssReduceOp(0, (x:Double) => x, (x:Double, y:Double) => math.max(x,y), null)
+  def mini(a:SDMat) = a.ssReduceOp(0, (x:Double) => x, (x:Double, y:Double) => math.min(x,y), null)
+  
+  def sum(a:SDMat, n:Int, omat:Mat) = a.ssReduceOp(n, (x:Double) => x, (x:Double, y:Double) => x+y, omat)
+  def maxi(a:SDMat, n:Int, omat:Mat) = a.ssReduceOp(n, (x:Double) => x, (x:Double, y:Double) => math.max(x,y), omat)
+  def mini(a:SDMat, n:Int, omat:Mat) = a.ssReduceOp(n, (x:Double) => x, (x:Double, y:Double) => math.min(x,y), omat)
+  def sum(a:SDMat, omat:Mat) = a.ssReduceOp(0, (x:Double) => x, (x:Double, y:Double) => x+y, omat)
+  def maxi(a:SDMat, omat:Mat) = a.ssReduceOp(0, (x:Double) => x, (x:Double, y:Double) => math.max(x,y), omat)
+  def mini(a:SDMat, omat:Mat) = a.ssReduceOp(0, (x:Double) => x, (x:Double, y:Double) => math.min(x,y), omat)
+  
+  def min(a:SMat, b:SMat) = a.ssMatOp(b, (x:Float, y:Float) => math.min(x,y), null)
+  def max(a:SMat, b:SMat) = a.ssMatOp(b, (x:Float, y:Float) => math.max(x,y), null)
+  def sum(a:SMat, n:Int) = a.ssReduceOp(n, (x:Float) => x, (x:Float, y:Float) => x+y, null)
+  def maxi(a:SMat, n:Int) = a.ssReduceOp(n, (x:Float) => x, (x:Float, y:Float) => math.max(x,y), null)
+  def mini(a:SMat, n:Int) = a.ssReduceOp(n, (x:Float) => x, (x:Float, y:Float) => math.min(x,y), null)
+  def sum(a:SMat) = a.ssReduceOp(0, (x:Float) => x, (x:Float, y:Float) => x+y, null)
+  def maxi(a:SMat) = a.ssReduceOp(0, (x:Float) => x, (x:Float, y:Float) => math.max(x,y), null)
+  def mini(a:SMat) = a.ssReduceOp(0, (x:Float) => x, (x:Float, y:Float) => math.min(x,y), null)
+  def min(a:SMat, b:Float) = a.ssMatOpScalar(b, (x:Float, y:Float) => math.min(x,y), null)
+  def max(a:SMat, b:Float) = a.ssMatOpScalar(b, (x:Float, y:Float) => math.max(x,y), null)
+  def min(b:Float, a:SMat) = a.ssMatOpScalar(b, (x:Float, y:Float) => math.min(x,y), null)
+  def max(b:Float, a:SMat) = a.ssMatOpScalar(b, (x:Float, y:Float) => math.max(x,y), null)
+  def min(a:SMat, b:Float, omat:Mat) = a.ssMatOpScalar(b, (x:Float, y:Float) => math.min(x,y), omat)
+  def max(a:SMat, b:Float, omat:Mat) = a.ssMatOpScalar(b, (x:Float, y:Float) => math.max(x,y), omat)
+  def min(b:Float, a:SMat, omat:Mat) = a.ssMatOpScalar(b, (x:Float, y:Float) => math.min(x,y), omat)
+  def max(b:Float, a:SMat, omat:Mat) = a.ssMatOpScalar(b, (x:Float, y:Float) => math.max(x,y), omat)
+
+  def sum(a:SMat, n:Int, omat:Mat) = a.ssReduceOp(n, (x:Float) => x, (x:Float, y:Float) => x+y, omat)
+  def maxi(a:SMat, n:Int, omat:Mat) = a.ssReduceOp(n, (x:Float) => x, (x:Float, y:Float) => math.max(x,y), omat)
+  def mini(a:SMat, n:Int, omat:Mat) = a.ssReduceOp(n, (x:Float) => x, (x:Float, y:Float) => math.min(x,y), omat)
+  def sum(a:SMat, omat:Mat) = a.ssReduceOp(0, (x:Float) => x, (x:Float, y:Float) => x+y, omat)
+  def maxi(a:SMat, omat:Mat) = a.ssReduceOp(0, (x:Float) => x, (x:Float, y:Float) => math.max(x,y), omat)
+  def mini(a:SMat, omat:Mat) = a.ssReduceOp(0, (x:Float) => x, (x:Float, y:Float) => math.min(x,y), omat)
+  def min(a:SDMat, b:Double) = a.ssMatOpScalar(b, (x:Double, y:Double) => math.min(x,y), null)
+  def max(a:SDMat, b:Double) = a.ssMatOpScalar(b, (x:Double, y:Double) => math.max(x,y), null)
+  def min(b:Double, a:SDMat) = a.ssMatOpScalar(b, (x:Double, y:Double) => math.min(x,y), null)
+  def max(b:Double, a:SDMat) = a.ssMatOpScalar(b, (x:Double, y:Double) => math.max(x,y), null)
+  
+  def sum(a:CMat, n:Int) = a.ccReduceOpv(n, CMat.vecAdd _, null)
+  def sum(a:CMat, n:Int, c:Mat) = a.ccReduceOpv(n, CMat.vecAdd _, c)
+     
+  def max(a:Mat, b:Mat):Mat = {
+    (a, b) match {
+      case (aa:FMat, bb:FMat) => max(aa, bb):FMat
+      case (aa:IMat, bb:IMat) => max(aa, bb):IMat
+      case (aa:DMat, bb:DMat) => max(aa, bb):DMat
+      case (aa:GMat, bb:GMat) => max(aa, bb):GMat
+    }
+  }
+  
+  def min(a:Mat, b:Mat):Mat = {
+    (a, b) match {
+      case (aa:FMat, bb:FMat) => min(aa, bb):FMat
+      case (aa:IMat, bb:IMat) => min(aa, bb):IMat
+      case (aa:DMat, bb:DMat) => min(aa, bb):DMat
+      case (aa:GMat, bb:GMat) => min(aa, bb):GMat
+    }
+  }
+  
+  def max(a:Mat, b:Mat, c:Mat):Mat = {
+    (a, b) match {
+      case (aa:FMat, bb:FMat) => max(aa, bb, c):FMat
+      case (aa:IMat, bb:IMat) => max(aa, bb, c):IMat
+      case (aa:DMat, bb:DMat) => max(aa, bb, c):DMat
+      case (aa:GMat, bb:GMat) => max(aa, bb, c):GMat
+    }
+  }
+  
+  def min(a:Mat, b:Mat, c:Mat):Mat = {
+    (a, b) match {
+      case (aa:FMat, bb:FMat) => min(aa, bb, c):FMat
+      case (aa:IMat, bb:IMat) => min(aa, bb, c):IMat
+      case (aa:DMat, bb:DMat) => min(aa, bb, c):DMat
+      case (aa:GMat, bb:GMat) => min(aa, bb, c):GMat
+    }
+  }
+  
+  def max(a:Float, b:Mat, c:Mat):Mat = {
+    b match {
+      case bb:FMat => max(a, bb, c):FMat
+      case bb:IMat => max(a.asInstanceOf[Int], bb, c):IMat
+      case bb:DMat => max(DMat(a), bb, c):DMat
+      case bb:GMat => max(GMat(a), bb, c):GMat
+      case bb:SMat => max(a, bb, c):SMat
+    }
+  }
+  
+  def min(a:Float, b:Mat, c:Mat):Mat = {
+    b match {
+      case bb:FMat=> min(a, bb, c):FMat
+      case bb:IMat=> min(a.asInstanceOf[Int], bb, c):IMat
+      case bb:DMat => min(DMat(a), bb, c):DMat
+      case bb:GMat => min(GMat(a), bb, c):GMat
+      case bb:SMat => min(a, bb, c):SMat
+    }
+  }
+  
+  def max(b:Mat, a:Float, c:Mat):Mat = {
+    b match {
+      case bb:FMat => max(a, bb, c):FMat
+      case bb:IMat => max(a.asInstanceOf[Int], bb, c):IMat
+      case bb:DMat => max(DMat(a), bb, c):DMat
+      case bb:GMat => max(GMat(a), bb, c):GMat
+      case bb:SMat => max(a, bb, c):SMat
+    }
+  }
+  
+  def min(b:Mat, a:Float, c:Mat):Mat = {
+    b match {
+      case bb:FMat=> min(a, bb, c):FMat
+      case bb:IMat=> min(a.asInstanceOf[Int], bb, c):IMat
+      case bb:DMat => min(DMat(a), bb, c):DMat
+      case bb:GMat => min(GMat(a), bb, c):GMat
+      case bb:SMat => min(a, bb, c):SMat
+    }
+  }
+  
+  def max(a:Double, b:Mat, c:Mat):Mat = {
+    b match {
+      case bb:FMat => max(a.asInstanceOf[Float], bb, c):FMat
+      case bb:IMat => max(a.asInstanceOf[Int], bb, c):IMat
+      case bb:DMat => max(DMat(a), bb, c):DMat
+      case bb:GMat => max(GMat(a), bb, c):GMat
+      case bb:SMat => max(a.asInstanceOf[Float], bb, c):SMat
+    }
+  }
+  
+  def min(a:Double, b:Mat, c:Mat):Mat = {
+    b match {
+      case bb:FMat => min(a.asInstanceOf[Float], bb, c):FMat
+      case bb:IMat => min(a.asInstanceOf[Int], bb, c):IMat
+      case bb:DMat=> min(DMat(a), bb, c):DMat
+      case bb:GMat => min(GMat(a), bb, c):GMat
+      case bb:SMat => min(a.asInstanceOf[Float], bb, c):SMat
+    }
+  }
+  
+  def max(a:Mat, b:Double, c:Mat):Mat = {
+    a match {
+      case aa:FMat => max(aa, b.asInstanceOf[Float], c):FMat
+      case aa:IMat => max(aa, b.asInstanceOf[Int], c):IMat
+      case aa:DMat => max(aa, DMat(b), c):DMat
+      case aa:GMat => max(aa, GMat(b), c):GMat
+      case aa:SMat => max(b.asInstanceOf[Float], aa, c):SMat
+    }
+  }
+  
+  def min(a:Mat, b:Double, c:Mat):Mat = {
+    a match {
+      case aa:FMat => min(aa, b.asInstanceOf[Float], c):FMat
+      case aa:IMat => min(aa, b.asInstanceOf[Int], c):IMat
+      case aa:DMat => min(aa, DMat(b), c):DMat
+      case aa:GMat => min(aa, GMat(b), c):GMat
+      case aa:SMat => min(b.asInstanceOf[Float], aa, c):SMat
+    }
+  }
+   
+  def mini(a:Mat, b:Int):Mat = {
+    a match {
+      case aa:FMat => mini(aa, b):FMat
+      case aa:IMat => mini(aa, b):IMat
+      case aa:DMat => mini(aa, b):DMat
+      case aa:GMat => mini(aa, b):GMat
+    }
+  }
+  
+  def maxi(a:Mat, b:Int):Mat = {
+    a match {
+      case aa:FMat => maxi(aa, b):FMat
+      case aa:IMat => maxi(aa, b):IMat
+      case aa:DMat => maxi(aa, b):DMat
+      case aa:GMat => maxi(aa, b):GMat
+    }
+  }
+  
+  def sum(a:Mat, b:Int):Mat = {
+    a match {
+      case aa:FMat => sum(aa, b):FMat
+      case aa:IMat => sum(aa, b):IMat
+      case aa:DMat => sum(aa, b):DMat
+      case aa:CMat => sum(aa, b):CMat
+      case aa:SMat => sum(aa, b):FMat
+      case aa:GMat => sum(aa, b):GMat
+    }
+  }
+  
+  def sum(a:Mat, b:Int, c:Mat):Mat = {
+    a match {
+      case aa:FMat => sum(aa, b, c):FMat
+      case aa:IMat => sum(aa, b, c):IMat
+      case aa:DMat=> sum(aa, b, c):DMat
+      case aa:SMat=> sum(aa, b, c):FMat
+      case aa:CMat => sum(aa, b, c):CMat
+      case aa:GMat => sum(aa, b, c):GMat
+    }
+  }
+  
+  def mean(a:FMat, dim0:Int):FMat = {
+    _mean(a, dim0).asInstanceOf[FMat]
+  }
+  
+  def mean(a:FMat):FMat = {
+    _mean(a, 0).asInstanceOf[FMat]
+  } 
+   
+  def mean(a:DMat, dim0:Int):DMat = {
+    _mean(a, dim0).asInstanceOf[DMat]
+  }
+  
+  def mean(a:DMat):DMat = {
+    _mean(a, 0).asInstanceOf[DMat]
+  }
+  
+  def mean(a:IMat, dim0:Int):FMat = {
+    _mean(a, dim0).asInstanceOf[FMat]
+  }
+  
+  def mean(a:IMat):FMat = {
+    _mean(a, 0).asInstanceOf[FMat]
+  }
+  
+  def mean(a:CMat, dim0:Int):CMat = {
+    _mean(a, dim0).asInstanceOf[CMat]
+  }
+  
+  def mean(a:CMat):CMat = {
+    _mean(a, 0).asInstanceOf[CMat]
+  }
+  
+  def mean(a:GMat, dim0:Int):GMat = {
+    _mean(a, dim0).asInstanceOf[GMat]
+  }
+  
+  def mean(a:GMat):GMat = {
+    _mean(a, 0).asInstanceOf[GMat]
+  }
+  
+  def mean(a:Mat, b:Int):Mat = _mean(a,b)
+  
+  def mean(a:Mat):Mat = _mean(a, 0):Mat
+  
+  def _mean(a:Mat, dim0:Int):Mat = {
+    val dim = if (a.nrows == 1 && dim0 == 0) 2 else math.max(1, dim0)
+    if (dim == 1) {
+      sum(a, 1)*(1.0f/a.nrows)
+    } else {
+      sum(a, 2)*(1.0f/a.ncols)
+    }
+  }
+  
+  def variance(a:FMat, dim0:Int):FMat = {
+    _variance(a, dim0).asInstanceOf[FMat]
+  }
+  
+  def variance(a:FMat):FMat = {
+    _variance(a, 0).asInstanceOf[FMat]
+  } 
+   
+  def variance(a:DMat, dim0:Int):DMat = {
+    _variance(a, dim0).asInstanceOf[DMat]
+  }
+  
+  def variance(a:DMat):DMat = {
+    _variance(a, 0).asInstanceOf[DMat]
+  }
+  
+  def variance(a:IMat, dim0:Int):FMat = {
+    _variance(a, dim0).asInstanceOf[FMat]
+  }
+  
+  def variance(a:IMat):FMat = {
+    _variance(a, 0).asInstanceOf[FMat]
+  }
+  
+  def variance(a:CMat, dim0:Int):CMat = {
+    _variance(a, dim0).asInstanceOf[CMat]
+  }
+  
+  def variance(a:CMat):CMat = {
+    _variance(a, 0).asInstanceOf[CMat]
+  }
+  
+  def variance(a:GMat, dim0:Int):GMat = {
+    _variance(a, dim0).asInstanceOf[GMat]
+  }
+  
+  def variance(a:GMat):GMat = {
+    _variance(a, 0).asInstanceOf[GMat]
+  }
+     
+  def variance(a:Mat, dim:Int) = _variance(a, dim)
+  
+  def variance(a:Mat):Mat = _variance(a, 0)
+ 
+  def _variance(a:Mat, dim0:Int):Mat = {
+    val dim = if (a.nrows == 1 && dim0 == 0) 2 else math.max(1, dim0)
+    if (dim == 1) {
+      val m = mean(a, 1)
+      sum(a *@ a, 1)*(1.0f/a.nrows) - m *@ m
+    } else {
+      val m = mean(a, 2)
+      sum(a *@ a, 2)*(1.0f/a.ncols) - m *@ m
+    }
+  }
+
+  
+  def applyDFun(a:DMat, omat:Mat, vfn:(Int, Array[Double], Array[Double])=>Unit, efn:(Double)=>Double, nflops:Long) ={
+      val out = recycleTry(omat, a)
+	    if (Mat.noMKL || vfn == null) {
+	      if (efn == null) {
+	        throw new RuntimeException("no Scala builtin version of this math function, sorry")
+	      } 
+	      var i = 0; val len = a.length; val odata = out.data; val adata = a.data
+	      while (i < len) {odata(i) = efn(adata(i)); i += 1}
+	    } else {
+	      vfn(a.length, a.data, out.data)
+	    }
+	    Mat.nflops += nflops*a.length
+	    out
+	  }
+
+  def applyDFunV(a:DMat, omat:Mat, vfn:(Int, Array[Double], Array[Double])=>Unit,
+                efn:(Int, Array[Double], Array[Double])=>Unit, nflops:Long) = {
+	    val out = recycleTry(omat, a)
+	    if (Mat.noMKL) {
+	      if (efn == null) {
+	        throw new RuntimeException("no Scala builtin version of this math function, sorry")
+	      } 
+	      efn(a.length, a.data, out.data)
+	    } else {
+	      vfn(a.length, a.data, out.data)
+	    }
+	    Mat.nflops += nflops*a.length
+	    out
+	  }
+  
+   def applySFun(a:FMat, omat:Mat, vfn:(Int, Array[Float], Array[Float])=>Unit, efn:(Float)=>Float, nflops:Long) ={
+	    val out = recycleTry(omat, a)
+	    if (Mat.noMKL || vfn == null) {
+	      if (efn == null) {
+	        throw new RuntimeException("no Scala builtin version of this math function, sorry")
+	      } 
+	      var i = 0; val len = a.length; val odata = out.data; val adata = a.data
+	      while (i < len) {odata(i) = efn(adata(i)); i += 1}
+	    } else {
+	      vfn(a.length, a.data, out.data)
+	    }	
+	    Mat.nflops += nflops*a.length
+	    out
+  	}
+ 
+  def applySFunV(a:FMat, omat:Mat, vfn:(Int, Array[Float], Array[Float])=>Unit, 
+                 efn:(Int, Array[Float], Array[Float])=>Unit, nflops:Long) ={
+	    val out = recycleTry(omat, a)
+	    if (Mat.noMKL) {
+	      if (efn == null) {
+	        throw new RuntimeException("no Scala builtin version of this math function, sorry")
+	      } 
+              efn(a.length, a.data, out.data)
+	    } else {
+	      vfn(a.length, a.data, out.data)
+	    }	
+	    Mat.nflops += nflops*a.length
+	    out
+  	}
+  
+  def applyD2Fun(a:DMat, b:DMat, omat:Mat, 
+      	vfn:(Int, Array[Double], Array[Double], Array[Double]) => Unit, 
+      	efn:(Double, Double)=>Double, nflops:Long):DMat = {
+      			val out = recycleTry(omat, a, b)
+      			if (Mat.noMKL) {
+      				if (efn == null) {
+      					throw new RuntimeException("no Scala builtin version of this math function, sorry")
+      				} 
+      				var	i = 0; val len = a.length; val odata = out.data; val adata = a.data; val bdata = b.data
+      				while	(i < len) {odata(i) = efn(adata(i), bdata(i)); i += 1}
+      			} else {
+      				vfn(a.length, a.data, b.data, out.data)
+      			}
+      			Mat.nflops += nflops*a.length
+      			out
+      	}
+  
+  def sign(a:DMat, out:Mat) = applyDFun(a, out, null, math.signum _, 1L)
+  def sign(a:DMat):DMat = sign(a, DMat(a.nrows, a.ncols))
+  
+  def abs(a:DMat, out:Mat) = applyDFun(a, out, vdAbs _, math.abs _, 1L)
+  def abs(a:DMat):DMat = abs(a, DMat(a.nrows, a.ncols))
+
+  def _vdexp(n:Int, a:Array[Double], b:Array[Double]) = {var i=0 ; while (i<n) {b(i) = math.exp(a(i)); i+=1}}
+  def exp(a:DMat, out:DMat) = applyDFunV(a, out, vdExp _, _vdexp _, 10L)
+
+  def exp(a:DMat):DMat = exp(a, DMat(a.nrows, a.ncols))
+  
+  def expm1(a:DMat, out:Mat) = applyDFun(a, out, vdExpm1 _, math.expm1 _, 10L)
+  def expm1(a:DMat):DMat = expm1(a, DMat(a.nrows, a.ncols))
+  
+  def sqrt(a:DMat, out:Mat) = applyDFun(a, out, vdSqrt _, math.sqrt _, 10L)
+  def sqrt(a:DMat):DMat = sqrt(a, DMat(a.nrows, a.ncols))
+
+  def ln(a:DMat, out:Mat) = applyDFun(a, out, vdLn _, math.log _, 10L)
+  def ln(a:DMat):DMat = ln(a, DMat(a.nrows, a.ncols))
+  
+  def log10(a:DMat, out:Mat) = applyDFun(a, out, vdLog10 _, math.log10 _, 10L)
+  def log10(a:DMat):DMat = log10(a, DMat(a.nrows, a.ncols))
+  
+  def log1p(a:DMat, out:Mat) = applyDFun(a, out, vdLog1p _, math.log1p _, 10L)
+  def log1p(a:DMat):DMat = log1p(a, DMat(a.nrows, a.ncols))
+  
+  def cos(a:DMat, out:Mat) = applyDFun(a, out, vdCos _, math.cos _, 10L)
+  def cos(a:DMat):DMat = cos(a, DMat(a.nrows, a.ncols))
+  
+  def sin(a:DMat, out:Mat) = applyDFun(a, out, vdSin _, math.sin _, 10L)
+  def sin(a:DMat):DMat = sin(a, DMat(a.nrows, a.ncols))
+  
+  def tan(a:DMat, out:Mat) = applyDFun(a, out, vdTan _, math.tan _, 10L)
+  def tan(a:DMat):DMat = tan(a, DMat(a.nrows, a.ncols))
+  
+  def cosh(a:DMat, out:Mat) = applyDFun(a, out, vdCosh _, math.cosh _, 10L)
+  def cosh(a:DMat):DMat = cosh(a, DMat(a.nrows, a.ncols))
+  
+  def sinh(a:DMat, out:Mat) = applyDFun(a, out, vdSinh _, math.sinh _, 10L)
+  def sinh(a:DMat):DMat = sinh(a, DMat(a.nrows, a.ncols))
+  
+  def tanh(a:DMat, out:Mat) = applyDFun(a, out, vdTanh _, math.tanh _, 10L)
+  def tanh(a:DMat):DMat = tanh(a, DMat(a.nrows, a.ncols))
+  
+  def acos(a:DMat, out:Mat) = applyDFun(a, out, vdAcos _, math.acos _, 10L)
+  def acos(a:DMat):DMat = acos(a, DMat(a.nrows, a.ncols))
+
+  def asin(a:DMat, out:Mat) = applyDFun(a, out, vdAsin _, math.asin _, 10L)
+  def asin(a:DMat):DMat = asin(a, DMat(a.nrows, a.ncols))
+  
+  def atan(a:DMat, out:Mat) = applyDFun(a, out, vdAtan _, math.atan _, 10L)
+  def atan(a:DMat):DMat = atan(a, DMat(a.nrows, a.ncols))
+
+  def acosh(a:DMat, out:Mat) = applyDFun(a, out, vdCosh _, null, 10L)
+  def acosh(a:DMat):DMat = acosh(a, DMat(a.nrows, a.ncols))
+  
+  def asinh(a:DMat, out:Mat) = applyDFun(a, out, vdSinh _, null, 10L)
+  def asinh(a:DMat):DMat = asinh(a, DMat(a.nrows, a.ncols))
+  
+  def atanh(a:DMat, out:Mat) = applyDFun(a, out, vdAtanh _, null, 10L)
+  def atanh(a:DMat):DMat = atanh(a, DMat(a.nrows, a.ncols))
+  
+  def erf(a:DMat, out:Mat) = applyDFun(a, out, vdErf _, null, 10L)
+  def erf(a:DMat):DMat = erf(a, DMat(a.nrows, a.ncols))
+  
+  def erfinv(a:DMat, out:Mat) = applyDFun(a, out, vdErfInv _, null, 10L)
+  def erfinv(a:DMat):DMat = erfinv(a, DMat(a.nrows, a.ncols))
+  
+  def erfc(a:DMat, out:Mat) = applyDFun(a, out, vdErfc _, null, 10L)
+  def erfc(a:DMat):DMat = erfc(a, DMat(a.nrows, a.ncols))
+  
+  def erfcinv(a:DMat, out:Mat) = applyDFun(a, out, vdErfcInv _, null, 10L)
+  def erfcinv(a:DMat):DMat = erfcinv(a, DMat(a.nrows, a.ncols))
+  
+  def normcdf(a:DMat, out:Mat) = applyDFun(a, out, vdCdfNorm _, null, 10L)
+  def normcdf(a:DMat):DMat = normcdf(a, DMat(a.nrows, a.ncols))
+  
+  def norminv(a:DMat, out:Mat) = applyDFun(a, out, vdCdfNormInv _, null, 10L)
+  def norminv(a:DMat):DMat = norminv(a, DMat(a.nrows, a.ncols))
+  
+  def gammaln(a:DMat, out:Mat) = applyDFun(a, out, vdLGamma _, null, 10L)
+  def gammaln(a:DMat):DMat = gammaln(a, DMat(a.nrows, a.ncols))
+  
+  def gamma(a:DMat, out:Mat) = applyDFun(a, out, vdTGamma _, null, 10L)
+  def gamma(a:DMat):DMat = gamma(a, DMat(a.nrows, a.ncols))
+  
+  def ceil(a:DMat, out:Mat) = applyDFun(a, out, vdCeil _, math.ceil, 1L)
+  def ceil(a:DMat):DMat = ceil(a, DMat(a.nrows, a.ncols))
+  
+  def floor(a:DMat, out:Mat) = applyDFun(a, out, vdFloor _, math.floor, 1L)
+  def floor(a:DMat):DMat = floor(a, DMat(a.nrows, a.ncols))
+
+  def round(a:DMat, out:Mat) = applyDFun(a, out, vdRound _, (x)=>(math.floor(x+0.5)), 1L)
+  def round(a:DMat):DMat = round(a, DMat(a.nrows, a.ncols))
+  
+  def trunc(a:DMat, out:Mat) = applyDFun(a, out, vdTrunc _, null, 1L)
+  def trunc(a:DMat):DMat = trunc(a, DMat(a.nrows, a.ncols))
+  
+  def atan2(a:DMat, b:DMat, out:Mat) = applyD2Fun(a, b, out, vdAtan2 _, math.atan2, 10L)
+  def atan2(a:DMat, b:DMat):DMat = atan2(a, b, DMat(a.nrows, a.ncols))
+  
+  def pow(a:DMat, b:DMat, out:Mat) = applyD2Fun(a, b, out, vdPow _, math.pow, 10L)
+  def pow(a:DMat, b:DMat):DMat = pow(a, b, DMat(a.nrows, a.ncols))
+  
+  def exppsi(a:DMat, out:Mat) = applyDFun(a, out, null, (x:Double)=>if (x<1.0) 0.5*x*x else x-0.5, 1L)
+  def exppsi(a:DMat):DMat = exppsi(a, DMat(a.nrows, a.ncols))
+  
+  
+  def sdev(a:DMat, dim0:Int):DMat = sqrt(variance(a, dim0))
+  def sdev(a:DMat):DMat = sdev(a, 0)
+
+  def sdev(a:FMat, dim0:Int):FMat = sqrt(variance(a, dim0))
+  def sdev(a:FMat):FMat = sdev(a, 0)
+  
+  def sign(a:FMat, out:Mat) = applySFun(a, out, null, math.signum _, 1L)
+  def sign(a:FMat):FMat = sign(a, FMat(a.nrows, a.ncols))
+  
+  def abs(a:FMat, out:Mat) = applySFun(a, out, vsAbs _, math.abs _, 1L)
+  def abs(a:FMat):FMat = abs(a, FMat(a.nrows, a.ncols))
+
+  def _vsexp(n:Int, a:Array[Float], b:Array[Float]) = {var i=0 ; while (i<n) {b(i) = math.exp(a(i)).asInstanceOf[Float]; i+=1}}  
+  def exp(a:FMat, out:Mat) = applySFun(a, out, vsExp _, (x:Float) => math.expm1(x).asInstanceOf[Float], 10L)
+  def exp(a:FMat):FMat = exp(a, FMat(a.nrows, a.ncols))
+  
+  def expm1(a:FMat, out:Mat) = applySFun(a, out, vsExpm1 _, (x:Float) => math.expm1(x).asInstanceOf[Float], 10L)
+  def expm1(a:FMat):FMat = expm1(a, FMat(a.nrows, a.ncols))
+  
+  def sqrt(a:FMat, out:Mat) = applySFun(a, out, vsSqrt _, (x:Float) => math.sqrt(x).asInstanceOf[Float], 10L)
+  def sqrt(a:FMat):FMat = sqrt(a, FMat(a.nrows, a.ncols))
+  
+  def ln(a:FMat, out:Mat) = applySFun(a, out, vsLn _, (x:Float) => math.log(x).asInstanceOf[Float], 10L)
+  def ln(a:FMat):FMat = ln(a, FMat(a.nrows, a.ncols))
+  
+  def log10(a:FMat, out:Mat) = applySFun(a, out, vsLog10 _, (x:Float) => math.log10(x).asInstanceOf[Float], 10L)
+  def log10(a:FMat):FMat = log10(a, FMat(a.nrows, a.ncols))
+  
+  def log1p(a:FMat, out:Mat) = applySFun(a, out, vsLog1p _, (x:Float) => math.log1p(x).asInstanceOf[Float], 10L)
+  def log1p(a:FMat):FMat = log1p(a, FMat(a.nrows, a.ncols))
+  
+  def cos(a:FMat, out:Mat) = applySFun(a, out, vsCos _, (x:Float) => math.cos(x).asInstanceOf[Float], 10L)
+  def cos(a:FMat):FMat = cos(a, FMat(a.nrows, a.ncols))
+  
+  def sin(a:FMat, out:Mat) = applySFun(a, out, vsSin _, (x:Float) => math.sin(x).asInstanceOf[Float], 10L)
+  def sin(a:FMat):FMat = sin(a, FMat(a.nrows, a.ncols))
+  
+  def tan(a:FMat, out:Mat) = applySFun(a, out, vsTan _, (x:Float) => math.tan(x).asInstanceOf[Float], 10L)
+  def tan(a:FMat):FMat = tan(a, FMat(a.nrows, a.ncols))
+  
+  def cosh(a:FMat, out:Mat) = applySFun(a, out, vsCosh _, (x:Float) => math.cosh(x).asInstanceOf[Float], 10L)
+  def cosh(a:FMat):FMat = cosh(a, FMat(a.nrows, a.ncols))
+  
+  def sinh(a:FMat, out:Mat) = applySFun(a, out, vsSinh _, (x:Float) => math.sinh(x).asInstanceOf[Float], 10L)
+  def sinh(a:FMat):FMat = sinh(a, FMat(a.nrows, a.ncols))
+  
+  def tanh(a:FMat, out:Mat) = applySFun(a, out, vsTanh _, (x:Float) => math.tanh(x).asInstanceOf[Float], 10L)
+  def tanh(a:FMat):FMat = tanh(a, FMat(a.nrows, a.ncols))
+  
+  def acos(a:FMat, out:Mat) = applySFun(a, out, vsAcos _, (x:Float) => math.acos(x).asInstanceOf[Float], 10L)
+  def acos(a:FMat):FMat = acos(a, FMat(a.nrows, a.ncols))
+
+  def asin(a:FMat, out:Mat) = applySFun(a, out, vsAsin _, (x:Float) => math.asin(x).asInstanceOf[Float], 10L)
+  def asin(a:FMat):FMat = asin(a, FMat(a.nrows, a.ncols))
+  
+  def atan(a:FMat, out:Mat) = applySFun(a, out, vsAtan _, (x:Float) => math.atan(x).asInstanceOf[Float], 10L)
+  def atan(a:FMat):FMat = atan(a, FMat(a.nrows, a.ncols))
+
+  def acosh(a:FMat, out:Mat) = applySFun(a, out, vsCosh _, null, 10L)
+  def acosh(a:FMat):FMat = acosh(a, FMat(a.nrows, a.ncols))
+  
+  def asinh(a:FMat, out:Mat) = applySFun(a, out, vsSinh _, null, 10L)
+  def asinh(a:FMat):FMat = asinh(a, FMat(a.nrows, a.ncols))
+  
+  def atanh(a:FMat, out:Mat) = applySFun(a, out, vsAtanh _, null, 10L)
+  def atanh(a:FMat):FMat = atanh(a, FMat(a.nrows, a.ncols))
+  
+  def erf(a:FMat, out:Mat) = applySFun(a, out, vsErf _, null, 10L)
+  def erf(a:FMat):FMat = erf(a, FMat(a.nrows, a.ncols))
+  
+  def erfinv(a:FMat, out:Mat) = applySFun(a, out, vsErfInv _, null, 10L)
+  def erfinv(a:FMat):FMat = erfinv(a, FMat(a.nrows, a.ncols))
+  
+  def erfc(a:FMat, out:Mat) = applySFun(a, out, vsErfc _, null, 10L)
+  def erfc(a:FMat):FMat = erfc(a, FMat(a.nrows, a.ncols))
+  
+  def erfcinv(a:FMat, out:Mat) = applySFun(a, out, vsErfcInv _, null, 10L)
+  def erfcinv(a:FMat):FMat = erfcinv(a, FMat(a.nrows, a.ncols))
+  
+  def normcdf(a:FMat, out:Mat) = applySFun(a, out, vsCdfNorm _, null, 10L)
+  def normcdf(a:FMat):FMat = normcdf(a, FMat(a.nrows, a.ncols))
+  
+  def norminv(a:FMat, out:Mat) = applySFun(a, out, vsCdfNormInv _, null, 10L)
+  def norminv(a:FMat):FMat = norminv(a, FMat(a.nrows, a.ncols))
+  
+  def gammaln(a:FMat, out:Mat) = applySFun(a, out, vsLGamma _, null, 10L)
+  def gammaln(a:FMat):FMat = gammaln(a, FMat(a.nrows, a.ncols))
+  
+  def gamma(a:FMat, out:Mat) = applySFun(a, out, vsTGamma _, null, 10L)
+  def gamma(a:FMat):FMat = gamma(a, FMat(a.nrows, a.ncols))
+  
+  def ceil(a:FMat, out:Mat) = applySFun(a, out, vsCeil _, (x:Float) => math.ceil(x).asInstanceOf[Float], 1L)
+  def ceil(a:FMat):FMat = ceil(a, FMat(a.nrows, a.ncols))
+  
+  def floor(a:FMat, out:Mat) = applySFun(a, out, vsFloor _, (x:Float) => math.floor(x).asInstanceOf[Float], 1L)
+  def floor(a:FMat):FMat = floor(a, FMat(a.nrows, a.ncols))
+
+  def round(a:FMat, out:Mat) = applySFun(a, out, vsRound _, (x:Float)=>math.floor(x+0.5).asInstanceOf[Float], 1L)
+  def round(a:FMat):FMat = round(a, FMat(a.nrows, a.ncols))
+  
+  def trunc(a:FMat, out:Mat) = applySFun(a, out, vsTrunc _, null, 1L)
+  def trunc(a:FMat):FMat = trunc(a, FMat(a.nrows, a.ncols))
+  
+  def exppsi(a:FMat, out:Mat) = applySFun(a, out, null, (x:Float)=>if (x<1.0f) 0.5f*x*x else x-0.5f, 1L)
+  def exppsi(a:FMat):FMat = exppsi(a, FMat(a.nrows, a.ncols))
+
+  def setVMLmode(n:Int) = {
+    vmlSetMode(n)
+  }
+
+  def getVMLmode():Int = {
+    vmlGetMode()
+  }
+
+  private def checkSizes(a:Mat, b:Mat) = {
+    if (a.nrows != b.nrows || a.ncols != b.ncols) {
+      throw new RuntimeException("argument dims mismatch")
+    }
+  }
+  
+  private def checkSizes(a:Mat, b:Mat, c:DMat) = {
+    if (a.nrows != b.nrows || a.ncols != b.ncols || a.nrows != c.nrows || a.ncols != c.ncols) {
+      throw new RuntimeException("argument dims mismatch")
+    }
+  }
+
+  def sprand(nrows:Int, ncols:Int, v:Double):SMat = {
+    val ioff = Mat.ioneBased
+    val out = SMat(nrows, ncols, math.max(math.min(nrows*ncols, 200),(1.5*v*nrows*ncols).intValue))
+    Mat.nflops += (5L*nrows*ncols*v).toLong
+    val vec = geornd(v, 1, out.nnz)
+    val vals = rand(1, out.nnz)
+    var irow = vec.data(0).intValue
+    var ipos = 0
+    var i = 0
+    out.jc(0) = ioff
+    while (i < ncols) {
+      while (irow < nrows && ipos < out.nnz-1) {
+  	out.data(ipos) = vals.data(ipos)
+  	out.ir(ipos) = irow+ioff
+  	ipos += 1
+  	irow += 1 + vec.data(ipos).intValue
+      }    
+      irow = irow - nrows
+      out.jc(i+1) = ipos+ioff
+      i += 1
+    }
+    SMat(out.sparseTrim)
+  }
+
+  def histc(a:DMat, b:DMat):IMat = {
+    val out = IMat(b.length, 1)
+    var i = 0
+    var hc = 0
+    var j = 0
+    while (j < a.length) {
+      if (i >= b.length-1 || a.data(j) < b.data(i+1)) {
+  	hc += 1
+      } else {
+  	out.data(i) = hc
+  	hc = 0
+  	i += 1
+      };
+      j += 1
+    }
+    out.data(b.length-1) = hc
+    out
+  }
+  
+  def roc(score0:DMat, vpos0:DMat, vneg0:DMat, nxvals:Int):DMat = {
+    import BIDMat.MatFunctions._
+    var score:DMat = null
+    if (size(score0,2) > size(score0,1)) {
+      score = score0.t
+    } else {
+      score = score0
+    };
+    var (vv, ii) = sortdown2(score);
+    var vpos = vpos0(ii);
+    var vneg = vneg0(ii);
+    var n = length(vpos);
+    if (size(vpos,2) > 1) {
+      vpos = vpos.t
+    };
+    if (size(vneg,2) > 1) {
+      vneg = vneg.t;
+    };
+    if (nnz(vneg < 0.0) + nnz(vpos < 0.0) > 0) {
+      sys.error("ROCcurve assumes vneg & vpos >= 0");
+    };
+
+    var tp = cumsum(vpos);
+    var fp = cumsum(vneg);
+    var npos = tp(n-1);
+    var nneg = fp(n-1);
+    var xvals:FMat = row(1 to nxvals)*(1.0*nneg/nxvals)
+    var nc:IMat = histc(fp, 0.0f \ xvals);
+    var loci = max(cumsum(nc(0 until nxvals)), 1);
+    val curve = (0.0 on tp(loci-1, 0))*(1.0/npos)
+    curve
+  }
+  
+  def applyGfun(in:GMat, omat:Mat, opn:Int, kflops:Long):GMat = {
+    val out = recycleTry(omat, in)
+    CUMAT.applygfun(in.data, out.data, in.nrows*in.ncols, opn)
+    JCuda.cudaDeviceSynchronize()
+    Mat.nflops += kflops*in.length
+    out
+  }
+
+  def applyGfun(in:GMat, opn:Int, kflops:Long):GMat = {
+    val out = GMat(in.nrows, in.ncols)
+    CUMAT.applygfun(in.data, out.data, in.nrows*in.ncols, opn)
+    JCuda.cudaDeviceSynchronize()
+    Mat.nflops += kflops*in.length
+    out
+  }
+  
+  def applyGfun2(a:GMat, b:GMat, omat:Mat, opn:Int, kflops:Long):GMat = {   
+    if (a.nrows == b.nrows && a.ncols == b.ncols) {
+    	val out = GMat(a.nrows, a.ncols)
+      CUMAT.applygfun2(a.data, b.data, out.data, a.nrows*a.ncols, opn)
+      JCuda.cudaDeviceSynchronize()
+      Mat.nflops += kflops*a.length
+      out
+    } else {
+      throw new RuntimeException("Dimensions mismatch")
+    }
+  }
+
+  def applyGfun2(a:GMat, b:GMat, opn:Int, kflops:Long):GMat = {
+    if  (a.nrows == b.nrows && a.ncols == b.ncols)  {
+	    val out = GMat(a.nrows, a.ncols)
+	    CUMAT.applygfun2(a.data, b.data, out.data, a.nrows*a.ncols, opn)
+	    JCuda.cudaDeviceSynchronize()
+	    Mat.nflops += kflops*a.length
+	    out
+    } else {
+      throw new RuntimeException("Dimensions mismatch")
+    }
+  }
+  import GMat.TransF
+
+  def abs(in:GMat, out:Mat):GMat =     applyGfun(in, out, TransF.abs, 1L)
+  def exp(in:GMat, out:Mat):GMat =     applyGfun(in, out, TransF.exp, 10L)
+  def expm1(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.expm1, 10L)
+  def sqrt(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.sqrt, 10L)
+  def ln(in:GMat, out:Mat):GMat =      applyGfun(in, out, TransF.ln, 10L)
+  def log10(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.log10, 10L)
+  def log1p(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.log1p, 10L)
+  def cos(in:GMat, out:Mat):GMat =     applyGfun(in, out, TransF.cos, 10L)
+  def sin(in:GMat, out:Mat):GMat =     applyGfun(in, out, TransF.sin, 10L)
+  def tan(in:GMat, out:Mat):GMat =     applyGfun(in, out, TransF.tan, 10L)
+  def cosh(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.cosh, 10L)
+  def sinh(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.sinh, 10L)
+  def tanh(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.tanh, 10L)
+  def acos(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.acos, 10L)
+  def asin(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.asin, 10L)
+  def atan(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.atan, 10L)
+  def acosh(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.acosh, 10L)
+  def asinh(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.asinh, 10L)
+  def atanh(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.atanh, 10L)
+  def erf(in:GMat, out:Mat):GMat =     applyGfun(in, out, TransF.erf, 10L)
+  def erfinv(in:GMat, out:Mat):GMat =  applyGfun(in, out, TransF.erfinv, 10L)
+  def erfc(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.erfc, 10L)
+  def ercinv(in:GMat, out:Mat):GMat =  applyGfun(in, out, TransF.erfcinv, 10L)
+  def gammaln(in:GMat, out:Mat):GMat = applyGfun(in, out, TransF.gammaln, 10L)
+  def gamma(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.gamma, 10L)
+  def ceil(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.ceil, 10L)
+  def floor(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.floor, 10L)
+  def round(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.round, 10L)
+  def trunc(in:GMat, out:Mat):GMat =   applyGfun(in, out, TransF.trunc, 10L)
+  def sign(in:GMat, out:Mat):GMat =    applyGfun(in, out, TransF.sign, 1L)
+  def exppsi(in:GMat, out:Mat):GMat =  applyGfun(in, out, TransF.exppsi, 1L)
+  
+  import GMat.TransF2
+  
+  def atan2(a:GMat, b:GMat, out:Mat):GMat =   applyGfun2(a, b, out, TransF2.atan2, 10L)
+  def pow(a:GMat, b:GMat, out:Mat):GMat =     applyGfun2(a, b, out, TransF2.pow, 10L)
+
+  def abs(in:GMat):GMat =     applyGfun(in, TransF.abs, 10L)
+  def exp(in:GMat):GMat =     applyGfun(in, TransF.exp, 10L)
+  def expm1(in:GMat):GMat =   applyGfun(in, TransF.expm1, 10L)
+  def sqrt(in:GMat):GMat =    applyGfun(in, TransF.sqrt, 10L)
+  def ln(in:GMat):GMat =      applyGfun(in, TransF.ln, 10L)
+  def log10(in:GMat):GMat =   applyGfun(in, TransF.log10, 10L)
+  def log1p(in:GMat):GMat =   applyGfun(in, TransF.log1p, 10L)
+  def cos(in:GMat):GMat =     applyGfun(in, TransF.cos, 10L)
+  def sin(in:GMat):GMat =     applyGfun(in, TransF.sin, 10L)
+  def tan(in:GMat):GMat =     applyGfun(in, TransF.tan, 10L)
+  def cosh(in:GMat):GMat =    applyGfun(in, TransF.cosh, 10L)
+  def sinh(in:GMat):GMat =    applyGfun(in, TransF.sinh, 10L)
+  def tanh(in:GMat):GMat =    applyGfun(in, TransF.tanh, 10L)
+  def acos(in:GMat):GMat =    applyGfun(in, TransF.acos, 10L)
+  def asin(in:GMat):GMat =    applyGfun(in, TransF.asin, 10L)
+  def atan(in:GMat):GMat =    applyGfun(in, TransF.atan, 10L)
+  def acosh(in:GMat):GMat =   applyGfun(in, TransF.acosh, 10L)
+  def asinh(in:GMat):GMat =   applyGfun(in, TransF.asinh, 10L)
+  def atanh(in:GMat):GMat =   applyGfun(in, TransF.atanh, 10L)
+  def erf(in:GMat):GMat =     applyGfun(in, TransF.erf, 10L)
+  def erfinv(in:GMat):GMat =  applyGfun(in, TransF.erfinv, 10L)
+  def erfc(in:GMat):GMat =    applyGfun(in, TransF.erfc, 10L)
+  def ercinv(in:GMat):GMat =  applyGfun(in, TransF.erfcinv, 10L)
+  def gammaln(in:GMat):GMat = applyGfun(in, TransF.gammaln, 10L)
+  def gamma(in:GMat):GMat =   applyGfun(in, TransF.gamma, 10L)
+  def ceil(in:GMat):GMat =    applyGfun(in, TransF.ceil, 10L)
+  def floor(in:GMat):GMat =   applyGfun(in, TransF.floor, 10L)
+  def round(in:GMat):GMat =   applyGfun(in, TransF.round, 10L)
+  def trunc(in:GMat):GMat =   applyGfun(in, TransF.trunc, 10L)
+  def sign(in:GMat):GMat =    applyGfun(in, TransF.sign, 1L)
+  def exppsi(in:GMat):GMat =    applyGfun(in, TransF.exppsi, 1L)
+  
+  def atan2(a:GMat, b:GMat):GMat =   applyGfun2(a, b, TransF2.atan2, 10L)
+  def pow(a:GMat, b:GMat):GMat =     applyGfun2(a, b, TransF2.pow, 10L)
+  
+  import GMat.BinOp
+  def max(a:GMat, b:GMat):GMat    = a.gOp(b, null, BinOp.op_max)
+  def min(a:GMat, b:GMat):GMat    = a.gOp(b, null, BinOp.op_min)
+  def maxi(a:GMat, dir:Int):GMat  = a.reduceOp(null, dir, BinOp.op_max)
+  def mini(a:GMat, dir:Int):GMat  = a.reduceOp(null, dir, BinOp.op_min)
+  def sum(a:GMat, dir:Int):GMat   = a.reduceOp(null, dir, BinOp.op_add)
+  def maxi(a:GMat):GMat           = a.reduceOp(null, 0, BinOp.op_max)
+  def mini(a:GMat):GMat           = a.reduceOp(null, 0, BinOp.op_min)
+  def sum(a:GMat):GMat            = a.reduceOp(null, 0, BinOp.op_add)
+  
+  def max(a:GMat, b:GMat, out:Mat):GMat    = a.gOp(b, out, BinOp.op_max)
+  def min(a:GMat, b:GMat, out:Mat):GMat    = a.gOp(b, out, BinOp.op_min)
+  def maxi(a:GMat, dir:Int, out:Mat):GMat  = a.reduceOp(out, dir, BinOp.op_max)
+  def mini(a:GMat, dir:Int, out:Mat):GMat  = a.reduceOp(out, dir, BinOp.op_min)
+  def sum(a:GMat, dir:Int, out:Mat):GMat   = a.reduceOp(out, dir, BinOp.op_add)
+  def maxi(a:GMat, out:Mat):GMat           = a.reduceOp(out, 0, BinOp.op_max)
+  def mini(a:GMat, out:Mat):GMat           = a.reduceOp(out, 0, BinOp.op_min)
+  def sum(a:GMat, out:Mat):GMat            = a.reduceOp(out, 0, BinOp.op_add)
+  
+  def abs(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => abs(aa, b):FMat
+      case aa:DMat => abs(aa, b):DMat
+      case aa:GMat => abs(aa, b):GMat
+    }
+  }
+  
+  def sign(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => sign(aa, b)
+      case aa:DMat => sign(aa, b)
+      case aa:GMat => sign(aa, b)
+    }
+  }
+       
+  def sqrt(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => sqrt(aa, b)
+      case aa:DMat => sqrt(aa, b)
+      case aa:GMat => sqrt(aa, b)
+    }
+  }
+  
+  def exp(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => exp(aa, b)
+      case aa:DMat => exp(aa, b)
+      case aa:GMat => exp(aa, b)
+    }
+  }
+  
+  def expm1(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => expm1(aa, b)
+      case aa:DMat => expm1(aa, b)
+      case aa:GMat => expm1(aa, b)
+    }
+  }
+  
+  def ln(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => ln(aa, b)
+      case aa:DMat => ln(aa, b)
+      case aa:GMat => ln(aa, b)
+    }
+  }
+  
+  def log10(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => log10(aa, b)
+      case aa:DMat => log10(aa, b)
+      case aa:GMat => log10(aa, b)
+    }
+  }
+    
+  def log1p(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => log1p(aa, b)
+      case aa:DMat => log1p(aa, b)
+      case aa:GMat => log1p(aa, b)
+    }
+  }
+  
+  def cos(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => cos(aa, b)
+      case aa:DMat => cos(aa, b)
+      case aa:GMat => cos(aa, b)
+    }
+  }
+  
+  def sin(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => sin(aa, b)
+      case aa:DMat => sin(aa, b)
+      case aa:GMat => sin(aa, b)
+    }
+  }
+  
+  def tan(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => tan(aa, b)
+      case aa:DMat => tan(aa, b)
+      case aa:GMat => tan(aa, b)
+    }
+  }
+    
+  def cosh(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => cosh(aa, b)
+      case aa:DMat => cosh(aa, b)
+      case aa:GMat => cosh(aa, b)
+    }
+  }
+     
+  def sinh(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => sinh(aa, b)
+      case aa:DMat => sinh(aa, b)
+      case aa:GMat => sinh(aa, b)
+    }
+  }
+      
+  def tanh(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => tanh(aa, b)
+      case aa:DMat => tanh(aa, b)
+      case aa:GMat => tanh(aa, b)
+    }
+  }
+    
+  def acos(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => acos(aa, b)
+      case aa:DMat => acos(aa, b)
+      case aa:GMat => acos(aa, b)
+    }
+  }
+      
+  def asin(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => asin(aa, b)
+      case aa:DMat => asin(aa, b)
+      case aa:GMat => asin(aa, b)
+    }
+  }
+  
+  def atan(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => atan(aa, b)
+      case aa:DMat => atan(aa, b)
+      case aa:GMat => atan(aa, b)
+    }
+  }
+  
+  def acosh(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => acosh(aa, b)
+      case aa:DMat => acosh(aa, b)
+      case aa:GMat => acosh(aa, b)
+    }
+  }
+  
+  def asinh(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => asinh(aa, b)
+      case aa:DMat => asinh(aa, b)
+      case aa:GMat => asinh(aa, b)
+    }
+  }
+  
+  def erf(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => erf(aa, b)
+      case aa:DMat => erf(aa, b)
+      case aa:GMat => erf(aa, b)
+    }
+  }
+   
+  def erfinv(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => erfinv(aa, b)
+      case aa:DMat => erfinv(aa, b)
+      case aa:GMat => erfinv(aa, b)
+    }
+  }
+    
+  def erfc(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => erfc(aa, b)
+      case aa:DMat => erfc(aa, b)
+      case aa:GMat => erfc(aa, b)
+    }
+  }
+   
+  def erfcinv(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => erfcinv(aa, b)
+      case aa:DMat => erfcinv(aa, b)
+      case aa:GMat => erfcinv(aa, b)
+    }
+  }
+  
+  def gamma(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => gamma(aa, b)
+      case aa:DMat => gamma(aa, b)
+      case aa:GMat => gamma(aa, b)
+    }
+  }
+    
+  def gammaln(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => gammaln(aa, b)
+      case aa:DMat => gammaln(aa, b)
+      case aa:GMat => gammaln(aa, b)
+    }
+  }
+  
+  def floor(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => floor(aa, b)
+      case aa:DMat => floor(aa, b)
+      case aa:GMat => floor(aa, b)
+    }
+  }
+  
+  def ceil(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => ceil(aa, b)
+      case aa:DMat => ceil(aa, b)
+      case aa:GMat => ceil(aa, b)
+    }
+  }
+   
+  def round(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => round(aa, b)
+      case aa:DMat => round(aa, b)
+      case aa:GMat => round(aa, b)
+    }
+  }
+  
+  def trunc(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => trunc(aa, b)
+      case aa:DMat => trunc(aa, b)
+      case aa:GMat => trunc(aa, b)
+    }
+  }
+  
+  def exppsi(a:Mat, b:Mat):Mat = {
+    a match {
+      case aa:FMat => exppsi(aa, b)
+      case aa:DMat => exppsi(aa, b)
+      case aa:GMat => exppsi(aa, b)
+    }
+  }
+  
+  def atan2(a:Mat, b:Mat, c:Mat):Mat = {
+    (a, b) match {
+      case (aa:FMat, bb:FMat) => atan2(aa, bb, c)
+      case (aa:DMat, bb:DMat) => atan2(aa, bb, c)
+      case (aa:GMat, bb:GMat) => atan2(aa, bb, c)
+    }
+  }
+  
+  def pow(a:Mat, b:Mat, c:Mat):Mat = {
+    (a, b) match {
+      case (aa:FMat, bb:FMat) => pow(aa, bb, c)
+      case (aa:DMat, bb:DMat) => pow(aa, bb, c)
+      case (aa:GMat, bb:GMat) => pow(aa, bb, c)
+    }
+  }
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/Solvers.scala b/src/main/scala/BIDMat/Solvers.scala
new file mode 100755
index 00000000..00cbc126
--- /dev/null
+++ b/src/main/scala/BIDMat/Solvers.scala
@@ -0,0 +1,334 @@
+package BIDMat
+import edu.berkeley.bid.CBLAS._
+import edu.berkeley.bid.LAPACK._
+import MatFunctions._
+import SciFunctions._
+
+object Solvers { 
+
+  def inv(a:FMat):FMat = _inv(a).asInstanceOf[FMat]
+  def inv(a:DMat):DMat = _inv(a).asInstanceOf[DMat]
+  def inv(a:CMat):CMat = _inv(a).asInstanceOf[CMat]
+  def inv(a:Mat):Mat = _inv(a)
+   
+  def _inv(a:Mat):Mat = {
+    Mat.nflops += 4L*a.nrows*a.nrows*a.nrows/3
+    if (a.nrows != a.ncols) {
+      throw new RuntimeException("inv needs a square matrix")
+    } else {
+      val out = a.copy
+      val ipiv = new Array[Int](a.nrows)
+      out match {
+        case dout:DMat => {
+        	dgetrf(ORDER.ColMajor, a.nrows, a.ncols, dout.data, a.nrows, ipiv)
+        	dgetri(ORDER.ColMajor, a.nrows, dout.data, a.nrows, ipiv)
+        }
+        case fout:FMat => {
+        	sgetrf(ORDER.ColMajor, a.nrows, a.ncols, fout.data, a.nrows, ipiv)
+        	sgetri(ORDER.ColMajor, a.nrows, fout.data, a.nrows, ipiv)
+        }
+        case dout:CMat => {
+        	cgetrf(ORDER.ColMajor, a.nrows, a.ncols, dout.data, a.nrows, ipiv)
+        	cgetri(ORDER.ColMajor, a.nrows, dout.data, a.nrows, ipiv)
+        }
+      }
+      out
+    }
+  }
+
+  def seig(a:FMat, getVecs:Boolean):(FMat, FMat) = {val (d,out) = _seig(a, getVecs); (d.asInstanceOf[FMat], out.asInstanceOf[FMat])}
+  def seig(a:DMat, getVecs:Boolean):(DMat, DMat) = {val (d,out) = _seig(a, getVecs); (d.asInstanceOf[DMat], out.asInstanceOf[DMat])}
+  def seig(a:Mat, getVecs:Boolean):(Mat, Mat) = _seig(a, getVecs)
+  
+  def seig(a:FMat):(FMat, FMat) = {val (d,out) = _seig(a, true); (d.asInstanceOf[FMat], out.asInstanceOf[FMat])} 
+  def seig(a:DMat):(DMat, DMat) = {val (d,out) = _seig(a, true); (d.asInstanceOf[DMat], out.asInstanceOf[DMat])}
+  def seig(a:Mat):(Mat, Mat) = _seig(a, true)
+  
+  def _seig(a:Mat, getVecs:Boolean):(Mat, Mat) = {
+    Mat.nflops += 6L*a.nrows*a.nrows*a.nrows 
+    if (a.nrows != a.ncols) {
+      throw new RuntimeException("eig needs a square matrix")
+    } else {
+      val out = a.copy
+      val d = a.zeros(a.nrows,1)
+      val e = a.zeros(a.nrows,1)
+      val tau = a.zeros(a.nrows,1)
+      (out, d, e, tau) match {
+        case (dout:DMat, dd:DMat, de:DMat, dtau:DMat) => {
+        	dsytrd(ORDER.ColMajor, "U", a.nrows, dout.data, a.nrows, dd.data, de.data, dtau.data)
+        	dorgtr(ORDER.ColMajor, "U", a.nrows, dout.data, a.nrows, dtau.data)
+        	dsteqr(ORDER.ColMajor, if (getVecs) "V" else "N", a.nrows, dd.data, de.data, dout.data, a.nrows)
+        }
+        case (fout:FMat, fd:FMat, fe:FMat, ftau:FMat) => {
+        	ssytrd(ORDER.ColMajor, "U", a.nrows, fout.data, a.nrows, fd.data, fe.data, ftau.data)
+        	sorgtr(ORDER.ColMajor, "U", a.nrows, fout.data, a.nrows, ftau.data)
+        	ssteqr(ORDER.ColMajor, if (getVecs) "V" else "N", a.nrows, fd.data, fe.data, fout.data, a.nrows)
+        }
+      }
+      (d, out)
+    }
+  }
+
+  def geig(a:Mat):(CMat, CMat) = geig(a, true)
+  
+  def geig(in:Mat, getVecs:Boolean):(CMat, CMat) = {
+    Mat.nflops += 10L*in.nrows*in.nrows*in.nrows 
+    if (in.nrows != in.ncols) {
+      throw new RuntimeException("eig needs a square matrix")
+    } else {
+      val ilo = new Array[Int](1)
+      val ihi = new Array[Int](1)
+      val a = CMat(in)
+      val scale = ones(a.nrows,1)
+      val tau = a.zeros(a.nrows,1)
+      val w = a.zeros(a.nrows, 1)
+      val mm = a.nrows
+      ilo(0) = 1; ihi(0) = a.nrows;
+      cgebal(ORDER.ColMajor, "S", a.nrows, a.data, a.nrows, ilo, ihi, scale.data)
+      cgehrd(ORDER.ColMajor, a.nrows, ilo(0), ihi(0), a.data, a.nrows, tau.data)
+      val q = a.copy
+      cunghr(ORDER.ColMajor, a.nrows, ilo(0), ihi(0), q.data, a.nrows, tau.data)
+      val z = q.copy
+      chseqr(ORDER.ColMajor, "S", "I", a.nrows, ilo(0), ihi(0), a.data, a.nrows, w.data, z.data, a.nrows)
+      if (getVecs) {
+      	Mat.nflops += 50L*in.nrows*in.nrows*in.nrows 
+      	val m = new Array[Int](1)
+      	val select = IMat(in.nrows, 1)
+      	val vl = a.zeros(a.nrows, 1)
+      	val ee = z.copy
+      	ctrevc(ORDER.ColMajor, "R", "A", select.data, a.nrows, a.data, a.nrows, vl.data, 1, ee.data, a.nrows,  mm, m)
+      	cgebak(ORDER.ColMajor, "S", "R", a.nrows, ilo(0), ihi(0), scale.data, mm, ee.data, a.nrows);
+      	z ~ q * (z * ee);
+      }
+      (w, z)
+    }
+  }
+  
+  def feig(a:FMat):(FMat, FMat) = {val (w,out) = _feig(a) ; (w.asInstanceOf[FMat], out.asInstanceOf[FMat])}
+  def feig(a:DMat):(DMat, DMat) = {val (w,out) = _feig(a) ; (w.asInstanceOf[DMat], out.asInstanceOf[DMat])}
+  def feig(a:Mat):(Mat, Mat) = _feig(a)
+  
+  def _feig(a:Mat):(Mat, Mat) = {  // Faster, divide and conquer algorithm for pos definite matrices
+    Mat.nflops += 3L*a.nrows*a.nrows*a.nrows
+    if (a.nrows != a.ncols) {
+      throw new RuntimeException("feig needs a square matrix")
+    } else {
+      val out = a.copy
+      val w = a.zeros(a.nrows,1)
+      (out, w) match {
+        case (dout:DMat, dw:DMat) => dsyevd(ORDER.ColMajor, "V", "U", a.nrows, dout.data, a.nrows, dw.data)
+        case (sout:FMat, sw:FMat) => ssyevd(ORDER.ColMajor, "V", "U", a.nrows, sout.data, a.nrows, sw.data)
+      }
+      (w, out)
+    }
+  }
+  /*
+   * Standard QR decomposition. Given m x n input A, return m x m orthonormal Q and m x n upper-triangular R. 
+   */
+  
+  def QRdecomp(a:FMat):(FMat, FMat) = {val (q,r) = _QRdecomp(a); (q.asInstanceOf[FMat], r.asInstanceOf[FMat])}
+  def QRdecomp(a:DMat):(DMat, DMat) = {val (q,r) = _QRdecomp(a); (q.asInstanceOf[DMat], r.asInstanceOf[DMat])}
+  def QRdecomp(a:CMat):(CMat, CMat) = {val (q,r) = _QRdecomp(a); (q.asInstanceOf[CMat], r.asInstanceOf[CMat])}
+  def QRdecomp(a:Mat):(Mat, Mat) = _QRdecomp(a)
+  
+  def _QRdecomp(a:Mat):(Mat, Mat) = {
+    Mat.nflops += 4L*a.nrows*a.ncols*math.min(a.nrows, a.ncols)
+    val m = a.nrows
+    val n = a.ncols
+    val r = a.copy
+    val q = a.zeros(m,m)
+    val tau = a.zeros(math.max(a.nrows, a.ncols), 1)
+    (r, q, tau) match {
+    case (fr:FMat, fq:FMat, ftau:FMat) => {
+    	sgeqrf(ORDER.ColMajor, m, n, fr.data, m, ftau.data)
+    	fq(?,0->n) = fr
+    	sorgqr(ORDER.ColMajor, m, m, n, fq.data, m, ftau.data)
+    }
+    case (dr:DMat, dq:DMat, dtau:DMat) => {
+    	dgeqrf(ORDER.ColMajor, m, n, dr.data, m, dtau.data)
+    	dq(?,0->n) = dr
+    	dorgqr(ORDER.ColMajor, m, m, n, dq.data, m, dtau.data)
+    }
+    case (cr:CMat, cq:CMat, ctau:CMat) => {
+    	cgeqrf(ORDER.ColMajor, m, n, cr.data, m, ctau.data)
+    	cq(?,0->n) = cr
+    	cungqr(ORDER.ColMajor, m, m, n, cq.data, m, ctau.data)
+    }
+    }
+    r.clearLower    
+    (q, r)
+  }
+  
+  /*
+   * Thin QR decomposition. Given m x n input A, return m x n orthonormal Q and n x n upper triangular R. 
+   */
+  
+  def QRdecompt(a:FMat):(FMat, FMat) = {val (q,r) = _QRdecompt(a); (q.asInstanceOf[FMat], r.asInstanceOf[FMat])}
+  def QRdecompt(a:DMat):(DMat, DMat) = {val (q,r) = _QRdecompt(a); (q.asInstanceOf[DMat], r.asInstanceOf[DMat])}
+  def QRdecompt(a:CMat):(CMat, CMat) = {val (q,r) = _QRdecompt(a); (q.asInstanceOf[CMat], r.asInstanceOf[CMat])}
+  def QRdecompt(a:Mat):(Mat, Mat) = _QRdecompt(a)
+  
+  def _QRdecompt(a:Mat):(Mat, Mat) = {
+    val m = a.nrows
+    val n = a.ncols
+    val a2 = a.zeros(a.ncols, a.ncols)
+    (a, a2) match {
+      case (fa:FMat, fa2:FMat) => sgemm(ORDER.ColMajor, TRANSPOSE.Trans, TRANSPOSE.NoTrans, n, n, m, 1f, fa.data, m, fa.data, m, 0f, fa2.data, n)
+      case (da:DMat, da2:DMat) => dgemm(ORDER.ColMajor, TRANSPOSE.Trans, TRANSPOSE.NoTrans, n, n, m, 1f, da.data, m, da.data, m, 0f, da2.data, n)
+      case (ca:CMat, ca2:CMat) => {
+        val cone = CMat.celem(1,0)
+        val czero = CMat.celem(0,0)
+        cgemm(ORDER.ColMajor, TRANSPOSE.Trans, TRANSPOSE.NoTrans, n, n, m, cone.data, ca.data, m, ca.data, m, czero.data, ca2.data, n)
+      }
+    }
+    Mat.nflops += 2L*a.ncols*a.ncols*a.nrows
+    val r = chol(a2).t
+    val q = a * inv(r)
+    (q, r)    
+  }
+  
+  def chol(a:FMat):FMat = _chol(a).asInstanceOf[FMat]
+  def chol(a:DMat):DMat = _chol(a).asInstanceOf[DMat]
+  def chol(a:CMat):CMat = _chol(a).asInstanceOf[CMat]
+  def chol(a:Mat):Mat = _chol(a) 
+  
+  def _chol(a:Mat):Mat = {  // Cholesky factorization
+    Mat.nflops += 1L*a.nrows*a.nrows*a.nrows/3
+    if (a.nrows != a.ncols) {
+      throw new RuntimeException("chol needs a square matrix")
+    } else {
+      val out = a.copy
+      out match {
+        case dout:DMat => dpotrf(ORDER.ColMajor, "L", a.nrows, dout.data, a.nrows)
+        case fout:FMat => spotrf(ORDER.ColMajor, "L", a.nrows, fout.data, a.nrows)
+        case cout:CMat => cpotrf(ORDER.ColMajor, "L", a.nrows, cout.data, a.nrows)
+      }
+      out.clearUpper
+      out
+    }
+  }
+  
+  /*
+   * Trisolve solves A x = r, for triangular A. Mode string argument is 3 characters. 
+   * Char1 = "U" or "L" for upper or lower-triangular input.
+   * Char2 = "N", "T" or "C" for A not-transposed, transposed or conjugate respectively. 
+   * Char3 = "N" or "U" whether the leading diagonal is non-unit "N" or unit "U" respectively. 
+   */
+  def trisolve(a:DMat, r:DMat, mode:String):DMat = _trisolve(a, r, mode).asInstanceOf[DMat]
+  def trisolve(a:FMat, r:FMat, mode:String):FMat = _trisolve(a, r, mode).asInstanceOf[FMat]
+  def trisolve(a:CMat, r:CMat, mode:String):CMat = _trisolve(a, r, mode).asInstanceOf[CMat]
+  def trisolve(a:Mat, r:Mat, mode:String):Mat = _trisolve(a, r, mode)
+  
+  def _trisolve(a:Mat, r:Mat, mode:String):Mat = {
+    if (a.nrows != a.ncols) {
+      throw new RuntimeException("tsolve a must be square")
+    }
+    if (a.ncols != r.nrows) {
+      throw new RuntimeException("tsolve matrix and rhs must have same ncols")
+    }
+    val out = r.copy
+    Mat.nflops += 1L*a.nrows*a.nrows*r.ncols
+    (a, out) match {
+      case (da:DMat, dout:DMat) => dtrtrs(ORDER.ColMajor, mode, a.nrows, r.ncols, da.data, a.nrows, dout.data, out.nrows)
+      case (fa:FMat, fout:FMat) => strtrs(ORDER.ColMajor, mode, a.nrows, r.ncols, fa.data, a.nrows, fout.data, out.nrows)
+      case (ca:CMat, cout:CMat) => ctrtrs(ORDER.ColMajor, mode, a.nrows, r.ncols, ca.data, a.nrows, cout.data, out.nrows)
+    }  
+    out
+  }
+
+  def trisolve(a:DMat, r:DMat):DMat = _trisolve(a, r, "UNN").asInstanceOf[DMat]
+  def trisolve(a:FMat, r:FMat):FMat = _trisolve(a, r, "UNN").asInstanceOf[FMat]
+  def trisolve(a:CMat, r:CMat):CMat = _trisolve(a, r, "UNN").asInstanceOf[CMat]
+  def trisolve(a:Mat, r:Mat):Mat = _trisolve(a, r, "UNN")
+  
+  def shiftLeft(mat:FMat, step:Int) = {
+    var i = step
+	  while (i < mat.ncols) {
+	  	System.arraycopy(mat.data, i*mat.nrows, mat.data, (i-step)*mat.nrows, mat.nrows)
+	  	i += 1
+	  }
+  }
+  
+  def shiftRight(mat:FMat, step:Int) = {
+    var i = mat.ncols - 1
+	  while (i >= step) {
+	  	System.arraycopy(mat.data, (i-step)*mat.nrows, mat.data, i*mat.nrows, mat.nrows)
+	  	i -= 1
+	  }
+  }
+  
+  def blgmres(A:FMat, b:FMat, nrst:Int, m:Int, s:Int, tol:Float) = {
+    val n = A.nrows
+    val R = normrnd(0, 1, n, s)
+    val H = A.zeros(s*(m+1), s*m)
+    val V = A.zeros(n, s*(m+1))
+    val e1 = A.zeros(s*(m+1),1)
+    e1(0,0) = 1
+    val rots = new Array[FMat](m)
+    val bnorm = norm(b)
+    var x = R(?,0)
+    var done = false
+    
+    def blk(i:Int) = i*s->(i+1)*s
+    def blk2(i:Int) = i*s->(i+2)*s
+    
+    var irestart = 0
+    while (irestart < nrst && !done) {
+      val res = b - A*x 
+      R(?,0) = res
+      var (vj, r) = QRdecompt(R)
+      V(?, 0 -> s) = vj
+      var ex = r(0,0)*e1
+      var j = 0
+      while (j < m && !done) {
+        var Uj = A * vj
+        var k = 0
+        while (k <= j) {
+        	val Vl = V(?, blk(k))
+        	val Hj = Vl.t * Uj
+        	Uj = Uj - Vl * Hj
+        	H(blk(k), blk(j)) = Hj
+        	k += 1
+        }
+        val (vjp, hjp) = QRdecompt(Uj)
+        H(blk(j+1), blk(j)) = hjp
+        V(?, blk(j+1)) = vjp
+        vj = vjp
+        k = 0
+        while (k < j) {     // Apply blocked Givens rotations
+          H(blk2(k), blk(j)) = rots(k) * H(blk2(k), blk(j))
+          k += 1
+        }
+        var (rot, tri) = QRdecomp(H(blk2(j), blk(j)))
+        H(blk2(j), blk(j)) = tri
+        rots(j) = rot.t
+        ex(blk2(j),0) = rots(j) * ex(blk2(j),0)
+        k = 0
+        while (k < s && !done) {
+          val ihere = j*s+k
+          printf("%f ", ex(ihere,0));
+          if (math.abs(ex(ihere,0))/bnorm < tol) {
+            val ym = trisolve(H(0->ihere, 0->ihere), ex(0->ihere,0))
+            x = x + V(?,0->ihere) * ym
+            done = true;
+          }
+          k += 1
+        }
+        printf("\n");
+        j += 1
+      } 
+      if (!done) {
+      	val ym = trisolve(H(0->s*m,?), ex(0->s*m,0))
+      	val zi = V(?,0->s*m) * ym
+      	x = x + zi
+      	if (s > 1) {
+      		shiftRight(R, 1)
+      		R(?, 1) = zi
+      	}
+      }
+      irestart += 1
+    }
+    (x, R, H, V)
+  }
+
+}
diff --git a/src/main/scala/BIDMat/SparseMat.scala b/src/main/scala/BIDMat/SparseMat.scala
new file mode 100755
index 00000000..5a4996ac
--- /dev/null
+++ b/src/main/scala/BIDMat/SparseMat.scala
@@ -0,0 +1,764 @@
+package BIDMat
+
+class SparseMat[@specialized(Double,Float) T]
+(nr: Int, nc: Int, var nnz0:Int, var ir:Array[Int], val jc:Array[Int], val data:Array[T])
+(implicit manifest:Manifest[T], numeric:Numeric[T]) extends Mat(nr, nc) {
+  
+  override def nnz = nnz0
+  
+  /*
+   * Bounds-checked matrix access
+   */	
+  def apply(r0:Int, c0:Int):T = {
+    val off = Mat.oneBased
+    val r = r0 - off
+    val c = c0 - off
+    if (r < 0 || r >= nrows || c < 0 || c >= ncols) {
+      throw new IndexOutOfBoundsException("("+(r+off)+","+(c+off)+") vs ("+nrows+","+ncols+")");
+    } else {
+      get_(r, c);
+    }
+  }
+  /*
+   * Internal (unchecked) accessor
+   */
+  def get_(r:Int, c:Int):T = {
+    val ioff = Mat.ioneBased
+    var ix = 0
+    if (ir != null) {
+    	ix = Mat.ibinsearch(r+ioff, ir, jc(c)-ioff, jc(c+1)-ioff)
+    } else {
+      ix = r+ioff - jc(c)
+    }    
+    if (ix >= 0) data(ix) else numeric.zero
+  }	
+  /*
+   * Update a matrix value, m(r,c) = v
+   */
+  def update(r0:Int, c0:Int, v:T):T = {
+  	val off = Mat.oneBased
+    val r = r0 - off
+    val c = c0 - off
+    if (r < 0 || r >= nrows || c < 0 || c >= ncols) {
+    	throw new IndexOutOfBoundsException("("+(r+off)+","+(c+off)+") vs ("+nrows+","+ncols+")");
+    } else {
+      set_(r, c, v);
+    }
+    v
+  }
+  /*
+   * Internal (unchecked) setter
+   */
+  def set_(r:Int, c:Int, v:T) = {
+    val ioff = Mat.ioneBased
+    var ix = 0
+    if (ir != null) {
+    	ix = Mat.ibinsearch(r+ioff, ir, jc(c)-ioff, jc(c+1)-ioff)
+    } else {
+      ix = r+ioff - jc(c)
+    } 
+    if (ix >= 0) data(ix) = v 
+    else throw new RuntimeException("Can't set missing values")
+  }		
+  
+  def explicitInds = {
+    if (ir == null) {
+    	val ioff = Mat.ioneBased
+    	ir = new Array[Int](nnz)
+    	var i = 0
+    	while (i < ncols) {
+    		var j = 0
+    		while (j + jc(i) < jc(i)+1) {
+    			ir(j+jc(i)-ioff) = j+ioff
+    			j += 1
+    		}
+    		i += 1
+    	}
+    }
+  }
+  /*
+   * Transpose
+   */
+  def gt:SparseMat[T] = {
+    explicitInds
+    SparseMat.sparseImpl[T](SparseMat.uncompressInds(jc, ir), 
+    		            if (Mat.ioneBased==1) SparseMat.decInds(ir) else ir, data, ncols, nrows)
+  }
+  /*
+   * Stack matrices vertically
+   */
+  def vertcat(a:SparseMat[T]):SparseMat[T] = 
+    if (ncols != a.ncols) {
+      throw new RuntimeException("ncols must match")
+    } else {
+      if (ir != null) a.explicitInds
+      if (a.ir != null) explicitInds
+      val out = if (ir != null) {
+      	SparseMat[T](nrows+a.nrows, ncols, nnz+a.nnz)
+      } else {
+        SparseMat.noRows[T](nrows+a.nrows, ncols, nnz+a.nnz)
+      }
+      val ioff = Mat.ioneBased
+      var ip = 0
+      var i = 0
+      out.jc(0) = ioff
+      while (i < ncols) {
+        var j = jc(i)-ioff
+      	while (j < jc(i+1)-ioff) {
+      	  if (out.ir != null) out.ir(ip) = ir(j)
+      	  out.data(ip) = data(j)
+      	  ip += 1
+      	  j += 1
+      	}
+        j = a.jc(i)-ioff
+      	while (j < a.jc(i+1)-ioff) {
+      	  if (out.ir != null) out.ir(ip) = a.ir(j) + nrows
+      	  out.data(ip) = a.data(j)
+      	  ip += 1
+      	  j += 1
+      	}
+      	out.jc(i+1) = ip+ioff
+      	i += 1
+      }
+      out
+    }
+  
+  /*
+   * Stack matrices horizontally
+   */	
+  
+  def horzcat(a:SparseMat[T]):SparseMat[T] =
+    if (nrows != a.nrows) {
+      throw new RuntimeException("nrows must match")
+    } else {
+      if (ir != null) a.explicitInds
+      if (a.ir != null) explicitInds
+      val out = if (ir != null) {
+      	SparseMat[T](nrows+a.nrows, ncols, nnz+a.nnz)
+      } else {
+        SparseMat.noRows[T](nrows+a.nrows, ncols, nnz+a.nnz)
+      }
+      var ip = 0
+      System.arraycopy(data, 0, out.data, 0, nnz)
+      System.arraycopy(a.data, 0, out.data, nnz, a.nnz)
+      if (out.ir != null) {
+      	System.arraycopy(ir, 0, out.ir, 0, nnz)
+      	System.arraycopy(a.ir, 0, out.ir, nnz, a.nnz)
+      }
+      System.arraycopy(jc, 0, out.jc, 0, ncols+1)
+      for (i <- 1 to a.ncols) {
+      	out.jc(i+ncols) = a.jc(i) + nnz
+      }				
+      out
+    }
+  
+  /*
+   * Find indices (single) for all non-zeros elements
+   */
+  def gfind:IMat = {
+    var out = IMat(nnz, 1)
+    val ioff = Mat.ioneBased
+    val off = Mat.oneBased
+    var i = 0
+    while (i < ncols) {
+      var j = jc(i)-ioff
+      if (ir != null) {
+      	while (j < jc(i+1)-ioff) {
+      		out.data(j) = ir(j)-ioff+off + i*nrows
+      		j += 1
+      	}
+      } else {
+        while (j < jc(i+1)-ioff) {
+      		out.data(j) = j-jc(i)+ioff+off + i*nrows
+      		j += 1
+      	}
+      }
+      i += 1
+    }
+    out
+  }
+  /*
+   * Find indices (i,j) for non-zero elements
+   */	
+  def gfind2:(IMat, IMat) = {
+    var iout = IMat(nnz, 1)
+    var jout = IMat(nnz, 1)
+    val ioff = Mat.ioneBased
+    val off = Mat.oneBased
+    var i = 0
+    while (i < ncols) {
+      var j = jc(i)-ioff
+      if (ir != null) {
+      	while (j < jc(i+1)-ioff) {
+      		iout.data(j) = ir(j)-ioff+off
+      		j += 1
+      	}
+      } else {
+        while (j < jc(i+1)-ioff) {
+      		iout.data(j) = j-jc(i)+ioff+off
+      		j += 1
+      	}
+      }
+      i += 1
+    }
+    if (off == 0) {
+    	System.arraycopy(SparseMat.uncompressInds(jc, ir), 0, jout.data, 0, nnz)
+    } else {
+    	SparseMat.incInds(SparseMat.uncompressInds(jc, ir), jout.data)
+    }
+    (iout, jout)
+  }
+  /*
+   * Find indices and values (i,j,v) for non-zero elements
+   */	
+  def gfind3:(IMat, IMat, DenseMat[T]) = {
+    val vout = new DenseMat[T](nnz,1)
+    val (iout, jout) = gfind2
+    System.arraycopy(data, 0, vout.data, 0, nnz)
+    (iout, jout, vout)
+  }  
+  /*
+   * Implement a(im) = b where im is a matrix of indices to a and im and b are same-sized
+   */
+  def update(im:IMat, b:SparseMat[T]) = {
+  }
+  
+  /*
+   * Implement slicing, a(iv,jv) where iv and jv are vectors, using ? as wildcard
+   */
+  def gapply(iv:IMat, jv:IMat):SparseMat[T] = 
+  	iv match {
+  	case aa:MatrixWildcard => {
+  		val colinds = DenseMat.getInds(jv, ncols) 
+  		val ioff = Mat.ioneBased
+  		val off = Mat.oneBased
+  		var tnnz = 0
+  		for (i <- 0 until colinds.length) tnnz += jc(colinds(i)-off+1) - jc(colinds(i)-off)
+  		val out = if (ir != null) {
+      	SparseMat[T](nrows, colinds.length, tnnz)
+      } else {
+        SparseMat.noRows[T](nrows, colinds.length, tnnz)
+      }
+  		var inext = 0
+  		var i = 0
+  		out.jc(0) = ioff
+  		while (i < out.ncols) {
+  			val istep = jc(colinds(i)-off+1) - jc(colinds(i)-off)
+  			if (ir != null) System.arraycopy(ir, jc(colinds(i)-off)-ioff, out.ir, inext, istep)
+  			System.arraycopy(data, jc(colinds(i)-off)-ioff, out.data, inext, istep)
+  			inext += istep
+  			out.jc(i+1) = inext+ioff
+  			i += 1
+  		}
+      out
+  	}
+  	case _ => {
+  	  explicitInds
+  	  val off = Mat.oneBased
+  	  val rowinds = if (off == 0) DenseMat.getInds(iv, nrows) else SparseMat.decInds(DenseMat.getInds(iv, nrows));
+  		val smat = SparseMat.sparseImpl[Int]((0 until iv.length).toArray, rowinds, Array.fill[Int](iv.length)(1), iv.length, nrows)
+  		val colinds = DenseMat.getInds(jv, ncols) 
+  		val ioff = Mat.ioneBased
+  		var tnnz = 0
+  		var i = 0
+  		while (i < colinds.length) {
+  			var j = jc(colinds(i)-off)-ioff
+  			while (j < jc(colinds(i)-off+1)-ioff) {
+  				tnnz += smat.jc(ir(j)+1-ioff) - smat.jc(ir(j)-ioff)
+  				j += 1
+  			}
+  			i += 1
+  		}
+  		val out = SparseMat[T](iv.length, colinds.length, tnnz)
+  		tnnz = 0
+  		i = 0
+  		out.jc(0) = ioff
+  		while (i < colinds.length) {
+  			var j = jc(colinds(i)-off)-ioff
+  			while (j < jc(colinds(i)-off+1)-ioff) {
+  				val dval = data(j)
+  				var k = smat.jc(ir(j)-ioff) - ioff
+  				while (k < smat.jc(ir(j)+1-ioff)-ioff) {
+  					out.ir(tnnz) = smat.ir(k)
+  					out.data(tnnz) = dval
+  					tnnz += 1
+  					k += 1
+  				}
+  				j += 1
+  			}
+  			out.jc(i+1) = tnnz+ioff
+  			i += 1
+  		}
+  		out
+  	}
+    }  
+
+  private def printOne(a:T):String = 
+  	a match {
+  	case v:Double => {
+  		if (v % 1 == 0 && math.abs(v) < 1e10) {	      
+  			"%d" format v.intValue
+  		} else {
+  			"%.5g" format v
+  		}
+  	}
+  	case v:Float => {
+  		if (v % 1 == 0 && math.abs(v) < 1e5) {	      
+  			"%d" format v.intValue
+  		} else {
+  			"%.5g" format v
+  		}
+  	}
+  	case _ => ""
+  }
+  
+  private def printOne(v0:Int):String = {
+  		val v = v0 + Mat.oneBased
+  		"%d" format v
+  }
+
+  
+  override def toString:String = {
+    val ioff = Mat.ioneBased
+    val maxRows = 8
+    var fieldWidth = 4
+    val sb:StringBuilder = new StringBuilder
+    val somespaces = "                    "
+    var innz = 0
+    var icol = 0
+    while (innz < math.min(nnz, maxRows)) {
+      while (innz >= jc(icol+1)-ioff) icol += 1
+      fieldWidth = math.max(fieldWidth, 2+printOne(ir(innz)).length)
+      fieldWidth = math.max(fieldWidth, 2+printOne(icol).length)
+      fieldWidth = math.max(fieldWidth, 2+printOne(data(innz)).length)
+      innz += 1
+    }
+    innz = 0
+    icol = 0
+    while (innz < math.min(nnz, maxRows)) {
+      while (innz >= jc(icol+1)-ioff) icol += 1
+      var str = printOne(ir(innz)-ioff)
+      sb.append("("+somespaces.substring(0,fieldWidth-str.length)+str)
+      str = printOne(icol)
+      sb.append(","+somespaces.substring(0,fieldWidth-str.length)+str)
+      str = printOne(data(innz))
+      sb.append(")"+somespaces.substring(0,fieldWidth-str.length)+str+"\n")
+      innz += 1
+    }
+    if (nnz > maxRows) {
+      for (j <- 0 until 3) {
+      	sb.append(somespaces.substring(0, fieldWidth-2)+"...")
+      }
+      sb.append("\n")
+    }
+    sb.toString()
+  }	
+  
+  def gsMult(a:SparseMat[T]):DenseMat[T] = 
+    if (ncols != a.nrows) 
+      throw new RuntimeException("dims mismatch")
+    else {
+      explicitInds
+      a.explicitInds
+      val out = new DenseMat[T](nrows, a.ncols)
+      val ioff = Mat.ioneBased
+      var i = 0
+      while (i < a.ncols) {
+      	val i0 = nrows*i
+      	var j = a.jc(i)-ioff
+      	while (j < a.jc(i+1)-ioff) {
+      	  val ind = a.ir(j)-ioff
+      	  val tval = a.data(j)
+      	  var k = jc(ind)-ioff
+      	  while (k < jc(ind+1)-ioff) {
+      	    val indx = ir(k)-ioff + i0
+      	    data(indx) = numeric.plus(data(indx), numeric.times(tval, data(k)))
+      	    k += 1
+      	  }
+      	  j += 1
+      	}
+      	i += 1
+      }
+      out
+    }
+  
+  def sgMatOp(b:SparseMat[T], op2:(T,T) => T, omat:Mat):SparseMat[T] =
+    if (nrows==b.nrows && ncols==b.ncols) {
+      if (ir != null) b.explicitInds
+      if (b.ir != null) explicitInds
+      if (ir == null) {
+        sgMatOpNR(b,op2,omat)
+      } else {
+      	val out = SparseMat.newOrCheck(nrows, ncols, nnz+b.nnz, omat)
+      	val ioff = Mat.ioneBased
+      	var nzc = 0
+      	out.jc(0) = ioff 
+      	var i = 0
+      	while (i < ncols) {
+      		var ia = jc(i)-ioff
+      		var ib = b.jc(i)-ioff
+      		while (ia < jc(i+1)-ioff && ib < b.jc(i+1)-ioff) {
+      			if (ir(ia) < b.ir(ib)) {
+      				out.ir(nzc) = ir(ia)
+      				out.data(nzc) = op2(data(ia), numeric.zero)
+      				ia += 1
+      			} else if (ir(ia) > b.ir(ib)) {
+      				out.ir(nzc) = b.ir(ib)
+      				out.data(nzc) = op2(numeric.zero, b.data(ib))
+      				ib += 1
+      			} else {
+      				out.ir(nzc) = ir(ia)
+      				out.data(nzc) = op2(data(ia), b.data(ib))
+      				ia += 1
+      				ib += 1
+      			}
+      			nzc += 1
+      		}
+      		while (ia < jc(i+1)-ioff) {
+      			out.ir(nzc) = ir(ia)
+      			out.data(nzc) = op2(data(ia), numeric.zero)
+      			ia += 1
+      			nzc += 1
+      		}
+      		while (ib < b.jc(i+1)-ioff) {
+      			out.ir(nzc) = b.ir(ib)
+      			out.data(nzc) = op2(numeric.zero, b.data(ib))
+      			ib += 1
+      			nzc += 1
+      		}
+      		out.jc(i+1) = nzc+ioff
+      		i += 1
+      	}
+      	out.sparseTrim
+      }
+    } else {
+    	throw new RuntimeException("dimensions mismatch")
+    }
+
+  
+  def sgMatOpNR(b:SparseMat[T], op2:(T,T) => T, omat:Mat):SparseMat[T] = {
+  		val out = SparseMat.newOrCheck(nrows, ncols, nnz+b.nnz, omat, true)
+  		val ioff = Mat.ioneBased
+  		var nzc = 0
+  		out.jc(0) = ioff
+  		for (i <- 0 until ncols) {
+  			var ia = jc(i)-ioff
+  			var ib = b.jc(i)-ioff
+  			while (ia < jc(i+1)-ioff && ib < b.jc(i+1)-ioff) {
+  				out.data(nzc) = op2(data(ia), b.data(ib))
+  				ia += 1
+  				ib += 1
+  				nzc += 1
+  			}
+  			while (ia < jc(i+1)-ioff) {
+  				out.data(nzc) = op2(data(ia), numeric.zero)
+  				ia += 1
+  				nzc += 1
+  			}
+  			while (ib < b.jc(i+1)-ioff) {
+  				out.data(nzc) = op2(numeric.zero, b.data(ib))
+  				ib += 1
+  				nzc += 1
+  			}
+  			out.jc(i+1) = nzc+ioff
+  		}
+  		out.sparseTrim
+  } 
+  
+  def sgReduceOp(dim:Int, op1:(T) => T, op2:(T,T) => T, omat:Mat):DenseMat[T] = {
+  		val ioff = Mat.ioneBased
+  		if (dim == 0) {
+  			if (nrows > 1 && ncols > 1) {
+  				throw new RuntimeException("must be a vector")
+  			} else {
+  				val out = DenseMat.newOrCheck(1, 1, omat)
+  				var j = 0
+  				var acc = op1(numeric.zero)
+  				while (j < nnz) { 
+  					acc = op2(acc, data(j))
+  					j += 1
+  				}
+  				out.data(0) = acc
+  				out
+  			}
+  		} else  if (dim == 1) {
+  			val out = DenseMat.newOrCheck(1, ncols, omat)
+  			var i = 0
+  			while (i < ncols) { 
+  				var acc = op1(numeric.zero)
+  				var j = jc(i)-ioff
+  				while (j < jc(i+1)-ioff) { 
+  					acc = op2(acc, data(j))
+  					j += 1
+  				}
+  				out.data(i) = acc
+  				i += 1
+  			}
+  			out
+  		} else if (dim == 2) { 
+  			val out = DenseMat.newOrCheck(nrows, 1, omat)
+  			out.clear
+  			if (ir != null) {
+  				var j = 0
+  				while (j < nnz) { 
+  					out.data(ir(j)-ioff) = op2(out.data(ir(j)-ioff), data(j))
+  					j += 1
+  				}
+  			} else {
+  			  var i = 0
+  				while (i < ncols) { 
+  					var j = jc(i)
+  					while (j < jc(i+1)) { 
+  						out.data(j-jc(i)) = op2(out.data(j-jc(i)), data(j-ioff))
+  						j += 1
+  					}
+  					i += 1
+  				}
+  			}
+  			out
+  		} else
+  			throw new RuntimeException("index must 1 or 2")	
+  }
+  
+  def ssMatOpOne(b:DenseMat[T], op2:(T,T) => T, omat:Mat):SparseMat[T] =	
+    if (b.nrows == 1 && b.ncols == 1) {
+      sgMatOpScalar(b.data(0), op2, omat)
+    } else throw new RuntimeException("dims incompatible")
+  
+  def sgMatOpScalar(b:T, op2:(T,T) => T, outmat:Mat):SparseMat[T] = {
+    val out = SparseMat.newOrCheck(nrows, ncols, nnz, outmat, (ir == null))
+    var i = 0
+    out.jc(0) = jc(0)
+    while (i < nnz) {
+      out.data(i) = op2(data(i), b)
+      if (ir != null) out.ir(i) = ir(i)
+      i += 1
+    } 
+    i = 0
+    while (i < ncols) {
+      out.jc(i) = jc(i)
+      i += 1
+    }
+    out.sparseTrim
+  } 
+  
+  def sparseTrim:SparseMat[T] = {
+    val ioff = Mat.ioneBased
+    var i = 0
+    var nzc = 0
+    while (i < ncols) {
+      var j = jc(i)
+      while (j < jc(i+1)) {
+      	if (numeric.signum(data(j-ioff)) != 0) nzc += 1
+      	j += 1
+      }
+      i += 1
+    }
+    if (nzc == nnz) {
+      this
+    } else {
+      var out = this
+      nzc = 0
+      var lastjc = 0
+      var i = 0
+      out.jc(0) = ioff
+      while (i < ncols) {
+    	var j = lastjc
+    	while (j < jc(i+1)-ioff) {
+    	  if (numeric.signum(data(j)) != 0) {
+    	    out.data(nzc) = data(j)
+    	    if (ir != null) out.ir(nzc) = ir(j)
+    	    nzc += 1
+    	  }
+    	  j += 1
+    	}
+    	lastjc = jc(i+1)-ioff
+    	out.jc(i+1) = nzc+ioff
+    	i += 1
+      }
+      nnz0 = nzc
+      out
+    }
+  }
+  
+  def check = {
+    val ioff = Mat.ioneBased
+    var i = 0
+    if (jc(0) != ioff) {
+      throw new RuntimeException("jc(0) should be "+ioff)
+    }
+    while (i < ncols) {
+      var j = jc(i)-ioff
+      if (jc(i) > jc(i+1)) {
+        throw new RuntimeException("jc(i) out of order " + i + " " + jc(i) + " " + jc(i+1))
+      }
+      if (ir != null) {
+      	while (j < jc(i+1)-ioff-1) {
+      		if (ir(j+1) <= ir(j)) {
+      			throw new RuntimeException("ir(j) out of order "+j+" "+ir(j)+" "+ir(j+1))
+      		}
+      		if (ir(j) < ioff) {
+      			throw new RuntimeException("ir("+j+")="+ir(j)+" too small")
+      		}
+      		if (ir(j+1) >= nrows+ioff) {
+      			throw new RuntimeException("ir("+(j+1)+")="+ir(j+1)+" out of range "+(nrows+ioff))
+      		}
+      		j += 1
+      	}
+      }
+      i += 1
+    }
+    if (jc(ncols) != nnz+ioff) {
+      throw new RuntimeException("jc(ncols) should be "+nnz)
+    }
+  }
+
+  def full:DenseMat[T] = { 
+    val out = new DenseMat[T](nrows, ncols)
+    val ioff = Mat.ioneBased
+    if (ir != null) {
+    	val cols = SparseMat.uncompressInds(jc, ir)
+    	var i = 0
+    	while (i < nnz) {
+    		out.data(ir(i)-ioff + nrows*cols(i)) = data(i)
+    		i += 1
+    	}
+    } else {
+      var i = 0
+    	while (i < ncols) {
+    	  var j = jc(i)-ioff
+    	  while (j < jc(i+1)-ioff) {
+    	  	out.data(j-jc(i)+ioff + nrows*i) = data(j)
+    	  	j += 1
+    	  }
+    		i += 1
+    	}
+    }
+    out
+  }
+  
+   override def recycle(nr:Int, nc:Int, nnz:Int):SparseMat[T] = {
+  	val jc0 = if (jc.size >= nc+1) jc else new Array[Int](nc+1)
+  	val ir0 = if (ir.size >= nnz) ir else new Array[Int](nnz)
+  	val data0 = if (data.size >= nnz) data else new Array[T](nnz)
+  	new SparseMat[T](nr, nc, nnz, ir0, jc0, data0)    
+  }
+
+}
+
+
+object SparseMat {
+  
+  def apply[T](nr:Int, nc:Int, nnz0:Int)
+  (implicit manifest:Manifest[T], numeric:Numeric[T]):SparseMat[T] = 
+    new SparseMat[T](nr, nc, nnz0, new Array[Int](nnz0), new Array[Int](nc+1), new Array[T](nnz0))
+    
+  def noRows[T](nr:Int, nc:Int, nnz0:Int)
+  (implicit manifest:Manifest[T], numeric:Numeric[T]):SparseMat[T] = 
+    new SparseMat[T](nr, nc, nnz0, null, new Array[Int](nc+1), new Array[T](nnz0))
+  
+  def sparseImpl[@specialized(Double, Float) T](rows:Array[Int], cols:Array[Int], vals:Array[T], nrows:Int, ncols:Int)
+  (implicit manifest:Manifest[T], numeric:Numeric[T]):SparseMat[T] = {
+    val ioff = Mat.ioneBased
+    val out = SparseMat[T](nrows, ncols, rows.length)
+    val orows = out.ir
+    val ocols = new Array[Int](rows.length)
+    var i = 0
+    while (i < cols.length) {
+      ocols(i) = cols(i)
+      orows(i) = rows(i) + ioff
+      i += 1
+    }
+    val isort = BIDMat.Mat.ilexsort2(ocols, orows)
+    i = 0
+    var igood = 0
+    while (i < cols.length) {
+      if (i == 0 || orows(i) != orows(i-1) || ocols(i) != ocols(i-1)) {
+      	ocols(igood) = ocols(i)
+      	orows(igood) = orows(i)
+      	out.data(igood) = vals(isort(i))	
+      	igood += 1
+      } else {
+    	  out.data(igood) = numeric.plus(out.data(igood), vals(isort(i)))
+      }
+      i += 1
+    }
+    SparseMat.compressInds(ocols, ncols, out.jc, igood)
+    out.sparseTrim
+  }
+  
+  def compressInds(coli:Array[Int], ncols:Int, out:Array[Int], nnz0:Int):Array[Int] = {
+    val ioff = Mat.ioneBased
+    out(0) = ioff    
+    var j = 0
+    var i = 0
+    while (i < ncols) {
+      while (j < nnz0 && coli(j) <= i) j+= 1
+      out(i+1) = j+ioff
+      i += 1
+    }
+    out
+  }
+  
+  def uncompressInds(coli:Array[Int], rowi:Array[Int]):Array[Int] = {
+  	val ioff = Mat.ioneBased
+  	val out = new Array[Int](rowi.length)
+  	var i = 0
+  	while (i < (coli.length-1)) {
+  		var j = coli(i)-ioff
+  		while (j < coli(i+1)-ioff) {
+  			out(j) = i
+  			j+= 1
+  		}
+  		i += 1
+  	}
+  	out
+  }
+
+  def incInds(inds:Array[Int], out:Array[Int]):Array[Int] = {
+    var i = 0
+    while (i < inds.length) { 
+      out(i) = inds(i) + 1 
+      i += 1
+    }
+    out
+  }
+  
+  def incInds(inds:Array[Int]):Array[Int] = {
+    val out = new Array[Int](inds.length)
+    incInds(inds, out)
+  }
+  
+  def decInds(inds:Array[Int]):Array[Int] = {
+    val out = new Array[Int](inds.length)
+    var i = 0
+    while (i < inds.length) { 
+      out(i) = inds(i) - 1 
+      i += 1
+    }
+    out
+  }
+ 
+  def newOrCheck[T](nr:Int, nc:Int, nnz:Int, oldmat:Mat, norows:Boolean = false)
+  (implicit manifest:Manifest[T], numeric:Numeric[T]):SparseMat[T] = {
+    if (oldmat.asInstanceOf[AnyRef] == null || (oldmat.nrows == 0 && oldmat.ncols == 0)) {
+      if (norows)
+        SparseMat.noRows(nr, nc, nnz)
+      else
+        SparseMat(nr, nc, nnz)
+    } else {
+      val omat = oldmat.asInstanceOf[SparseMat[T]];
+      if (omat.nrows == nr && omat.ncols == nc && omat.nnz == nnz) {
+        omat
+      } else {
+      	omat.recycle(nr, nc, nnz)
+      }
+    }
+  }
+}
+
+
+
+
+
+
diff --git a/src/main/scala/BIDMat/Translators.scala b/src/main/scala/BIDMat/Translators.scala
new file mode 100755
index 00000000..9760b84a
--- /dev/null
+++ b/src/main/scala/BIDMat/Translators.scala
@@ -0,0 +1,147 @@
+package BIDMat
+import scala.util.parsing.combinator._
+
+class MKLdeclarationsParser extends JavaTokenParsers { 
+  var otype:Int = 3
+  var prefix:String = "LAPACK"
+  def fdecl: Parser[String] = ftype~ident~"("~decls~")"~""";*""".r ^^ { 
+    case a~b~c~d~e~f => {
+      val b0 = b.split("_"); val b1 = b0(b0.length-1)
+      val toreturn:Boolean = (a != fvoid)
+    	otype match {
+    	case 0 => "JNIEXPORT "+a("")+"JNICALL Java_edu_berkeley_bid_"+prefix+"_"+b1+
+    	"\n(JNIEnv * env, jobject calling_obj, "+d+") {\n"
+    	case 1 => d
+    	case 2 => "    "+(if (toreturn) a("")+"retval = " else "") +b+c+d+e+";\n"
+    	case 3 => d+(if (toreturn) "    return retval;\n}\n" else "}\n")
+    	case 4 => "  public static native "+a(b1)+c+d+e+";\n"
+    	}
+    }
+  } 
+  
+  def decls: Parser[String] = repsep(adecl, ",") ^^ {
+    case a:List[String] => {
+      var ll = ""
+      	for (el <- a) {
+      	  ll = ll + el
+      	  if (otype == 0 || otype == 2 || otype == 4) {
+      	    if (el != a.last) ll = ll+", "
+      	  }
+      	}
+      ll
+      }
+    }
+  
+  def adecl: Parser[String] = ftype~ident ^^ { case a~b => a(b) }
+  
+  def ftype: Parser[String=>String] = fvoidp | fvoid | flogicalp | flogical | fintp | fint | fuintp | fuint | fchar |
+    ffloatp | ffloat | ffcomplexp | ffcomplex | fdoublep | fdouble | fdcomplexp | fdcomplex 
+   
+  def sstring(y:String, wrap:Int, p1:String, p2:String, p3:String, p4:String) = {
+    otype match {
+      case 0 => p1 + " " + (if (wrap > 0) "j_"+y else y)
+      case 1 => if (wrap == 1) "    "+p2+" "+y+" = (*env)->GetPrimitiveArrayCritical(env, j_"+y+", JNI_FALSE);\n" else 
+      	if (wrap == 2) "    char * "+y+" = (char *)(*env)->GetStringUTFChars(env, j_"+y+", JNI_FALSE);\n" else ""
+      case 2 => if (y.length == 0) p2+" " else  p3+y
+      case 3 => if (wrap == 1) "    (*env)->ReleasePrimitiveArrayCritical(env, j_"+y+", "+y+", 0);\n" else 
+        if (wrap == 2) 	"    (*env)->ReleaseStringUTFChars(env, j_"+y+", "+y+");\n" else ""
+      case 4 => p4+" "+y
+    }
+  }
+  
+  def fvoid: Parser[String=>String] = "void" ^^ (x => (y=>
+    sstring(y, 0, "void", "void", "", "void")))
+    
+  def fenum: Parser[String=>String] = ("const"~"enum"~ident | "enum"~ident)  ^^ {
+    case a~b~c => (y => sstring(y, 0, "jint", "jint", "("+c+")", "int"))
+    case b~c => (y => sstring(y, 0, "jint", "jint", "("+c+")", "int"))
+  }
+  
+  def flogical: Parser[String=>String] = ("const"~"lapack_logical" | "lapack_logical") ^^ (x => (y=>
+    sstring(y, 1, "int", "jint *", "(lapack_logical)", "int")))
+
+  def fchar: Parser[String=>String] = ("const"~"char" | "char") ^^ (x => (y=> 
+    sstring(y, 2, "jstring", "char *", "*", "String")))
+    
+  def fint: Parser[String=>String] = ("const"~"int" | "int" | "const"~"lapack_int" | "lapack_int" | "MKL_INT") ^^ (x => (y=> 
+    sstring(y, 0, "jint", "jint", "", "int")))
+  
+  def fuint: Parser[String=>String] = ("const"~"MKL_UINT" | "MKL_UINT"| "const"~"unsigned"~"int" | "unsigned"~"int")^^ (x => (y =>
+    sstring(y, 0, "jint", "jint", "(unsigned int)", "int")))
+  
+  def ffloat: Parser[String=>String] = ("const"~"float" | "float" | "lapack_float") ^^ (x => (y =>
+    sstring(y, 0, "jfloat", "jfloat", "", "float")))
+    
+  def fdouble: Parser[String=>String] = ("const"~"double" | "double" | "lapack_double") ^^ (x => (y =>
+    sstring(y, 0, "jdouble", "jdouble", "", "double")))
+    
+  def ffcomplex: Parser[String=>String] = ("const"~"lapack_complex_float" | "lapack_complex_float" | "MKL_complex8") ^^ (x => (y =>
+    sstring(y, 1, "jfloatArray", "jfloat *", "*", "float []")))
+    
+  def fdcomplex: Parser[String=>String] = ("const"~"lapack_complex_double" | "lapack_complex_double" | "MKL_complex16") ^^ (x => (y =>
+    sstring(y, 1, "jdoubleArray", "jdouble *", "*", "double []")))
+  
+  def fvoidp: Parser[String=>String] = ("const"~"void"~"*" | "void"~"*") ^^ (x => (y =>
+    sstring(y, 1, "jfloatArray", "jfloat *", "(void *)", "float []")))
+  
+  def flogicalp: Parser[String=>String] = ("const"~"lapack_logical"~"*" | "lapack_logical"~"*") ^^ (x => (y=>
+    sstring(y, 1, "jintArray", "jint *", "(lapack_logical *)", "int []")))
+
+  def fintp:  Parser[String=>String] = ("const"~"int"~"*" | "int"~"*" | "MKL_INT"~"*" | "lapack_int"~"*") ^^ (x => (y=> 
+    sstring(y, 1, "jintArray", "jint *", "", "int []")))
+     
+  def fuintp: Parser[String=>String] = ("const"~"MKL_UINT"~"*" | "MKL_UINT"~"*" | "const"~"unsigned"~"int"~"*" | "unsigned"~"int"~"*") ^^ (x => (y =>
+    sstring(y, 1, "jintArray ", "jint *", "", "int []")))
+
+  def ffloatp: Parser[String=>String] = ("const"~"float"~"*" | "float"~"*") ^^ (x => (y=> 
+    sstring(y, 1, "jfloatArray", "jfloat *", "", "float []")))
+  
+  def fdoublep: Parser[String=>String] = ("const"~"double"~"*" | "double"~"*") ^^ (x => (y=> 
+    sstring(y, 1, "jdoubleArray", "jdouble *", "", "double []")))
+  
+  def ffcomplexp: Parser[String=>String] = ("const"~"lapack_complex_float"~"*" | "lapack_complex_float"~"*" | "MKL_complex8"~"*") ^^ {
+    case x0~x1~x2 => (y => sstring(y, 1, "jfloatArray", "jfloat *", "("+x1+" *)", "float []"))
+    case x1~x2 => (y => sstring(y, 1, "jfloatArray", "jfloat *", "("+x1+" *)", "float []"))
+    }
+    
+  def fdcomplexp: Parser[String=>String] = ("const"~"lapack_complex_double"~"*" | "lapack_complex_double"~"*" | "MKL_complex16"~"*") ^^ {
+    case x0~x1~x2 => (y=> sstring(y, 1, "jdoubleArray", "jdouble *", "("+x1+" *)", "double []"))
+    case x1~x2 => (y=> sstring(y, 1, "jdoubleArray", "jdouble *", "("+x1+" *)", "double []"))
+    }
+
+}
+
+object translateTester { 
+  def main(args: Array[String]) {
+    val p = new MKLdeclarationsParser
+    val s = scala.io.Source.fromFile(args(0))
+    if (args.length == 1) {
+    	s.getLines.foreach((line) => {
+    		if (line.length > 1) {
+    			p.otype = 0
+    			println(p.parseAll(p.fdecl, line).get)
+    			p.otype = 1
+    			println(p.parseAll(p.fdecl, line).get)
+    			p.otype = 2
+    			println(p.parseAll(p.fdecl, line).get)
+    			p.otype = 3
+    			println(p.parseAll(p.fdecl, line).get)
+    		}
+    	})
+    } else {
+    	s.getLines.foreach((line) => {
+    		if (line.length > 1) {
+    			p.otype = 4	  
+    			println(p.parseAll(p.fdecl, line).get)
+    		}
+    	})
+    }
+  }
+}
+
+
+
+object Translators { 
+
+
+}
diff --git a/src/test/scala/BIDMat/Copyright.txt b/src/test/scala/BIDMat/Copyright.txt
new file mode 100755
index 00000000..21326596
--- /dev/null
+++ b/src/test/scala/BIDMat/Copyright.txt
@@ -0,0 +1,25 @@
+Copyright (c) 2012, Regents of the University of California
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/src/test/scala/BIDMat/DMatTest.scala b/src/test/scala/BIDMat/DMatTest.scala
new file mode 100755
index 00000000..8d013c15
--- /dev/null
+++ b/src/test/scala/BIDMat/DMatTest.scala
@@ -0,0 +1,130 @@
+package BIDMat
+
+import Mat._
+import MatFunctions._
+import org.scalatest._;
+import org.scalatest.junit._;
+import org.scalatest.prop._;
+import org.junit.runner.RunWith
+
+@RunWith(classOf[JUnitRunner])
+class DMatTest extends FunSuite with Checkers {
+	val x = DMat(2,3)
+	val xvalues = List(1.0,2.0,3.0,4.0,5.0,6.0).toArray
+	System.arraycopy(xvalues, 0, x.data, 0, 6)
+	val y = DMat(1,3)
+	val yvalues = List(7.0,8.0,9.0).toArray
+	System.arraycopy(yvalues, 0, y.data, 0, 3)
+	val z = DMat(2,1)
+	val zvalues = List(10.0,11.0).toArray
+	System.arraycopy(zvalues, 0, z.data, 0, 2)
+	val xx = DMat(3,4)
+	val xxvalues = List(1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0).toArray
+	System.arraycopy(xxvalues, 0, xx.data, 0, 12)
+	
+  test("DMat fill") {
+    assert(x(0,0) == 1.0);
+    assert(x(1,0) == 2.0);
+    assert(x(0,1) == 3.0);
+    assert(x(1,1) == 4.0);
+    assert(x(0,2) == 5.0);
+    assert(x(1,2) == 6.0);
+  }
+	
+	test("DMat on") {
+		val a = x on y;
+    assert(a(0,0) == 1.0);
+    assert(a(1,0) == 2.0);
+    assert(a(0,1) == 3.0);
+    assert(a(1,1) == 4.0);
+    assert(a(0,2) == 5.0);
+    assert(a(1,2) == 6.0);
+
+    assert(a(2,0) == 7.0);
+    assert(a(2,1) == 8.0);
+    assert(a(2,2) == 9.0);
+  }
+	
+	test("DMat \\") {
+		val a = x \ z;
+    assert(a(0,0) == 1.0);
+    assert(a(1,0) == 2.0);
+    assert(a(0,1) == 3.0);
+    assert(a(1,1) == 4.0);
+    assert(a(0,2) == 5.0);
+    assert(a(1,2) == 6.0);
+    assert(a(0,3) == 10.0);
+    assert(a(1,3) == 11.0);
+  }
+	
+	test("DMat t") {
+		val a = x.t;
+		assert(a(0,0) == 1.0);
+    assert(a(0,1) == 2.0);
+    assert(a(1,0) == 3.0);
+    assert(a(1,1) == 4.0);
+    assert(a(2,0) == 5.0);
+    assert(a(2,1) == 6.0);
+  }
+	
+	test("DMat slice 1") {
+		val a = xx(?,1)
+		assert(a(0,0) == 4.0);
+    assert(a(1,0) == 5.0);
+    assert(a(2,0) == 6.0);
+  }
+	
+	test("DMat slice 2") {
+		val a = xx(?,1 to 2)
+		assert(a(0,0) == 4.0);
+    assert(a(1,0) == 5.0);
+    assert(a(2,0) == 6.0);
+    assert(a(0,1) == 7.0);
+    assert(a(1,1) == 8.0);
+    assert(a(2,1) == 9.0);
+  }
+	
+	test("DMat slice 3") {
+		val a = xx(1,?)
+		assert(a(0,0) == 2.0);
+    assert(a(0,1) == 5.0);
+    assert(a(0,2) == 8.0);
+    assert(a(0,3) == 11.0);
+  }
+	
+	test("DMat slice 4") {
+		val a = xx(0 to 1,?)
+		assert(a(0,0) == 1.0);
+    assert(a(0,1) == 4.0);
+    assert(a(0,2) == 7.0);
+    assert(a(0,3) == 10.0);
+    assert(a(1,0) == 2.0);
+    assert(a(1,1) == 5.0);
+    assert(a(1,2) == 8.0);
+    assert(a(1,3) == 11.0);
+  }
+	
+	test("DMat slice 5") {
+		val a = xx(?,?)
+		assert(a(0,0) == 1.0);
+    assert(a(0,1) == 4.0);
+    assert(a(0,2) == 7.0);
+    assert(a(0,3) == 10.0);
+    assert(a(1,0) == 2.0);
+    assert(a(1,1) == 5.0);
+    assert(a(1,2) == 8.0);
+    assert(a(1,3) == 11.0);
+    assert(a(2,0) == 3.0);
+    assert(a(2,1) == 6.0);
+    assert(a(2,2) == 9.0);
+    assert(a(2,3) == 12.0);
+  }
+	
+	test("DMat slice 6") {
+		val a = xx(0 to 1, 2 to 3)
+		assert(a(0,0) == 7.0);
+		assert(a(1,0) == 8.0);
+    assert(a(0,1) == 10.0);
+    assert(a(1,1) == 11.0);
+  }
+}
\ No newline at end of file
diff --git a/src/test/scala/BIDMat/TestDops.scala b/src/test/scala/BIDMat/TestDops.scala
new file mode 100755
index 00000000..b8e9b85a
--- /dev/null
+++ b/src/test/scala/BIDMat/TestDops.scala
@@ -0,0 +1,25 @@
+package BIDMat
+
+
+import DMat._
+import IMat._
+import FMat._
+import scala.compat.Platform._
+
+
+object TestDops {
+  def main(args: Array[String]): Unit = {
+	 val n = 2000
+	 val a = IMat(n,n)
+	 val b = IMat(n,n)
+	 val t0 = currentTime
+	 val m = 1000
+	 println("starting up")
+	 for (i <- 0 until m) {
+		 val c = a + b
+	 }
+	 val t1 = currentTime - t0
+	 println("time="+t1+" msec, gflops="+(n.doubleValue*n*m/t1/1e6))
+  }
+}
+
diff --git a/src/test/scala/BIDMat/TestDops2.scala b/src/test/scala/BIDMat/TestDops2.scala
new file mode 100755
index 00000000..6af1030d
--- /dev/null
+++ b/src/test/scala/BIDMat/TestDops2.scala
@@ -0,0 +1,29 @@
+package BIDMat
+
+import Mat._
+import DMat._
+import FMat._
+import scala.compat.Platform._
+
+
+object TestDops2 {
+  def main(args: Array[String]): Unit = {
+    val n = 50000
+    val k = 20
+    val l = 1
+    val a = FMat(l,n)
+    val b = FMat(n,k)
+    val d = FMat(k,n)
+    val e = FMat(n,l)
+    val m = 30000
+    val t0 = currentTime
+    println("Starting up")
+    for (i <- 0 until m) {
+      val c = a * b
+//      val c = d * e
+    }
+    val t1 = currentTime - t0
+	println("time="+t1+"msec, gflops="+(2.0*k*n*l*m/t1/1e6))
+  }
+}
+
diff --git a/src/test/scala/BIDMat/TestDops3.scala b/src/test/scala/BIDMat/TestDops3.scala
new file mode 100755
index 00000000..17616ac1
--- /dev/null
+++ b/src/test/scala/BIDMat/TestDops3.scala
@@ -0,0 +1,30 @@
+package BIDMat
+
+import Mat._
+import DMat._
+import FMat._
+import scala.compat.Platform._
+
+
+object TestDops3 {
+  def main(args: Array[String]): Unit = {
+    val n = 50000
+    val k = 10
+    val l = 1
+    val a = FMat(n,k)
+//    val a2 = FMat(k,n)
+    val b = FMat(l,n)
+    val d = FMat(k,l)
+    val m = 30000
+    val t0 = currentTime
+    println("Starting up")
+    for (i <- 0 until m) {
+//      val c = b * a
+//      val e = a * d
+      val c = a t
+    }
+    val t1 = currentTime - t0
+	println("time="+t1+"msec, gflops="+(2.0*k*n*l*m/t1/1e6))
+  }
+}
+
diff --git a/src/test/scala/BIDMat/TestHDF5.scala b/src/test/scala/BIDMat/TestHDF5.scala
new file mode 100755
index 00000000..8abed633
--- /dev/null
+++ b/src/test/scala/BIDMat/TestHDF5.scala
@@ -0,0 +1,39 @@
+package BIDMat
+
+
+import MatFunctions._
+import SciFunctions._
+import CMat._
+
+
+
+object TestHDF5 {
+	def main(args: Array[String]) : Unit = {
+			val n = 50000
+			val k = 10
+			val l = 1
+			val a = rand(n,k)
+			val b = IMat(l,n)
+			val c = sprand(10,10,0.1)
+			val d = CSMat(1,2)
+			d(0,0) = "test"
+			d(0,1) = "try"
+			val fname = "d:\\sentiment\\tmp\\mtest.mat"
+//			println(a.toString)
+//			println(b.toString)
+			saveAs(fname, d, "c")
+/*			val fid = new java.io.FileInputStream(fname)
+			var next:Int = 0
+			var i = 0
+			while (next >= 0) {
+				var next = fid.read()
+				if (next > 0) println("buf("+i+")=("+(next + (if (next > 127) -256 else 0))+")")
+				i += 1
+			}
+				fid.close() */
+			val e:CMat = load(fname, "c").asInstanceOf[CMat]
+	    println(e(0,0).asInstanceOf[String])
+		  println(e(0,1).asInstanceOf[String])
+	}
+}
+