From 1fd4cbd0cfa34f3ca424cf89c1667de8930d8726 Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Thu, 21 May 2015 20:27:51 +0200
Subject: [PATCH 01/15] starting on ROSW

---
 src/rosw.jl | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 src/rosw.jl

diff --git a/src/rosw.jl b/src/rosw.jl
new file mode 100644
index 000000000..70fe9dfaf
--- /dev/null
+++ b/src/rosw.jl
@@ -0,0 +1,95 @@
+# Rosenbrock-Wanner methods
+###########################
+
+# Rosenbrock-W methods are typically specified for autonomous DAE:
+# Mẋ = f(x)
+#
+# by the stage equations
+# M kᵢ = hf(x₀ + + Σⱼ aᵢⱼkⱼ) + h J Σⱼ γᵢⱼkⱼ
+#
+# and step completion formula
+# x₁ = x₀ + \Sigmaⱼ bⱼ kⱼ
+#
+# The method used here uses transformed equations as done in PETSc.
+
+immutable TableauRosW{Name, S, T} <: Tableau{Name, S, T}
+    order::(@compat(Tuple{Vararg{Int}})) # the order of the methods
+    a::Matrix{T}
+    γ::Matrix{T}
+    # one or several row vectors.  First row is used for the step,
+    # second for error calc.
+    b::Matrix{T}
+    function TableauRosW(order,a,γ,b)
+        @assert isa(S,Integer)
+        @assert isa(Name,Symbol)
+        @assert S==size(γ,1)==size(a,2)==size(γ,1)==size(a,2)==size(b,2)
+        @assert size(b,1)==length(order)
+        new(order,a,b,c)
+    end
+end
+function TableauRosW{T}(name::Symbol, order::(@compat(Tuple{Vararg{Int}})),
+                   a::Matrix{T}, γ::Matrix{T}, b::Matrix{T})
+    TableauRosW{name,size(b,2),T}(order, a, γ, c)
+end
+function TableauRosW(name::Symbol, order::(@compat(Tuple{Vararg{Int}})), T::Type,
+                   a::Matrix, γ::Matrix, b::Matrix)
+    TableauRosW{name,size(b,2),T}(order, convert(Matrix{T},a),
+                                  convert(Matrix{T},γ), convert(Matrix{T},c) )
+end
+conv_field{T,N}(D,a::Array{T,N}) = convert(Array{D,N}, a)
+function Base.convert{Tnew<:Real,Name,S,T}(::Type{Tnew}, tab::TableauRKExplicit{Name,S,T})
+    # Converts the tableau coefficients to the new type Tnew
+    newflds = ()
+    @compat for n in fieldnames(tab)
+        fld = getfield(tab,n)
+        if eltype(fld)==T
+            newflds = tuple(newflds..., conv_field(Tnew, fld))
+        else
+            newflds = tuple(newflds..., fld)
+        end
+    end
+    TableauRKExplicit{Name,S,Tnew}(newflds...) # TODO: could this be done more generically in a type-stable way?
+end
+
+
+
+# Transformed Tableau, used only internally
+immutable TableauRosW_T{Name, S, T} <: Tableau{Name, S, T}
+    order::(@compat(Tuple{Vararg{Int}})) # the order of the methods
+    a::Matrix{T}  # this is TableauRosW.a transformed
+    γii::Vector{T}
+    γinv::Matrix{T}
+    b::Matrix{T} # this is TableauRosW.b transformed
+end
+function tabletransform{Name,S,T}(rt::TableauRosW{Name,S,T})
+    γii = diag(rt.γ) γ
+    γinv = inv(rt.γ)
+    ahat = rt.A * γinv
+    bhat = similar(rt.b)
+    bhat[:,1] = squeeze(rt.b[:,1]'*γinv,1)
+    bhat[:,2] = squeeze(rt.b[:,2]'*γinv,1)
+    return RosWTableTrans{Name,S,T}(rt.order, ahat, γii, γinv, bhat)
+end
+
+
+## tableau for ros34pw2
+const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4),
+                                [0  0  0  0
+                                 8.7173304301691801e-01  0  0  0
+                                 8.4457060015369423e-01  -1.1299064236484185e-01  0  0
+                                 0  0  1.  0],
+                                [4.3586652150845900e-01  0  0  0
+                                 -8.7173304301691801e-01  4.3586652150845900e-01  0  0
+                                 -9.0338057013044082e-01  5.4180672388095326e-02  4.3586652150845900e-01  0
+                                 2.4212380706095346e-01  -1.2232505839045147e+00  5.4526025533510214e-01  4.3586652150845900e-01],
+                                 [2.4212380706095346e-01  -1.2232505839045147e+00  1.5452602553351020e+00  4.3586652150845900e-01
+                                  3.7810903145819369e-01  -9.6042292212423178e-02  5.0000000000000000e-01  2.1793326075422950e-01]'
+                                  )
+function oderosw_fixed()
+
+end
+
+function oderosw_adapt()
+
+end
+

From b4b3b7bc58493b3a5a5e69541a2e663267163784 Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Fri, 22 May 2015 16:46:43 +0200
Subject: [PATCH 02/15] fixed step ROSW works

---
 examples/van_der_pol_rosw.jl |  75 +++++++++++++++++++++
 src/ODE.jl                   |   3 +
 src/rosw.jl                  | 123 +++++++++++++++++++++++++++++++----
 test/runtests.jl             |   1 +
 4 files changed, 189 insertions(+), 13 deletions(-)
 create mode 100644 examples/van_der_pol_rosw.jl

diff --git a/examples/van_der_pol_rosw.jl b/examples/van_der_pol_rosw.jl
new file mode 100644
index 000000000..746ec40cf
--- /dev/null
+++ b/examples/van_der_pol_rosw.jl
@@ -0,0 +1,75 @@
+# Van der Pol using DASSL.jl
+using ODE
+abstol = 1e-6
+reltol = 1e-6
+
+const mu = 100.
+
+# Explicit equation
+function vdp(t, y)
+    #  vdp(t, y)
+    #
+    # ydot of van der Pol equation.  (consider using rescaled eq. so
+    # period stays the same for different mu.  Cf. Wanner & Hairer 1991,
+    # p.5)
+    return [y[2], mu^2*((1-y[1]^2) * y[2] - y[1])]
+end
+function Jvdp(t, y)
+    #  Jvdp(t, y, mu)
+    #
+    # Jacobian of vdp
+    return [0 1;
+            -mu^2*(y[2]*2*y[1] + 1)  mu^2*(1-y[1]^2)]
+end
+
+# implicit equation
+function vdp_impl(y, ydot)
+    #  vdp_impl(t, y)
+    #
+    # Implicit van der Pol equation.  (consider using rescaled eq. so
+    # period stays the same for different mu.  Cf. Wanner & Hairer 1991,
+    # p.5)
+    return vdp(0,y) - ydot
+end
+function Jvdp_impl(y, ydot, α)
+    # d vdp_impl /dy + α d vdp/dy
+    #
+    # Jacobian
+    return Jvdp(0,y) - α * eye(2)
+end
+
+###
+# reference as t=2
+##
+refsol = [0.1706167732170483e1, -0.8928097010247975e0]
+
+###
+# Fixed step
+###
+
+nt = 50_000;
+tspan = linspace(0.,2.0,nt)
+y0 = [2.,0.];
+
+t,yout1 = ode4s(vdp, y0, tspan; jacobian=Jvdp)
+@time t,yout1 = ode4s(vdp, y0, tspan; jacobian=Jvdp)
+
+t,yout2 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
+@time t,yout2 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
+
+println("Fixed step: abs error of ode4s vs ref:")
+println(yout1[end]-refsol)
+println("Fixed step: abs error of ode_rosw vs ref:")
+println(yout2[end]-refsol)
+
+# #### adaptive rosw
+# @time yout3, ts, steps, dts, xerrs = rosw_runner_adaptive(
+#                      vdp_impl, Jvdp_impl, [0., 2.], y0;
+#                      reltol=reltol, abstol=abstol, dt0=1e-5, mindt=1e-8)
+
+# check
+
+# using Winston
+# plot(ts, yout3[1,:])
+
+# oplot(tspan, yout2[1,:],"r")
diff --git a/src/ODE.jl b/src/ODE.jl
index a364988d8..2c8287ad9 100644
--- a/src/ODE.jl
+++ b/src/ODE.jl
@@ -423,4 +423,7 @@ const ms_coefficients4 = [ 1      0      0     0
                           -9/24   37/24 -59/24 55/24]
 
 
+## Rosenbrock-Wanner methods
+include("rosw.jl")
+
 end # module ODE
diff --git a/src/rosw.jl b/src/rosw.jl
index 70fe9dfaf..33e2d1cac 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -1,6 +1,8 @@
 # Rosenbrock-Wanner methods
 ###########################
 
+export ode_rosw
+
 # Rosenbrock-W methods are typically specified for autonomous DAE:
 # Mẋ = f(x)
 #
@@ -12,6 +14,10 @@
 #
 # The method used here uses transformed equations as done in PETSc.
 
+
+# Tableaus
+##########
+
 immutable TableauRosW{Name, S, T} <: Tableau{Name, S, T}
     order::(@compat(Tuple{Vararg{Int}})) # the order of the methods
     a::Matrix{T}
@@ -24,20 +30,20 @@ immutable TableauRosW{Name, S, T} <: Tableau{Name, S, T}
         @assert isa(Name,Symbol)
         @assert S==size(γ,1)==size(a,2)==size(γ,1)==size(a,2)==size(b,2)
         @assert size(b,1)==length(order)
-        new(order,a,b,c)
+        new(order,a,γ,b)
     end
 end
 function TableauRosW{T}(name::Symbol, order::(@compat(Tuple{Vararg{Int}})),
                    a::Matrix{T}, γ::Matrix{T}, b::Matrix{T})
-    TableauRosW{name,size(b,2),T}(order, a, γ, c)
+    TableauRosW{name,size(b,2),T}(order, a, γ, b)
 end
 function TableauRosW(name::Symbol, order::(@compat(Tuple{Vararg{Int}})), T::Type,
                    a::Matrix, γ::Matrix, b::Matrix)
     TableauRosW{name,size(b,2),T}(order, convert(Matrix{T},a),
-                                  convert(Matrix{T},γ), convert(Matrix{T},c) )
+                                  convert(Matrix{T},γ), convert(Matrix{T},b) )
 end
 conv_field{T,N}(D,a::Array{T,N}) = convert(Array{D,N}, a)
-function Base.convert{Tnew<:Real,Name,S,T}(::Type{Tnew}, tab::TableauRKExplicit{Name,S,T})
+function Base.convert{Tnew<:Real,Name,S,T}(::Type{Tnew}, tab::TableauRosW{Name,S,T})
     # Converts the tableau coefficients to the new type Tnew
     newflds = ()
     @compat for n in fieldnames(tab)
@@ -48,7 +54,7 @@ function Base.convert{Tnew<:Real,Name,S,T}(::Type{Tnew}, tab::TableauRKExplicit{
             newflds = tuple(newflds..., fld)
         end
     end
-    TableauRKExplicit{Name,S,Tnew}(newflds...) # TODO: could this be done more generically in a type-stable way?
+    TableauRosW{Name,S,Tnew}(newflds...) # TODO: could this be done more generically in a type-stable way?
 end
 
 
@@ -62,18 +68,18 @@ immutable TableauRosW_T{Name, S, T} <: Tableau{Name, S, T}
     b::Matrix{T} # this is TableauRosW.b transformed
 end
 function tabletransform{Name,S,T}(rt::TableauRosW{Name,S,T})
-    γii = diag(rt.γ) γ
+    γii = diag(rt.γ)
     γinv = inv(rt.γ)
-    ahat = rt.A * γinv
+    ahat = rt.a * γinv
     bhat = similar(rt.b)
-    bhat[:,1] = squeeze(rt.b[:,1]'*γinv,1)
-    bhat[:,2] = squeeze(rt.b[:,2]'*γinv,1)
-    return RosWTableTrans{Name,S,T}(rt.order, ahat, γii, γinv, bhat)
+    bhat[1,:] = squeeze(rt.b[1,:]*γinv,1)
+    bhat[2,:] = squeeze(rt.b[2,:]*γinv,1)
+    return TableauRosW_T{Name,S,T}(rt.order, ahat, γii, γinv, bhat)
 end
 
 
 ## tableau for ros34pw2
-const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4),
+const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4), Float64,
                                 [0  0  0  0
                                  8.7173304301691801e-01  0  0  0
                                  8.4457060015369423e-01  -1.1299064236484185e-01  0  0
@@ -83,13 +89,104 @@ const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4),
                                  -9.0338057013044082e-01  5.4180672388095326e-02  4.3586652150845900e-01  0
                                  2.4212380706095346e-01  -1.2232505839045147e+00  5.4526025533510214e-01  4.3586652150845900e-01],
                                  [2.4212380706095346e-01  -1.2232505839045147e+00  1.5452602553351020e+00  4.3586652150845900e-01
-                                  3.7810903145819369e-01  -9.6042292212423178e-02  5.0000000000000000e-01  2.1793326075422950e-01]'
+                                  3.7810903145819369e-01  -9.6042292212423178e-02  5.0000000000000000e-01  2.1793326075422950e-01]
                                   )
-function oderosw_fixed()
 
+###################
+# Fixed step solver
+###################
+ode_rosw(fn, Jfn, y0, tspan) = oderosw_fixed(fn, Jfn, y0::AbstractVector, tspan, bt_ros34pw2)
+function oderosw_fixed{N,S}(fn, Jfn, y0::AbstractVector, tspan,
+                            btab::TableauRosW{N,S})
+    # TODO: refactor with oderk_fixed
+    Et, Eyf, Ty, btab = make_consistent_types(fn, y0, tspan, btab)
+    btab = tabletransform(btab)
+    dof = length(y0)
+
+    ys = Array(Ty, length(tspan))
+    allocate!(ys, y0, dof)
+    ys[1] = deepcopy(y0)
+
+    tspan = convert(Vector{Et}, tspan)
+    # work arrays:
+    ks = Array(Ty, S)
+    # allocate!(ks, y0, dof) # no need to allocate as fn is not in-place
+    ytmp = similar(y0, Eyf, dof)
+    for i=1:length(tspan)-1
+        dt = tspan[i+1]-tspan[i]
+        ys[i+1] = rosw_step(fn, Jfn, ys[i], dt, btab, 2)
+    end
+    return tspan, ys
+end
+
+function linsolve(h, hprime, y0)
+    # Does one Newton step, i.e. a linear solve.
+    return y0 - hprime(y0)\h(y0)
 end
 
+
 function oderosw_adapt()
 
 end
 
+function rosw_step(g, gprime, x0, dt,
+                   btab::TableauRosW_T, bt_ind=1)
+    # This takes one step for a ode/dae system defined by
+    # g(x,xdot)=0
+    # gprime(x, xdot, α) = dg/dx + α dg/dxdot
+
+    stages = size(btab.a,1)
+    dof = length(x0)
+    ys  = zeros(eltype(x0), stages, dof)
+
+    # stage solutions
+    jacobian_stale = true
+    hprime_store = zeros(dof,dof)
+
+    cij = (tril(btab.γinv)-diagm(diag(btab.γinv)))/dt
+    for i=1:stages
+        u = zeros(dof)
+        udot = zeros(dof)
+        function h(yi) # length(yi)==dof
+            # this function is Eq 5
+            u[:] = x0 + yi
+            for j=1:i-1
+                for d=1:dof
+                    u[d] += btab.a[i,j]*ys[j,d]
+                end
+            end
+            udot[:] = 1./(dt*btab.γii[i]).*yi  # jed: is this index with the stage?
+            for j=1:i
+                # this is Eq.5-3
+                for d=1:dof
+                    udot[d] += cij[i,j]*ys[j,d]
+                end
+            end
+            g(u, udot)
+        end
+        function hprime(yi)
+            # here we only update the jacobian once per time step
+            if jacobian_stale
+                hprime_store[:,:] = gprime(u, udot, 1./(dt*btab.γii[i]))  # jed: is this index with the stage?
+            end
+            hprime_store
+        end
+        # this is just a linear solve, usually
+#        ys[i,:] = nlsolve(h, zeros(dof), hprime; opts=one_step_only)
+        ys[i,:] = linsolve(h, hprime, zeros(dof))
+#        ys[i,:] = newtonsolve(h, hprime, zeros(dof), maxsteps=1, warn=false)
+        if i==1
+            # calculate jacobian
+            jacobian_stale = false
+        end
+    end
+
+    # completion:  (done twice for error control)
+    x1 = zeros(eltype(x0), length(x0))
+    for i=1:dof
+        for j=1:stages
+            x1[i] += btab.b[bt_ind,j]*ys[j,i]
+        end
+    end
+    return x0 + x1
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 64f57e0b7..86fc105f0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -23,6 +23,7 @@ solvers = [
            # fixed-step
            ODE.ode4s_s,
            ODE.ode4s_kr,
+#           ODE.ode_rosw,
            # adaptive
            ODE.ode23s]
 

From 136f4d9ef08b4bdee298d483375324823c4248a5 Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Fri, 22 May 2015 17:07:10 +0200
Subject: [PATCH 03/15] adaptive solver working

---
 examples/van_der_pol_rosw.jl |  41 +++++++----
 src/rosw.jl                  | 129 ++++++++++++++++++++++++++++++++++-
 2 files changed, 155 insertions(+), 15 deletions(-)

diff --git a/examples/van_der_pol_rosw.jl b/examples/van_der_pol_rosw.jl
index 746ec40cf..0a2751a01 100644
--- a/examples/van_der_pol_rosw.jl
+++ b/examples/van_der_pol_rosw.jl
@@ -42,27 +42,44 @@ end
 # reference as t=2
 ##
 refsol = [0.1706167732170483e1, -0.8928097010247975e0]
+y0 = [2.,0.];
+tstart = 0.
+tend = 2.
+
+
 
 ###
 # Fixed step
 ###
+# nt = 50_000;
+# tspan = linspace(tstart, tend, nt)
 
-nt = 50_000;
-tspan = linspace(0.,2.0,nt)
-y0 = [2.,0.];
+# t,yout1 = ode4s(vdp, y0, tspan; jacobian=Jvdp)
+# @time t,yout1 = ode4s(vdp, y0, tspan; jacobian=Jvdp)
+
+# t,yout2 = ode_rosw_fixed(vdp_impl, Jvdp_impl, y0, tspan)
+# @time t,yout2 = ode_rosw_fixed(vdp_impl, Jvdp_impl, y0, tspan)
+
+# println("Fixed step: abs error of ode4s vs ref:")
+# println(yout1[end]-refsol)
+# println("Fixed step: abs error of ode_rosw vs ref:")
+# println(yout2[end]-refsol)
+
+# #### adaptive
+tspan = linspace(tstart, tend, 2)
 
-t,yout1 = ode4s(vdp, y0, tspan; jacobian=Jvdp)
-@time t,yout1 = ode4s(vdp, y0, tspan; jacobian=Jvdp)
+t,yout3 = ode23s(vdp, y0, tspan; jacobian=Jvdp)
+@time t,yout3 = ode23s(vdp, y0, tspan; jacobian=Jvdp)
 
-t,yout2 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
-@time t,yout2 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
+t,yout4 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
+@time t,yout4 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
 
-println("Fixed step: abs error of ode4s vs ref:")
-println(yout1[end]-refsol)
-println("Fixed step: abs error of ode_rosw vs ref:")
-println(yout2[end]-refsol)
+println("Adaptive step: abs error of ode23s vs ref:")
+println(yout3[end]-refsol)
+println("Adaptive step: abs error of ode_rosw vs ref:")
+println(yout4[end]-refsol)
 
-# #### adaptive rosw
+# rosw
 # @time yout3, ts, steps, dts, xerrs = rosw_runner_adaptive(
 #                      vdp_impl, Jvdp_impl, [0., 2.], y0;
 #                      reltol=reltol, abstol=abstol, dt0=1e-5, mindt=1e-8)
diff --git a/src/rosw.jl b/src/rosw.jl
index 33e2d1cac..517262364 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -1,7 +1,7 @@
 # Rosenbrock-Wanner methods
 ###########################
 
-export ode_rosw
+export ode_rosw, ode_rosw_fixed
 
 # Rosenbrock-W methods are typically specified for autonomous DAE:
 # Mẋ = f(x)
@@ -95,7 +95,7 @@ const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4), Float64,
 ###################
 # Fixed step solver
 ###################
-ode_rosw(fn, Jfn, y0, tspan) = oderosw_fixed(fn, Jfn, y0::AbstractVector, tspan, bt_ros34pw2)
+ode_rosw_fixed(fn, Jfn, y0, tspan) = oderosw_fixed(fn, Jfn, y0, tspan, bt_ros34pw2)
 function oderosw_fixed{N,S}(fn, Jfn, y0::AbstractVector, tspan,
                             btab::TableauRosW{N,S})
     # TODO: refactor with oderk_fixed
@@ -124,8 +124,131 @@ function linsolve(h, hprime, y0)
     return y0 - hprime(y0)\h(y0)
 end
 
+ode_rosw(fn, Jfn, y0, tspan;kwargs...) = oderosw_adapt(fn, Jfn, y0, tspan, bt_ros34pw2; kwargs...)
+function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRosW{N,S};
+                            reltol = 1.0e-5, abstol = 1.0e-8,
+                            norm=Base.norm,
+                            minstep=abs(tspan[end] - tspan[1])/1e9,
+                            maxstep=abs(tspan[end] - tspan[1])/2.5,
+                            initstep=0.,
+                            points=:all
+                            )
+    fn_expl = (t,y)->fn(y, y*0)
+    Et, Eyf, Ty, btab = make_consistent_types(fn, y0, tspan, btab)
+    btab = tabletransform(btab)
+    # parameters
+    order = minimum(btab.order)
+    timeout_const = 5 # after step reduction do not increase step for
+                      # timeout_const steps
+
+    ## Initialization
+    dof = length(y0)
+    tspan = convert(Vector{Et}, tspan)
+    t = tspan[1]
+    tstart = tspan[1]
+    tend = tspan[end]
+
+    # work arrays:
+    y      = similar(y0, Eyf, dof)      # y at time t
+    y[:]   = y0
+    ytrial = similar(y0, Eyf, dof) # trial solution at time t+dt
+    yerr   = similar(y0, Eyf, dof) # error of trial solution
+    ks = Array(Ty, S)
+    # allocate!(ks, y0, dof) # no need to allocate as fn is not in-place
+    ytmp   = similar(y0, Eyf, dof)
+
+    # output ys
+    nsteps_fixed = length(tspan) # these are always output
+    ys = Array(Ty, nsteps_fixed)
+    allocate!(ys, y0, dof)
+    ys[1] = y0
+
+    # Option points determines where solution is returned:
+    if points==:all
+        tspan_fixed = tspan
+        tspan = Et[tstart]
+        iter_fixed = 2 # index into tspan_fixed
+        sizehint!(tspan, nsteps_fixed)
+    elseif points!=:specified
+        error("Unrecognized option points==$points")
+    end
+    # Time
+    dt, tdir, ks[1] = hinit(fn_expl, y, tstart, tend, order, reltol, abstol) # sets ks[1]=f0
+    if initstep!=0
+        dt = sign(initstep)==tdir ? initstep : error("initstep has wrong sign.")
+    end
+    # Diagnostics
+    dts = Et[]
+    errs = Float64[]
+    steps = [0,0]  # [accepted, rejected]
 
-function oderosw_adapt()
+    ## Integration loop
+    laststep = false
+    timeout = 0 # for step-control
+    iter = 2 # the index into tspan and ys
+    while true
+        ytrial[:] = rosw_step(fn, Jfn, y, dt, btab, 1)
+        yerr[:] = ytrial - rosw_step(fn, Jfn, y, dt, btab, 2)
+        err, newdt, timeout = stepsize_hw92!(dt, tdir, y, ytrial, yerr, order, timeout,
+                                            dof, abstol, reltol, maxstep, norm)
+
+        if err<=1.0 # accept step
+            # diagnostics
+            steps[1] +=1
+            push!(dts, dt)
+            push!(errs, err)
+
+            # Output:
+            f0 = ks[1]
+            f1 = fn_expl(t+dt, ytrial)
+            if points==:specified
+                # interpolate onto given output points
+                while iter-1<nsteps_fixed && (tdir*tspan[iter]<tdir*(t+dt) || laststep) # output at all new times which are < t+dt
+                    hermite_interp!(ys[iter], tspan[iter], t, dt, y, ytrial, f0, f1) # TODO: 3rd order only!
+                    iter += 1
+                end
+            else
+                # first interpolate onto given output points
+                while iter_fixed-1<nsteps_fixed && tdir*t<tdir*tspan_fixed[iter_fixed]<tdir*(t+dt) # output at all new times which are < t+dt
+                    yout = hermite_interp(tspan_fixed[iter_fixed], t, dt, y, ytrial, f0, f1)
+                    index_or_push!(ys, iter, yout) # TODO: 3rd order only!
+                    push!(tspan, tspan_fixed[iter_fixed])
+                    iter_fixed += 1
+                    iter += 1
+                end
+                # but also output every step taken
+                index_or_push!(ys, iter, copy(ytrial))
+                push!(tspan, t+dt)
+                iter += 1
+            end
+            ks[1] = f1 # load ks[1]==f0 for next step
+
+            # Break if this was the last step:
+            laststep && break
+
+            # Swap bindings of y and ytrial, avoids one copy
+            y, ytrial = ytrial, y
+
+            # Update t to the time at the end of current step:
+            t += dt
+            dt = newdt
+
+            # Hit end point exactly if next step within 1% of end:
+            if tdir*(t+dt*1.01) >= tdir*tend
+                dt = tend-t
+                laststep = true # next step is the last, if it succeeds
+            end
+        elseif abs(newdt)<minstep  # minimum step size reached, break
+            println("Warning: dt < minstep.  Stopping.")
+            break
+        else # redo step with smaller dt
+            laststep = false
+            steps[2] +=1
+            dt = newdt
+            timeout = timeout_const
+        end
+    end
+    return tspan, ys
 
 end
 

From a23fa52f62c86d6a473b11bb549f265eda39e4f4 Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Mon, 25 May 2015 17:23:39 +0200
Subject: [PATCH 04/15] first performance updates

---
 examples/van_der_pol_rosw.jl |  2 +
 src/rosw.jl                  | 98 ++++++++++++++++++++++++++++++++----
 2 files changed, 89 insertions(+), 11 deletions(-)

diff --git a/examples/van_der_pol_rosw.jl b/examples/van_der_pol_rosw.jl
index 0a2751a01..d24f4444a 100644
--- a/examples/van_der_pol_rosw.jl
+++ b/examples/van_der_pol_rosw.jl
@@ -69,9 +69,11 @@ tend = 2.
 tspan = linspace(tstart, tend, 2)
 
 t,yout3 = ode23s(vdp, y0, tspan; jacobian=Jvdp)
+gc()
 @time t,yout3 = ode23s(vdp, y0, tspan; jacobian=Jvdp)
 
 t,yout4 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
+gc()
 @time t,yout4 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
 
 println("Adaptive step: abs error of ode23s vs ref:")
diff --git a/src/rosw.jl b/src/rosw.jl
index 517262364..64252f3f9 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -63,9 +63,11 @@ end
 immutable TableauRosW_T{Name, S, T} <: Tableau{Name, S, T}
     order::(@compat(Tuple{Vararg{Int}})) # the order of the methods
     a::Matrix{T}  # this is TableauRosW.a transformed
-    γii::Vector{T}
     γinv::Matrix{T}
     b::Matrix{T} # this is TableauRosW.b transformed
+    # derived quantities:
+    γii::Vector{T}
+    c::Matrix{T} # = (tril(btab.γinv)-diagm(diag(btab.γinv)))
 end
 function tabletransform{Name,S,T}(rt::TableauRosW{Name,S,T})
     γii = diag(rt.γ)
@@ -74,7 +76,8 @@ function tabletransform{Name,S,T}(rt::TableauRosW{Name,S,T})
     bhat = similar(rt.b)
     bhat[1,:] = squeeze(rt.b[1,:]*γinv,1)
     bhat[2,:] = squeeze(rt.b[2,:]*γinv,1)
-    return TableauRosW_T{Name,S,T}(rt.order, ahat, γii, γinv, bhat)
+    c = (tril(γinv)-diagm(diag(γinv)))
+    return TableauRosW_T{Name,S,T}(rt.order, ahat, γinv, bhat, γii, c)
 end
 
 
@@ -114,7 +117,7 @@ function oderosw_fixed{N,S}(fn, Jfn, y0::AbstractVector, tspan,
     ytmp = similar(y0, Eyf, dof)
     for i=1:length(tspan)-1
         dt = tspan[i+1]-tspan[i]
-        ys[i+1] = rosw_step(fn, Jfn, ys[i], dt, btab, 2)
+        ys[i+1] = rosw_step!(fn, Jfn, ys[i], dt, btab, 2)
     end
     return tspan, ys
 end
@@ -133,6 +136,9 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
                             initstep=0.,
                             points=:all
                             )
+    # TODO: refactor with oderk_adapt
+
+    ## Figure types
     fn_expl = (t,y)->fn(y, y*0)
     Et, Eyf, Ty, btab = make_consistent_types(fn, y0, tspan, btab)
     btab = tabletransform(btab)
@@ -141,7 +147,7 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
     timeout_const = 5 # after step reduction do not increase step for
                       # timeout_const steps
 
-    ## Initialization
+    ## Setup
     dof = length(y0)
     tspan = convert(Vector{Et}, tspan)
     t = tspan[1]
@@ -154,8 +160,10 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
     ytrial = similar(y0, Eyf, dof) # trial solution at time t+dt
     yerr   = similar(y0, Eyf, dof) # error of trial solution
     ks = Array(Ty, S)
-    # allocate!(ks, y0, dof) # no need to allocate as fn is not in-place
-    ytmp   = similar(y0, Eyf, dof)
+    allocate!(ks, y0, dof) # no need to allocate as fn is not in-place
+    hprime_store = zeros(Eyf,dof,dof)
+    u = zeros(Eyf,dof)
+    udot = zeros(Eyf,dof)
 
     # output ys
     nsteps_fixed = length(tspan) # these are always output
@@ -187,11 +195,21 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
     timeout = 0 # for step-control
     iter = 2 # the index into tspan and ys
     while true
-        ytrial[:] = rosw_step(fn, Jfn, y, dt, btab, 1)
-        yerr[:] = ytrial - rosw_step(fn, Jfn, y, dt, btab, 2)
+        ytrial[:] = rosw_step!(fn, Jfn, y, dt, btab, ks, hprime_store, u, udot, 1)
+        # completion again, see line 927 of
+        # http://www.mcs.anl.gov/petsc/petsc-current/src/ts/impls/rosw/rosw.c.html#TSROSW
+        yerr[:] = 0.0
+        for j=1:S
+            for i=1:dof
+                yerr[i] += (btab.b[2,j]-btab.b[1,j])*ks[j][i]
+            end
+        end
+
+        # ytrial[:] = rosw_step(fn, Jfn, y, dt, btab, 1)
+        # yerr[:] = ytrial - rosw_step(fn, Jfn, y, dt, btab, 2)
+
         err, newdt, timeout = stepsize_hw92!(dt, tdir, y, ytrial, yerr, order, timeout,
                                             dof, abstol, reltol, maxstep, norm)
-
         if err<=1.0 # accept step
             # diagnostics
             steps[1] +=1
@@ -266,7 +284,6 @@ function rosw_step(g, gprime, x0, dt,
     jacobian_stale = true
     hprime_store = zeros(dof,dof)
 
-    cij = (tril(btab.γinv)-diagm(diag(btab.γinv)))/dt
     for i=1:stages
         u = zeros(dof)
         udot = zeros(dof)
@@ -282,7 +299,7 @@ function rosw_step(g, gprime, x0, dt,
             for j=1:i
                 # this is Eq.5-3
                 for d=1:dof
-                    udot[d] += cij[i,j]*ys[j,d]
+                    udot[d] += btab.c[i,j]/dt*ys[j,d]
                 end
             end
             g(u, udot)
@@ -313,3 +330,62 @@ function rosw_step(g, gprime, x0, dt,
     end
     return x0 + x1
 end
+
+function rosw_step!(g, gprime, x0, dt,
+                   btab::TableauRosW_T, ks, hprime_store, u, udot, bt_ind=1)
+    # This takes one step for a ode/dae system defined by
+    # g(x,xdot)=0
+    # gprime(x, xdot, α) = dg/dx + α dg/dxdot
+
+    stages = size(btab.a,1)
+    dof = length(x0)
+
+    # stage solutions
+    jacobian_stale = true
+    
+    # calculate kappa
+    for i=1:stages
+        function h(yi) # length(yi)==dof
+            # this function is Eq 5
+            u[:] = x0
+            u[:] += yi
+            for j=1:i-1
+                for d=1:dof
+                    u[d] += btab.a[i,j]*ks[j][d]
+                end
+            end
+            udot[:] = 1./(dt*btab.γii[i]).*yi  # jed: is this index with the stage?
+            for j=1:i
+                # this is Eq.5-3
+                for d=1:dof
+                    udot[d] += btab.c[i,j]/dt*ks[j][d]
+                end
+            end
+            g(u, udot)
+        end
+        function hprime(yi)
+            # here we only update the jacobian once per time step
+            if jacobian_stale
+                hprime_store[:,:] = gprime(u, udot, 1./(dt*btab.γii[i]))  # jed: is this index with the stage?
+            end
+            hprime_store
+        end
+        # this is just a linear solve, usually
+#        ks[i,:] = nlsolve(h, zeros(dof), hprime; opts=one_step_only)
+        ks[i][:] = linsolve(h, hprime, zeros(dof))
+#        ks[i,:] = newtonsolve(h, hprime, zeros(dof), maxsteps=1, warn=false)
+        if i==1
+            # calculate jacobian
+            jacobian_stale = false
+        end
+    end
+
+    # completion:  (done twice for error control)
+    x1 = zeros(eltype(x0), length(x0))
+    for j=1:stages
+        for i=1:dof
+            x1[i] += btab.b[bt_ind,j]*ks[j][i]
+        end
+    end
+    return x0 + x1
+end

From 79291063d60b95920b737a700b8c18d1f43d1ca2 Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Tue, 26 May 2015 19:16:41 +0200
Subject: [PATCH 05/15] Tidier and running faster. Now onto in-place functions

---
 examples/van_der_pol_rosw.jl |  26 ++++
 src/rosw.jl                  | 294 +++++++++++++++--------------------
 2 files changed, 151 insertions(+), 169 deletions(-)

diff --git a/examples/van_der_pol_rosw.jl b/examples/van_der_pol_rosw.jl
index d24f4444a..3663805d0 100644
--- a/examples/van_der_pol_rosw.jl
+++ b/examples/van_der_pol_rosw.jl
@@ -38,6 +38,32 @@ function Jvdp_impl(y, ydot, α)
     return Jvdp(0,y) - α * eye(2)
 end
 
+# implicit inplace equation
+function vdp_impl!(res, y, ydot)
+    #  vdp_impl(t, y)
+    #
+    # Implicit van der Pol equation.  (consider using rescaled eq. so
+    # period stays the same for different mu.  Cf. Wanner & Hairer 1991,
+    # p.5)
+    #
+    # Note, that ydot can be used as res here.
+    res[1] = y[2] - ydot[1]
+    res[2] = mu^2*((1-y[1]^2) * y[2] - y[1]) - ydot[2]
+    return nothing
+end
+function Jvdp_impl!(res, y, ydot, α)
+    # d vdp_impl /dy + α d vdp/dy
+    #
+    # Jacobian
+
+    res[1,1] = 0 - α
+    res[1,2] = 1
+    res[2,1] = -mu^2*(y[2]*2*y[1] + 1)
+    res[2,2] = mu^2*(1-y[1]^2) - α
+    return nothing
+end
+
+
 ###
 # reference as t=2
 ##
diff --git a/src/rosw.jl b/src/rosw.jl
index 64252f3f9..20e983ea5 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -1,6 +1,8 @@
 # Rosenbrock-Wanner methods
 ###########################
-
+#
+# Main references: Wanner & Hairrere 1996, PETSc http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/TS/TSROSW.html
+@show "NEW"
 export ode_rosw, ode_rosw_fixed
 
 # Rosenbrock-W methods are typically specified for autonomous DAE:
@@ -66,17 +68,21 @@ immutable TableauRosW_T{Name, S, T} <: Tableau{Name, S, T}
     γinv::Matrix{T}
     b::Matrix{T} # this is TableauRosW.b transformed
     # derived quantities:
-    γii::Vector{T}
+    γii::T
     c::Matrix{T} # = (tril(btab.γinv)-diagm(diag(btab.γinv)))
 end
 function tabletransform{Name,S,T}(rt::TableauRosW{Name,S,T})
-    γii = diag(rt.γ)
+    # the code only works if
+    if !all(x->x==rt.γ[1],diag(rt.γ))
+        error("This Rosenbrock implementation only works for tableaus with γ_ii==γ_jj for all i,j.")
+    end
+    γii = rt.γ[1,1]
     γinv = inv(rt.γ)
     ahat = rt.a * γinv
     bhat = similar(rt.b)
     bhat[1,:] = squeeze(rt.b[1,:]*γinv,1)
     bhat[2,:] = squeeze(rt.b[2,:]*γinv,1)
-    c = (tril(γinv)-diagm(diag(γinv)))
+    c = (tril(γinv)-diagm(diag(γinv))) # negative of W&H definition
     return TableauRosW_T{Name,S,T}(rt.order, ahat, γinv, bhat, γii, c)
 end
 
@@ -98,37 +104,33 @@ const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4), Float64,
 ###################
 # Fixed step solver
 ###################
-ode_rosw_fixed(fn, Jfn, y0, tspan) = oderosw_fixed(fn, Jfn, y0, tspan, bt_ros34pw2)
-function oderosw_fixed{N,S}(fn, Jfn, y0::AbstractVector, tspan,
+ode_rosw_fixed(fn, Jfn, x0, tspan) = oderosw_fixed(fn, Jfn, x0, tspan, bt_ros34pw2)
+function oderosw_fixed{N,S}(fn, Jfn, x0::AbstractVector, tspan,
                             btab::TableauRosW{N,S})
     # TODO: refactor with oderk_fixed
-    Et, Eyf, Ty, btab = make_consistent_types(fn, y0, tspan, btab)
+    Et, Exf, Tx, btab = make_consistent_types(fn, x0, tspan, btab)
     btab = tabletransform(btab)
-    dof = length(y0)
+    dof = length(x0)
 
-    ys = Array(Ty, length(tspan))
-    allocate!(ys, y0, dof)
-    ys[1] = deepcopy(y0)
+    xs = Array(Tx, length(tspan))
+    allocate!(xs, x0, dof)
+    xs[1] = deepcopy(x0)
 
     tspan = convert(Vector{Et}, tspan)
     # work arrays:
-    ks = Array(Ty, S)
-    # allocate!(ks, y0, dof) # no need to allocate as fn is not in-place
-    ytmp = similar(y0, Eyf, dof)
+    ks = Array(Tx, S)
+    # allocate!(ks, x0, dof) # no need to allocate as fn is not in-place
+    xtmp = similar(x0, Exf, dof)
     for i=1:length(tspan)-1
         dt = tspan[i+1]-tspan[i]
-        ys[i+1] = rosw_step!(fn, Jfn, ys[i], dt, btab, 2)
+        xs[i+1] = rosw_step!(fn, Jfn, xs[i], dt, btab, 2)
     end
-    return tspan, ys
+    return tspan, xs
 end
 
-function linsolve(h, hprime, y0)
-    # Does one Newton step, i.e. a linear solve.
-    return y0 - hprime(y0)\h(y0)
-end
 
-ode_rosw(fn, Jfn, y0, tspan;kwargs...) = oderosw_adapt(fn, Jfn, y0, tspan, bt_ros34pw2; kwargs...)
-function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRosW{N,S};
+ode_rosw(fn, Jfn, x0, tspan;kwargs...) = oderosw_adapt(fn, Jfn, x0, tspan, bt_ros34pw2; kwargs...)
+function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRosW{N,S};
                             reltol = 1.0e-5, abstol = 1.0e-8,
                             norm=Base.norm,
                             minstep=abs(tspan[end] - tspan[1])/1e9,
@@ -139,8 +141,8 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
     # TODO: refactor with oderk_adapt
 
     ## Figure types
-    fn_expl = (t,y)->fn(y, y*0)
-    Et, Eyf, Ty, btab = make_consistent_types(fn, y0, tspan, btab)
+    fn_expl = (t,x)->fn(x, x*0)
+    Et, Exf, Tx, btab = make_consistent_types(fn, x0, tspan, btab)
     btab = tabletransform(btab)
     # parameters
     order = minimum(btab.order)
@@ -148,28 +150,30 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
                       # timeout_const steps
 
     ## Setup
-    dof = length(y0)
+    dof = length(x0)
     tspan = convert(Vector{Et}, tspan)
     t = tspan[1]
     tstart = tspan[1]
     tend = tspan[end]
 
     # work arrays:
-    y      = similar(y0, Eyf, dof)      # y at time t
-    y[:]   = y0
-    ytrial = similar(y0, Eyf, dof) # trial solution at time t+dt
-    yerr   = similar(y0, Eyf, dof) # error of trial solution
-    ks = Array(Ty, S)
-    allocate!(ks, y0, dof) # no need to allocate as fn is not in-place
-    hprime_store = zeros(Eyf,dof,dof)
-    u = zeros(Eyf,dof)
-    udot = zeros(Eyf,dof)
-
-    # output ys
+    x      = similar(x0, Exf, dof) # x at time t (time at beginning of step)
+    x[:]   = x0                    # fill with IC
+    xtrial = similar(x0, Exf, dof) # trial solution at time t+dt
+    xerr   = similar(x0, Exf, dof) # error of trial solution
+    k = Array(Tx, S) # stage variables
+    allocate!(k, x0, dof)
+    ks = zeros(Exf,dof) # work vector for one k
+    h_store = zeros(Exf,dof) # work vector 
+    jac_store = zeros(Exf,dof,dof) # Jacobian storage
+    u = zeros(Exf,dof)    # work vector 
+    udot = zeros(Exf,dof) # work vector 
+
+    # output xs
     nsteps_fixed = length(tspan) # these are always output
-    ys = Array(Ty, nsteps_fixed)
-    allocate!(ys, y0, dof)
-    ys[1] = y0
+    xs = Array(Tx, nsteps_fixed)
+    allocate!(xs, x0, dof)
+    xs[1] = x0
 
     # Option points determines where solution is returned:
     if points==:all
@@ -181,7 +185,7 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
         error("Unrecognized option points==$points")
     end
     # Time
-    dt, tdir, ks[1] = hinit(fn_expl, y, tstart, tend, order, reltol, abstol) # sets ks[1]=f0
+    dt, tdir, k[1] = hinit(fn_expl, x, tstart, tend, order, reltol, abstol) # sets k[1]=f0
     if initstep!=0
         dt = sign(initstep)==tdir ? initstep : error("initstep has wrong sign.")
     end
@@ -193,22 +197,19 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
     ## Integration loop
     laststep = false
     timeout = 0 # for step-control
-    iter = 2 # the index into tspan and ys
+    iter = 2 # the index into tspan and xs
     while true
-        ytrial[:] = rosw_step!(fn, Jfn, y, dt, btab, ks, hprime_store, u, udot, 1)
-        # completion again, see line 927 of
+        rosw_step!(xtrial, fn, Jfn, x, dt, dof, btab,
+                   k, h_store, jac_store, ks, u, udot, 1)
+        # Completion again for embedded method, see line 927 of
         # http://www.mcs.anl.gov/petsc/petsc-current/src/ts/impls/rosw/rosw.c.html#TSROSW
-        yerr[:] = 0.0
-        for j=1:S
-            for i=1:dof
-                yerr[i] += (btab.b[2,j]-btab.b[1,j])*ks[j][i]
+        xerr[:] = 0.0
+        for s=1:S
+            for d=1:dof
+                xerr[d] += (btab.b[2,s]-btab.b[1,s])*k[s][d]
             end
         end
-
-        # ytrial[:] = rosw_step(fn, Jfn, y, dt, btab, 1)
-        # yerr[:] = ytrial - rosw_step(fn, Jfn, y, dt, btab, 2)
-
-        err, newdt, timeout = stepsize_hw92!(dt, tdir, y, ytrial, yerr, order, timeout,
+        err, newdt, timeout = stepsize_hw92!(dt, tdir, x, xtrial, xerr, order, timeout,
                                             dof, abstol, reltol, maxstep, norm)
         if err<=1.0 # accept step
             # diagnostics
@@ -217,35 +218,35 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
             push!(errs, err)
 
             # Output:
-            f0 = ks[1]
-            f1 = fn_expl(t+dt, ytrial)
+            f0 = k[1]
+            f1 = fn_expl(t+dt, xtrial)
             if points==:specified
                 # interpolate onto given output points
                 while iter-1<nsteps_fixed && (tdir*tspan[iter]<tdir*(t+dt) || laststep) # output at all new times which are < t+dt
-                    hermite_interp!(ys[iter], tspan[iter], t, dt, y, ytrial, f0, f1) # TODO: 3rd order only!
+                    hermite_interp!(xs[iter], tspan[iter], t, dt, x, xtrial, f0, f1) # TODO: 3rd order only!
                     iter += 1
                 end
             else
                 # first interpolate onto given output points
                 while iter_fixed-1<nsteps_fixed && tdir*t<tdir*tspan_fixed[iter_fixed]<tdir*(t+dt) # output at all new times which are < t+dt
-                    yout = hermite_interp(tspan_fixed[iter_fixed], t, dt, y, ytrial, f0, f1)
-                    index_or_push!(ys, iter, yout) # TODO: 3rd order only!
+                    xout = hermite_interp(tspan_fixed[iter_fixed], t, dt, x, xtrial, f0, f1)
+                    index_or_push!(xs, iter, xout) # TODO: 3rd order only!
                     push!(tspan, tspan_fixed[iter_fixed])
                     iter_fixed += 1
                     iter += 1
                 end
                 # but also output every step taken
-                index_or_push!(ys, iter, copy(ytrial))
+                index_or_push!(xs, iter, copy(xtrial))
                 push!(tspan, t+dt)
                 iter += 1
             end
-            ks[1] = f1 # load ks[1]==f0 for next step
+            k[1] = f1 # load k[1]==f0 for next step
 
             # Break if this was the last step:
             laststep && break
 
-            # Swap bindings of y and ytrial, avoids one copy
-            y, ytrial = ytrial, y
+            # Swap bindings of x and xtrial, avoids one copy
+            x, xtrial = xtrial, x
 
             # Update t to the time at the end of current step:
             t += dt
@@ -266,126 +267,81 @@ function oderosw_adapt{N,S}(fn, Jfn, y0::AbstractVector, tspan, btab::TableauRos
             timeout = timeout_const
         end
     end
-    return tspan, ys
+    return tspan, xs
 
 end
-
-function rosw_step(g, gprime, x0, dt,
-                   btab::TableauRosW_T, bt_ind=1)
+function rosw_step!{N,S}(xtrial, g, gprime, x, dt, dof, btab::TableauRosW_T{N,S},
+                         k, h_store, jac_store, ks, u, udot, bt_ind=1)
     # This takes one step for a ode/dae system defined by
     # g(x,xdot)=0
     # gprime(x, xdot, α) = dg/dx + α dg/dxdot
 
-    stages = size(btab.a,1)
-    dof = length(x0)
-    ys  = zeros(eltype(x0), stages, dof)
-
-    # stage solutions
-    jacobian_stale = true
-    hprime_store = zeros(dof,dof)
+    # first step
+    # ks[:] = 0 assumes ks==0
+    s = 1
+    h!(h_store, ks, u, udot, g, x, btab, dt, k, s, dof)
+    # It's sufficient to only update the Jacobian once per time step:
+    # (Note that this uses u & udot calculated in h! above.)
+    jac, tmp = hprime!(jac_store, x, gprime, u, udot, btab, dt)
+    
+    # calculate k
+    for s=1:S-1
+        # first step of Newton iteration with guess ks==0
+        k[s][:] = ks - jac\h_store # TODO use A_ldiv_B!
 
-    for i=1:stages
-        u = zeros(dof)
-        udot = zeros(dof)
-        function h(yi) # length(yi)==dof
-            # this function is Eq 5
-            u[:] = x0 + yi
-            for j=1:i-1
-                for d=1:dof
-                    u[d] += btab.a[i,j]*ys[j,d]
-                end
-            end
-            udot[:] = 1./(dt*btab.γii[i]).*yi  # jed: is this index with the stage?
-            for j=1:i
-                # this is Eq.5-3
-                for d=1:dof
-                    udot[d] += btab.c[i,j]/dt*ys[j,d]
-                end
-            end
-            g(u, udot)
-        end
-        function hprime(yi)
-            # here we only update the jacobian once per time step
-            if jacobian_stale
-                hprime_store[:,:] = gprime(u, udot, 1./(dt*btab.γii[i]))  # jed: is this index with the stage?
-            end
-            hprime_store
-        end
-        # this is just a linear solve, usually
-#        ys[i,:] = nlsolve(h, zeros(dof), hprime; opts=one_step_only)
-        ys[i,:] = linsolve(h, hprime, zeros(dof))
-#        ys[i,:] = newtonsolve(h, hprime, zeros(dof), maxsteps=1, warn=false)
-        if i==1
-            # calculate jacobian
-            jacobian_stale = false
-        end
+        h!(h_store, ks, u, udot, g, x, btab, dt, k, s+1, dof)
     end
-
-    # completion:  (done twice for error control)
-    x1 = zeros(eltype(x0), length(x0))
-    for i=1:dof
-        for j=1:stages
-            x1[i] += btab.b[bt_ind,j]*ys[j,i]
+    # last step
+    k[S][:] = ks - jac\h_store
+
+    # completion:
+    xtrial[:] = x
+    for s=1:S
+        for d=1:dof
+            xtrial[d] += btab.b[bt_ind,s]*k[s][d]
         end
     end
-    return x0 + x1
+    return nothing
 end
-
-function rosw_step!(g, gprime, x0, dt,
-                   btab::TableauRosW_T, ks, hprime_store, u, udot, bt_ind=1)
-    # This takes one step for a ode/dae system defined by
-    # g(x,xdot)=0
-    # gprime(x, xdot, α) = dg/dx + α dg/dxdot
-
-    stages = size(btab.a,1)
-    dof = length(x0)
-
-    # stage solutions
-    jacobian_stale = true
-    
-    # calculate kappa
-    for i=1:stages
-        function h(yi) # length(yi)==dof
-            # this function is Eq 5
-            u[:] = x0
-            u[:] += yi
-            for j=1:i-1
-                for d=1:dof
-                    u[d] += btab.a[i,j]*ks[j][d]
-                end
-            end
-            udot[:] = 1./(dt*btab.γii[i]).*yi  # jed: is this index with the stage?
-            for j=1:i
-                # this is Eq.5-3
-                for d=1:dof
-                    udot[d] += btab.c[i,j]/dt*ks[j][d]
-                end
-            end
-            g(u, udot)
-        end
-        function hprime(yi)
-            # here we only update the jacobian once per time step
-            if jacobian_stale
-                hprime_store[:,:] = gprime(u, udot, 1./(dt*btab.γii[i]))  # jed: is this index with the stage?
-            end
-            hprime_store
-        end
-        # this is just a linear solve, usually
-#        ks[i,:] = nlsolve(h, zeros(dof), hprime; opts=one_step_only)
-        ks[i][:] = linsolve(h, hprime, zeros(dof))
-#        ks[i,:] = newtonsolve(h, hprime, zeros(dof), maxsteps=1, warn=false)
-        if i==1
-            # calculate jacobian
-            jacobian_stale = false
-        end
+function h!(res, ks, u, udot, g, x, btab, dt, k, s, dof)
+    # h(ks)=0 to be solved for ks.
+    #
+    # Calculates h(ks) and h'(ks) (if s==1).
+    # Modifies its first 3 arguments.
+    #
+    # ks -- guess for k[s] (usually==0) AND result g(u,udot)
+    # u, udot -- work arrays for first and second argument to g
+    # g -- obj function
+    # x -- current state
+    # btab
+    # k -- stage vars (jed's yi)
+    # s -- current stage to calculate
+    # dof
+    #
+    # TODO: could re-use ks for res
+
+    # stage independent and first stage:
+    ss = 1
+    for d=1:dof
+        u[d] = (ks[d] + x[d]
+                + btab.a[s,ss]*k[ss][d])
+        udot[d] = (1/(dt*btab.γii).*ks[d] # usually ==0 as ks==0
+                   + btab.c[s,ss]/dt*k[ss][d])
     end
-
-    # completion:  (done twice for error control)
-    x1 = zeros(eltype(x0), length(x0))
-    for j=1:stages
-        for i=1:dof
-            x1[i] += btab.b[bt_ind,j]*ks[j][i]
+    # later stages:
+    for ss=2:s-1
+        for d=1:dof
+            u[d] += btab.a[s,ss]*k[ss][d]
+            udot[d] += btab.c[s,ss]/dt*k[ss][d]
         end
     end
-    return x0 + x1
+    res[:] = g(u, udot)
+    return nothing
+end
+function hprime!(res, xi, gprime, u, udot, btab, dt)
+    # The Jacobian of h!  Note that u and udot should be calculated by
+    # running h! first.
+    res[:,:] = gprime(u, udot, 1./(dt*btab.γii))
+    tmp = copy(res)
+    return lufact!(res), tmp
 end

From ed19a7c889671da41e7079b834a77213891313fe Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Tue, 26 May 2015 21:58:19 +0200
Subject: [PATCH 06/15] updated to work with inplace functions

---
 examples/van_der_pol_rosw.jl | 20 +++++++++++--
 src/rosw.jl                  | 55 ++++++++++++++++--------------------
 2 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/examples/van_der_pol_rosw.jl b/examples/van_der_pol_rosw.jl
index 3663805d0..75bc32c1d 100644
--- a/examples/van_der_pol_rosw.jl
+++ b/examples/van_der_pol_rosw.jl
@@ -38,6 +38,15 @@ function Jvdp_impl(y, ydot, α)
     return Jvdp(0,y) - α * eye(2)
 end
 
+# Results 24dfdc9cacfd6
+# elapsed time: 0.025767095 seconds (25194952 bytes allocated)
+# elapsed time: 0.00997568 seconds   (6674864 bytes allocated)
+# Adaptive step: abs error of ode23s vs ref:
+# [0.012513592025336528,0.013225223452835055]
+# Adaptive step: abs error of ode_rosw vs ref:
+# [0.012416936355840624,0.013124961519338618]
+
+
 # implicit inplace equation
 function vdp_impl!(res, y, ydot)
     #  vdp_impl(t, y)
@@ -63,6 +72,13 @@ function Jvdp_impl!(res, y, ydot, α)
     return nothing
 end
 
+# elapsed time: 0.02611999 seconds (25194952 bytes allocated)
+# elapsed time: 0.006630583 seconds (4025352 bytes allocated)
+# Adaptive step: abs error of ode23s vs ref:
+# [0.012513592025336528,0.013225223452835055]
+# Adaptive step: abs error of ode_rosw vs ref:
+# [0.012416936355840624,0.013124961519338618]
+
 
 ###
 # reference as t=2
@@ -98,9 +114,9 @@ t,yout3 = ode23s(vdp, y0, tspan; jacobian=Jvdp)
 gc()
 @time t,yout3 = ode23s(vdp, y0, tspan; jacobian=Jvdp)
 
-t,yout4 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
+t,yout4 = ode_rosw(vdp_impl!, Jvdp_impl!, y0, tspan)
 gc()
-@time t,yout4 = ode_rosw(vdp_impl, Jvdp_impl, y0, tspan)
+@time t,yout4 = ode_rosw(vdp_impl!, Jvdp_impl!, y0, tspan)
 
 println("Adaptive step: abs error of ode23s vs ref:")
 println(yout3[end]-refsol)
diff --git a/src/rosw.jl b/src/rosw.jl
index 20e983ea5..03c3a1e0e 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -141,7 +141,7 @@ function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRos
     # TODO: refactor with oderk_adapt
 
     ## Figure types
-    fn_expl = (t,x)->fn(x, x*0)
+    fn_expl = (t,x)->(out=similar(x); fn(out, x, x*0); out)
     Et, Exf, Tx, btab = make_consistent_types(fn, x0, tspan, btab)
     btab = tabletransform(btab)
     # parameters
@@ -164,7 +164,6 @@ function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRos
     k = Array(Tx, S) # stage variables
     allocate!(k, x0, dof)
     ks = zeros(Exf,dof) # work vector for one k
-    h_store = zeros(Exf,dof) # work vector 
     jac_store = zeros(Exf,dof,dof) # Jacobian storage
     u = zeros(Exf,dof)    # work vector 
     udot = zeros(Exf,dof) # work vector 
@@ -200,7 +199,7 @@ function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRos
     iter = 2 # the index into tspan and xs
     while true
         rosw_step!(xtrial, fn, Jfn, x, dt, dof, btab,
-                   k, h_store, jac_store, ks, u, udot, 1)
+                   k, jac_store, ks, u, udot, 1)
         # Completion again for embedded method, see line 927 of
         # http://www.mcs.anl.gov/petsc/petsc-current/src/ts/impls/rosw/rosw.c.html#TSROSW
         xerr[:] = 0.0
@@ -270,29 +269,27 @@ function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRos
     return tspan, xs
 
 end
-function rosw_step!{N,S}(xtrial, g, gprime, x, dt, dof, btab::TableauRosW_T{N,S},
-                         k, h_store, jac_store, ks, u, udot, bt_ind=1)
+function rosw_step!{N,S}(xtrial, g!, gprime!, x, dt, dof, btab::TableauRosW_T{N,S},
+                         k, jac_store, ks, u, udot, bt_ind=1)
     # This takes one step for a ode/dae system defined by
     # g(x,xdot)=0
     # gprime(x, xdot, α) = dg/dx + α dg/dxdot
 
     # first step
-    # ks[:] = 0 assumes ks==0
     s = 1
-    h!(h_store, ks, u, udot, g, x, btab, dt, k, s, dof)
+    h!(k[s], ks, u, udot, g!, x, btab, dt, k, s, dof) # k[s] now holds residual
     # It's sufficient to only update the Jacobian once per time step:
     # (Note that this uses u & udot calculated in h! above.)
-    jac, tmp = hprime!(jac_store, x, gprime, u, udot, btab, dt)
+    jac = hprime!(jac_store, x, gprime!, u, udot, btab, dt)
     
     # calculate k
     for s=1:S-1
         # first step of Newton iteration with guess ks==0
-        k[s][:] = ks - jac\h_store # TODO use A_ldiv_B!
-
-        h!(h_store, ks, u, udot, g, x, btab, dt, k, s+1, dof)
+        k[s][:] = ks - jac\k[s] # in-place A_ldiv_B!(jac, k[s]) 
+        h!(k[s+1], ks, u, udot, g!, x, btab, dt, k, s+1, dof)
     end
     # last step
-    k[S][:] = ks - jac\h_store
+    k[S][:] = ks - jac\k[S]
 
     # completion:
     xtrial[:] = x
@@ -303,12 +300,13 @@ function rosw_step!{N,S}(xtrial, g, gprime, x, dt, dof, btab::TableauRosW_T{N,S}
     end
     return nothing
 end
-function h!(res, ks, u, udot, g, x, btab, dt, k, s, dof)
+function h!(res, ks, u, udot, g!, x, btab, dt, k, s, dof)
     # h(ks)=0 to be solved for ks.
     #
     # Calculates h(ks) and h'(ks) (if s==1).
     # Modifies its first 3 arguments.
     #
+    # res -- result: can use k[s] for this
     # ks -- guess for k[s] (usually==0) AND result g(u,udot)
     # u, udot -- work arrays for first and second argument to g
     # g -- obj function
@@ -317,31 +315,28 @@ function h!(res, ks, u, udot, g, x, btab, dt, k, s, dof)
     # k -- stage vars (jed's yi)
     # s -- current stage to calculate
     # dof
-    #
-    # TODO: could re-use ks for res
 
-    # stage independent and first stage:
-    ss = 1
+    # stage independent
     for d=1:dof
-        u[d] = (ks[d] + x[d]
-                + btab.a[s,ss]*k[ss][d])
-        udot[d] = (1/(dt*btab.γii).*ks[d] # usually ==0 as ks==0
-                   + btab.c[s,ss]/dt*k[ss][d])
+        u[d] = ks[d] + x[d]
+        udot[d] = 1/(dt*btab.γii).*ks[d] # usually ==0 as ks==0
     end
-    # later stages:
-    for ss=2:s-1
+    # stages
+    for ss=1:s-1
         for d=1:dof
             u[d] += btab.a[s,ss]*k[ss][d]
             udot[d] += btab.c[s,ss]/dt*k[ss][d]
         end
     end
-    res[:] = g(u, udot)
+    g!(res, u, udot)
     return nothing
 end
-function hprime!(res, xi, gprime, u, udot, btab, dt)
-    # The Jacobian of h!  Note that u and udot should be calculated by
-    # running h! first.
-    res[:,:] = gprime(u, udot, 1./(dt*btab.γii))
-    tmp = copy(res)
-    return lufact!(res), tmp
+function hprime!(res, xi, gprime!, u, udot, btab, dt)
+    # The Jacobian of h!  Note that u and udot need to be calculated
+    # by running h! first.
+    #
+    # The res will hold part of the LU factorization.  However use the
+    # returned LU-type.
+    gprime!(res, u, udot, 1./(dt*btab.γii))
+    return lufact!(res)
 end

From b3f4697b20dd19c1ac520e977383a0c6b72305a2 Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Wed, 27 May 2015 14:55:22 +0200
Subject: [PATCH 07/15] DAE example working.  ROSW faster than DASSL

---
 examples/hb1dae.jl           | 62 +++++++++++++++++++++++++++++
 examples/van_der_pol_rosw.jl | 75 ++++++++++++++++++++++++++++++------
 src/rosw.jl                  | 69 +++++++++++++++++++++++++--------
 src/runge_kutta.jl           |  2 +-
 4 files changed, 179 insertions(+), 29 deletions(-)
 create mode 100644 examples/hb1dae.jl

diff --git a/examples/hb1dae.jl b/examples/hb1dae.jl
new file mode 100644
index 000000000..13b0799ae
--- /dev/null
+++ b/examples/hb1dae.jl
@@ -0,0 +1,62 @@
+# hb1dae reformulates the hb1ode example as a differential-algebraic equation (DAE) problem.
+# See Matlab help and http://radio.feld.cvut.cz/matlab/techdoc/math_anal/ch_8_od8.html#670396
+
+using ODE, DASSL
+M = diagm([1.,1,1])
+M[3,3] = 0.
+
+function hb1dae(t, y,ydot)
+    res = zeros(length(y))
+    hb1dae!(res, y,ydot)
+    return res
+end
+
+function hb1dae!(res, y,ydot)
+    res[1] = -0.04*y[1] + 1e4*y[2]*y[3] - ydot[1]
+    res[2] = 0.04*y[1]  - 1e4*y[2]*y[3] - 3e7*y[2]^2 - ydot[2]
+    res[3] = y[1] + y[2] + y[3] - 1
+    return nothing
+end
+function Jhb1dae!(res, y, ydot, a)
+    res[1,1] = -0.04 - a
+    res[1,2] = 1e4*y[3]
+    res[1,3] = 1e4*y[2]
+
+    res[2,1] = 0.04
+    res[2,2] = -1e4*y[3] - 3e7*2*y[2] -a;
+    res[2,3] = -1e4*y[2]
+
+    res[3,1] = 1.
+    res[3,2] = 1.
+    res[3,3] = 1.
+
+    return nothing
+end
+function Jhb1dae(t, y, ydot, a)
+    res = zeros(length(y),length(y))
+    Jhb1dae!(res, y, ydot, a)
+    return res
+end
+
+
+tspan = [0, 4e6]
+y0 = [1., 0, 0]
+
+t,ydassl = dasslSolve(hb1dae, y0, tspan, jacobian=Jhb1dae)
+@time t,ydassl = dasslSolve(hb1dae, y0, tspan, jacobian=Jhb1dae)
+y = hcat(ydassl...);
+
+## ROSW
+t,yrosw = ode_rosw(hb1dae!, Jhb1dae!, y0, tspan)
+@time t,yrosw = ode_rosw(hb1dae!, Jhb1dae!, y0, tspan)
+yr = hcat(yrosw...);
+
+println("Relative difference between DASSL vs ROSW:")
+println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
+
+
+# using Winston
+# plot(t, y[1,:], xlog=true)
+# oplot(t, y[2,:]*1e4, xlog=true)
+# oplot(t, y[3,:], xlog=true)
+# oplot(t, y[1,:], xlog=true)
diff --git a/examples/van_der_pol_rosw.jl b/examples/van_der_pol_rosw.jl
index 75bc32c1d..ca77e8fa1 100644
--- a/examples/van_der_pol_rosw.jl
+++ b/examples/van_der_pol_rosw.jl
@@ -1,5 +1,6 @@
 # Van der Pol using DASSL.jl
 using ODE
+using DASSL
 abstol = 1e-6
 reltol = 1e-6
 
@@ -32,7 +33,7 @@ function vdp_impl(y, ydot)
     return vdp(0,y) - ydot
 end
 function Jvdp_impl(y, ydot, α)
-    # d vdp_impl /dy + α d vdp/dy
+    # d vdp_impl /dy + α d vdp_impl/dydot
     #
     # Jacobian
     return Jvdp(0,y) - α * eye(2)
@@ -61,7 +62,7 @@ function vdp_impl!(res, y, ydot)
     return nothing
 end
 function Jvdp_impl!(res, y, ydot, α)
-    # d vdp_impl /dy + α d vdp/dy
+    # d vdp_impl /dy + α d vdp/dydot
     #
     # Jacobian
 
@@ -74,6 +75,7 @@ end
 
 # elapsed time: 0.02611999 seconds (25194952 bytes allocated)
 # elapsed time: 0.006630583 seconds (4025352 bytes allocated)
+# elapsed time: 0.005866466 seconds (2670408 bytes allocated) (newer version)
 # Adaptive step: abs error of ode23s vs ref:
 # [0.012513592025336528,0.013225223452835055]
 # Adaptive step: abs error of ode_rosw vs ref:
@@ -110,11 +112,11 @@ tend = 2.
 # #### adaptive
 tspan = linspace(tstart, tend, 2)
 
-t,yout3 = ode23s(vdp, y0, tspan; jacobian=Jvdp)
+t,yout3 = ode23s(vdp, y0, [0,0.1]; jacobian=Jvdp)
 gc()
 @time t,yout3 = ode23s(vdp, y0, tspan; jacobian=Jvdp)
 
-t,yout4 = ode_rosw(vdp_impl!, Jvdp_impl!, y0, tspan)
+t,yout4 = ode_rosw(vdp_impl!, Jvdp_impl!, y0, [0, 0.1])
 gc()
 @time t,yout4 = ode_rosw(vdp_impl!, Jvdp_impl!, y0, tspan)
 
@@ -123,14 +125,63 @@ println(yout3[end]-refsol)
 println("Adaptive step: abs error of ode_rosw vs ref:")
 println(yout4[end]-refsol)
 
-# rosw
-# @time yout3, ts, steps, dts, xerrs = rosw_runner_adaptive(
-#                      vdp_impl, Jvdp_impl, [0., 2.], y0;
-#                      reltol=reltol, abstol=abstol, dt0=1e-5, mindt=1e-8)
 
-# check
+####################
+# DAE: reduced Van der Pol
+
+
+
+# implicit inplace equation
+function dae_vdp_impl!(res, y, ydot)
+    #  dae_vdp_impl(t, y)
+    #
+    # Implicit, reduced van der Pol equation.  mu->\infinity
+
+    res[1] = y[2] - ydot[1]
+    res[2] = y[1] - (y[2]^3/3 - y[2])
+    return nothing
+end
+function Jdae_vdp_impl!(res, y, ydot, α)
+    # d dae_vdp_impl /dy + α d dae_vdp_impl/dydot
+    #
+    # Jacobian
+
+    res[1,1] = 0 - α
+    res[1,2] = 1
+    res[2,1] = 1
+    res[2,2] = -(y[2]^2 - 1)
+    return nothing
+end
+
+t,yout5 = ode_rosw(dae_vdp_impl!, Jdae_vdp_impl!, y0, [0, 0.1])
+gc()
+@time t,yout5 = ode_rosw(dae_vdp_impl!, Jdae_vdp_impl!, y0, tspan)
+
+
+
+# implicit inplace equation
+function dae_vdp(t, y, ydot)
+    #  dae_vdp_impl(t, y)
+    #
+    # Implicit, reduced van der Pol equation.  mu->\infinity
+
+    [y[2] - ydot[1], y[1] - (y[2]^3/3 - y[2])]
+end
+function Jdae_vdp(t, y, ydot, α)
+    # d dae_vdp_impl /dy + α d dae_vdp_impl/dydot
+    #
+    # Jacobian
+    res = zeros(2,2)
+    res[1,1] = 0 - α
+    res[1,2] = 1
+    res[2,1] = 1
+    res[2,2] = -(y[2]^2 - 1)
+    return res
+end
+tspan = [0, 4e6]
+y0 = [1., 0, 0]
+
+t,ydassl = dasslSolve(dae_vdp, y0, tspan)
+
 
-# using Winston
-# plot(ts, yout3[1,:])
 
-# oplot(tspan, yout2[1,:],"r")
diff --git a/src/rosw.jl b/src/rosw.jl
index 03c3a1e0e..7545d02c7 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -5,6 +5,10 @@
 @show "NEW"
 export ode_rosw, ode_rosw_fixed
 
+# TODO:
+# - finite diff Jacobian
+# - AD Jacobian
+
 # Rosenbrock-W methods are typically specified for autonomous DAE:
 # Mẋ = f(x)
 #
@@ -101,6 +105,19 @@ const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4), Float64,
                                   3.7810903145819369e-01  -9.6042292212423178e-02  5.0000000000000000e-01  2.1793326075422950e-01]
                                   )
 
+# binterpt[0][0]=1.0564298455794094;
+# binterpt[1][0]=2.296429974281067;
+# binterpt[2][0]=-1.307599564525376;
+# binterpt[3][0]=-1.045260255335102;
+# binterpt[0][1]=-1.3864882699759573;
+# binterpt[1][1]=-8.262611700275677;
+# binterpt[2][1]=7.250979895056055;
+# binterpt[3][1]=2.398120075195581;
+# binterpt[0][2]=0.5721822314575016;
+# binterpt[1][2]=4.742931142090097;
+# binterpt[2][2]=-4.398120075195578;
+# binterpt[3][2]=-0.9169932983520199;
+
 ###################
 # Fixed step solver
 ###################
@@ -133,13 +150,19 @@ ode_rosw(fn, Jfn, x0, tspan;kwargs...) = oderosw_adapt(fn, Jfn, x0, tspan, bt_ro
 function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRosW{N,S};
                             reltol = 1.0e-5, abstol = 1.0e-8,
                             norm=Base.norm,
-                            minstep=abs(tspan[end] - tspan[1])/1e9,
+                            minstep=abs(tspan[end] - tspan[1])/1e18,
                             maxstep=abs(tspan[end] - tspan[1])/2.5,
                             initstep=0.,
-                            points=:all
+#                            points=:all
                             )
     # TODO: refactor with oderk_adapt
 
+    # FIXME: add interpolation
+    if length(tspan)>2
+        error("specified output times not supported yet")
+    else
+        points=:all
+    end
     ## Figure types
     fn_expl = (t,x)->(out=similar(x); fn(out, x, x*0); out)
     Et, Exf, Tx, btab = make_consistent_types(fn, x0, tspan, btab)
@@ -183,8 +206,9 @@ function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRos
     elseif points!=:specified
         error("Unrecognized option points==$points")
     end
+
     # Time
-    dt, tdir, k[1] = hinit(fn_expl, x, tstart, tend, order, reltol, abstol) # sets k[1]=f0
+    dt, tdir, f0 = hinit(fn_expl, x, tstart, tend, order, reltol, abstol)
     if initstep!=0
         dt = sign(initstep)==tdir ? initstep : error("initstep has wrong sign.")
     end
@@ -199,7 +223,7 @@ function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRos
     iter = 2 # the index into tspan and xs
     while true
         rosw_step!(xtrial, fn, Jfn, x, dt, dof, btab,
-                   k, jac_store, ks, u, udot, 1)
+                   k, jac_store, ks, u, udot)
         # Completion again for embedded method, see line 927 of
         # http://www.mcs.anl.gov/petsc/petsc-current/src/ts/impls/rosw/rosw.c.html#TSROSW
         xerr[:] = 0.0
@@ -217,17 +241,19 @@ function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRos
             push!(errs, err)
 
             # Output:
-            f0 = k[1]
-            f1 = fn_expl(t+dt, xtrial)
+            # f0 = k[1]
+            # f1 = fn_expl(t+dt, xtrial)
             if points==:specified
                 # interpolate onto given output points
                 while iter-1<nsteps_fixed && (tdir*tspan[iter]<tdir*(t+dt) || laststep) # output at all new times which are < t+dt
+                    error("FIXME: not implemented yet")
                     hermite_interp!(xs[iter], tspan[iter], t, dt, x, xtrial, f0, f1) # TODO: 3rd order only!
                     iter += 1
                 end
             else
                 # first interpolate onto given output points
                 while iter_fixed-1<nsteps_fixed && tdir*t<tdir*tspan_fixed[iter_fixed]<tdir*(t+dt) # output at all new times which are < t+dt
+                    error("FIXME: not implemented yet")
                     xout = hermite_interp(tspan_fixed[iter_fixed], t, dt, x, xtrial, f0, f1)
                     index_or_push!(xs, iter, xout) # TODO: 3rd order only!
                     push!(tspan, tspan_fixed[iter_fixed])
@@ -239,7 +265,7 @@ function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRos
                 push!(tspan, t+dt)
                 iter += 1
             end
-            k[1] = f1 # load k[1]==f0 for next step
+            # k[1] = f1 # load k[1]==f0 for next step
 
             # Break if this was the last step:
             laststep && break
@@ -275,23 +301,34 @@ function rosw_step!{N,S}(xtrial, g!, gprime!, x, dt, dof, btab::TableauRosW_T{N,
     # g(x,xdot)=0
     # gprime(x, xdot, α) = dg/dx + α dg/dxdot
 
-    # first step
+    ## Calculate k
+    # first step:
     s = 1
     h!(k[s], ks, u, udot, g!, x, btab, dt, k, s, dof) # k[s] now holds residual
     # It's sufficient to only update the Jacobian once per time step:
     # (Note that this uses u & udot calculated in h! above.)
     jac = hprime!(jac_store, x, gprime!, u, udot, btab, dt)
-    
-    # calculate k
+    # middle steps:
     for s=1:S-1
         # first step of Newton iteration with guess ks==0
-        k[s][:] = ks - jac\k[s] # in-place A_ldiv_B!(jac, k[s]) 
+        #        k[s][:] = ks - jac\k[s] # in-place A_ldiv_B!(jac, k[s])
+        # TODO: add option to do more Newton iterations
+        A_ldiv_B!(jac, k[s])
+        for d=1:dof
+            k[s][d] = ks[d]-k[s][d]
+        end
         h!(k[s+1], ks, u, udot, g!, x, btab, dt, k, s+1, dof)
+        # TODO: add option to evaluate Jakobian again
     end
-    # last step
-    k[S][:] = ks - jac\k[S]
-
-    # completion:
+    # last step:
+    #    k[S][:] = ks - jac\k[S]
+    # TODO: add option to do more Newton iterations
+    A_ldiv_B!(jac, k[S])
+    for d=1:dof
+        k[S][d] = ks[d]-k[S][d]
+    end
+    
+    ## Completion:
     xtrial[:] = x
     for s=1:S
         for d=1:dof
@@ -319,7 +356,7 @@ function h!(res, ks, u, udot, g!, x, btab, dt, k, s, dof)
     # stage independent
     for d=1:dof
         u[d] = ks[d] + x[d]
-        udot[d] = 1/(dt*btab.γii).*ks[d] # usually ==0 as ks==0
+        udot[d] = 1/(dt*btab.γii)*ks[d] # usually ==0 as ks==0
     end
     # stages
     for ss=1:s-1
diff --git a/src/runge_kutta.jl b/src/runge_kutta.jl
index dfb8919ee..018197939 100644
--- a/src/runge_kutta.jl
+++ b/src/runge_kutta.jl
@@ -464,7 +464,7 @@ function hermite_interp!(y, tquery,t,dt,y0,y1,f0,f1)
     #
     # f_0 = f(x_0 , y_0) , f_1 = f(x_0 + h, y_1 )
     # this is O(3). TODO for higher order.
-
+error("")
     theta = (tquery-t)/dt
     for i=1:length(y0)
         y[i] = ((1-theta)*y0[i] + theta*y1[i] + theta*(theta-1) *

From 7bad8dd2b1dec77be3ac049f8691719bd3f7194d Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Wed, 27 May 2015 15:37:32 +0200
Subject: [PATCH 08/15] Numerical Jacobian working.

(Copied from DASSL)
---
 examples/hb1dae.jl           |  11 ++-
 examples/van_der_pol_rosw.jl | 126 ++++++++++++++++++-----------------
 src/rosw.jl                  |  52 +++++++++------
 3 files changed, 108 insertions(+), 81 deletions(-)

diff --git a/examples/hb1dae.jl b/examples/hb1dae.jl
index 13b0799ae..d0f99a525 100644
--- a/examples/hb1dae.jl
+++ b/examples/hb1dae.jl
@@ -47,13 +47,20 @@ t,ydassl = dasslSolve(hb1dae, y0, tspan, jacobian=Jhb1dae)
 y = hcat(ydassl...);
 
 ## ROSW
-t,yrosw = ode_rosw(hb1dae!, Jhb1dae!, y0, tspan)
-@time t,yrosw = ode_rosw(hb1dae!, Jhb1dae!, y0, tspan)
+t,yrosw = ode_rosw(hb1dae!, y0, tspan) #, jacobian=Jhb1dae!)
+@time t,yrosw = ode_rosw(hb1dae!, y0, tspan)#, jacobian=Jhb1dae!)
 yr = hcat(yrosw...);
 
 println("Relative difference between DASSL vs ROSW:")
 println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
 
+# with Jacobian
+t,yrosw = ode_rosw(hb1dae!, y0, tspan, jacobian=Jhb1dae!)
+@time t,yrosw = ode_rosw(hb1dae!, y0, tspan, jacobian=Jhb1dae!)
+yr = hcat(yrosw...);
+
+println("Relative difference between DASSL vs ROSW with Jac:")
+println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
 
 # using Winston
 # plot(t, y[1,:], xlog=true)
diff --git a/examples/van_der_pol_rosw.jl b/examples/van_der_pol_rosw.jl
index ca77e8fa1..2bdb32c28 100644
--- a/examples/van_der_pol_rosw.jl
+++ b/examples/van_der_pol_rosw.jl
@@ -116,72 +116,78 @@ t,yout3 = ode23s(vdp, y0, [0,0.1]; jacobian=Jvdp)
 gc()
 @time t,yout3 = ode23s(vdp, y0, tspan; jacobian=Jvdp)
 
-t,yout4 = ode_rosw(vdp_impl!, Jvdp_impl!, y0, [0, 0.1])
+t,yout4 = ode_rosw(vdp_impl!, y0, [0, 0.1], jacobian=Jvdp_impl!)
 gc()
-@time t,yout4 = ode_rosw(vdp_impl!, Jvdp_impl!, y0, tspan)
+@time t,yout4 = ode_rosw(vdp_impl!, y0, tspan, jacobian=Jvdp_impl!)
+
+# numerical jacobian
+t,yout5 = ode_rosw(vdp_impl!, y0, [0, 0.1])
+gc()
+@time t,yout5 = ode_rosw(vdp_impl!, y0, tspan)
+
 
 println("Adaptive step: abs error of ode23s vs ref:")
 println(yout3[end]-refsol)
 println("Adaptive step: abs error of ode_rosw vs ref:")
 println(yout4[end]-refsol)
-
-
-####################
-# DAE: reduced Van der Pol
-
-
-
-# implicit inplace equation
-function dae_vdp_impl!(res, y, ydot)
-    #  dae_vdp_impl(t, y)
-    #
-    # Implicit, reduced van der Pol equation.  mu->\infinity
-
-    res[1] = y[2] - ydot[1]
-    res[2] = y[1] - (y[2]^3/3 - y[2])
-    return nothing
-end
-function Jdae_vdp_impl!(res, y, ydot, α)
-    # d dae_vdp_impl /dy + α d dae_vdp_impl/dydot
-    #
-    # Jacobian
-
-    res[1,1] = 0 - α
-    res[1,2] = 1
-    res[2,1] = 1
-    res[2,2] = -(y[2]^2 - 1)
-    return nothing
-end
-
-t,yout5 = ode_rosw(dae_vdp_impl!, Jdae_vdp_impl!, y0, [0, 0.1])
-gc()
-@time t,yout5 = ode_rosw(dae_vdp_impl!, Jdae_vdp_impl!, y0, tspan)
-
-
-
-# implicit inplace equation
-function dae_vdp(t, y, ydot)
-    #  dae_vdp_impl(t, y)
-    #
-    # Implicit, reduced van der Pol equation.  mu->\infinity
-
-    [y[2] - ydot[1], y[1] - (y[2]^3/3 - y[2])]
-end
-function Jdae_vdp(t, y, ydot, α)
-    # d dae_vdp_impl /dy + α d dae_vdp_impl/dydot
-    #
-    # Jacobian
-    res = zeros(2,2)
-    res[1,1] = 0 - α
-    res[1,2] = 1
-    res[2,1] = 1
-    res[2,2] = -(y[2]^2 - 1)
-    return res
-end
-tspan = [0, 4e6]
-y0 = [1., 0, 0]
-
-t,ydassl = dasslSolve(dae_vdp, y0, tspan)
+println("Adaptive step: ROSW abs error with numerical Jacobian:")
+println(yout5[end]-refsol)
+
+
+# ####################
+# # DAE: reduced Van der Pol
+
+
+
+# # implicit inplace equation
+# function dae_vdp_impl!(res, y, ydot)
+#     #  dae_vdp_impl(t, y)
+#     #
+#     # Implicit, reduced van der Pol equation.  mu->\infinity
+
+#     res[1] = y[2] - ydot[1]
+#     res[2] = y[1] - (y[2]^3/3 - y[2])
+#     return nothing
+# end
+# function Jdae_vdp_impl!(res, y, ydot, α)
+#     # d dae_vdp_impl /dy + α d dae_vdp_impl/dydot
+#     #
+#     # Jacobian
+
+#     res[1,1] = 0 - α
+#     res[1,2] = 1
+#     res[2,1] = 1
+#     res[2,2] = -(y[2]^2 - 1)
+#     return nothing
+# end
+
+# t,yout5 = ode_rosw(dae_vdp_impl!, Jdae_vdp_impl!, y0, [0, 0.1])
+# gc()
+# @time t,yout5 = ode_rosw(dae_vdp_impl!, Jdae_vdp_impl!, y0, tspan)
+
+
+
+# # implicit inplace equation
+# function dae_vdp(t, y, ydot)
+#     #  dae_vdp_impl(t, y)
+#     #
+#     # Implicit, reduced van der Pol equation.  mu->\infinity
+
+#     [y[2] - ydot[1], y[1] - (y[2]^3/3 - y[2])]
+# end
+# function Jdae_vdp(t, y, ydot, α)
+#     # d dae_vdp_impl /dy + α d dae_vdp_impl/dydot
+#     #
+#     # Jacobian
+#     res = zeros(2,2)
+#     res[1,1] = 0 - α
+#     res[1,2] = 1
+#     res[2,1] = 1
+#     res[2,2] = -(y[2]^2 - 1)
+#     return res
+# end
+
+# t,ydassl = dasslSolve(dae_vdp, y0, tspan)
 
 
 
diff --git a/src/rosw.jl b/src/rosw.jl
index 7545d02c7..6d51c58f7 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -1,12 +1,12 @@
 # Rosenbrock-Wanner methods
 ###########################
 #
-# Main references: Wanner & Hairrere 1996, PETSc http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/TS/TSROSW.html
-@show "NEW"
+# Main references:
+# - Wanner & Hairer 1996
+# - PETSc http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/TS/TSROSW.html
 export ode_rosw, ode_rosw_fixed
 
 # TODO:
-# - finite diff Jacobian
 # - AD Jacobian
 
 # Rosenbrock-W methods are typically specified for autonomous DAE:
@@ -105,19 +105,6 @@ const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4), Float64,
                                   3.7810903145819369e-01  -9.6042292212423178e-02  5.0000000000000000e-01  2.1793326075422950e-01]
                                   )
 
-# binterpt[0][0]=1.0564298455794094;
-# binterpt[1][0]=2.296429974281067;
-# binterpt[2][0]=-1.307599564525376;
-# binterpt[3][0]=-1.045260255335102;
-# binterpt[0][1]=-1.3864882699759573;
-# binterpt[1][1]=-8.262611700275677;
-# binterpt[2][1]=7.250979895056055;
-# binterpt[3][1]=2.398120075195581;
-# binterpt[0][2]=0.5721822314575016;
-# binterpt[1][2]=4.742931142090097;
-# binterpt[2][2]=-4.398120075195578;
-# binterpt[3][2]=-0.9169932983520199;
-
 ###################
 # Fixed step solver
 ###################
@@ -146,13 +133,14 @@ function oderosw_fixed{N,S}(fn, Jfn, x0::AbstractVector, tspan,
 end
 
 
-ode_rosw(fn, Jfn, x0, tspan;kwargs...) = oderosw_adapt(fn, Jfn, x0, tspan, bt_ros34pw2; kwargs...)
-function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRosW{N,S};
+ode_rosw(fn, x0, tspan;kwargs...) = oderosw_adapt(fn, x0, tspan, bt_ros34pw2; kwargs...)
+function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S};
                             reltol = 1.0e-5, abstol = 1.0e-8,
                             norm=Base.norm,
                             minstep=abs(tspan[end] - tspan[1])/1e18,
                             maxstep=abs(tspan[end] - tspan[1])/2.5,
                             initstep=0.,
+                            jacobian=numerical_jacobian(fn, reltol, abstol)
 #                            points=:all
                             )
     # TODO: refactor with oderk_adapt
@@ -222,7 +210,7 @@ function oderosw_adapt{N,S}(fn, Jfn, x0::AbstractVector, tspan, btab::TableauRos
     timeout = 0 # for step-control
     iter = 2 # the index into tspan and xs
     while true
-        rosw_step!(xtrial, fn, Jfn, x, dt, dof, btab,
+        rosw_step!(xtrial, fn, jacobian, x, dt, dof, btab,
                    k, jac_store, ks, u, udot)
         # Completion again for embedded method, see line 927 of
         # http://www.mcs.anl.gov/petsc/petsc-current/src/ts/impls/rosw/rosw.c.html#TSROSW
@@ -377,3 +365,29 @@ function hprime!(res, xi, gprime!, u, udot, btab, dt)
     gprime!(res, u, udot, 1./(dt*btab.γii))
     return lufact!(res)
 end
+
+# Copied & adapted from DASSL:
+# generate a function that computes approximate jacobian using forward
+# finite differences
+function numerical_jacobian(fn!, reltol, abstol)
+    function numjac!(jac, y, dy, a)
+        # TODO use coloring
+        
+        ep      = eps(1e6)   # this is the machine epsilon # TODO: better value?
+        h       = 1/a           # h ~ 1/a
+        wt      = reltol*abs(y).+abstol
+        # delta for approximation of jacobian.  I removed the
+        # sign(h_next*dy0) from the definition of delta because it was
+        # causing trouble when dy0==0 (which happens for ord==1)
+        edelta  = spdiagm(max(abs(y),abs(h*dy),wt)*sqrt(ep))
+
+        tmp = similar(y)
+        f0 = similar(y)
+        fn!(f0, y, dy)
+        for i=1:length(y)
+            fn!(tmp, y+edelta[:,i], a*edelta[:,i]+dy)
+            jac[:,i] = (tmp-f0)/edelta[i,i]
+        end
+        return nothing
+    end
+end

From e7196d260e26a3570a677037595014bad3bc163a Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Wed, 27 May 2015 16:10:19 +0200
Subject: [PATCH 09/15] Fixed fixed step solver added rodas3 solver

---
 examples/hb1dae.jl | 20 +++++++++++++++++++-
 src/rosw.jl        | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/examples/hb1dae.jl b/examples/hb1dae.jl
index d0f99a525..0d1be29ae 100644
--- a/examples/hb1dae.jl
+++ b/examples/hb1dae.jl
@@ -51,10 +51,18 @@ t,yrosw = ode_rosw(hb1dae!, y0, tspan) #, jacobian=Jhb1dae!)
 @time t,yrosw = ode_rosw(hb1dae!, y0, tspan)#, jacobian=Jhb1dae!)
 yr = hcat(yrosw...);
 
-println("Relative difference between DASSL vs ROSW:")
+println("Relative difference between DASSL vs ROSW, no Jac:")
+println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
+
+println("Using fixed step ROSW")
+tspan = t
+t,yrosw = ode_rosw_fixed(hb1dae!, y0, tspan)
+@time t,yrosw = ode_rosw_fixed(hb1dae!, y0, tspan)
+println("Relative difference between DASSL vs fixed step, no Jac:")
 println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
 
 # with Jacobian
+tspan = [0, 4e6]
 t,yrosw = ode_rosw(hb1dae!, y0, tspan, jacobian=Jhb1dae!)
 @time t,yrosw = ode_rosw(hb1dae!, y0, tspan, jacobian=Jhb1dae!)
 yr = hcat(yrosw...);
@@ -62,6 +70,16 @@ yr = hcat(yrosw...);
 println("Relative difference between DASSL vs ROSW with Jac:")
 println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
 
+
+println("Using ros_rodas3")
+t,yrosw = ODE.oderosw_adapt(hb1dae!, y0, tspan, ODE.bt_ros_rodas3)
+@time t,yrosw = ODE.oderosw_adapt(hb1dae!, y0, tspan, ODE.bt_ros_rodas3)
+println("Relative difference between DASSL vs Robas2, no Jac:")
+println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
+
+
+
+
 # using Winston
 # plot(t, y[1,:], xlog=true)
 # oplot(t, y[2,:]*1e4, xlog=true)
diff --git a/src/rosw.jl b/src/rosw.jl
index 6d51c58f7..793186658 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -8,6 +8,7 @@ export ode_rosw, ode_rosw_fixed
 
 # TODO:
 # - AD Jacobian
+# - fix fixed step solver
 
 # Rosenbrock-W methods are typically specified for autonomous DAE:
 # Mẋ = f(x)
@@ -91,7 +92,7 @@ function tabletransform{Name,S,T}(rt::TableauRosW{Name,S,T})
 end
 
 
-## tableau for ros34pw2
+## tableau for ros34pw2 http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/TS/TSROSWRA34PW2.html
 const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4), Float64,
                                 [0  0  0  0
                                  8.7173304301691801e-01  0  0  0
@@ -105,12 +106,28 @@ const bt_ros34pw2 = TableauRosW(:ros34pw2, (3,4), Float64,
                                   3.7810903145819369e-01  -9.6042292212423178e-02  5.0000000000000000e-01  2.1793326075422950e-01]
                                   )
 
+## tableau for RODAS3 http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/TS/TSROSWRODAS3.html
+const bt_ros_rodas3 = TableauRosW(:ros_rodas3, (3,4), Rational{Int},
+                                  [0 0 0 0
+                                   0 0 0 0
+                                   1 0 0 0
+                                   3//4 -1//4 1//2 0],
+                                  [1//2 0 0 0
+                                   1  1//2 0 0
+                                   -1//4 -1//4 1//2 0
+                                   1//12 1//12 -2//3 1//2],
+                                   [5//6 -1//6 -1//6 1//2
+                                    3//4 -1//4 1//2 0]
+                                    )
+
 ###################
 # Fixed step solver
 ###################
-ode_rosw_fixed(fn, Jfn, x0, tspan) = oderosw_fixed(fn, Jfn, x0, tspan, bt_ros34pw2)
-function oderosw_fixed{N,S}(fn, Jfn, x0::AbstractVector, tspan,
-                            btab::TableauRosW{N,S})
+ode_rosw_fixed(fn, x0, tspan; kwargs...) = oderosw_fixed(fn, x0, tspan, bt_ros34pw2, kwargs...)
+function oderosw_fixed{N,S}(fn, x0::AbstractVector, tspan,
+                            btab::TableauRosW{N,S};
+                            jacobian=numerical_jacobian(fn, 1e-6, 1e-6)
+                            )
     # TODO: refactor with oderk_fixed
     Et, Exf, Tx, btab = make_consistent_types(fn, x0, tspan, btab)
     btab = tabletransform(btab)
@@ -122,18 +139,26 @@ function oderosw_fixed{N,S}(fn, Jfn, x0::AbstractVector, tspan,
 
     tspan = convert(Vector{Et}, tspan)
     # work arrays:
-    ks = Array(Tx, S)
+    k = Array(Tx, S) # stage variables
+    allocate!(k, x0, dof)
+    ks = zeros(Exf,dof) # work vector for one k
+    jac_store = zeros(Exf,dof,dof) # Jacobian storage
+    u = zeros(Exf,dof)    # work vector 
+    udot = zeros(Exf,dof) # work vector 
+    
     # allocate!(ks, x0, dof) # no need to allocate as fn is not in-place
     xtmp = similar(x0, Exf, dof)
     for i=1:length(tspan)-1
         dt = tspan[i+1]-tspan[i]
-        xs[i+1] = rosw_step!(fn, Jfn, xs[i], dt, btab, 2)
+        rosw_step!(xs[i+1], fn, jacobian, xs[i], dt, dof, btab,
+                   k, jac_store, ks, u, udot)
     end
     return tspan, xs
 end
 
 
 ode_rosw(fn, x0, tspan;kwargs...) = oderosw_adapt(fn, x0, tspan, bt_ros34pw2; kwargs...)
+ode_rodas3(fn, x0, tspan;kwargs...) = oderosw_adapt(fn, x0, tspan, bt_ros_rodas3; kwargs...)
 function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S};
                             reltol = 1.0e-5, abstol = 1.0e-8,
                             norm=Base.norm,

From 24346ed596d391dc90da6406fc3975854f871e8c Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Wed, 27 May 2015 20:33:11 +0200
Subject: [PATCH 10/15] Added tests

---
 src/rosw.jl          | 11 +++++--
 src/runge_kutta.jl   |  1 -
 test/implicit-eqs.jl | 78 ++++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl     |  1 +
 4 files changed, 87 insertions(+), 4 deletions(-)
 create mode 100644 test/implicit-eqs.jl

diff --git a/src/rosw.jl b/src/rosw.jl
index 793186658..5df473709 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -7,7 +7,9 @@
 export ode_rosw, ode_rosw_fixed
 
 # TODO:
-# - AD Jacobian
+# - Jacobian coloring: https://github.com/mlubin/ReverseDiffSparse.jl
+#      http://wiki.cs.purdue.edu:9835/coloringpage/abstracts/acyclic-SISC.pdf
+# - AD Jacobian: https://github.com/mlubin/ReverseDiffSparse.jl
 # - fix fixed step solver
 
 # Rosenbrock-W methods are typically specified for autonomous DAE:
@@ -172,7 +174,9 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
 
     # FIXME: add interpolation
     if length(tspan)>2
-        error("specified output times not supported yet")
+        warn("specified output times not supported yet")
+        tspan = [tspan[1], tspan[end]]
+        points=:all
     else
         points=:all
     end
@@ -326,7 +330,8 @@ function rosw_step!{N,S}(xtrial, g!, gprime!, x, dt, dof, btab::TableauRosW_T{N,
         # first step of Newton iteration with guess ks==0
         #        k[s][:] = ks - jac\k[s] # in-place A_ldiv_B!(jac, k[s])
         # TODO: add option to do more Newton iterations
-        A_ldiv_B!(jac, k[s])
+        A_ldiv_B!(jac, k[s])  # TODO: see whether this is impacted by https://github.com/JuliaLang/julia/issues/10787
+                              # and https://github.com/JuliaLang/julia/issues/11325
         for d=1:dof
             k[s][d] = ks[d]-k[s][d]
         end
diff --git a/src/runge_kutta.jl b/src/runge_kutta.jl
index 018197939..aa6d0d260 100644
--- a/src/runge_kutta.jl
+++ b/src/runge_kutta.jl
@@ -464,7 +464,6 @@ function hermite_interp!(y, tquery,t,dt,y0,y1,f0,f1)
     #
     # f_0 = f(x_0 , y_0) , f_1 = f(x_0 + h, y_1 )
     # this is O(3). TODO for higher order.
-error("")
     theta = (tquery-t)/dt
     for i=1:length(y0)
         y[i] = ((1-theta)*y0[i] + theta*y1[i] + theta*(theta-1) *
diff --git a/test/implicit-eqs.jl b/test/implicit-eqs.jl
new file mode 100644
index 000000000..ada0eb754
--- /dev/null
+++ b/test/implicit-eqs.jl
@@ -0,0 +1,78 @@
+using ODE
+using Base.Test
+
+tol = 1e-2
+
+solvers = [
+           ## Stiff
+           ODE.ode_rosw
+           ]
+
+for solver in solvers
+    println("using $solver")
+    # dy
+    # -- = 6 ==> y = 6t
+    # dt
+    f = (res,y,ydot)->(res[:]=6.0-ydot; nothing)
+    t,y=solver(f, [0.], [0.,1])
+    y = vcat(y...)
+    @test maximum(abs(y-6t)) < tol
+
+    # dy
+    # -- = 2t ==> y = t.^2
+    # dt
+    # t,y=solver((t,y)->2t, 0., [0:.001:1;])
+    # @test maximum(abs(y-t.^2)) < tol
+    
+
+    # dy
+    # -- = y ==> y = y0*e.^t
+    # dt
+    f = (res,y,ydot)->(res[:]=y-ydot;nothing)
+    t,y=solver(f, [1.], [0,1])
+    y = vcat(y...)
+    @test maximum(abs(y-e.^t)) < tol
+
+    t,y=solver((res,y,ydot)->(res[:]=y-ydot), [1.], [1, 0])
+    y = vcat(y...)
+    @test maximum(abs(y-e.^(t-1))) < tol
+ 
+    # dv       dw
+    # -- = -w, -- = v ==> v = v0*cos(t) - w0*sin(t), w = w0*cos(t) + v0*sin(t)
+    # dt       dt
+    #
+    # y = [v, w]
+
+    function ff(res,y,ydot)
+        res[1] = -y[2]-ydot[1]
+        res[2] = y[1] -ydot[2]
+    end
+    
+    t,y=solver(ff, [1., 2.], [0, 2*pi])
+    ys = hcat(y...).'   # convert Vector{Vector{Float}} to Matrix{Float}
+    @test maximum(abs(ys-[cos(t)-2*sin(t) 2*cos(t)+sin(t)])) < tol
+end
+
+# rober testcase from http://www.unige.ch/~hairer/testset/testset.html
+let
+    println("ROBER test case")
+    function f(t, y)
+        ydot = similar(y)
+        ydot[1] = -0.04*y[1] + 1.0e4*y[2]*y[3]
+        ydot[3] = 3.0e7*y[2]*y[2]
+        ydot[2] = -ydot[1] - ydot[3]
+        ydot
+    end
+    f!(res, y, ydot) = (res[:] = f(0, y) - ydot; nothing)
+
+    t = [0., 1e11]
+    t,y = ode_rosw(f!, [1.0, 0.0, 0.0], t; abstol=1e-8, reltol=1e-8,
+                                        maxstep=1e11/10, minstep=1e11/1e18)
+
+    refsol = [0.2083340149701255e-07,
+              0.8333360770334713e-13,
+              0.9999999791665050] # reference solution at tspan[2]
+    @test norm(refsol-y[end], Inf) < 3e-10
+end
+
+println("All looks OK")
diff --git a/test/runtests.jl b/test/runtests.jl
index 86fc105f0..9376f8d8d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -81,5 +81,6 @@ let
     @test norm(refsol-y[end], Inf) < 2e-10
 end
 include("interface-tests.jl")
+include("implicit-eqs.jl")
 
 println("All looks OK")

From 3bf2ce0d5b62324ae769ecec218f8998a8c6d828 Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Sun, 31 May 2015 11:17:51 +0200
Subject: [PATCH 11/15] Small fix in numeric jac

---
 src/rosw.jl | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/rosw.jl b/src/rosw.jl
index 5df473709..e95792cc1 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -4,7 +4,7 @@
 # Main references:
 # - Wanner & Hairer 1996
 # - PETSc http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/TS/TSROSW.html
-export ode_rosw, ode_rosw_fixed
+export ode_rosw, ode_rosw_fixed, ode_rodas3
 
 # TODO:
 # - Jacobian coloring: https://github.com/mlubin/ReverseDiffSparse.jl
@@ -409,14 +409,20 @@ function numerical_jacobian(fn!, reltol, abstol)
         # delta for approximation of jacobian.  I removed the
         # sign(h_next*dy0) from the definition of delta because it was
         # causing trouble when dy0==0 (which happens for ord==1)
-        edelta  = spdiagm(max(abs(y),abs(h*dy),wt)*sqrt(ep))
+        edelta  = max(abs(y),abs(h*dy),wt)*sqrt(ep)
 
         tmp = similar(y)
+        tmp1 = similar(y)
+        tmp2 = similar(y)
         f0 = similar(y)
         fn!(f0, y, dy)
         for i=1:length(y)
-            fn!(tmp, y+edelta[:,i], a*edelta[:,i]+dy)
-            jac[:,i] = (tmp-f0)/edelta[i,i]
+            copy!(tmp1, y)
+            copy!(tmp2, dy)
+            tmp1[i] += edelta[i]
+            tmp2[i] += a*edelta[i]
+            fn!(tmp, tmp1, tmp2)
+            jac[:,i] = (tmp-f0)/edelta[i]
         end
         return nothing
     end

From 037d19cb6893e7bb20ad32968d8565e5ee571afe Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Tue, 9 Jun 2015 17:47:48 +0200
Subject: [PATCH 12/15] ...

---
 REQUIRE            |  1 +
 examples/hb1dae.jl |  6 ++--
 src/rosw.jl        | 74 ++++++++++++++++++++++++++++++++++++++++++++--
 src/runge_kutta.jl |  3 +-
 4 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/REQUIRE b/REQUIRE
index 8e8352dde..2f9450386 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -1,3 +1,4 @@
 julia 0.3
 Polynomials
 Compat 0.4.1
+MatrixColorings
diff --git a/examples/hb1dae.jl b/examples/hb1dae.jl
index 0d1be29ae..461b7e40d 100644
--- a/examples/hb1dae.jl
+++ b/examples/hb1dae.jl
@@ -33,9 +33,9 @@ function Jhb1dae!(res, y, ydot, a)
     return nothing
 end
 function Jhb1dae(t, y, ydot, a)
-    res = zeros(length(y),length(y))
-    Jhb1dae!(res, y, ydot, a)
-    return res
+    jac = zeros(length(y),length(y))
+    Jhb1dae!(jac, y, ydot, a)
+    return jac
 end
 
 
diff --git a/src/rosw.jl b/src/rosw.jl
index e95792cc1..04680c420 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -4,6 +4,8 @@
 # Main references:
 # - Wanner & Hairer 1996
 # - PETSc http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/TS/TSROSW.html
+
+using MatrixColorings
 export ode_rosw, ode_rosw_fixed, ode_rodas3
 
 # TODO:
@@ -24,6 +26,11 @@ export ode_rosw, ode_rosw_fixed, ode_rodas3
 # The method used here uses transformed equations as done in PETSc.
 
 
+rosw_poststep(xtrial, err, t, dt, steps) = (showcompact(t), print(", ") ,
+                                            showcompact(dt), print(", "),
+                                            showcompact(err), print(", "),
+                                            println(steps))
+
 # Tableaus
 ##########
 
@@ -204,7 +211,12 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
     k = Array(Tx, S) # stage variables
     allocate!(k, x0, dof)
     ks = zeros(Exf,dof) # work vector for one k
-    jac_store = zeros(Exf,dof,dof) # Jacobian storage
+    if isa(jacobian, Tuple)
+        jac_store = jacobian[2]
+        jacobian = jacobian[1]
+    else
+        jac_store = zeros(Exf,dof,dof) # Jacobian storage
+    end
     u = zeros(Exf,dof)    # work vector 
     udot = zeros(Exf,dof) # work vector 
 
@@ -250,7 +262,9 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
             end
         end
         err, newdt, timeout = stepsize_hw92!(dt, tdir, x, xtrial, xerr, order, timeout,
-                                            dof, abstol, reltol, maxstep, norm)
+                                             dof, abstol, reltol, maxstep, norm)
+
+        rosw_poststep(xtrial, err, t, dt, steps)
         if err<=1.0 # accept step
             # diagnostics
             steps[1] +=1
@@ -300,6 +314,7 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
                 laststep = true # next step is the last, if it succeeds
             end
         elseif abs(newdt)<minstep  # minimum step size reached, break
+            println(xerr)
             println("Warning: dt < minstep.  Stopping.")
             break
         else # redo step with smaller dt
@@ -422,7 +437,60 @@ function numerical_jacobian(fn!, reltol, abstol)
             tmp1[i] += edelta[i]
             tmp2[i] += a*edelta[i]
             fn!(tmp, tmp1, tmp2)
-            jac[:,i] = (tmp-f0)/edelta[i]
+            if isa(jac, SparseMatrixCSC)
+                for nz in nzrange(jac,i)
+                    nonzeros(jac)[nz] = tmpy[rowvals(jac)[nz]]
+                end
+            else
+                for j=1:length(y)
+                    jac[j,i] = (tmp[j]-f0[j])/edelta[i]
+                end
+            end
+        end
+        return nothing
+    end
+end
+
+function numerical_jacobian(fn!, reltol, abstol, JPattern1::SparseMatrixCSC, JPattern2::SparseMatrixCSC)
+    # JPattern1: sparsity pattern of dfn/dy
+    # JPattern2: sparsity pattern of dfn/dydot
+    
+    # S1 = color_jac(JPattern1) # seed matrix
+    # S2 = color_jac(JPattern2) # seed matrix
+    # nc1 = size(S1,2) # number of colors
+    # nc2 = size(S2,2) # number of colors
+
+    # TODO: there are probably more clever ways to do this:
+    S = color_jac(JPattern1+JPattern2)
+    nc = size(S,2)
+    function numjac_c!(jac, y, ydot, a)
+        # updates jac in-place
+        ep      = eps(1e6)   # this is the machine epsilon # TODO: better value?
+        h       = 1/a           # h ~ 1/a
+        wt      = reltol*abs(y).+abstol
+        # delta for approximation of jacobian.  I removed the
+        # sign(h_next*dy0) from the definition of delta because it was
+        # causing trouble when dy0==0 (which happens for ord==1)
+        edelta  = max(abs(y),abs(h*ydot),wt)*sqrt(ep)
+
+        dof = length(y)
+        res0 = similar(y)
+        fn!(res0, y, ydot)
+        dy = deepcopy(y) # y+dy
+        dydot = deepcopy(y) # y+dydot
+        res1 = similar(y)   # fn(y+dy, ydot)
+        res2 = similar(y)   # fn(y, ydot+dydot)
+        for c in 1:nc
+            add_delta!(dy, S, c, edelta)  # adds a delta to all indices of color c
+            add_delta!(dydot, S, c, edelta)  # adds a delta to all indices of color c
+            fn!(res1, dy, ydot)
+            fn!(res2, y, dydot)
+            for j=1:dof
+                res1[j] = ( res1[j]-res0[j] + a*(res2[j]-res0[j]) )/edelta[i]
+            end
+            recover_jac!(jac, res1, S, c) # recovers the columns of jac which correspond c
+            remove_delta!(dy, S, c, edelta) # remove delta again
+            remove_delta!(dydot, S, c, edelta) # remove delta again
         end
         return nothing
     end
diff --git a/src/runge_kutta.jl b/src/runge_kutta.jl
index aa6d0d260..043ee9064 100644
--- a/src/runge_kutta.jl
+++ b/src/runge_kutta.jl
@@ -393,7 +393,8 @@ function stepsize_hw92!(dt, tdir, x0, xtrial, xerr, order,
     # If timeout>0 no step size increase is allowed, timeout is
     # decremented in here.
     #
-    # Returns the error, newdt and the number of timeout-steps
+    # Returns the error, newdt and the number of timeout-steps.
+    # Updates xerr in-place with the relative error.
     #
     # TODO:
     # - allow component-wise reltol and abstol?

From 4c5683a71d7d6cb89917f9c42f2974313748aaba Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Wed, 10 Jun 2015 13:32:01 +0200
Subject: [PATCH 13/15] Fixed bug in runge-kutta

When intital time step overstepped tspan[end] then it didn't work correctly.
---
 src/ODE.jl         |  3 ++-
 src/runge_kutta.jl | 10 +++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/ODE.jl b/src/ODE.jl
index 2c8287ad9..6738f12ed 100644
--- a/src/ODE.jl
+++ b/src/ODE.jl
@@ -62,6 +62,7 @@ Base.convert{Tnew<:Real}(::Type{Tnew}, tab::Tableau) = error("Define convert met
 # estimator for initial step based on book
 # "Solving Ordinary Differential Equations I" by Hairer et al., p.169
 function hinit(F, x0, t0, tend, p, reltol, abstol)
+    # Returns first step, direction of integration and F evaluated at t0
     tdir = sign(tend-t0)
     tdir==0 && error("Zero time span")
     tau = max(reltol*norm(x0, Inf), abstol)
@@ -84,7 +85,7 @@ function hinit(F, x0, t0, tend, p, reltol, abstol)
         pow = -(2. + log10(max(d1, d2)))/(p + 1.)
         h1 = 10.^pow
     end
-    return tdir*min(100*h0, h1), tdir, f0
+    return tdir*min(100*h0, h1, tdir*(tend-t0)), tdir, f0
 end
 
 # isoutofdomain takes the state and returns true if state is outside
diff --git a/src/runge_kutta.jl b/src/runge_kutta.jl
index 043ee9064..b0dbb7399 100644
--- a/src/runge_kutta.jl
+++ b/src/runge_kutta.jl
@@ -286,7 +286,7 @@ function oderk_adapt{N,S}(fn, y0::AbstractVector, tspan, btab_::TableauRKExplici
     steps = [0,0]  # [accepted, rejected]
 
     ## Integration loop
-    laststep = false
+    islaststep = abs(t+dt-tend)<=eps(tend) ? true : false
     timeout = 0 # for step-control
     iter = 2 # the index into tspan and ys
     while true
@@ -307,7 +307,7 @@ function oderk_adapt{N,S}(fn, y0::AbstractVector, tspan, btab_::TableauRKExplici
             f1 = isFSAL(btab) ? ks[S] : fn(t+dt, ytrial)
             if points==:specified
                 # interpolate onto given output points
-                while iter-1<nsteps_fixed && (tdir*tspan[iter]<tdir*(t+dt) || laststep) # output at all new times which are < t+dt
+                while iter-1<nsteps_fixed && (tdir*tspan[iter]<tdir*(t+dt) || islaststep) # output at all new times which are < t+dt
                     hermite_interp!(ys[iter], tspan[iter], t, dt, y, ytrial, f0, f1) # TODO: 3rd order only!
                     iter += 1
                 end
@@ -328,7 +328,7 @@ function oderk_adapt{N,S}(fn, y0::AbstractVector, tspan, btab_::TableauRKExplici
             ks[1] = f1 # load ks[1]==f0 for next step
 
             # Break if this was the last step:
-            laststep && break
+            islaststep && break
 
             # Swap bindings of y and ytrial, avoids one copy
             y, ytrial = ytrial, y
@@ -340,13 +340,13 @@ function oderk_adapt{N,S}(fn, y0::AbstractVector, tspan, btab_::TableauRKExplici
             # Hit end point exactly if next step within 1% of end:
             if tdir*(t+dt*1.01) >= tdir*tend
                 dt = tend-t
-                laststep = true # next step is the last, if it succeeds
+                islaststep = true # next step is the last, if it succeeds
             end
         elseif abs(newdt)<minstep  # minimum step size reached, break
             println("Warning: dt < minstep.  Stopping.")
             break
         else # redo step with smaller dt
-            laststep = false
+            islaststep = false
             steps[2] +=1
             dt = newdt
             timeout = timeout_const

From c1b35eb1f945898e12c35dad60e5a6b6328d7704 Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Wed, 10 Jun 2015 13:34:26 +0200
Subject: [PATCH 14/15] Having a sparse Jacobian now works.

The problem was that A_ldiv_B! does not update B in case A is a sparse
lufact: https://github.com/JuliaLang/julia/issues/10787

Sparse coloring does not work yet.
---
 examples/bruss1d.jl | 202 ++++++++++++++++++++++++++++++++++++++++++++
 examples/hb1dae.jl  |  12 +--
 src/rosw.jl         |  84 ++++++++++--------
 3 files changed, 255 insertions(+), 43 deletions(-)
 create mode 100644 examples/bruss1d.jl

diff --git a/examples/bruss1d.jl b/examples/bruss1d.jl
new file mode 100644
index 000000000..4df7035eb
--- /dev/null
+++ b/examples/bruss1d.jl
@@ -0,0 +1,202 @@
+using ODE,DASSL
+
+const T = Float64 # the datatype used, probably Float64
+Tarr = SparseMatrixCSC
+const N = 500 # active gridpoints
+const dof = 2N # degrees of freedom (boundary points are not free)
+dx = 1/(N+1)
+x = dx:dx:1-dx
+dae = 0  # index of DAE, ==0 for ODE
+
+# parameters
+alpha = 1/50
+const gamma = alpha/dx^2
+
+# BC
+const one_ = one(T)
+const three = 3*one_
+ubc1() = one_
+ubc2() = one_
+vbc1() = three
+vbc2() = three
+#IC
+u0(x) = 1 + 0.5*sin(2π*x)
+v0(x) = 3 * ones(length(x))
+
+inds(i) = (2i-1, 2i)
+function fn!(res, y, dydt)
+    # the ode function res = f(y,dydt)
+    #
+    # Note the u and v are staggered:
+    # u = y[1:2:end], v=y[2:2:end]
+    
+    # setup for first step
+    i = 1
+    iu, iv = inds(i)
+    ui0 = ubc1(); vi0 = vbc1() # BC
+    ui1 = y[iu];   vi1 = y[iv]
+    ui2 = y[iu+2]; vi2 = y[iv+2]
+    while i<=N
+        res[iu] = 1 + ui1^2*vi1 - 4*ui1 + gamma*(ui0 - 2*ui1 + ui2) - dydt[iu]
+        res[iv] = 3*ui1 - ui1^2*vi1     + gamma*(vi0 - 2*vi1 + vi2) - dydt[iv]
+        # setup for next step:
+        i += 1
+        iu, iv = inds(i)
+        ui0 = ui1; vi0 = vi1
+        ui1 = ui2; vi1 = vi2
+        if i<N
+            ui2 = y[iu+2]
+            vi2 = y[iv+2]
+        else # on last step do BC
+            ui2 = ubc2()
+            vi2 = vbc2()
+        end
+    end
+    return nothing
+end
+fn!(;T_::Type=T, dof_=dof) = zeros(T_,dof_)
+function fn(t, y, ydot)
+    res = fn!()
+    fn!(res, y, ydot)
+    return res
+end
+function fn_ex(t, y)
+    res = fn!()
+    fn!(res, y, 0*y)
+    return res
+end
+
+
+
+# Jacobian is a banded matrix with upper and lower bandwidth 2,
+# c.f. W&H p.148, but Julia does not support banded matrices yet.
+# Thus use a sparse matrix.
+
+function jac!(dfdy, y, ydot, a)
+    # The Jacobian of fn: = df/dy + a* df/dydot
+    ii = 1 # direct index into dfdy.nzval
+    for i=1:N # iterate over double-columns
+        iu, iv = inds(i)
+        ui = y[iu]
+        vi = y[iv]
+        # iu: column belonging to d/du(i)
+        if iu-2>=1
+            # du'(i-1)/du(i)
+            dfdy.nzval[ii] = gamma
+            ii +=1
+        end
+        if iv-2>=1
+            # dv'(i-1)/du(i)==0
+            dfdy.nzval[ii] = 0
+            ii +=1
+        end
+        # du'(i)/du(i)
+        dfdy.nzval[ii]   = 2*ui*vi - 4 -2*gamma - a
+        ii += 1
+        # dv'(i)/du(i)
+        dfdy.nzval[ii] = 3 - 2*ui*vi
+        ii +=1
+        if  iu+2<=2N
+            # du'(i+1)/du(i)
+            dfdy.nzval[ii] = gamma
+            ii +=1
+        end
+        
+        # iv: column belonging to d/dv(i)
+        if iv-2>=1
+            # dv'(i-1)/dv(i)
+            dfdy.nzval[ii] = gamma
+            ii +=1
+        end
+        # du'(i)/dv(i)
+        dfdy.nzval[ii] = ui^2
+        ii +=1
+        # dv'(i)/dv(i)
+        dfdy.nzval[ii]   = -ui^2 -2*gamma - a
+        ii +=1
+        if  iu+2<=2N
+            # du'(i+1)/dv(i)==0
+            dfdy.nzval[ii] = 0
+            ii +=1
+        end
+        if iv+2<=2N
+            # dv'(i+1)/dv(i)
+            dfdy.nzval[ii] = gamma
+            ii +=1
+        end
+    end
+    return nothing
+end
+function jac!(;T_::Type=T, dof_=dof)
+    B = (ones(T_,dof_-2), ones(T_,dof_-1), ones(T_,dof_), ones(T_,dof_-1), ones(T_,dof_-2))        
+    spdiagm(B, (-2,-1,0,1,2))
+end
+function jac(t, y, ydot, a)
+    J = jac!()
+    jac!(J, y, ydot, a)
+    return J
+end
+function jac_ex(t, y)
+    return jac(t, y, y, 0)
+end
+
+refsol = T[0.9949197002317599, 3.0213845767604077, 0.9594350193986054, 3.0585989778165419, 0.9243010095428502, 3.0952478919989637, 0.8897959106772672,
+           3.1310118289054731, 0.8561653620284367, 3.1656101198770159, 0.8236197147449046, 3.1988043370624344, 0.7923328094811884, 3.2303999530641514,
+           0.7624421042573115, 3.2602463873623941, 0.7340499750795348, 3.2882356529108807, 0.7072259700779899, 3.3142998590079271, 0.6820097782458483,
+           3.3384078449410937, 0.6584146743834650, 3.3605612157873943, 0.6364312187752559, 3.3807900316323134, 0.6160310186921587, 3.3991483695914764,
+           0.5971703941198909, 3.4157099395342736, 0.5797938277687891, 3.4305638938070224, 0.5638371159206763, 3.4438109320334580, 0.5492301695479158,
+           3.4555597666485198, 0.5358994429426996, 3.4659239846027008, 0.5237699892215797, 3.4750193162238476, 0.5127671585747183, 3.4829613034792271,
+           0.5028179665048467, 3.4898633463634923, 0.4938521662914935, 3.4958350971335204, 0.4858030633656755, 3.5009811668111510, 0.4786081100251151,
+           3.5054001059792705, 0.4722093177200750, 3.5091836216744015, 0.4665535216425440, 3.5124159935026285, 0.4615925290790646, 3.5151736544621075,
+           0.4572831793403656, 3.5175249049438184, 0.4535873393501199, 3.5195297317024448, 0.4504718553589467, 3.5212397070273984, 0.4479084778719241,
+           3.5226979467564341, 0.4458737738041973, 3.5239391090719634, 0.4443490371324889, 3.5249894191569453, 0.4433202068820853, 3.5258667077466495,
+           0.4427777991494095, 3.5265804544017270, 0.4427168579654424, 3.5271318289682063, 0.4431369281018266, 3.5275137272135266, 0.4440420513508381,
+           3.5277107990730161, 0.4454407863109616, 3.5276994703501980, 0.4473462502188303, 3.5274479611304068, 0.4497761798232572, 3.5269163066324394,
+           0.4527530066369863, 3.5260563887768472, 0.4563039400688689, 3.5248119894251024, 0.4604610498812091, 3.5231188790654930, 0.4652613370907894,
+           3.5209049576992761, 0.4707467798082714, 3.5180904678044698, 0.4769643375804777, 3.5145883024867057, 0.4839658945842979, 3.5103044351908528,
+           0.4918081185812277, 3.5051385005173827, 0.5005522089940899, 3.4989845585737802, 0.5102635039989190, 3.4917320776245013, 0.5210109134090777,
+           3.4832671712209993, 0.5328661417420937, 3.4734741260299615, 0.5459026646938675, 3.4622372546582585, 0.5601944229089820, 3.4494431032230182,
+           0.5758142001453760, 3.4349830354873361, 0.5928316594749734, 3.4187562033108012, 0.6113110218368440, 3.4006728962523969, 0.6313083867734524,
+           3.3806582409098729, 0.6528687160104193, 3.3586561928427350, 0.6760225267555723, 3.3346337311179157, 0.7007823726569721, 3.3085851288057930,
+           0.7271392249346637, 3.2805361342349380, 0.7550589020044152, 3.2505478606008622, 0.7844787296769868, 3.2187201496972175, 0.8153046416214843,
+           3.1851941538893653, 0.8474089465959840, 3.1501538739882800, 0.8806289904192589, 3.1138264039027113, 0.9147669230929857, 3.0764806689389470,
+           0.9495907429372025, 3.0384245041548366, 0.9848367306701233] # reference sol from Hairer has 1.36 scd
+refsolinds = collect(1:7:dof) # refsol is only at components 1:7:2N
+
+
+ic = zeros(T,2N)
+ic[1:2:2N] = u0(x)
+ic[2:2:2N] = v0(x)
+tspan = T[0.0, 10.0] # integration interval
+tspan0 = T[0.0, 0.001] # integration interval
+
+#t,ydassl = dasslSolve(fn, ic, tspan, jacobian=jac)
+
+## ode23s
+# tout, yout = ode23s(fn_ex, ic, tspan0, jacobian=jac_ex)
+# @time tout, yout = ode23s(fn_ex, ic, tspan, jacobian=jac_ex)
+# @show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
+
+# ## DASSL
+# t,ydassl = dasslSolve(fn, ic, tspan0, jacobian=jac)
+# @time t,ydassl = dasslSolve(fn, ic, tspan, jacobian=jac)
+# @show norm(abs(ydassl[end][refsolinds]-refsol)./refsol, Inf)
+
+## ROSW
+
+# # No jac works but is slower and gives much higher precision than
+# # other methods.
+# tout, yout = ode_rosw(fn!, ic, tspan0)
+# @time tout, yout = ode_rosw(fn!, ic, tspan)
+# @show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
+
+# Analytic Jacobian
+tout, yout = ode_rosw(fn!, ic, tspan0, jacobian=(jac!, jac!()))
+@time tout, yout = ode_rosw(fn!, ic, tspan, jacobian=(jac!, jac!()))
+@show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
+
+# Numerical colored Jacobian
+tout, yout = ode_rosw(fn!, ic, tspan0, jacobian=jac!())
+@time tout, yout = ode_rosw(fn!, ic, tspan, jacobian=jac!())
+@show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
+
diff --git a/examples/hb1dae.jl b/examples/hb1dae.jl
index 461b7e40d..e6902685f 100644
--- a/examples/hb1dae.jl
+++ b/examples/hb1dae.jl
@@ -54,12 +54,12 @@ yr = hcat(yrosw...);
 println("Relative difference between DASSL vs ROSW, no Jac:")
 println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
 
-println("Using fixed step ROSW")
-tspan = t
-t,yrosw = ode_rosw_fixed(hb1dae!, y0, tspan)
-@time t,yrosw = ode_rosw_fixed(hb1dae!, y0, tspan)
-println("Relative difference between DASSL vs fixed step, no Jac:")
-println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
+# println("Using fixed step ROSW")
+# tspan = t
+# t,yrosw = ode_rosw_fixed(hb1dae!, y0, tspan)
+# @time t,yrosw = ode_rosw_fixed(hb1dae!, y0, tspan)
+# println("Relative difference between DASSL vs fixed step, no Jac:")
+# println(abs(ydassl[end]-yrosw[end])./abs(ydassl[end]))
 
 # with Jacobian
 tspan = [0, 4e6]
diff --git a/src/rosw.jl b/src/rosw.jl
index 04680c420..bd36d3c34 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -26,10 +26,12 @@ export ode_rosw, ode_rosw_fixed, ode_rodas3
 # The method used here uses transformed equations as done in PETSc.
 
 
-rosw_poststep(xtrial, err, t, dt, steps) = (showcompact(t), print(", ") ,
-                                            showcompact(dt), print(", "),
-                                            showcompact(err), print(", "),
-                                            println(steps))
+# rosw_poststep(xtrial, err, t, dt, steps) = (showcompact(t), print(", ") ,
+#                                             showcompact(dt), print(", "),
+#                                             showcompact(err), print(", "),
+#                                             println(steps))
+
+rosw_poststep(xtrial, err, t, dt, steps) = nothing
 
 # Tableaus
 ##########
@@ -132,13 +134,13 @@ const bt_ros_rodas3 = TableauRosW(:ros_rodas3, (3,4), Rational{Int},
 ###################
 # Fixed step solver
 ###################
-ode_rosw_fixed(fn, x0, tspan; kwargs...) = oderosw_fixed(fn, x0, tspan, bt_ros34pw2, kwargs...)
-function oderosw_fixed{N,S}(fn, x0::AbstractVector, tspan,
+ode_rosw_fixed(fn!, x0, tspan; kwargs...) = oderosw_fixed(fn!, x0, tspan, bt_ros34pw2, kwargs...)
+function oderosw_fixed{N,S}(fn!, x0::AbstractVector, tspan,
                             btab::TableauRosW{N,S};
-                            jacobian=numerical_jacobian(fn, 1e-6, 1e-6)
+                            jacobian=numerical_jacobian(fn!, 1e-6, 1e-6)
                             )
     # TODO: refactor with oderk_fixed
-    Et, Exf, Tx, btab = make_consistent_types(fn, x0, tspan, btab)
+    Et, Exf, Tx, btab = make_consistent_types(fn!, x0, tspan, btab)
     btab = tabletransform(btab)
     dof = length(x0)
 
@@ -155,26 +157,26 @@ function oderosw_fixed{N,S}(fn, x0::AbstractVector, tspan,
     u = zeros(Exf,dof)    # work vector 
     udot = zeros(Exf,dof) # work vector 
     
-    # allocate!(ks, x0, dof) # no need to allocate as fn is not in-place
+    # allocate!(ks, x0, dof) # no need to allocate as fn! is not in-place
     xtmp = similar(x0, Exf, dof)
     for i=1:length(tspan)-1
         dt = tspan[i+1]-tspan[i]
-        rosw_step!(xs[i+1], fn, jacobian, xs[i], dt, dof, btab,
+        rosw_step!(xs[i+1], fn!, jacobian, xs[i], dt, dof, btab,
                    k, jac_store, ks, u, udot)
     end
     return tspan, xs
 end
 
 
-ode_rosw(fn, x0, tspan;kwargs...) = oderosw_adapt(fn, x0, tspan, bt_ros34pw2; kwargs...)
-ode_rodas3(fn, x0, tspan;kwargs...) = oderosw_adapt(fn, x0, tspan, bt_ros_rodas3; kwargs...)
-function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S};
+ode_rosw(fn!, x0, tspan;kwargs...) = oderosw_adapt(fn!, x0, tspan, bt_ros34pw2; kwargs...)
+ode_rodas3(fn!, x0, tspan;kwargs...) = oderosw_adapt(fn!, x0, tspan, bt_ros_rodas3; kwargs...)
+function oderosw_adapt{N,S}(fn!, x0::AbstractVector, tspan, btab::TableauRosW{N,S};
                             reltol = 1.0e-5, abstol = 1.0e-8,
                             norm=Base.norm,
                             minstep=abs(tspan[end] - tspan[1])/1e18,
                             maxstep=abs(tspan[end] - tspan[1])/2.5,
                             initstep=0.,
-                            jacobian=numerical_jacobian(fn, reltol, abstol)
+                            jacobian=numerical_jacobian(fn!, reltol, abstol)
 #                            points=:all
                             )
     # TODO: refactor with oderk_adapt
@@ -188,8 +190,8 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
         points=:all
     end
     ## Figure types
-    fn_expl = (t,x)->(out=similar(x); fn(out, x, x*0); out)
-    Et, Exf, Tx, btab = make_consistent_types(fn, x0, tspan, btab)
+    fn_expl = (t,x)->(out=similar(x); fn!(out, x, x*0); out)
+    Et, Exf, Tx, btab = make_consistent_types(fn!, x0, tspan, btab)
     btab = tabletransform(btab)
     # parameters
     order = minimum(btab.order)
@@ -211,10 +213,13 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
     k = Array(Tx, S) # stage variables
     allocate!(k, x0, dof)
     ks = zeros(Exf,dof) # work vector for one k
-    if isa(jacobian, Tuple)
+    if isa(jacobian, Tuple) # Jacobian function and sparse storage provided
         jac_store = jacobian[2]
         jacobian = jacobian[1]
-    else
+    elseif isa(jacobian, SparseMatrixCSC) # Sparse storage provided, make colored Jacobian (TODO)
+        jac_store = jacobian
+        jacobian = numerical_jacobian(fn!, reltol, abstol) #, jac_store, jac_store)
+    else # nothing provided, make dense numerical jacobian
         jac_store = zeros(Exf,dof,dof) # Jacobian storage
     end
     u = zeros(Exf,dof)    # work vector 
@@ -247,11 +252,11 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
     steps = [0,0]  # [accepted, rejected]
 
     ## Integration loop
-    laststep = false
+    islaststep = abs(t+dt-tend)<=eps(tend) ? true : false
     timeout = 0 # for step-control
     iter = 2 # the index into tspan and xs
     while true
-        rosw_step!(xtrial, fn, jacobian, x, dt, dof, btab,
+        rosw_step!(xtrial, fn!, jacobian, x, dt, dof, btab,
                    k, jac_store, ks, u, udot)
         # Completion again for embedded method, see line 927 of
         # http://www.mcs.anl.gov/petsc/petsc-current/src/ts/impls/rosw/rosw.c.html#TSROSW
@@ -276,13 +281,13 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
             # f1 = fn_expl(t+dt, xtrial)
             if points==:specified
                 # interpolate onto given output points
-                while iter-1<nsteps_fixed && (tdir*tspan[iter]<tdir*(t+dt) || laststep) # output at all new times which are < t+dt
+                while iter-1<nsteps_fixed && (tdir*tspan[iter]<tdir*(t+dt) || islaststep) # output at all new times which are < t+dt
                     error("FIXME: not implemented yet")
                     hermite_interp!(xs[iter], tspan[iter], t, dt, x, xtrial, f0, f1) # TODO: 3rd order only!
                     iter += 1
                 end
             else
-                # first interpolate onto given output points
+                # First interpolate onto given output points
                 while iter_fixed-1<nsteps_fixed && tdir*t<tdir*tspan_fixed[iter_fixed]<tdir*(t+dt) # output at all new times which are < t+dt
                     error("FIXME: not implemented yet")
                     xout = hermite_interp(tspan_fixed[iter_fixed], t, dt, x, xtrial, f0, f1)
@@ -299,7 +304,7 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
             # k[1] = f1 # load k[1]==f0 for next step
 
             # Break if this was the last step:
-            laststep && break
+            islaststep && break
 
             # Swap bindings of x and xtrial, avoids one copy
             x, xtrial = xtrial, x
@@ -311,14 +316,14 @@ function oderosw_adapt{N,S}(fn, x0::AbstractVector, tspan, btab::TableauRosW{N,S
             # Hit end point exactly if next step within 1% of end:
             if tdir*(t+dt*1.01) >= tdir*tend
                 dt = tend-t
-                laststep = true # next step is the last, if it succeeds
+                islaststep = true # next step is the last, if it succeeds
             end
         elseif abs(newdt)<minstep  # minimum step size reached, break
-            println(xerr)
+            println("max err=$(maximum(xerr)), mean err=$(mean(xerr))")
             println("Warning: dt < minstep.  Stopping.")
             break
         else # redo step with smaller dt
-            laststep = false
+            islaststep = false
             steps[2] +=1
             dt = newdt
             timeout = timeout_const
@@ -345,8 +350,9 @@ function rosw_step!{N,S}(xtrial, g!, gprime!, x, dt, dof, btab::TableauRosW_T{N,
         # first step of Newton iteration with guess ks==0
         #        k[s][:] = ks - jac\k[s] # in-place A_ldiv_B!(jac, k[s])
         # TODO: add option to do more Newton iterations
-        A_ldiv_B!(jac, k[s])  # TODO: see whether this is impacted by https://github.com/JuliaLang/julia/issues/10787
+        k[s] = A_ldiv_B!(jac, k[s])  # TODO: see whether this is impacted by https://github.com/JuliaLang/julia/issues/10787
                               # and https://github.com/JuliaLang/julia/issues/11325
+#        k[s][:]  = jac_store\k[s]
         for d=1:dof
             k[s][d] = ks[d]-k[s][d]
         end
@@ -356,7 +362,8 @@ function rosw_step!{N,S}(xtrial, g!, gprime!, x, dt, dof, btab::TableauRosW_T{N,
     # last step:
     #    k[S][:] = ks - jac\k[S]
     # TODO: add option to do more Newton iterations
-    A_ldiv_B!(jac, k[S])
+    k[S] = A_ldiv_B!(jac, k[S])
+#    k[S][:]  = jac_store\k[S]    
     for d=1:dof
         k[S][d] = ks[d]-k[S][d]
     end
@@ -408,7 +415,13 @@ function hprime!(res, xi, gprime!, u, udot, btab, dt)
     # The res will hold part of the LU factorization.  However use the
     # returned LU-type.
     gprime!(res, u, udot, 1./(dt*btab.γii))
-    return lufact!(res)
+    if true # issparse(res)
+        # For issues with sparse https://github.com/JuliaLang/julia/issues/7488
+        # Basically lufact! destroys res and it cannot be used anymore later.
+        return lufact(res)
+    else
+        return lufact!(res)
+    end
 end
 
 # Copied & adapted from DASSL:
@@ -416,12 +429,11 @@ end
 # finite differences
 function numerical_jacobian(fn!, reltol, abstol)
     function numjac!(jac, y, dy, a)
-        # TODO use coloring
-        
+        # Numerical Jacobian, finite differences, no coloring.
         ep      = eps(1e6)   # this is the machine epsilon # TODO: better value?
         h       = 1/a           # h ~ 1/a
         wt      = reltol*abs(y).+abstol
-        # delta for approximation of jacobian.  I removed the
+        # Delta for approximation of Jacobian.  I removed the
         # sign(h_next*dy0) from the definition of delta because it was
         # causing trouble when dy0==0 (which happens for ord==1)
         edelta  = max(abs(y),abs(h*dy),wt)*sqrt(ep)
@@ -437,7 +449,7 @@ function numerical_jacobian(fn!, reltol, abstol)
             tmp1[i] += edelta[i]
             tmp2[i] += a*edelta[i]
             fn!(tmp, tmp1, tmp2)
-            if isa(jac, SparseMatrixCSC)
+            if false #isa(jac, SparseMatrixCSC)
                 for nz in nzrange(jac,i)
                     nonzeros(jac)[nz] = tmpy[rowvals(jac)[nz]]
                 end
@@ -468,9 +480,7 @@ function numerical_jacobian(fn!, reltol, abstol, JPattern1::SparseMatrixCSC, JPa
         ep      = eps(1e6)   # this is the machine epsilon # TODO: better value?
         h       = 1/a           # h ~ 1/a
         wt      = reltol*abs(y).+abstol
-        # delta for approximation of jacobian.  I removed the
-        # sign(h_next*dy0) from the definition of delta because it was
-        # causing trouble when dy0==0 (which happens for ord==1)
+
         edelta  = max(abs(y),abs(h*ydot),wt)*sqrt(ep)
 
         dof = length(y)
@@ -486,7 +496,7 @@ function numerical_jacobian(fn!, reltol, abstol, JPattern1::SparseMatrixCSC, JPa
             fn!(res1, dy, ydot)
             fn!(res2, y, dydot)
             for j=1:dof
-                res1[j] = ( res1[j]-res0[j] + a*(res2[j]-res0[j]) )/edelta[i]
+                res1[j] = ( res1[j]-res0[j] + a*(res2[j]-res0[j]) )/edelta[j]
             end
             recover_jac!(jac, res1, S, c) # recovers the columns of jac which correspond c
             remove_delta!(dy, S, c, edelta) # remove delta again

From 64b63dc3d896ca575c3ace0c4964c33a7c1bc5af Mon Sep 17 00:00:00 2001
From: Mauro Werder <mauro_lc@runbox.com>
Date: Wed, 10 Jun 2015 17:10:16 +0200
Subject: [PATCH 15/15] finally got it working

[ci skip]
---
 examples/bruss1d.jl |  33 +++++++-------
 src/rosw.jl         | 104 ++++++++++++++++++++++----------------------
 2 files changed, 71 insertions(+), 66 deletions(-)

diff --git a/examples/bruss1d.jl b/examples/bruss1d.jl
index 4df7035eb..582005496 100644
--- a/examples/bruss1d.jl
+++ b/examples/bruss1d.jl
@@ -169,34 +169,37 @@ ic[1:2:2N] = u0(x)
 ic[2:2:2N] = v0(x)
 tspan = T[0.0, 10.0] # integration interval
 tspan0 = T[0.0, 0.001] # integration interval
+tspan1 = T[0.0, 0.3] # integration interval
 
 #t,ydassl = dasslSolve(fn, ic, tspan, jacobian=jac)
 
 ## ode23s
-# tout, yout = ode23s(fn_ex, ic, tspan0, jacobian=jac_ex)
-# @time tout, yout = ode23s(fn_ex, ic, tspan, jacobian=jac_ex)
-# @show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
+tout, yout = ode23s(fn_ex, ic, tspan0, jacobian=jac_ex)
+@time tout, yout = ode23s(fn_ex, ic, tspan, jacobian=jac_ex)
+@show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
 
 # ## DASSL
-# t,ydassl = dasslSolve(fn, ic, tspan0, jacobian=jac)
-# @time t,ydassl = dasslSolve(fn, ic, tspan, jacobian=jac)
-# @show norm(abs(ydassl[end][refsolinds]-refsol)./refsol, Inf)
+t,ydassl = dasslSolve(fn, ic, tspan0, jacobian=jac)
+@time t,ydassl = dasslSolve(fn, ic, tspan, jacobian=jac)
+@show norm(abs(ydassl[end][refsolinds]-refsol)./refsol, Inf)
 
 ## ROSW
+# for some reason the abstol and reltol need to be way lower to reach
+# the same precision.  Presumably an error in the step control.
+abstol, reltol = 1e-2, 1e-3
+# No jac
+tout, yout = ode_rosw(fn!, ic, tspan0)
+@time tout, yout = ode_rosw(fn!, ic, tspan, abstol=abstol, reltol=reltol);
+@show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
 
-# # No jac works but is slower and gives much higher precision than
-# # other methods.
-# tout, yout = ode_rosw(fn!, ic, tspan0)
-# @time tout, yout = ode_rosw(fn!, ic, tspan)
-# @show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
-
-# Analytic Jacobian
+# Analytic Jacobian, same as previous
 tout, yout = ode_rosw(fn!, ic, tspan0, jacobian=(jac!, jac!()))
-@time tout, yout = ode_rosw(fn!, ic, tspan, jacobian=(jac!, jac!()))
+@time tout, yout = ode_rosw(fn!, ic, tspan, jacobian=(jac!, jac!()), abstol=abstol, reltol=reltol);
 @show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
 
 # Numerical colored Jacobian
 tout, yout = ode_rosw(fn!, ic, tspan0, jacobian=jac!())
-@time tout, yout = ode_rosw(fn!, ic, tspan, jacobian=jac!())
+@time tout, yout = ode_rosw(fn!, ic, tspan, jacobian=jac!(), abstol=abstol, reltol=reltol);
 @show norm(abs(yout[end][refsolinds]-refsol)./refsol, Inf)
 
+nothing
diff --git a/src/rosw.jl b/src/rosw.jl
index bd36d3c34..fa4bd1b7b 100644
--- a/src/rosw.jl
+++ b/src/rosw.jl
@@ -213,12 +213,12 @@ function oderosw_adapt{N,S}(fn!, x0::AbstractVector, tspan, btab::TableauRosW{N,
     k = Array(Tx, S) # stage variables
     allocate!(k, x0, dof)
     ks = zeros(Exf,dof) # work vector for one k
-    if isa(jacobian, Tuple) # Jacobian function and sparse storage provided
+    if isa(jacobian, Tuple) # Jacobian function and sparse pattern/storage provided
         jac_store = jacobian[2]
         jacobian = jacobian[1]
     elseif isa(jacobian, SparseMatrixCSC) # Sparse storage provided, make colored Jacobian (TODO)
         jac_store = jacobian
-        jacobian = numerical_jacobian(fn!, reltol, abstol) #, jac_store, jac_store)
+        jacobian = numerical_jacobian(fn!, reltol, abstol, jac_store)
     else # nothing provided, make dense numerical jacobian
         jac_store = zeros(Exf,dof,dof) # Jacobian storage
     end
@@ -352,7 +352,6 @@ function rosw_step!{N,S}(xtrial, g!, gprime!, x, dt, dof, btab::TableauRosW_T{N,
         # TODO: add option to do more Newton iterations
         k[s] = A_ldiv_B!(jac, k[s])  # TODO: see whether this is impacted by https://github.com/JuliaLang/julia/issues/10787
                               # and https://github.com/JuliaLang/julia/issues/11325
-#        k[s][:]  = jac_store\k[s]
         for d=1:dof
             k[s][d] = ks[d]-k[s][d]
         end
@@ -363,7 +362,6 @@ function rosw_step!{N,S}(xtrial, g!, gprime!, x, dt, dof, btab::TableauRosW_T{N,
     #    k[S][:] = ks - jac\k[S]
     # TODO: add option to do more Newton iterations
     k[S] = A_ldiv_B!(jac, k[S])
-#    k[S][:]  = jac_store\k[S]    
     for d=1:dof
         k[S][d] = ks[d]-k[S][d]
     end
@@ -415,7 +413,7 @@ function hprime!(res, xi, gprime!, u, udot, btab, dt)
     # The res will hold part of the LU factorization.  However use the
     # returned LU-type.
     gprime!(res, u, udot, 1./(dt*btab.γii))
-    if true # issparse(res)
+    if issparse(res)
         # For issues with sparse https://github.com/JuliaLang/julia/issues/7488
         # Basically lufact! destroys res and it cannot be used anymore later.
         return lufact(res)
@@ -428,79 +426,83 @@ end
 # generate a function that computes approximate jacobian using forward
 # finite differences
 function numerical_jacobian(fn!, reltol, abstol)
-    function numjac!(jac, y, dy, a)
+    function numjac!(jac, y, ydot, a)
         # Numerical Jacobian, finite differences, no coloring.
-        ep      = eps(1e6)   # this is the machine epsilon # TODO: better value?
+        ep      = eps(1.0)   # this is the machine epsilon # TODO: better value?
         h       = 1/a           # h ~ 1/a
         wt      = reltol*abs(y).+abstol
         # Delta for approximation of Jacobian.  I removed the
-        # sign(h_next*dy0) from the definition of delta because it was
-        # causing trouble when dy0==0 (which happens for ord==1)
-        edelta  = max(abs(y),abs(h*dy),wt)*sqrt(ep)
-
-        tmp = similar(y)
-        tmp1 = similar(y)
-        tmp2 = similar(y)
+        # sign(h_next*ydot0) from the definition of delta because it was
+        # causing trouble when ydot0==0 (which happens for ord==1)
+        edelta  = max(abs(y),abs(h*ydot),wt)*sqrt(ep)
+        
         f0 = similar(y)
-        fn!(f0, y, dy)
+        fn!(f0, y, ydot)
+
+        dy = deepcopy(y)
+        dydot = deepcopy(ydot)
+        f1 = similar(y)
         for i=1:length(y)
-            copy!(tmp1, y)
-            copy!(tmp2, dy)
-            tmp1[i] += edelta[i]
-            tmp2[i] += a*edelta[i]
-            fn!(tmp, tmp1, tmp2)
-            if false #isa(jac, SparseMatrixCSC)
-                for nz in nzrange(jac,i)
-                    nonzeros(jac)[nz] = tmpy[rowvals(jac)[nz]]
-                end
-            else
-                for j=1:length(y)
-                    jac[j,i] = (tmp[j]-f0[j])/edelta[i]
-                end
+            dy[i] += edelta[i]
+            dydot[i] += a*edelta[i]
+            fn!(f1, dy, dydot)
+            for j=1:length(y)
+                jac[j,i] = (f1[j]-f0[j])/edelta[i]
             end
+            # remove delta again
+            dy[i] -= edelta[i]
+            dydot[i] -= a*edelta[i]
         end
         return nothing
     end
 end
 
-function numerical_jacobian(fn!, reltol, abstol, JPattern1::SparseMatrixCSC, JPattern2::SparseMatrixCSC)
-    # JPattern1: sparsity pattern of dfn/dy
-    # JPattern2: sparsity pattern of dfn/dydot
-    
-    # S1 = color_jac(JPattern1) # seed matrix
-    # S2 = color_jac(JPattern2) # seed matrix
-    # nc1 = size(S1,2) # number of colors
-    # nc2 = size(S2,2) # number of colors
+if VERSION < v"0.4.0-dev"
+    # TODO: remove these two for 0.4
+    rowvals(S::SparseMatrixCSC) = S.rowval
+    nzrange(S::SparseMatrixCSC, col::Integer) = S.colptr[col]:(S.colptr[col+1]-1)
+end
+function numerical_jacobian(fn!, reltol, abstol, JPattern::SparseMatrixCSC)
+    # JPattern: sparsity pattern of dfn/dy + dfn/dydot
+    #           Typically this matrix will be used for storage of the
+    #           Jacobian as well.
 
     # TODO: there are probably more clever ways to do this:
-    S = color_jac(JPattern1+JPattern2)
+    S = color_jac(JPattern)
     nc = size(S,2)
     function numjac_c!(jac, y, ydot, a)
         # updates jac in-place
-        ep      = eps(1e6)   # this is the machine epsilon # TODO: better value?
+        ep      = eps(1.0)   # this is the machine epsilon # TODO: better value?
         h       = 1/a           # h ~ 1/a
         wt      = reltol*abs(y).+abstol
 
         edelta  = max(abs(y),abs(h*ydot),wt)*sqrt(ep)
+        a_edelta = a*edelta
+
+        f0 = similar(y)
+        fn!(f0, y, ydot)
 
-        dof = length(y)
-        res0 = similar(y)
-        fn!(res0, y, ydot)
         dy = deepcopy(y) # y+dy
-        dydot = deepcopy(y) # y+dydot
-        res1 = similar(y)   # fn(y+dy, ydot)
-        res2 = similar(y)   # fn(y, ydot+dydot)
+        dydot = deepcopy(ydot) # y+dydot
+        f1 = similar(y)   # fn(y+dy, ydot)
         for c in 1:nc
             add_delta!(dy, S, c, edelta)  # adds a delta to all indices of color c
-            add_delta!(dydot, S, c, edelta)  # adds a delta to all indices of color c
-            fn!(res1, dy, ydot)
-            fn!(res2, y, dydot)
-            for j=1:dof
-                res1[j] = ( res1[j]-res0[j] + a*(res2[j]-res0[j]) )/edelta[j]
+            add_delta!(dydot, S, c, a_edelta)  # adds a delta to all indices of color c
+            fn!(f1, dy, dydot)
+            for j=1:length(y)
+                f1[j] = f1[j]-f0[j]
             end
-            recover_jac!(jac, res1, S, c) # recovers the columns of jac which correspond c
+            recover_jac!(jac, f1, S, c) # recovers the columns of jac which correspond c
             remove_delta!(dy, S, c, edelta) # remove delta again
-            remove_delta!(dydot, S, c, edelta) # remove delta again
+            remove_delta!(dydot, S, c, a_edelta) # remove delta again
+        end
+        # do the division by edelta
+        nz = nonzeros(jac)
+        for j =1:size(jac,2)
+            ed = edelta[j]
+            for i in nzrange(jac,j)
+                nz[i] /= ed
+            end
         end
         return nothing
     end