diff --git a/doc/docs/NLopt_Algorithms.md b/doc/docs/NLopt_Algorithms.md
index 786c8097..975dd7a5 100644
--- a/doc/docs/NLopt_Algorithms.md
+++ b/doc/docs/NLopt_Algorithms.md
@@ -337,6 +337,8 @@ specified using the [`nlopt_set_param` API](NLopt_Reference.md#algorithm-specifi
 * `dual_algorithm` (defaults to `NLOPT_LD_MMA`), `dual_ftol_rel` (defaults to `1e-14`), `dual_ftol_abs` (defaults to `0`), `dual_xtol_rel` (defaults to `0`), `dual_xtol_abs` (defaults to `0`), `dual_maxeval` (defaults to `100000`): These specify how the algorithm internally solves the "dual" optimization problem for its approximate objective.   Because this subsidiary solve requires no evaluations of the user's objective function, it is typically fast enough that we can solve it to high precision without worrying too much about the details.  Howeve,r in high-dimensional problems you may notice that MMA/CCSA is taking a long time between optimization steps, in which case you may want to increase `dual_ftol_rel` or make other changes.   If these parameters are not specified, NLopt takes them from the [subsidiary-optimizer algorithm](NLopt_Reference.md#localsubsidiary-optimization-algorithm) if that has been specified, and otherwise uses the defaults indicated here.
 * `verbosity`: If > 0, causes the algorithm to print internal status information on each iteration.
 * `rho_init`: if specified, should be a rough upper bound for the second derivative (the biggest eigenvalue of the Hessian of the objective or constraints); defaults to `1.0`.   CCSA/MMA will adaptively adjust this as the optimization progresses, so even it if `rho_init` is completely wrong the algorithm will still converge.  A `rho_init` that is too large will cause the algorithm to take overly small steps at the beginning, while a `rho_init` that is too small will cause it to take overly large steps (and have to backtrack) at the beginning.   Similarly, you can also use the "initial stepsize" option ([NLopt reference](NLopt_Reference.md#initial-step-size)) to control the maximum size of the initial steps (half the diameter of the trust region).
+* `inner_gradients` (defaults to `1`): if `0`, the gradient is *not* computed on "inner" iterations of the CCSA algorithm.  This can be useful in order to save computations on inner iterations, and also for the caller to distinguish inner and outer iterations.  However, that means that your functions are called twice on outer iterations: once for the last inner iteration (without gradients), and again with the gradient; to save wasted computation, you may want to cache the last function value and not recompute it (only compute the gradient) if `x` does not change between subsequent calls.
+* `always_improve` (defaults to `1`): if `0`, only update the current point (whence the gradients and model) on outer iterations, as in the original Svanberg paper; if `1`, we are slightly more aggressive about updating the current point, even doing so on inner iterations if the new point is strictly better (e.g. feasible and better objective) than the previous point.
 
 ### SLSQP
 
diff --git a/src/algs/mma/ccsa_quadratic.c b/src/algs/mma/ccsa_quadratic.c
index f89fc78e..77a3a89b 100644
--- a/src/algs/mma/ccsa_quadratic.c
+++ b/src/algs/mma/ccsa_quadratic.c
@@ -219,6 +219,7 @@ nlopt_result ccsa_quadratic_minimize(
      double *minf,
      nlopt_stopping *stop,
      nlopt_opt dual_opt, int inner_maxeval, unsigned verbose, double rho_init,
+	 int inner_gradients, int always_improve, double sigma_min,
 	 const double *sigma_init)
 {
      nlopt_result ret = NLOPT_SUCCESS;
@@ -329,6 +330,7 @@ nlopt_result ccsa_quadratic_minimize(
 	       sigma[j] = 1.0; /* arbitrary default */
 	  else
 	       sigma[j] = 0.5 * (ub[j] - lb[j]);
+		sigma[j] = MAX(sigma[j], sigma_min);
      }
      rho = rho_init;
      for (i = 0; i < m; ++i) {
@@ -446,7 +448,7 @@ nlopt_result ccsa_quadratic_minimize(
 				i, y[i], i, dd.gcval[i]);
 	       }
 
-	       fcur = f(n, xcur, dfdx_cur, f_data);
+	       fcur = f(n, xcur, inner_gradients ? dfdx_cur : NULL, f_data);
 	       ++ *(stop->nevals_p);
 		   ++inner_nevals;
 	       if (nlopt_stop_forced(stop)) {
@@ -454,7 +456,8 @@ nlopt_result ccsa_quadratic_minimize(
 	       feasible_cur = 1; infeasibility_cur = 0;
 	       inner_done = dd.gval >= fcur;
 	       for (i = ifc = 0; ifc < mfc; ++ifc) {
-		    nlopt_eval_constraint(fcval_cur + i, dfcdx_cur + i*n,
+		    nlopt_eval_constraint(fcval_cur + i,
+					  inner_gradients ? dfcdx_cur + i*n : NULL,
 					  fc + ifc, n, xcur);
 		    i += fc[ifc].m;
 		    if (nlopt_stop_forced(stop)) {
@@ -474,10 +477,45 @@ nlopt_result ccsa_quadratic_minimize(
 
 		   inner_done = inner_done || (inner_maxeval > 0 && inner_nevals == inner_maxeval);
 
-	       if ((fcur < *minf && (inner_done || feasible_cur || !feasible))
-		    || (!feasible && infeasibility_cur < infeasibility)) {
+		   /* update the current point.  If always_improve (the default), use an aggressive
+		      strategy that always accepts an improvement even if !inner_done,
+			  otherwise only update the current point if inner_done (= outer iteration),
+			  more like the original Svanberg paper */
+	       if (always_improve ?
+			   (fcur < *minf && (inner_done || feasible_cur || !feasible))
+		       || (!feasible && infeasibility_cur < infeasibility) : inner_done) {
 		    if (verbose && !feasible_cur)
 			 printf("CCSA - using infeasible point?\n");
+
+			if (!inner_gradients) { /* evaluate again, this time with gradients */
+				fcur = f(n, xcur, dfdx_cur, f_data);
+				/* don't update *(stop->nevals_p) -- hope user caches xcur
+			       so that they don't actually recompute f */
+				if (nlopt_stop_forced(stop)) {
+					ret = NLOPT_FORCED_STOP; goto done; }
+				feasible_cur = 1; infeasibility_cur = 0;
+				for (i = ifc = 0; ifc < mfc; ++ifc) {
+					nlopt_eval_constraint(fcval_cur + i,
+							dfcdx_cur + i*n,
+							fc + ifc, n, xcur);
+					i += fc[ifc].m;
+					if (nlopt_stop_forced(stop))
+						ret = NLOPT_FORCED_STOP; goto done;
+				}
+				/* recompute feasible_cur etc in case the caller
+				   has changed the objective function for an outer iteration,
+				   but don't change inner_done in that case */
+				for (i = ifc = 0; ifc < mfc; ++ifc) {
+					unsigned i0 = i, inext = i + fc[ifc].m;
+					for (; i < inext; ++i) {
+					feasible_cur = feasible_cur
+						&& fcval_cur[i] <= fc[ifc].tol[i-i0];
+					if (fcval_cur[i] > infeasibility_cur)
+						infeasibility_cur = fcval_cur[i];
+					}
+				}
+			}
+
 		    dd.fval = *minf = fcur;
 		    infeasibility = infeasibility_cur;
 		    memcpy(fcval, fcval_cur, sizeof(double)*m);
@@ -548,6 +586,7 @@ nlopt_result ccsa_quadratic_minimize(
 			    0.01*(ub-lb), which seems unnecessarily large */
 			 sigma[j] = MAX(sigma[j], 1e-8*(ub[j]-lb[j]));
 		    }
+			sigma[j] = MAX(sigma[j], sigma_min);
 	       }
 	       for (j = 0; j < MIN(verbose, n); ++j)
 		    printf("                 CCSA sigma[%u] -> %g\n",
diff --git a/src/algs/mma/mma.c b/src/algs/mma/mma.c
index 3236099c..505cedd4 100644
--- a/src/algs/mma/mma.c
+++ b/src/algs/mma/mma.c
@@ -149,6 +149,7 @@ nlopt_result mma_minimize(unsigned n, nlopt_func f, void *f_data,
 			  double *minf,
 			  nlopt_stopping *stop,
 			  nlopt_opt dual_opt, int inner_maxeval, unsigned verbose, double rho_init,
+			  int inner_gradients, int always_improve, double sigma_min,
 			  const double *sigma_init)
 {
      nlopt_result ret = NLOPT_SUCCESS;
@@ -205,6 +206,7 @@ nlopt_result mma_minimize(unsigned n, nlopt_func f, void *f_data,
 	       sigma[j] = 1.0; /* arbitrary default */
 	  else
 	       sigma[j] = 0.5 * (ub[j] - lb[j]);
+	  sigma[j] = MAX(sigma[j], sigma_min);
      }
      rho = rho_init;
      for (i = 0; i < m; ++i) {
@@ -292,7 +294,7 @@ nlopt_result mma_minimize(unsigned n, nlopt_func f, void *f_data,
 				i, y[i], i, dd.gcval[i]);
 	       }
 
-	       fcur = f(n, xcur, dfdx_cur, f_data);
+	       fcur = f(n, xcur, inner_gradients ? dfdx_cur : NULL, f_data);
 	       ++ *(stop->nevals_p);
 		   ++inner_nevals;
 	       if (nlopt_stop_forced(stop)) {
@@ -301,7 +303,7 @@ nlopt_result mma_minimize(unsigned n, nlopt_func f, void *f_data,
 	       new_infeasible_constraint = 0;
 	       inner_done = dd.gval >= fcur;
 	       for (i = ifc = 0; ifc < mfc; ++ifc) {
-		    nlopt_eval_constraint(fcval_cur + i, dfcdx_cur + i*n,
+		    nlopt_eval_constraint(fcval_cur + i, inner_gradients ? dfcdx_cur + i*n : NULL,
 					  fc + ifc, n, xcur);
 		    i += fc[ifc].m;
 		    if (nlopt_stop_forced(stop)) {
@@ -325,10 +327,48 @@ nlopt_result mma_minimize(unsigned n, nlopt_func f, void *f_data,
 
 		   inner_done = inner_done || (inner_maxeval > 0 && inner_nevals == inner_maxeval);
 
-	       if ((fcur < *minf && (inner_done || feasible_cur || !feasible))
-		    || (!feasible && infeasibility_cur < infeasibility)) {
+		   /* update the current point.  If always_improve (the default), use an aggressive
+		      strategy that always accepts an improvement even if !inner_done,
+			  otherwise only update the current point if inner_done (= outer iteration),
+			  more like the original Svanberg paper */
+	       if (always_improve ? (fcur < *minf && (inner_done || feasible_cur || !feasible))
+		    || (!feasible && infeasibility_cur < infeasibility) : inner_done) {
 		    if (verbose && !feasible_cur)
 			 printf("MMA - using infeasible point?\n");
+
+			if (!inner_gradients) { /* evaluate again, this time with gradients */
+				fcur = f(n, xcur, dfdx_cur, f_data);
+				/* don't update *(stop->nevals_p) -- hope user caches xcur
+			       so that they don't actually recompute f */
+				if (nlopt_stop_forced(stop)) {
+					ret = NLOPT_FORCED_STOP; goto done; }
+				feasible_cur = 1; infeasibility_cur = 0;
+				new_infeasible_constraint = 0;
+				inner_done = dd.gval >= fcur;
+				for (i = ifc = 0; ifc < mfc; ++ifc) {
+					nlopt_eval_constraint(fcval_cur + i, dfcdx_cur + i*n,
+							fc + ifc, n, xcur);
+					i += fc[ifc].m;
+					if (nlopt_stop_forced(stop)) {
+					ret = NLOPT_FORCED_STOP; goto done; }
+				}
+				/* recompute feasible_cur etc in case the caller
+				   has changed the objective function for an outer iteration,
+				   but don't change inner_done in that case */
+				for (i = ifc = 0; ifc < mfc; ++ifc) {
+					unsigned i0 = i, inext = i + fc[ifc].m;
+					for (; i < inext; ++i)
+					if (!nlopt_isnan(fcval_cur[i])) {
+						feasible_cur = feasible_cur
+							&& (fcval_cur[i] <= fc[ifc].tol[i-i0]);
+						if (nlopt_isnan(fcval[i]) && fcval_cur[i] > 0)
+							new_infeasible_constraint = 1;
+						if (fcval_cur[i] > infeasibility_cur)
+							infeasibility_cur = fcval_cur[i];
+					}
+				}
+			}
+
 		    dd.fval = *minf = fcur;
 		    infeasibility = infeasibility_cur;
 		    memcpy(fcval, fcval_cur, sizeof(double)*m);
@@ -398,6 +438,7 @@ nlopt_result mma_minimize(unsigned n, nlopt_func f, void *f_data,
 			 sigma[j] = MIN(sigma[j], 10*(ub[j]-lb[j]));
 			 sigma[j] = MAX(sigma[j], 0.01*(ub[j]-lb[j]));
 		    }
+			sigma[j] = MAX(sigma[j], sigma_min);
 	       }
 	       for (j = 0; j < MIN(verbose, n); ++j)
 		    printf("                 MMA sigma[%u] -> %g\n",
diff --git a/src/algs/mma/mma.h b/src/algs/mma/mma.h
index ca2131c4..72a813c6 100644
--- a/src/algs/mma/mma.h
+++ b/src/algs/mma/mma.h
@@ -41,6 +41,7 @@ nlopt_result mma_minimize(unsigned n, nlopt_func f, void *f_data,
 			  double *minf,
 			  nlopt_stopping *stop,
 			  nlopt_opt dual_opt, int inner_maxeval, unsigned verbose, double rho_init,
+                 int inner_gradients, int always_improve, double sigma_min,
                  const double *sigma_init);
 
 nlopt_result ccsa_quadratic_minimize(
@@ -54,6 +55,7 @@ nlopt_result ccsa_quadratic_minimize(
      double *minf,
      nlopt_stopping *stop,
      nlopt_opt dual_opt, int inner_maxeval, unsigned verbose, double rho_init,
+     int inner_gradients, int always_improve, double sigma_min,
      const double *sigma_init);
 
 #ifdef __cplusplus
diff --git a/src/api/optimize.c b/src/api/optimize.c
index 4cc6583b..20268f9a 100644
--- a/src/api/optimize.c
+++ b/src/api/optimize.c
@@ -798,11 +798,20 @@ static nlopt_result nlopt_optimize_(nlopt_opt opt, double *x, double *minf)
             int inner_maxeval = (int)nlopt_get_param(opt, "inner_maxeval",0);
             int verbosity = (int)nlopt_get_param(opt, "verbosity",0);
             double rho_init = nlopt_get_param(opt, "rho_init",1.0);
+            int inner_gradients = (int)nlopt_get_param(opt, "inner_gradients",1);
+            int always_improve = (int)nlopt_get_param(opt, "always_improve",1);
+            double sigma_min = nlopt_get_param(opt, "sigma_min",0.0);
             nlopt_opt dual_opt;
             nlopt_result ret;
 
             if (!(rho_init > 0) && !nlopt_isinf(rho_init))
                 RETURN_ERR(NLOPT_INVALID_ARGS, opt, "rho_init must be positive and finite");
+            if (inner_gradients != 0 && inner_gradients != 1)
+                RETURN_ERR(NLOPT_INVALID_ARGS, opt, "inner_gradients must be 0 or 1");
+            if (always_improve != 0 && always_improve != 1)
+                RETURN_ERR(NLOPT_INVALID_ARGS, opt, "always_improve must be 0 or 1");
+            if (sigma_min < 0.0)
+                RETURN_ERR(NLOPT_INVALID_ARGS, opt, "sigma_min must be non-negative");
             verbosity = verbosity < 0 ? 0 : verbosity;
 
 #define LO(param, def) (opt->local_opt ? opt->local_opt->param : (def))
@@ -817,9 +826,9 @@ static nlopt_result nlopt_optimize_(nlopt_opt opt, double *x, double *minf)
             nlopt_set_maxeval(dual_opt, (int)nlopt_get_param(opt, "dual_maxeval", LO(maxeval, 100000)));
 #undef LO
             if (algorithm == NLOPT_LD_MMA)
-                ret = mma_minimize(n, f, f_data, opt->m, opt->fc, lb, ub, x, minf, &stop, dual_opt, inner_maxeval, (unsigned)verbosity, rho_init, opt->dx);
+                ret = mma_minimize(n, f, f_data, opt->m, opt->fc, lb, ub, x, minf, &stop, dual_opt, inner_maxeval, (unsigned)verbosity, rho_init, inner_gradients, always_improve, sigma_min, opt->dx);
             else
-                ret = ccsa_quadratic_minimize(n, f, f_data, opt->m, opt->fc, opt->pre, lb, ub, x, minf, &stop, dual_opt, inner_maxeval, (unsigned)verbosity, rho_init, opt->dx);
+                ret = ccsa_quadratic_minimize(n, f, f_data, opt->m, opt->fc, opt->pre, lb, ub, x, minf, &stop, dual_opt, inner_maxeval, (unsigned)verbosity, rho_init, inner_gradients, always_improve, sigma_min, opt->dx);
             nlopt_destroy(dual_opt);
             return ret;
         }