/*
**  $Id: lmnlfit.c,v 1.3 1999/11/19 00:01:51 frank Exp $
**  lmnlfit.c	1.1	BSD 4.3 UNIX	940406	SIO/ODF	fmd
**
**  LMnlFit -- non-linear least squares fitting using a modified
**  Levenberg-Marquardt algorithm.
**
**  This routine has been translated from the slatec FORTRAN routine snls1
**  by K. L. Hiebert.
**
**  LMnlFit minimizes the sum of the squares of m nonlinear 
**  functions in n variables by a modification of the 
**  Levenberg-Marquardt algorithm.  This code is a combination 
**  of the Argonne Laboratory MINPACK subroutines lmder, lmdif, and lmstr. 
**
**  The purpose of LMnlFit is to minimize the sum of the squares of m 
**  nonlinear functions in n variables by a modification of the 
**  Levenberg-Marquardt algorithm.  The user must provide a subroutine
**  which calculates the functions.  The user has the option 
**  of how the Jacobian will be supplied.  The user can supply the 
**  full Jacobian, or the rows of the Jacobian (to avoid storing 
**  the full jacobian), or let the code approximate the Jacobian by 
**  forward-differencing.
**
**  Reference:
**	More, Jorge J.,
**	The Levenberg-Marquardt algorithm, implementation and
**	theory.  Numerical Analysis, G. A. Watson, Editor.
**	Lecture notes in Mathematics 630, Springer-Verlag, 1977.
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <odflib.h>

#ifndef FALSE
#define FALSE 0
#define TRUE  1
#endif
#define min(a,b)	(((a)<(b))?(a):(b))
#define max(a,b)	(((a)>(b))?(a):(b))

static	double	LMParam();
	double	EuclidianNorm(), FPMachineConst();
static	int	ApproxJacob(), Erroneously();
static	void	ChkFunGrad(), QRFactor(), QRSolve(), RWUpdt();

int				/* <- return code: 1-4 = ok */
LMnlFit(pInitFunc, pCalcFunc, m, n, x, fvec, maxfev, printInterval, nfev)
	int	(*pInitFunc)();	/* -> A user-supplied function to 
				      return additional optional parameters.
				      May be supplied as NULL. */
	int	(*pCalcFunc)();	/* -> A user-supplied function to
				      evaluate the functions to solve. May also
				      optionally print intermediate results,
				      calculate the Jacobian matrix for the
				      solution and supply scaling
				      information. */
	int	m;		/* -> number of functions to solve. */
	int	n;		/* -> number of variables to solve for
				      (n <= m). */
	double	*x;		/* <> vector of length n containing 
				      an initial solution estimate.
				      Returns the final estimate. */
	double	*fvec;		/* <- vector of length m returning the
				      function evaluations at x. */
	int	maxfev;		/* -> Limits the number of user-supplied
				      function evaluations. */
	int	printInterval;	/* -> If > 0, pCalcFunc will be called
				      every printInterval iterations through 
				      the Levenberg-Marquardt algorithm with a
				      control code of 0 to print intermediate
				      values. */
	int	*nfev;		/* <- Total number of times pCalcFunc was
				      called to evaluate the functions. */
{
/*
**  Variables:
**  =========
*/
    double	actred, checkLimit = 0.1, delta, *diag, dirder, epsFcn,
		epsMach, err, *fjac, fnorm, fnorm1, fTol, gnorm, gTol,
		par, *pFjacj, pnorm, prered, *qtf, ratio, stepBound, sum,
		temp, temp1, temp2, *wa1, *wa2, *wa3, *wa4, xnorm, xTol;
    unsigned	allocSize;
    int		funRtn, i, *ipvt, info, iter, j, jacOpt, l, ldfjac,
		nrow, scaleOpt, sing;

/*
**  Code:
**  ====
*/
    epsMach = FPMachineConst(3);

    info   =
    *nfev  = 0;
/*
**  Get initialization options:
**
**    jacOpt	-- Jacobian matrix calculation:
**		   1 = approximate by forward differencing;
**		   2 = pCalcFunc calculates the entire Jacobian;
**		   3 = pCalcFunc calculates Jacobian a row-at-a-time.
**    scaleOpt	-- Scaling option:
**		   0 = let LMnlFit() do it;
**		   1 = pCalcFunc calculates scaling vector.
**    fTol	-- Function-convergence  tolerance.
**    xTol	-- Relative-error        tolerance.
**    gTol	-- Geometric-convergence tolerance.
**    epsFcn	-- If jacOpt==1, epsilon for the forward-difference
**		   approximation. Otherwise ignored.
**    stepBound	-- A factor used to determine the initial Levenberg-
**		   Marquardt step bound. 100 is a good value.
*/
    if ( n <= 0 || m < n || pCalcFunc == NULL)
	return (Erroneously(0));
    if (pInitFunc == NULL) {
	jacOpt    = 1;
	scaleOpt  = 0;
        fTol      = sqrt(FPMachineConst(3));
	xTol      = sqrt(FPMachineConst(3));
	gTol      = 0.0;
	epsFcn    = 0.0;
	stepBound = 100.0;
    } else
	(*pInitFunc)( &jacOpt,  &scaleOpt, &fTol, &xTol, &gTol,
		      &epsFcn, &stepBound);
/*
**  check the input parameters for errors.
*/
    if (   jacOpt <   1 || jacOpt >    3 ||
	     fTol < 0.0 ||   xTol <  0.0 ||
	     gTol < 0.0 || maxfev <=   0 ||
	stepBound <=0.0)
	return (Erroneously(0));
/*
**  Allocate work areas.
*/
    ldfjac    = (jacOpt != 3) ? m : n;
    allocSize = (       n*sizeof (int))    +		/* ipvt  */
		(       n*sizeof (double)) +		/* qtf   */
		(       n*sizeof (double)) +		/* diag  */
		(     3*n*sizeof (double)) +		/* wa1-3 */
		(       m*sizeof (double)) +		/* wa4   */
		(ldfjac*n*sizeof (double));		/* fjac  */
    if ((qtf = (double *)malloc(allocSize))==NULL)
	return (Erroneously(9));
    diag =  qtf+n;
    wa1  = diag+n;
    wa2  =  wa1+n;
    wa3  =  wa2+n;
    wa4  =  wa3+n;
    fjac =  wa4+m;
    ipvt = (int *)(fjac+ldfjac*n);
/*
**  if user-specified scaling, get it.
*/
    if (scaleOpt != 0) {
	funRtn = (*pCalcFunc)(4, m, n, x, fvec, diag, n);
	if (funRtn < 0)
	    return (Erroneously(-1));
        for (j = 0; j < n; j++)
	    if (diag[j] <= 0.0)
	        return (Erroneously(0));
    }
/*
**  evaluate the function at the starting point and calculate its norm.
*/
    *nfev = 1;
    funRtn = (*pCalcFunc)(1, m, n, x, fvec, fjac, ldfjac);
    if (funRtn < 0) {
	(void)free((char *)qtf);
	return (Erroneously(funRtn));
    }
    fnorm = EuclidianNorm(m, fvec);
/*
**  Initialize the Levenberg-Marquardt parameter and the iteration counter.
*/
    par  = 0.0;
    iter = 1;
/*
**  beginning of the outer loop.
*/
 for (;;) {
/*
**  if requested, call pCalcFunc to enable printing of iterates.
*/
    if (printInterval > 0)
        if ((iter - 1) % printInterval == 0) {
	    funRtn = (*pCalcFunc)(0, m, n, x, fvec, fjac, ldfjac);
    	    if (funRtn < 0)
		break;
	}

/*
**  Calculate the Jacobian matrix.
**  Store the full Jacobian using m x n storage.
*/
    switch (jacOpt) {

    /*
    **  Approximate the jacobian using the supplied function.
    */
    case 1:
        funRtn = ApproxJacob(pCalcFunc,m,n,x,fvec,fjac,ldfjac,epsFcn,wa4);
        *nfev += n;
	break;

    /*
    **  The user supplies the full Jacobian.
    */
    case 2:
        funRtn = (*pCalcFunc)(2, m, n, x, fvec, fjac, ldfjac);
    /*
    **  on the first iteration, check the user supplied jacobian.
    */
        if (iter == 1) {
            if (funRtn < 0)
	        break;
        /*
        **  get the incremented x-values into wa1[].
        */
            ChkFunGrad(m, n, x, fvec, fjac, ldfjac, wa1, wa4, 1, &err);
        /*
        **  evaluate function at incremented value and put in wa4[]. 
        */
            funRtn = (*pCalcFunc)(1, m, n, wa1, wa4, fjac, ldfjac);
            ++(*nfev);
            if (funRtn < 0)
	        break;
            for (i = 0; i < m; i++) {
	        ChkFunGrad(1, n, x, fvec, &fjac[i],
		       ldfjac, wa1, &wa4[i], 2, &err);
	        if (err < checkLimit)
	            fprintf(stderr,"%s%d%s\n",
			"LMnlFit-- derivative of function ",i,
		    	" may be wrong, close to 0.");
            }
        }
	break;

    /*
    **  pCalcFunc supplies the Jacobian one row at a time.
    */
    case 3:
    /*
    **  accumulate the jacobian by rows in order to save storage.
    **  compute the QR factorization of the Jacobian matrix
    **  calculated one row at a time, while simultaneously
    **  forming (q transpose)*fvec and storing the first n components in qtf.
    */
        for (j = 0, pFjacj=fjac; j < n; j++, pFjacj += ldfjac) {
	    qtf[j] = 0.0;
	    for (i = 0; i < n; i++)
	        pFjacj[i] = 0.0;
        }
        for (i = 0; i < m; i++) {
	    nrow = i;
	    funRtn = (*pCalcFunc)(3, m, n, x, fvec, wa3, nrow);
	    if (funRtn < 0)
	        break;
        /*
        **  on the first iteration, check the user supplied jacobian.
        */
	    if (iter == 1) {
            /*
            **  get the incremented x-values into wa1[].
            */
	        ChkFunGrad(m, n, x, fvec, fjac, ldfjac, wa1, wa4, 1, &err);
            /*
            **  evaluate at incremented values, if not already evaluated.
            */
	        if (i == 0) {
	        /*
	        **  evaluate function at incremented value and put into wa4[].
	        */
	            funRtn = (*pCalcFunc)(1, m, n, wa1, wa4, fjac, nrow);
	            ++(*nfev);
	            if (funRtn < 0)
	                break;
	        }
	        ChkFunGrad(1, n, x, fvec, wa3, 1, wa1, wa4, 2, &err);
	        if (err < checkLimit && printInterval)
	            fprintf(stderr, "%s%d%s\n",
			    "dsnls1--derivative of function ",i,
		    	    " may be wrong, too close to 0.");
	    }
	    temp = fvec[i];
	    RWUpdt(n, fjac, ldfjac, wa3, qtf, &temp, wa1, wa2);
        }
        if (funRtn < 0)
	    break;
    /*
    **  if the jacobian is rank deficient, call QRFactor to
    **  reorder its columns and update the components of qtf.
    */
        sing = FALSE;
        for (j = 0, pFjacj=fjac; j < n; j++, pFjacj += ldfjac) {
	    if (pFjacj[j] == 0.0)
	        sing = TRUE;
	    ipvt[j] = j;
	    wa2[j]  = EuclidianNorm(j+1, pFjacj);
        }
        if (sing) {
            QRFactor(n, n, fjac, ldfjac, TRUE, ipvt, wa1, wa2, wa3);
            for (j = 0, pFjacj=fjac; j < n; j++, pFjacj += ldfjac) {
	        if (pFjacj[j] != 0.0) {
	            sum = 0.0;
	            for (i = j; i < n; i++)
		        sum += pFjacj[i] * qtf[i];
	            temp = -sum / pFjacj[j];
	            for (i = j; i < n; i++)
		        qtf[i] += pFjacj[i] * temp;
	        }
	        pFjacj[j] = wa1[j];
            }
        }
	break;
    } 
    if (funRtn < 0)
	break;

    if (jacOpt != 3) {
    /*
    **  Compute the QR Factorization of the Jacobian.
    */
        QRFactor(m, n, fjac, ldfjac, TRUE, ipvt, wa1, wa2, wa3);
    /*
    **  form (q transpose)*fvec and store the first n components in qtf.
    */
        for (i = 0; i < m; i++)
	    wa4[i] = fvec[i];
        for (j = 0, pFjacj = fjac; j < n; j++, pFjacj += ldfjac) {
	    if (pFjacj[j] != 0.0) {
	        sum = 0.0;
	        for (i = j; i < m; i++)
		    sum += pFjacj[i] * wa4[i];
	        temp = -sum / pFjacj[j];
	        for (i = j; i < m; i++)
		    wa4[i] += pFjacj[i] * temp;
	    }
	    pFjacj[j] = wa1[j];
	    qtf[j]    = wa4[j];
        }
    }
/*
**  On the first iteration and if scaleOpt is 0, scale according
**  to the norms of the columns of the initial Jacobian.
*/
    if (iter == 1) {
        if (scaleOpt == 0)
            for (j = 0; j < n; j++)
	        diag[j] = ((wa2[j] == 0.0) ? 1.0 : wa2[j]);
    /*
    **  On the first iteration, calculate the norm of the scaled x
    **  and initialize the step bound delta.
    */
        for (j = 0; j < n; j++)
	    wa3[j] = diag[j] * x[j];
        xnorm = EuclidianNorm(n, wa3);
        delta = stepBound * xnorm;
        if (delta == 0.0)
	    delta = stepBound;
    }
/*
**  Compute the norm of the scaled gradient.
*/
    gnorm = 0.0;
    if (fnorm != 0.0) {
        for (j = 0, pFjacj = fjac; j < n; j++, pFjacj += ldfjac) {
	    l   = ipvt[j];
	    if (wa2[l] == 0.0)
	        continue;
	    sum = 0.0;
	    for (i = 0; i <= j; i++)
	        sum += pFjacj[i] * (qtf[i] / fnorm);
	    gnorm = max( gnorm, fabs(sum/wa2[l]));
        }
    }
/*
**  Test for convergence of the gradient norm.
*/
    if (gnorm <= gTol)
	info = 4;
    if (info != 0)
	break;
/*
**  Rescale if necessary.
*/
    if (scaleOpt == 0)
        for (j = 0; j < n; j++)
	    diag[j] = max( diag[j], wa2[j]);
/*
**  Beginning of the inner loop.
*/
  for (;;) {
/*
**  Determine the Levenberg-Marquardt parameter.
*/
    par = LMParam(n, par, fjac, ldfjac, ipvt, diag,
		  qtf, delta, wa1, wa2, wa3, wa4);
/*
**  Store the direction p and x + p. calculate the norm of p.
*/
    for (j = 0; j < n; j++) {
	wa1[j] = -wa1[j];
	wa2[j] = x[j] + wa1[j];
	wa3[j] = diag[j] * wa1[j];
    }
    pnorm = EuclidianNorm(n, wa3);
/*
**  On the first iteration, adjust the initial step bound.
*/
    if (iter == 1)
	delta = min( delta, pnorm);
/*
**  Evaluate the function at x + p and calculate its norm.
*/
    funRtn = (*pCalcFunc)(1, m, n, wa2, wa4, fjac, ldfjac);
    ++(*nfev);
    if (funRtn < 0)
	break;
    fnorm1 = EuclidianNorm(m, wa4);
/*
**  Compute the scaled actual reduction.
*/
    actred = -1.0;
    if ((0.1*fnorm1) < fnorm)
	actred = 1.0 - (fnorm1*fnorm1)/(fnorm*fnorm);
/*
**  Compute the scaled predicted reduction and
**  the scaled directional derivative.
*/
    for (j = 0, pFjacj = fjac; j < n; j++, pFjacj += ldfjac) {
	wa3[j] = 0.0;
	l      = ipvt[j];
	temp   = wa1[l];
	for (i = 0; i <= j; i++)
	    wa3[i] += pFjacj[i] * temp;
    }
    temp1  = EuclidianNorm(n, wa3) / fnorm;
    temp2  = (sqrt(par)*pnorm)/fnorm;
    prered = temp1*temp1 + temp2*temp2*2.0;
    dirder = -(temp1*temp1 + temp2*temp2);
/*
**  Compute the ratio of the actual to the predicted reduction.
*/
    ratio = 0.0;
    if (prered != 0.0)
	ratio = actred/prered;
/*
**  Update the step bound.
*/
    if (ratio > 0.25) {
        if (par == 0.0 || ratio >= 0.75) {
            delta = pnorm * 2.0;
            par   = 0.5 * par;
	}
    } else {
        if (actred >= 0.0)
	    temp = 2.0;
        else 
	    temp = (dirder + 0.5*actred)/(0.5*dirder);
        if ((0.1*fnorm1) >= fnorm || temp > 10.0)
	    temp = 10.0;
        delta = min( delta, pnorm*10.0) / temp;
        par  *= temp;
    }

/*
**  Test for successful iteration.
*/
    if (ratio >= 1.0e-4) {
    /*
    **  Successful iteration. Update x, fvec, and their norms.
    */
        for (j = 0; j < n; j++) {
	    x[j]   = wa2[j];
	    wa2[j] = diag[j] * x[j];
        }
        for (i = 0; i < m; i++)
	    fvec[i] = wa4[i];
        xnorm = EuclidianNorm(n, wa2);
        fnorm = fnorm1;
        iter++;
    }
/*
**  Tests for convergence.
*/
    if (fabs(actred) <= fTol && prered <= fTol && (0.5*ratio) <= 1.0)
	info = 1;
    if (delta <= xTol*xnorm)
	if (info==1)
	    info = 3;
	else
	    info = 2;
    if (info != 0)
	break;
/*
**  Tests for termination and stringent tolerances.
*/
    if (*nfev >= maxfev)
	info = 5;
    if (fabs(actred) <= epsMach && prered <= epsMach && (0.5*ratio) <= 1.0)
	info = 6;
    if (delta <= (epsMach*xnorm))
	info = 7;
    if (gnorm <= epsMach)
	info = 8;
    if (info != 0)
	break;

    if (ratio >= 1.0e-4)
	break;
  }	/* end of the inner loop. repeat if iteration unsuccessful. */

 }	/* end of the outer loop. */
/*
**  Termination, either normal or user imposed.
*/
    if (funRtn < 0)
	info = funRtn;
    if (printInterval > 0)
	funRtn = (*pCalcFunc)(0, m, n, x, fvec, fjac, ldfjac);

    if (printInterval || info > 3 || info < 0)
	(void)LMPrintMsg(info);
    if (qtf)
	(void)free((char *)(qtf));

    return (info);

} /* LMnlFit */

static int
Erroneously(num)
	int	num;
{
	return (LMPrintMsg(num));
}

/*
**  Calculate the Levenberg-Marquardt parameter.
*/
static double
LMParam(n, prvPar, r, ldr, ipvt, diag, qtb, delta, x, sigma, wa1, wa2)
	int	n;
	double	prvPar, *r;
	int	ldr, *ipvt;
	double	*diag, *qtb, delta, *x, *sigma, *wa1, *wa2;
{
	double	par, parc, parl, temp, paru, dwarf, gnorm, fp,
		dxnorm, sum, *pRj;
	int	iter, i, j, k, l, nsing;

    dwarf = FPMachineConst(0);
/*
**  Compute and store in x the Gauss-Newton direction. If the
**  Jacobian is rank-deficient, obtain a least squares solution.
*/
    par   = prvPar;
    nsing = n-1;
    for (j = 0, pRj = r; j < n; j++, pRj += ldr) {
	if (pRj[j] == 0.0 && nsing == (n-1))
	    nsing = j-1;
	if (nsing < (n-1))
	    wa1[j] = 0.0;
	else
	    wa1[j] = qtb[j];
    }
    for (j = nsing, pRj = &r[j*ldr]; j >= 0; --j, pRj -= ldr) {
	temp   = wa1[j] / pRj[j];
	wa1[j] = temp;
	for (i = 0; i < j; i++)
	    wa1[i] -= pRj[i]*temp;
    }
    for (j = 0; j < n; j++) {
	l    = ipvt[j];
	x[l] = wa1[j];
    }
/*
**  Initialize the iteration counter.
**  Evaluate the function at the origin, and test
**  for acceptance of the Gauss-Newton direction.
*/
    iter = 0;
    for (j = 0; j < n; j++)
	wa2[j] = diag[j] * x[j];
    dxnorm = EuclidianNorm(n, wa2);
    fp     = dxnorm - delta;
    if (fp <= (0.1*delta))
	return (0.0);
/*
**  If the Jacobian is not rank deficient, the Newton
**  step provides a lower bound, parl, for the zero of
**  the function. otherwise set this bound to 0.0.
*/
    parl = 0.0;
    if (nsing >= (n-1)) {
        for (j = 0; j < n; j++) {
	    l      = ipvt[j];
	    wa1[j] = diag[l] * (wa2[l]/dxnorm);
        }
        for (j = 0, pRj = r; j < n; j++, pRj += ldr) {
	    sum = 0.0;
	    for (i = 0; i < j; i++)
	        sum += pRj[i] * wa1[i];
	    wa1[j] = (wa1[j] - sum) / pRj[j];
        }
        temp = EuclidianNorm(n, wa1);
        parl = fp/(delta*temp*temp);
    }
/*
**  Calculate an upper bound, paru, for the zero of the function.
*/
    for (j = 0, pRj = r; j < n; j++, pRj += ldr) {
	sum = 0.0;
	for (i = 0; i <= j; i++)
	    sum += pRj[i] * qtb[i];
	l      = ipvt[j];
	wa1[j] = sum / diag[l];
    }
    gnorm = EuclidianNorm(n, wa1);
    paru  = gnorm / delta;
    if (paru == 0.0)
	paru = dwarf / min( delta, 0.1);
/*
**  If the input par lies outside of the interval (parl,paru),
**  set par to the closer endpoint.
*/
    par = min( max( par, parl), paru);
    if (par == 0.0)
	par = gnorm / dxnorm;
/*
**  Beginning of an iteration.
*/
    for (;;) {
        iter++;
    /*
    **  Evaluate the function at the current value of par.
    */
        if (par == 0.0)
	    par = max( dwarf, 0.001*paru);
        temp = sqrt(par);
        for (j = 0; j < n; j++)
	    wa1[j] = temp * diag[j];
        QRSolve(n, r, ldr, ipvt, wa1, qtb, x, sigma, wa2);
        for (j = 0; j < n; j++)
	    wa2[j] = diag[j] * x[j];
        dxnorm = EuclidianNorm(n, wa2);
        temp   = fp;
        fp     = dxnorm - delta;
    /*
    **  If the function is small enough, accept the current value 
    **  of par. Also test for the exceptional cases where parl
    **  is 0.0 or the number of iterations has reached 10.
    */
        if ((fabs(fp) <= (0.1*delta)) || 
	    (   parl == 0.0           &&
		  fp <= temp          &&
		temp <  0.0)          ||
	       (iter == 10))
	    break;
    /*
    **  Compute the Newton correction.
    */
        for (j = 0; j < n; j++) {
	    l      = ipvt[j];
	    wa1[j] = diag[l]*(wa2[l]/dxnorm);
        }
        for (j = 0, pRj = r; j < n; j++, pRj += ldr) {
	    temp   = wa1[j] / sigma[j];
	    wa1[j] = temp;
	    for (i = j+1; i < n; i++)
	        wa1[i] -= pRj[i] * temp;
        }
        temp = EuclidianNorm(n, wa1);
        parc = fp/(delta*temp*temp);
    /*
    **  Depending on the sign of the function, update parl or paru.
    */
        if (fp > 0.0)
	    parl = max( parl, par);
        else if (fp < 0.0)
	    paru = min( paru, par);
    /*
    **  compute an improved estimate for par.
    */
        par = max( parl, par+parc);
    }
/*
**  Termination.
*/
    if (iter == 0)
	par = 0.0;

    return (par);

} /* LMParam */

/*
**  Check the gradients of m nonlinear functions in n
**  variables, evaluated at a point x, for consistency
**  with the functions themselves.
*/
static void
ChkFunGrad(m, n, x, fvec, fjac, ldfjac, xp, fvecp, mode, err)
	int	m, n;
	double	*x, *fvec, *fjac;
	int	ldfjac;
	double	*xp, *fvecp;
	int	mode;
	double	*err;
{
    double factor = 100.;

    double epsf, temp;
    int i, j;
    double epsMach, epslog, eps;

/*
**  epsMach is the machine precision.
*/
    epsMach = FPMachineConst(3);
    eps    = sqrt(epsMach);

    if (mode == 1) {
        for (j = 0; j < n; j++) {
	    temp = eps * fabs(x[j]);
	    if (temp == 0.0)
	        temp = eps;
	    xp[j] = x[j] + temp;
        }
    } else {
	double	*pFjacj;

        epsf   = factor * epsMach;
        epslog = log10(eps);
        for (i = 0; i < m; i++)
	    err[i] = 0.0;
        for (j = 0, pFjacj = fjac; j < n; j++, pFjacj += ldfjac) {
	    temp = fabs(x[j]);
	    if (temp == 0.0)
	        temp = 1.0;
	    for (i = 0; i < m; i++)
	        err[i] += temp * pFjacj[i];
        }
        for (i = 0; i < m; i++) {
	    temp = 1.0;
	    if (fvec[i]  != 0.0 && 
		fvecp[i] != 0.0 &&
		fabs(fvecp[i]-fvec[i]) >= (epsf*fabs(fvec[i])))
	        temp = eps * fabs((fvecp[i]-fvec[i])/eps - err[i])
		     / (fabs(fvec[i])+fabs(fvecp[i]));
	    err[i] = 1.0;
	    if (temp > epsMach && temp < eps)
	        err[i] = (log10(temp) - epslog) / epslog;
	    if (temp >= eps)
	        err[i] = 0.0;
        }
    }

} /* ChkFunGrad */

/*
**  Approximate the Jacobian by foward-differencing.
*/
static int
ApproxJacob(pCalcFunc, m, n, x, fvec, fjac, ldfjac, epsFcn, wa)
	int	(*pCalcFunc) ();
	int	m, n;
	double	*x, *fvec, *fjac;
	int	ldfjac;
	double	epsFcn, *wa;
{
    double	*pFjacj, temp, h, eps;
    int		funRtn, i, j;

    eps    = sqrt( max( epsFcn, FPMachineConst(3)));
/*
**  Call (*pCalcFunc)() with option=1 to indicate that function values
**  are to be returned.
*/
    for (j = 0, pFjacj = fjac; j < n; j++, pFjacj += ldfjac) {
	temp = x[j];
	if ((h = eps * fabs(temp)) == 0.0)
	    h = eps;
	x[j] += h;
	funRtn = (*pCalcFunc)(1, m, n, x, wa, fjac, ldfjac);
	if (funRtn >= 0) {
	    x[j] = temp;
	    for (i = 0; i < m; i++)
	        pFjacj[i] = (wa[i] - fvec[i])/h;
        } else
	    break;
    }

    return (funRtn);

} /* ApproxJacob */

/*
**    Using householder transformations with column
**    pivoting (optional), compute a QR factorization of the
**    m by n matrix a. QRFactor determines an orthogonal
**    matrix q, a permutation matrix p, and an upper trapezoidal
**    matrix r with diagonal elements of nonincreasing magnitude,
**    such that a*p = q*r. the householder transformation for
**    column k, k = 1,2,...,min(m,n), is of the form
**                          t
**          i - (1/u(k))*u*u
**
**    where u has zeros in the first k-1 positions.
*/
static void
QRFactor(m, n, a, lda, calcPivot, ipvt, rdiag, acnorm, wa)
	int	m, n;
	double	*a;
	int	lda;
	int	calcPivot;
	int	*ipvt;
	double	*rdiag, *acnorm, *wa;
{
	double	*pAj, *pAk, *pAkmax, temp, epsMach, ajnorm, d_1, sum;
	int	kmax, i, j, k, minmn;

/*
**  epsMach is the machine precision.
*/
    epsMach = FPMachineConst(3);
/*
**  Compute the initial column norms and initialize several arrays.
*/
    for (j = 0, pAj = a; j < n; j++, pAj += lda) {
	acnorm[j] =
	rdiag[j]  = 
	wa[j]     =  EuclidianNorm(m, pAj);
	if (calcPivot)
	    ipvt[j] = j;
    }
/*
**  Reduce a to r with Householder transformations.
*/
    minmn = min( m, n);
    for (j = 0, pAj = a; j < minmn; j++, pAj += lda) {
	if (calcPivot) {
        /*
        **  Bring the column of largest norm into the pivot position.
        */
	    for (k = kmax = j; k < n; k++)
	        if (rdiag[k] > rdiag[kmax])
		    kmax = k;
	    if (kmax != j) {
		pAkmax = &a[kmax*lda];
	        for (i = 0; i < m; i++) {
	            temp      = pAj[i];
	            pAj[i]    = pAkmax[i];
	            pAkmax[i] = temp;
	        }
	        rdiag[kmax] = rdiag[j];
	        wa[kmax]    = wa[j];
	        k           = ipvt[j];
	        ipvt[j]     = ipvt[kmax];
	        ipvt[kmax]  = k;
	    }
	}
    /*
    **  Compute the Householder transformation to reduce the
    **  j-th column of a to a multiple of the j-th unit vector.
    */
	ajnorm = EuclidianNorm(m-j, &pAj[j]);
	if (ajnorm != 0.0) {
	    if (pAj[j] < 0.0)
	        ajnorm = -ajnorm;
	    for (i = j; i < m; i++)
	        pAj[i] /= ajnorm;
	    pAj[j] += 1.0;
        /*
        **  Apply the transformation to the remaining columns
        **  and update the norms.
        */
	    for (k = j+1, pAk = &a[k*lda]; k < n; k++, pAk += lda) {
	        sum = 0.0;
	        for (i = j; i < m; i++)
		    sum += pAj[i] * pAk[i];
	        temp = sum / pAj[j];
	        for (i = j; i < m; i++)
		    pAk[i] -= temp * pAj[i];
	        if (calcPivot && (rdiag[k] != 0.0)) {
	            temp      = pAk[j] / rdiag[k];
	            rdiag[k] *= sqrt( max( 0.0, 1.0-temp*temp));
	            d_1       = rdiag[k] / wa[k];
	            if ((0.05*(d_1*d_1)) <= epsMach) {
	                rdiag[k] = EuclidianNorm(m-(j+1), &pAk[j+1]);
	                wa[k]    = rdiag[k];
		    }
	        }
	    }
	}
	rdiag[j] = -ajnorm;
    }

} /* QRFactor */

/*
**  Calculate the Euclidian norm of an n element vector x.
*/
double 
EuclidianNorm(n, x)
	int	n;
	double	*x;
{
	double	rdwarf = 3.834e-20;
	double	rgiant = 1.304e19;
	double	t, xabs, x1max, x3max, s1, s2, s3, agiant, floatn;
	int	i;

    s1     =
    s2     =
    s3     =
    x1max  =
    x3max  = 0.0;
    floatn = (double) n;
    agiant = rgiant / floatn;
    for (i = 0; i < n; i++, x++) {
	xabs = fabs(*x);
	if (xabs > rdwarf && xabs < agiant) {
	/*
	**  Sum for intermediate components.
	*/
	    s2 += xabs*xabs;
	    continue;
	}
	if (xabs > rdwarf) {
	/*
	**  Sum for large components.
	*/
	    if (xabs > x1max) {
		t     = x1max/xabs;
		s1    = 1.0 + s1*t*t;
		x1max = xabs;
		continue;
	    }
	    t   = xabs/x1max;
	    s1 += t*t;
	    continue;
	}
    /*
    **  Sum for small components.
    */
	if (xabs > x3max) {
	    t     = x3max/xabs;
	    s3    = 1.0 + s3*t*t;
	    x3max = xabs;
	    continue;
	}
	if (xabs != 0.0) {
	    t   = xabs/x3max;
	    s3 += t*t;
	}
    }
/*
**  Calculation of norm.
*/
    if (s1 != 0.0)
        return (x1max * sqrt(s1 + (s2/x1max)/x1max));
    if (s2 == 0.0)
        return (x3max * sqrt(s3));
    if (s2 >= x3max)
	return (sqrt(s2 * (1.0 + (x3max/s2) * (x3max*s3))));
    else
	return (sqrt(x3max * (s2/x3max + x3max*s3)));

} /* EuclidianNorm */

/*
**  QRSolve (qrsolve from MINPACK)
**
**    Given an m by n matrix a, an n by n diagonal matrix d,
**    and an m-vector b, the problem is to determine an x which
**    solves the system
**          a*x = b ,     d*x = 0 ,
**    in the least squares sense.
*/
static void
QRSolve(n, r, ldr, ipvt, diag, qtb, x, sigma, wa)
	int	n;
	double	*r;
	int	ldr, *ipvt;
	double	*diag, *qtb, *x, *sigma, *wa;
{
    double	temp, cotan, qtbpj, tangent, cosine, sine, sum,
		*pRi, *pRj, *pRk;
    int		i, j, k, l, nsing;

    for (j = 0, pRj = r; j < n; j++, pRj += ldr) {
	for (i = j, pRi = pRj; i < n; i++, pRi += ldr)
	    pRj[i] = pRi[j];
	x[j]  = pRj[j];
	wa[j] = qtb[j];
    }
/*
**  Eliminate the diagonal matrix d using a Givens rotation.
*/
    for (j = 0, pRj = r; j < n; j++, pRj += ldr) {
    /*
    **  Prepare the row of d to be eliminated, locating the
    **  diagonal element using p from the QR factorization.
    */
	l = ipvt[j];
	if (diag[l] != 0.0) {
	    for (k = j; k < n; k++)
	        sigma[k] = 0.0;
	    sigma[j] = diag[l];
        /*
        **  The transformations to eliminate the row of d
        **  modify only a single element of (q transpose)*b
        **  beyond the first n, which is initially 0.0.
        */
	    qtbpj    = 0.0;
	    for (k = j, pRk = pRj; k < n; k++, pRk += ldr) {
	    /*
	    **  Determine a Givens rotation which eliminates the
	    **  appropriate element in the current row of d.
	    */
	        if (sigma[k] == 0.0)
		    continue;
	        if (fabs(pRk[k]) < fabs(sigma[k])) {
	            cotan  = pRk[k]/sigma[k];
	            sine   = 0.5/sqrt(0.25 + 0.25*cotan*cotan);
	            cosine = sine*cotan;
	        } else {
	            tangent= sigma[k]/pRk[k];
	            cosine = 0.5/sqrt(0.25 + 0.25*tangent*tangent);
	            sine   = cosine*tangent;
	        }
	    /*
	    **  Compute the modified diagonal element of r and
	    **  the modified element of ((q transpose)*b,0).
	    */
	        pRk[k] = cosine * pRk[k] + sine   * sigma[k];
	        temp   = cosine * wa[k]  + sine   * qtbpj;
	        qtbpj  = -sine  * wa[k]  + cosine * qtbpj;
	        wa[k]  = temp;
	    /*
	    **  Accumulate the tranformation in the row of s.
	    */
	        for (i = k+1; i < n; i++) {
		    temp     = cosine * pRk[i] + sine   * sigma[i];
		    sigma[i] = -sine  * pRk[i] + cosine * sigma[i];
		    pRk[i]   = temp;
	        }
	    }
	}
    /*
    **  Store the diagonal element of s and restore
    **  the corresponding diagonal element of r.
    */
	sigma[j] = pRj[j];
	pRj[j]   = x[j];
    }
/*
**  Solve the triangular system for z. If the system is
**  singular, then obtain a least squares solution.
*/
    nsing = n-1;
    for (j = 0; j < n; j++) {
	if (sigma[j] == 0.0 && nsing == (n-1))
	    nsing = j-1;
	if (nsing < (n-1))
	    wa[j] = 0.0;
    }
    for (k = 0, j = nsing, pRj = &r[j*ldr];
	k <= nsing; k++, --j, pRj -= ldr) {
	sum = 0.0;
	for (i = j+1; i <= nsing; i++)
	    sum += pRj[i] * wa[i];
	wa[j] = (wa[j] - sum) / sigma[j];
    }
/*
**  Permute the components of z back to components of x.
*/
    for (j = 0; j < n; j++) {
	l    = ipvt[j];
	x[l] = wa[j];
    }

} /* QRSolve */

/*
**    RWUpdt: (subroutine RWUPDT from MINPACK)
**
**    Given an n by n upper triangular matrix r, this subroutine
**    computes the QR decomposition of the matrix formed when a row
**    is added to r. If the row is specified by the vector w, then
**    rwupdt determines an orthogonal matrix q such that when the
**    n+1 by n matrix composed of r augmented by w is premultiplied
**    by (q transpose), the resulting matrix is upper trapezoidal.
**    The matrix (q transpose) is the product of n transformations
**          g(n)*g(n-1)* ... *g(1)
**    where g(i) is a Givens rotation in the (i,n+1) plane which
**    eliminates elements in the (n+1)-st plane. rwupdt also
**    computes the product (q transpose)*c where c is the
**    (n+1)-vector (b,alpha). q itself is not accumulated, rather
**    the information to recover the g rotations is supplied.
*/

static void
RWUpdt(n, r, ldr, w, b, alpha, cosine, sine)
	int	n;
	double	*r;
	int	ldr;
	double	*w, *b, *alpha, *cosine, *sine;
{
    double	temp, rowj, cotan, tangent, *pb, *pcos, *psin, *pr, *pw;
    int		i, j, jm1;

    for (j = 0, pb=b, pcos=cosine, psin=sine, pr=r, pw=w;
	j < n; j++, pb++, pcos++, psin++, pw++, pr+=ldr) {
	double	*pcost, *psint, *prt;

	rowj = *pw;
    /*
    **  Apply the previous transformations to
    **        r(i,j), i=1,2,...,j-1, and to w(j).
    */
	for (i = 0, pcost=cosine, psint=sine, prt=pr;
	    i < j; i++, pcost++, psint++, prt++) {
	    temp = (*pcost)  * (*prt) + (*psint) * rowj;
	    rowj = -(*psint) * (*prt) + (*pcost) * rowj;
	    *prt = temp;
	}
    /*
    **  Determine a Givens rotation which eliminates w(j).
    */
	*pcos = 1.0;
	*psin = 0.0;
	if (rowj != 0.0) {
	    if (fabs(pr[j]) < fabs(rowj)) {
	        cotan   = pr[j] / rowj;
	        *psin   = 0.5 / sqrt(0.25 + 0.25*cotan*cotan);
	        *pcos   = *psin * cotan;
	    } else {
	        tangent = rowj / pr[j];
	        *pcos   = 0.5 / sqrt(0.25 + 0.25*tangent*tangent);
	        *psin   = *pcos * tangent;
	    }
        /*
	**  Apply the current transformation to r(j,j), b(j), and alpha.
        */
	    pr[j] = *pcos    * pr[j] + *psin * rowj;
	    temp  = *pcos    * *pb   + *psin * *alpha;
	    *alpha= -(*psin) * *pb   + *pcos * *alpha;
	    *pb   = temp;
	}
    }

} /* RWUpdt */

/*
**  Return machine-dependent floating-point constants.
**  NOTE: Choose these carefully. They may be both hardware
**  and compiler-dependent. Here we assume that they are
**  contained in <float.h>.
*/
double 
FPMachineConst(what)
	int	what;
{
/*
**  double-precision floating-point machine constants:
**
**  b = mantissa base (usually 2), t = number of mantissa bits.
**
**  FPMachineConst(0) = b**(emin-1), the smallest positive magnitude.
**  FPMachineConst(1) = b**emax*(1 - b**(-t)), the largest magnitude.
**  FPMachineConst(2) = b**(-t), the smallest relative spacing.
**  FPMachineConst(3) = b**(1-t), the largest relative spacing.
**  FPMachineConst(4) = log10(b)
**
**  References:  Fox P.A., Hall A.D., Schryer N.L., Framework for a
**               portable library, acm transactions on mathematical
**               software, vol. 4, no. 2, june 1978, pp. 177-188.
*/
    double	value;

    switch (what) {
    case 0:
	value	= DBL_MIN;
	break;
    case 1:
	value	= DBL_MAX;
	break;
    case 2:
	value	= ldexp(1.0, -DBL_MANT_DIG);
	break;
    case 3:
	value	= DBL_EPSILON;
	break;
    case 4:
	value	= 0.30102999566398;
	break;
    default:
	fprintf(stderr,"%s\n",
	"FPMachineConst -- argument out of bounds");
	value	= quiet_nan();
	break;
    };

    return (value);

} /* FPMachineConst */


static char	*LMfitMsgs[] = {
"Improper input parameter(s).",
"Both actual and predicted relative reductions in the sum of squares are at\n\
most fTol.",
"Relative error between two consecutive iterations is at most xTol.",
"Conditions for both fTol and xTol have been met.",
"The cosine of the angle between fvec and any column of the Jacobian is at\n\
most gTol in absolute value.",
"The maximum number of function evaluations has been exceeded.",
"fTol is too small. No further reduction in the sum of squares is possible.",
"xTol is too small. No further improvement in the solution is possible.",
"gTol is too small. Fvec is orthogonal to the columns of the Jacobian\n\
to machine precision.",
"Out of memory.",
0};

int
LMPrintMsg(num)
	int	num;
{
	char	*msg;
	if (num < 0)
	    msg = "user function returned negative value.";
	else if (num >= 0 && num <= 9)
	    msg = LMfitMsgs[num];
	else
	    return (num);
	fprintf(stderr,"%s\n%s\n","LMnlFit --",msg);
	return (num);
}