Coverage for src/pycse/PYCSE.py: 97.48%
119 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-23 16:23 -0400
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-23 16:23 -0400
1"""Module containing useful scientific and engineering functions.
3- Linear regression
4- Nonlinear regression.
5- Differential equation solvers.
7See http://kitchingroup.cheme.cmu.edu/pycse
9Copyright 2025, John Kitchin
10(see accompanying license files for details).
11"""
13# pylint: disable=invalid-name
15import warnings
16import numpy as np
17from scipy.stats.distributions import t
18from scipy.optimize import curve_fit
19from scipy.integrate import solve_ivp
21import numdifftools as nd
23# * Linear regression
26def polyfit(x, y, deg, alpha=0.05, *args, **kwargs):
27 """Least squares polynomial fit with parameter confidence intervals.
29 Parameters
30 ----------
31 x : array_like, shape (M,)
32 x-coordinates of the M sample points ``(x[i], y[i])``.
33 y : array_like, shape (M,) or (M, K)
34 y-coordinates of the sample points. Several data sets of sample
35 points sharing the same x-coordinates can be fitted at once by
36 passing in a 2D-array that contains one dataset per column.
37 deg : int
38 Degree of the fitting polynomial
39 *args and **kwargs are passed to regress.
41 Returns
42 -------
43 [b, bint, se]
44 b is a vector of the fitted parameters
45 bint is a 2D array of confidence intervals
46 se is an array of standard error for each parameter.
47 """
48 # in vander, the second arg is the number of columns, so we have to add one
49 # to the degree since there are N + 1 columns for a polynomial of order N
50 X = np.vander(x, deg + 1)
51 return regress(X, y, alpha, *args, **kwargs)
54def polyval(p, x, X, y, alpha=0.05, ub=1e-5, ef=1.05):
55 """Evaluate the polynomial p at x with prediction intervals.
57 Parameters
58 ----------
59 p: parameters from pycse.polyfit
60 x: array_like, shape (M,)
61 x-coordinates to evaluate the polynomial at.
62 X: array_like, shape (N,)
63 the original x-data that p was fitted from.
64 y: array-like, shape (N,)
65 the original y-data that p was fitted from.
67 alpha : confidence level, 95% = 0.05
68 ub : upper bound for smallest allowed Hessian eigenvalue
69 ef : eigenvalue factor for scaling Hessian
71 Returns
72 -------
73 y, yint, pred_se
74 y : the predicted values
75 yint: confidence interval
76 """
77 deg = len(p) - 1
78 _x = np.vander(x, deg + 1) # to predict at
80 _X = np.vander(X, deg + 1) # original data
82 return predict(_X, y, p, _x, alpha, ub, ef)
85def regress(A, y, alpha=0.05, *args, **kwargs):
86 r"""Linear least squares regression with confidence intervals.
88 Solve the matrix equation \(A p = y\) for p.
90 The confidence intervals account for sample size using a student T
91 multiplier.
93 This code is derived from the descriptions at
94 http://www.weibull.com/DOEWeb/confidence_intervals_in_multiple_linear_regression.htm
95 and
96 http://www.weibull.com/DOEWeb/estimating_regression_models_using_least_squares.htm
98 Parameters
99 ----------
100 A : a matrix of function values in columns, e.g.
101 A = np.column_stack([T**0, T**1, T**2, T**3, T**4])
103 y : a vector of values you want to fit
105 alpha : 100*(1 - alpha) confidence level
107 args and kwargs are passed to np.linalg.lstsq
109 Example
110 -------
111 >>> import numpy as np
112 >>> x = np.array([0, 1, 2])
113 >>> y = np.array([0, 2, 4])
114 >>> X = np.column_stack([x**0, x])
115 >>> regress(X, y)
116 (array([ -5.12790050e-16, 2.00000000e+00]), None, None)
118 Returns
119 -------
120 [b, bint, se]
121 b is a vector of the fitted parameters
122 bint is an array of confidence intervals. The ith row is for the ith parameter.
123 se is an array of standard error for each parameter.
125 """
126 # This is to silence an annoying FutureWarning.
127 if "rcond" not in kwargs:
128 kwargs["rcond"] = None
130 b, _, _, _ = np.linalg.lstsq(A, y, *args, **kwargs)
132 bint, se = None, None
134 if alpha is not None:
135 # compute the confidence intervals
136 n = len(y)
137 k = len(b)
139 errors = y - np.dot(A, b) # this may have many columns
140 sigma2 = np.sum(errors**2, axis=0) / (n - k) # RMSE
142 covariance = np.linalg.inv(np.dot(A.T, A))
144 # sigma2 is either a number, or (1, noutputs)
145 # covariance is npars x npars
146 # I need to scale C for each column
147 try:
148 C = [covariance * s for s in sigma2]
149 dC = np.array([np.diag(c) for c in C]).T
150 except TypeError:
151 C = covariance * sigma2
152 dC = np.diag(C)
154 # The diagonal on each column is related to the standard error
156 if (dC < 0.0).any():
157 warnings.warn(
158 "\n{0}\ndetected a negative number in your"
159 " covariance matrix. Taking the absolute value"
160 " of the diagonal. something is probably wrong"
161 " with your data or model".format(dC)
162 )
163 dC = np.abs(dC)
165 se = np.sqrt(dC) # standard error
167 # CORRECTED: Use n - k degrees of freedom (not n - k - 1)
168 # For linear regression with n observations and k parameters,
169 # the residual has exactly n - k degrees of freedom.
170 sT = t.ppf(1.0 - alpha / 2.0, n - k) # student T multiplier
171 CI = sT * se
173 # bint is a little tricky, and depends on the shape of the output.
174 bint = np.array([(b - CI, b + CI)]).T
176 return (b, bint.squeeze(), se)
179def predict(X, y, pars, XX, alpha=0.05, ub=1e-5, ef=1.05):
180 """Prediction interval for linear regression.
182 Based on the delta method.
184 Parameters
185 ----------
186 X : known x-value array, one row for each y-point
187 y : known y-value array
188 pars : fitted parameters
189 XX : x-value array to make predictions for
190 alpha : confidence level, 95% = 0.05
191 ub : upper bound for smallest allowed Hessian eigenvalue
192 ef : eigenvalue factor for scaling Hessian
194 See https://en.wikipedia.org/wiki/Prediction_interval#Unknown_mean,_unknown_variance
196 Returns
197 y, yint, pred_se
198 y : the predicted values
199 yint: confidence interval
200 pred_se: std error on predictions.
201 """
202 n = len(X)
203 npars = len(pars)
204 dof = n - npars
206 errs = y - X @ pars
208 sse = np.sum(errs**2, axis=0)
210 # CORRECTED: Use unbiased variance estimator with correct DOF
211 # mse represents σ², the noise variance
212 mse = sse / dof # Was: sse / n
214 gprime = XX
216 # CORRECTED: Removed factor of 2 for covariance calculation
217 # Even though np.linalg.lstsq minimizes SSE (with Hessian H = 2X'X),
218 # the Fisher Information is I = H / (2σ²) = 2X'X / (2σ²) = X'X / σ²
219 # Therefore: Cov(β) = I⁻¹ = σ² × (X'X)⁻¹ (factor of 2 cancels)
220 # This matches what regress() correctly uses (line 142)
221 hat = X.T @ X # Was: 2 * X.T @ X
222 eps = max(ub, ef * np.linalg.eigvals(hat).min())
224 # Parameter covariance matrix
225 I_fisher = np.linalg.pinv(hat + np.eye(npars) * eps)
227 # CORRECTED: Compute parameter uncertainty and total prediction uncertainty separately
228 # Parameter uncertainty: SE(X̂β) = sqrt(σ² × x'(X'X)⁻¹x)
229 # Total prediction uncertainty: SE(ŷ - y_new) = sqrt(σ² + SE(X̂β)²)
230 # The old formula used (1 + 1/n)^0.5 approximation, which only holds at the sample mean
232 try:
233 # This happens if mse is iterable
234 param_se = np.sqrt([_mse * np.diag(gprime @ I_fisher @ gprime.T) for _mse in mse]).T
235 # Total prediction SE: sqrt(noise_variance + parameter_variance)
236 total_se = np.sqrt([_mse + _param_se**2 for _mse, _param_se in zip(mse, param_se.T)]).T
237 except TypeError:
238 # This happens if mse is a single number
239 # you need at least 1d to get a diagonal. This line is needed because
240 # there is a case where there is one prediction where this product leads
241 # to a scalar quantity and we need to upgrade it to 1d to avoid an
242 # error.
243 gig = np.atleast_1d(gprime @ I_fisher @ gprime.T)
244 param_se = np.sqrt(mse * np.diag(gig)).T
245 # Total prediction SE includes both noise and parameter uncertainty
246 total_se = np.sqrt(mse + param_se**2)
248 tval = t.ppf(1.0 - alpha / 2.0, dof)
250 yy = XX @ pars
252 # Prediction intervals using total uncertainty
253 yint = np.array(
254 [
255 yy - tval * total_se,
256 yy + tval * total_se,
257 ]
258 )
260 return (yy, yint, total_se)
263# * Nonlinear regression
266def nlinfit(model, x, y, p0, alpha=0.05, **kwargs):
267 r"""Nonlinear regression with confidence intervals.
269 Parameters
270 ----------
271 model : function f(x, p0, p1, ...) = y
272 x : array of the independent data
273 y : array of the dependent data
274 p0 : array of the initial guess of the parameters
275 alpha : 100*(1 - alpha) is the confidence interval
276 i.e. alpha = 0.05 is 95% confidence
278 kwargs are passed to curve_fit.
280 Example
281 -------
282 Fit a line \(y = mx + b\) to some data.
284 >>> import numpy as np
285 >>> def f(x, m, b):
286 ... return m * x + b
287 ...
288 >>> X = np.array([0, 1, 2])
289 >>> y = np.array([0, 2, 4])
290 >>> nlinfit(f, X, y, [0, 1])
291 (array([ 2.00000000e+00, -2.18062024e-12]),
292 array([[ 2.00000000e+00, 2.00000000e+00],
293 [ -2.18315458e-12, -2.17808591e-12]]),
294 array([ 1.21903752e-12, 1.99456367e-16]))
296 Returns
297 -------
298 [p, pint, SE]
299 p is an array of the fitted parameters
300 pint is an array of confidence intervals
301 SE is an array of standard errors for the parameters.
303 """
304 pars, pcov = curve_fit(model, x, y, p0=p0, **kwargs)
305 n = len(y) # number of data points
306 p = len(pars) # number of parameters
308 dof = max(0, n - p) # number of degrees of freedom
310 # student-t value for the dof and confidence level
311 tval = t.ppf(1.0 - alpha / 2.0, dof)
313 SE = []
314 pint = []
315 for p, var in zip(pars, np.diag(pcov)):
316 sigma = var**0.5
317 SE.append(sigma)
318 pint.append([p - sigma * tval, p + sigma * tval])
320 return (pars, np.array(pint), np.array(SE))
323def nlpredict(X, y, model, popt, xnew, loss=None, alpha=0.05, ub=1e-5, ef=1.05):
324 """Prediction error for a nonlinear fit.
326 Parameters
327 ----------
328 X : array-like
329 Independent variable data used for fitting
330 y : array-like
331 Dependent variable data used for fitting
332 model : callable
333 Model function with signature model(x, ...)
334 popt : array-like
335 Optimized parameters from fitting (e.g., from curve_fit or nlinfit)
336 xnew : array-like
337 x-values to predict at
338 loss : callable, optional
339 Loss function that was minimized during fitting, with signature loss(*params).
340 If None (default), assumes scipy.optimize.curve_fit was used and automatically
341 constructs the correct loss function: loss = 0.5 * sum((y - model(X, *p))**2).
342 This is the ½SSE convention used by scipy's least_squares optimizer.
343 If you used a different optimizer or loss function, provide it explicitly.
344 alpha : float, optional
345 Confidence level (default: 0.05 for 95% confidence intervals)
346 ub : float, optional
347 Upper bound for smallest allowed Hessian eigenvalue (default: 1e-5)
348 ef : float, optional
349 Eigenvalue factor for scaling Hessian (default: 1.05)
351 This function uses numdifftools for the Hessian and Jacobian.
353 Returns
354 -------
355 y : array
356 Predicted values at xnew
357 yint : array
358 Prediction intervals at alpha confidence level, shape (n, 2)
359 se : array
360 Standard error of predictions
362 Notes
363 -----
364 The default loss function (½SSE) matches the convention used by scipy.optimize.curve_fit,
365 which internally minimizes 0.5 * sum(residuals**2). If you provide a custom loss function,
366 ensure it uses the same convention as your fitting procedure.
367 """
368 # If no loss function provided, assume curve_fit was used (½SSE convention)
369 if loss is None:
371 def loss(*p):
372 return 0.5 * np.sum((y - model(X, *p)) ** 2)
374 ypred = model(xnew, *popt)
376 hessp = nd.Hessian(lambda p: loss(*p))(popt)
377 # for making the Hessian better conditioned.
378 eps = max(ub, ef * np.linalg.eigvals(hessp).min())
380 sse = loss(*popt)
381 n = len(y)
382 p = len(popt)
383 mse = sse / (n - p) # Use unbiased estimator
384 I_fisher = np.linalg.pinv(hessp + np.eye(len(popt)) * eps)
386 gprime = nd.Jacobian(lambda p: model(xnew, *p))(popt)
388 sigmas = np.sqrt(mse * np.diag(gprime @ I_fisher @ gprime.T))
389 tval = t.ppf(1 - alpha / 2, len(y) - len(popt))
391 return [
392 ypred,
393 np.array(
394 [
395 # https://online.stat.psu.edu/stat501/lesson/7/7.2
396 ypred - tval * (sigmas**2 + mse) ** 0.5, # lower bound
397 ypred + tval * (sigmas**2 + mse) ** 0.5, # upper bound
398 ]
399 ).T,
400 sigmas,
401 ]
404def Rsquared(y, Y):
405 """Return R^2, or coefficient of determination.
407 y is a 1d array of observations.
408 Y is a 1d array of predictions from a model.
410 Returns
411 -------
412 The R^2 value for the fit.
413 """
414 errs = y - Y
415 SS_res = np.sum(errs**2)
416 SS_tot = np.sum((y - np.mean(y)) ** 2)
417 return 1 - SS_res / SS_tot
420def bic(x, y, model, popt):
421 """Compute the Bayesian information criterion (BIC).
423 Parameters
424 ----------
425 model : function(x, ...) returns prediction for y
426 popt : optimal parameters
427 y : array, known y-values
429 Returns
430 -------
431 BIC : float
433 https://en.wikipedia.org/wiki/Bayesian_information_criterion#Gaussian_special_case
434 """
435 n = len(y)
436 k = len(popt)
437 rss = np.sum((model(x, *popt) - y) ** 2)
438 bic = n * np.log(rss / n) + k * np.log(n)
439 return bic
442def lbic(X, y, popt):
443 """Compute the Bayesian information criterion for a linear model.
445 Paramters
446 ---------
447 X : array of input variables in column form
448 y : known y values
449 popt : fitted parameters
451 Returns
452 -------
453 BIC : float
454 """
455 n = len(y)
456 k = len(popt)
457 rss = np.sum((X @ popt - y) ** 2)
458 bic = n * np.log(rss / n) + k * np.log(n)
459 return bic
462# * ivp
463def ivp(f, tspan, y0, *args, **kwargs):
464 """Solve an ODE initial value problem.
466 This provides some convenience defaults that I think are better than
467 solve_ivp.
469 Parameters
470 ----------
471 f : function
472 callable y'(x, y) = f(x, y)
474 tspan : array
475 The x points you want the solution at. The first and last points are used in
476 tspan in solve_ivp.
478 y0 : array
479 Initial conditions
481 *args : type
482 arbitrary positional arguments to pass to solve_ivp
484 **kwargs : type arbitrary kwargs to pass to solve_ivp.
485 max_step is set to be the min diff of tspan. dense_output is set to True.
486 t_eval is set to the array specified in tspan.
488 Returns
489 -------
490 solution from solve_ivp
492 """
493 t0, tf = tspan[0], tspan[-1]
495 # make the max_step the smallest step in tspan, or what is in kwargs.
496 if "max_step" not in kwargs:
497 kwargs["max_step"] = min(np.diff(tspan))
499 if "dense_output" not in kwargs:
500 kwargs["dense_output"] = True
502 if "t_eval" not in kwargs:
503 kwargs["t_eval"] = tspan
505 sol = solve_ivp(f, (t0, tf), y0, *args, **kwargs)
507 if sol.status != 0:
508 print(sol.message)
510 return sol
513# * End
516if __name__ == "__main__":
517 import doctest
519 doctest.testmod()