Coverage for src/pycse/PYCSE.py: 97.48%

119 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-23 16:23 -0400

1"""Module containing useful scientific and engineering functions. 

2 

3- Linear regression 

4- Nonlinear regression. 

5- Differential equation solvers. 

6 

7See http://kitchingroup.cheme.cmu.edu/pycse 

8 

9Copyright 2025, John Kitchin 

10(see accompanying license files for details). 

11""" 

12 

13# pylint: disable=invalid-name 

14 

15import warnings 

16import numpy as np 

17from scipy.stats.distributions import t 

18from scipy.optimize import curve_fit 

19from scipy.integrate import solve_ivp 

20 

21import numdifftools as nd 

22 

23# * Linear regression 

24 

25 

26def polyfit(x, y, deg, alpha=0.05, *args, **kwargs): 

27 """Least squares polynomial fit with parameter confidence intervals. 

28 

29 Parameters 

30 ---------- 

31 x : array_like, shape (M,) 

32 x-coordinates of the M sample points ``(x[i], y[i])``. 

33 y : array_like, shape (M,) or (M, K) 

34 y-coordinates of the sample points. Several data sets of sample 

35 points sharing the same x-coordinates can be fitted at once by 

36 passing in a 2D-array that contains one dataset per column. 

37 deg : int 

38 Degree of the fitting polynomial 

39 *args and **kwargs are passed to regress. 

40 

41 Returns 

42 ------- 

43 [b, bint, se] 

44 b is a vector of the fitted parameters 

45 bint is a 2D array of confidence intervals 

46 se is an array of standard error for each parameter. 

47 """ 

48 # in vander, the second arg is the number of columns, so we have to add one 

49 # to the degree since there are N + 1 columns for a polynomial of order N 

50 X = np.vander(x, deg + 1) 

51 return regress(X, y, alpha, *args, **kwargs) 

52 

53 

54def polyval(p, x, X, y, alpha=0.05, ub=1e-5, ef=1.05): 

55 """Evaluate the polynomial p at x with prediction intervals. 

56 

57 Parameters 

58 ---------- 

59 p: parameters from pycse.polyfit 

60 x: array_like, shape (M,) 

61 x-coordinates to evaluate the polynomial at. 

62 X: array_like, shape (N,) 

63 the original x-data that p was fitted from. 

64 y: array-like, shape (N,) 

65 the original y-data that p was fitted from. 

66 

67 alpha : confidence level, 95% = 0.05 

68 ub : upper bound for smallest allowed Hessian eigenvalue 

69 ef : eigenvalue factor for scaling Hessian 

70 

71 Returns 

72 ------- 

73 y, yint, pred_se 

74 y : the predicted values 

75 yint: confidence interval 

76 """ 

77 deg = len(p) - 1 

78 _x = np.vander(x, deg + 1) # to predict at 

79 

80 _X = np.vander(X, deg + 1) # original data 

81 

82 return predict(_X, y, p, _x, alpha, ub, ef) 

83 

84 

85def regress(A, y, alpha=0.05, *args, **kwargs): 

86 r"""Linear least squares regression with confidence intervals. 

87 

88 Solve the matrix equation \(A p = y\) for p. 

89 

90 The confidence intervals account for sample size using a student T 

91 multiplier. 

92 

93 This code is derived from the descriptions at 

94 http://www.weibull.com/DOEWeb/confidence_intervals_in_multiple_linear_regression.htm 

95 and 

96 http://www.weibull.com/DOEWeb/estimating_regression_models_using_least_squares.htm 

97 

98 Parameters 

99 ---------- 

100 A : a matrix of function values in columns, e.g. 

101 A = np.column_stack([T**0, T**1, T**2, T**3, T**4]) 

102 

103 y : a vector of values you want to fit 

104 

105 alpha : 100*(1 - alpha) confidence level 

106 

107 args and kwargs are passed to np.linalg.lstsq 

108 

109 Example 

110 ------- 

111 >>> import numpy as np 

112 >>> x = np.array([0, 1, 2]) 

113 >>> y = np.array([0, 2, 4]) 

114 >>> X = np.column_stack([x**0, x]) 

115 >>> regress(X, y) 

116 (array([ -5.12790050e-16, 2.00000000e+00]), None, None) 

117 

118 Returns 

119 ------- 

120 [b, bint, se] 

121 b is a vector of the fitted parameters 

122 bint is an array of confidence intervals. The ith row is for the ith parameter. 

123 se is an array of standard error for each parameter. 

124 

125 """ 

126 # This is to silence an annoying FutureWarning. 

127 if "rcond" not in kwargs: 

128 kwargs["rcond"] = None 

129 

130 b, _, _, _ = np.linalg.lstsq(A, y, *args, **kwargs) 

131 

132 bint, se = None, None 

133 

134 if alpha is not None: 

135 # compute the confidence intervals 

136 n = len(y) 

137 k = len(b) 

138 

139 errors = y - np.dot(A, b) # this may have many columns 

140 sigma2 = np.sum(errors**2, axis=0) / (n - k) # RMSE 

141 

142 covariance = np.linalg.inv(np.dot(A.T, A)) 

143 

144 # sigma2 is either a number, or (1, noutputs) 

145 # covariance is npars x npars 

146 # I need to scale C for each column 

147 try: 

148 C = [covariance * s for s in sigma2] 

149 dC = np.array([np.diag(c) for c in C]).T 

150 except TypeError: 

151 C = covariance * sigma2 

152 dC = np.diag(C) 

153 

154 # The diagonal on each column is related to the standard error 

155 

156 if (dC < 0.0).any(): 

157 warnings.warn( 

158 "\n{0}\ndetected a negative number in your" 

159 " covariance matrix. Taking the absolute value" 

160 " of the diagonal. something is probably wrong" 

161 " with your data or model".format(dC) 

162 ) 

163 dC = np.abs(dC) 

164 

165 se = np.sqrt(dC) # standard error 

166 

167 # CORRECTED: Use n - k degrees of freedom (not n - k - 1) 

168 # For linear regression with n observations and k parameters, 

169 # the residual has exactly n - k degrees of freedom. 

170 sT = t.ppf(1.0 - alpha / 2.0, n - k) # student T multiplier 

171 CI = sT * se 

172 

173 # bint is a little tricky, and depends on the shape of the output. 

174 bint = np.array([(b - CI, b + CI)]).T 

175 

176 return (b, bint.squeeze(), se) 

177 

178 

179def predict(X, y, pars, XX, alpha=0.05, ub=1e-5, ef=1.05): 

180 """Prediction interval for linear regression. 

181 

182 Based on the delta method. 

183 

184 Parameters 

185 ---------- 

186 X : known x-value array, one row for each y-point 

187 y : known y-value array 

188 pars : fitted parameters 

189 XX : x-value array to make predictions for 

190 alpha : confidence level, 95% = 0.05 

191 ub : upper bound for smallest allowed Hessian eigenvalue 

192 ef : eigenvalue factor for scaling Hessian 

193 

194 See https://en.wikipedia.org/wiki/Prediction_interval#Unknown_mean,_unknown_variance 

195 

196 Returns 

197 y, yint, pred_se 

198 y : the predicted values 

199 yint: confidence interval 

200 pred_se: std error on predictions. 

201 """ 

202 n = len(X) 

203 npars = len(pars) 

204 dof = n - npars 

205 

206 errs = y - X @ pars 

207 

208 sse = np.sum(errs**2, axis=0) 

209 

210 # CORRECTED: Use unbiased variance estimator with correct DOF 

211 # mse represents σ², the noise variance 

212 mse = sse / dof # Was: sse / n 

213 

214 gprime = XX 

215 

216 # CORRECTED: Removed factor of 2 for covariance calculation 

217 # Even though np.linalg.lstsq minimizes SSE (with Hessian H = 2X'X), 

218 # the Fisher Information is I = H / (2σ²) = 2X'X / (2σ²) = X'X / σ² 

219 # Therefore: Cov(β) = I⁻¹ = σ² × (X'X)⁻¹ (factor of 2 cancels) 

220 # This matches what regress() correctly uses (line 142) 

221 hat = X.T @ X # Was: 2 * X.T @ X 

222 eps = max(ub, ef * np.linalg.eigvals(hat).min()) 

223 

224 # Parameter covariance matrix 

225 I_fisher = np.linalg.pinv(hat + np.eye(npars) * eps) 

226 

227 # CORRECTED: Compute parameter uncertainty and total prediction uncertainty separately 

228 # Parameter uncertainty: SE(X̂β) = sqrt(σ² × x'(X'X)⁻¹x) 

229 # Total prediction uncertainty: SE(ŷ - y_new) = sqrt(σ² + SE(X̂β)²) 

230 # The old formula used (1 + 1/n)^0.5 approximation, which only holds at the sample mean 

231 

232 try: 

233 # This happens if mse is iterable 

234 param_se = np.sqrt([_mse * np.diag(gprime @ I_fisher @ gprime.T) for _mse in mse]).T 

235 # Total prediction SE: sqrt(noise_variance + parameter_variance) 

236 total_se = np.sqrt([_mse + _param_se**2 for _mse, _param_se in zip(mse, param_se.T)]).T 

237 except TypeError: 

238 # This happens if mse is a single number 

239 # you need at least 1d to get a diagonal. This line is needed because 

240 # there is a case where there is one prediction where this product leads 

241 # to a scalar quantity and we need to upgrade it to 1d to avoid an 

242 # error. 

243 gig = np.atleast_1d(gprime @ I_fisher @ gprime.T) 

244 param_se = np.sqrt(mse * np.diag(gig)).T 

245 # Total prediction SE includes both noise and parameter uncertainty 

246 total_se = np.sqrt(mse + param_se**2) 

247 

248 tval = t.ppf(1.0 - alpha / 2.0, dof) 

249 

250 yy = XX @ pars 

251 

252 # Prediction intervals using total uncertainty 

253 yint = np.array( 

254 [ 

255 yy - tval * total_se, 

256 yy + tval * total_se, 

257 ] 

258 ) 

259 

260 return (yy, yint, total_se) 

261 

262 

263# * Nonlinear regression 

264 

265 

266def nlinfit(model, x, y, p0, alpha=0.05, **kwargs): 

267 r"""Nonlinear regression with confidence intervals. 

268 

269 Parameters 

270 ---------- 

271 model : function f(x, p0, p1, ...) = y 

272 x : array of the independent data 

273 y : array of the dependent data 

274 p0 : array of the initial guess of the parameters 

275 alpha : 100*(1 - alpha) is the confidence interval 

276 i.e. alpha = 0.05 is 95% confidence 

277 

278 kwargs are passed to curve_fit. 

279 

280 Example 

281 ------- 

282 Fit a line \(y = mx + b\) to some data. 

283 

284 >>> import numpy as np 

285 >>> def f(x, m, b): 

286 ... return m * x + b 

287 ... 

288 >>> X = np.array([0, 1, 2]) 

289 >>> y = np.array([0, 2, 4]) 

290 >>> nlinfit(f, X, y, [0, 1]) 

291 (array([ 2.00000000e+00, -2.18062024e-12]), 

292 array([[ 2.00000000e+00, 2.00000000e+00], 

293 [ -2.18315458e-12, -2.17808591e-12]]), 

294 array([ 1.21903752e-12, 1.99456367e-16])) 

295 

296 Returns 

297 ------- 

298 [p, pint, SE] 

299 p is an array of the fitted parameters 

300 pint is an array of confidence intervals 

301 SE is an array of standard errors for the parameters. 

302 

303 """ 

304 pars, pcov = curve_fit(model, x, y, p0=p0, **kwargs) 

305 n = len(y) # number of data points 

306 p = len(pars) # number of parameters 

307 

308 dof = max(0, n - p) # number of degrees of freedom 

309 

310 # student-t value for the dof and confidence level 

311 tval = t.ppf(1.0 - alpha / 2.0, dof) 

312 

313 SE = [] 

314 pint = [] 

315 for p, var in zip(pars, np.diag(pcov)): 

316 sigma = var**0.5 

317 SE.append(sigma) 

318 pint.append([p - sigma * tval, p + sigma * tval]) 

319 

320 return (pars, np.array(pint), np.array(SE)) 

321 

322 

323def nlpredict(X, y, model, popt, xnew, loss=None, alpha=0.05, ub=1e-5, ef=1.05): 

324 """Prediction error for a nonlinear fit. 

325 

326 Parameters 

327 ---------- 

328 X : array-like 

329 Independent variable data used for fitting 

330 y : array-like 

331 Dependent variable data used for fitting 

332 model : callable 

333 Model function with signature model(x, ...) 

334 popt : array-like 

335 Optimized parameters from fitting (e.g., from curve_fit or nlinfit) 

336 xnew : array-like 

337 x-values to predict at 

338 loss : callable, optional 

339 Loss function that was minimized during fitting, with signature loss(*params). 

340 If None (default), assumes scipy.optimize.curve_fit was used and automatically 

341 constructs the correct loss function: loss = 0.5 * sum((y - model(X, *p))**2). 

342 This is the ½SSE convention used by scipy's least_squares optimizer. 

343 If you used a different optimizer or loss function, provide it explicitly. 

344 alpha : float, optional 

345 Confidence level (default: 0.05 for 95% confidence intervals) 

346 ub : float, optional 

347 Upper bound for smallest allowed Hessian eigenvalue (default: 1e-5) 

348 ef : float, optional 

349 Eigenvalue factor for scaling Hessian (default: 1.05) 

350 

351 This function uses numdifftools for the Hessian and Jacobian. 

352 

353 Returns 

354 ------- 

355 y : array 

356 Predicted values at xnew 

357 yint : array 

358 Prediction intervals at alpha confidence level, shape (n, 2) 

359 se : array 

360 Standard error of predictions 

361 

362 Notes 

363 ----- 

364 The default loss function (½SSE) matches the convention used by scipy.optimize.curve_fit, 

365 which internally minimizes 0.5 * sum(residuals**2). If you provide a custom loss function, 

366 ensure it uses the same convention as your fitting procedure. 

367 """ 

368 # If no loss function provided, assume curve_fit was used (½SSE convention) 

369 if loss is None: 

370 

371 def loss(*p): 

372 return 0.5 * np.sum((y - model(X, *p)) ** 2) 

373 

374 ypred = model(xnew, *popt) 

375 

376 hessp = nd.Hessian(lambda p: loss(*p))(popt) 

377 # for making the Hessian better conditioned. 

378 eps = max(ub, ef * np.linalg.eigvals(hessp).min()) 

379 

380 sse = loss(*popt) 

381 n = len(y) 

382 p = len(popt) 

383 mse = sse / (n - p) # Use unbiased estimator 

384 I_fisher = np.linalg.pinv(hessp + np.eye(len(popt)) * eps) 

385 

386 gprime = nd.Jacobian(lambda p: model(xnew, *p))(popt) 

387 

388 sigmas = np.sqrt(mse * np.diag(gprime @ I_fisher @ gprime.T)) 

389 tval = t.ppf(1 - alpha / 2, len(y) - len(popt)) 

390 

391 return [ 

392 ypred, 

393 np.array( 

394 [ 

395 # https://online.stat.psu.edu/stat501/lesson/7/7.2 

396 ypred - tval * (sigmas**2 + mse) ** 0.5, # lower bound 

397 ypred + tval * (sigmas**2 + mse) ** 0.5, # upper bound 

398 ] 

399 ).T, 

400 sigmas, 

401 ] 

402 

403 

404def Rsquared(y, Y): 

405 """Return R^2, or coefficient of determination. 

406 

407 y is a 1d array of observations. 

408 Y is a 1d array of predictions from a model. 

409 

410 Returns 

411 ------- 

412 The R^2 value for the fit. 

413 """ 

414 errs = y - Y 

415 SS_res = np.sum(errs**2) 

416 SS_tot = np.sum((y - np.mean(y)) ** 2) 

417 return 1 - SS_res / SS_tot 

418 

419 

420def bic(x, y, model, popt): 

421 """Compute the Bayesian information criterion (BIC). 

422 

423 Parameters 

424 ---------- 

425 model : function(x, ...) returns prediction for y 

426 popt : optimal parameters 

427 y : array, known y-values 

428 

429 Returns 

430 ------- 

431 BIC : float 

432 

433 https://en.wikipedia.org/wiki/Bayesian_information_criterion#Gaussian_special_case 

434 """ 

435 n = len(y) 

436 k = len(popt) 

437 rss = np.sum((model(x, *popt) - y) ** 2) 

438 bic = n * np.log(rss / n) + k * np.log(n) 

439 return bic 

440 

441 

442def lbic(X, y, popt): 

443 """Compute the Bayesian information criterion for a linear model. 

444 

445 Paramters 

446 --------- 

447 X : array of input variables in column form 

448 y : known y values 

449 popt : fitted parameters 

450 

451 Returns 

452 ------- 

453 BIC : float 

454 """ 

455 n = len(y) 

456 k = len(popt) 

457 rss = np.sum((X @ popt - y) ** 2) 

458 bic = n * np.log(rss / n) + k * np.log(n) 

459 return bic 

460 

461 

462# * ivp 

463def ivp(f, tspan, y0, *args, **kwargs): 

464 """Solve an ODE initial value problem. 

465 

466 This provides some convenience defaults that I think are better than 

467 solve_ivp. 

468 

469 Parameters 

470 ---------- 

471 f : function 

472 callable y'(x, y) = f(x, y) 

473 

474 tspan : array 

475 The x points you want the solution at. The first and last points are used in 

476 tspan in solve_ivp. 

477 

478 y0 : array 

479 Initial conditions 

480 

481 *args : type 

482 arbitrary positional arguments to pass to solve_ivp 

483 

484 **kwargs : type arbitrary kwargs to pass to solve_ivp. 

485 max_step is set to be the min diff of tspan. dense_output is set to True. 

486 t_eval is set to the array specified in tspan. 

487 

488 Returns 

489 ------- 

490 solution from solve_ivp 

491 

492 """ 

493 t0, tf = tspan[0], tspan[-1] 

494 

495 # make the max_step the smallest step in tspan, or what is in kwargs. 

496 if "max_step" not in kwargs: 

497 kwargs["max_step"] = min(np.diff(tspan)) 

498 

499 if "dense_output" not in kwargs: 

500 kwargs["dense_output"] = True 

501 

502 if "t_eval" not in kwargs: 

503 kwargs["t_eval"] = tspan 

504 

505 sol = solve_ivp(f, (t0, tf), y0, *args, **kwargs) 

506 

507 if sol.status != 0: 

508 print(sol.message) 

509 

510 return sol 

511 

512 

513# * End 

514 

515 

516if __name__ == "__main__": 

517 import doctest 

518 

519 doctest.testmod()