Coverage for src/pycse/sklearn/leaf_model.py: 0.00%
36 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-23 16:23 -0400
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-23 16:23 -0400
1"""Leaf Models in sklearn.
3This model is based on a DecisionTreeRegressor. When you train the model, it
4first uses a DecisionTreeRegressor to divide the data set into leaves, then fits
5your model to each leaf. You can request uncertainty that is computed from the
6model fitted on each leaf.
8This is not as rigorous as linear decision tree models, but it is conceptually
9simple. I consider it a proof of concept model.
11Example:
13import numpy as np
14from sklearn.pipeline import Pipeline
15from sklearn.preprocessing import PolynomialFeatures
16from sklearn.preprocessing import StandardScaler
17from sklearn.linear_model import LinearRegression, BayesianRidge
18from pycse.sklearn.leaf_model import LeafModelRegressor
20R = 8.314
21k0, Ea = 6.79049544e+06, 4.02891385e+04
23T = np.linspace(300, 600, 40)
24k = k0 * np.exp(-Ea / R / T)
26pipe = Pipeline([('scaler', StandardScaler()),
27 ('poly', PolynomialFeatures(degree=2)),
28 ('Br', BayesianRidge())])
30lt = LeafModelRegressor(leaf_model=pipe, min_samples_leaf=5)
31lt.fit(T[:, None], k);
33f = np.linspace(200, 700)
34pf, se = lt.predict(f[:, None], return_std=True)
36import matplotlib.pyplot as plt
37plt.plot(T, k, '.')
38plt.plot(f, pf.squeeze());
39plt.plot(f, pf.squeeze() + se, f, pf.squeeze() - se);
41"""
43import numpy as np
44from sklearn.tree import DecisionTreeRegressor
45from sklearn import clone
48class LeafModelRegressor(DecisionTreeRegressor):
49 """An sklearn Leaf Model class."""
51 def __init__(self, leaf_model, **kwargs):
52 """Initialize a LeafModel.
54 LEAF_MODEL is an sklearn estimator.
55 """
56 self.leaf_model = leaf_model
57 super().__init__(**kwargs)
59 def fit(self, X, y):
60 """Fit the model.
62 First we fit the decision tree. Then we fit the leaves in the tree to
63 the leaf_model. Each leaf gets its own model.
65 This is not an optimal fit, I assume the decision tree would make
66 different splits if it was using leaf model errors instead of average
67 errors. It works pretty well though.
68 """
69 self.xtrain = X
70 self.ytrain = y
71 super().fit(X, y)
73 # Now train the leaf models
74 leaves = self.apply(self.xtrain)
75 self.leaf_models = {}
77 for leaf in set(leaves):
78 # get the x,y-points for this leaf, generate the features you might
79 # want to store this data, in case you need it for UQ
80 _X = self.xtrain[leaves == leaf]
81 _y = self.ytrain[leaves == leaf]
83 # This is the model for this leaf
84 self.leaf_models[leaf] = clone(self.leaf_model)
85 self.leaf_models[leaf].fit(_X, _y)
87 return self
89 def predict(self, X, return_std=False):
90 """Predict values for X.
92 if RETURN_STD is truthy, try to get a stderr from the leaf models.
93 """
94 # Get leaves for X that we are predicting
95 pleaves = self.apply(X)
97 predictions = np.zeros(X.shape[0])
98 errors = np.zeros(X.shape[0])
100 for leaf in set(pleaves):
101 model = self.leaf_models[leaf]
103 ind = pleaves == leaf
104 try:
105 py, pse = model.predict(X[ind], return_std=return_std)
107 # if the model doesn't accept return_std you get TypeError
108 # we just catch it here.
109 except (ValueError, TypeError):
110 py = model.predict(X[ind])
111 pse = np.empty(py.shape) * np.nan
112 predictions[ind] = py
113 errors[ind] = pse
115 if return_std:
116 return np.array(predictions), np.array(errors)
117 else:
118 return np.array(predictions)