Coverage for src/pycse/sklearn/leaf_model.py: 0.00%

36 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-23 16:23 -0400

1"""Leaf Models in sklearn. 

2 

3This model is based on a DecisionTreeRegressor. When you train the model, it 

4first uses a DecisionTreeRegressor to divide the data set into leaves, then fits 

5your model to each leaf. You can request uncertainty that is computed from the 

6model fitted on each leaf. 

7 

8This is not as rigorous as linear decision tree models, but it is conceptually 

9simple. I consider it a proof of concept model. 

10 

11Example: 

12 

13import numpy as np 

14from sklearn.pipeline import Pipeline 

15from sklearn.preprocessing import PolynomialFeatures 

16from sklearn.preprocessing import StandardScaler 

17from sklearn.linear_model import LinearRegression, BayesianRidge 

18from pycse.sklearn.leaf_model import LeafModelRegressor 

19 

20R = 8.314 

21k0, Ea = 6.79049544e+06, 4.02891385e+04 

22 

23T = np.linspace(300, 600, 40) 

24k = k0 * np.exp(-Ea / R / T) 

25 

26pipe = Pipeline([('scaler', StandardScaler()), 

27 ('poly', PolynomialFeatures(degree=2)), 

28 ('Br', BayesianRidge())]) 

29 

30lt = LeafModelRegressor(leaf_model=pipe, min_samples_leaf=5) 

31lt.fit(T[:, None], k); 

32 

33f = np.linspace(200, 700) 

34pf, se = lt.predict(f[:, None], return_std=True) 

35 

36import matplotlib.pyplot as plt 

37plt.plot(T, k, '.') 

38plt.plot(f, pf.squeeze()); 

39plt.plot(f, pf.squeeze() + se, f, pf.squeeze() - se); 

40 

41""" 

42 

43import numpy as np 

44from sklearn.tree import DecisionTreeRegressor 

45from sklearn import clone 

46 

47 

48class LeafModelRegressor(DecisionTreeRegressor): 

49 """An sklearn Leaf Model class.""" 

50 

51 def __init__(self, leaf_model, **kwargs): 

52 """Initialize a LeafModel. 

53 

54 LEAF_MODEL is an sklearn estimator. 

55 """ 

56 self.leaf_model = leaf_model 

57 super().__init__(**kwargs) 

58 

59 def fit(self, X, y): 

60 """Fit the model. 

61 

62 First we fit the decision tree. Then we fit the leaves in the tree to 

63 the leaf_model. Each leaf gets its own model. 

64 

65 This is not an optimal fit, I assume the decision tree would make 

66 different splits if it was using leaf model errors instead of average 

67 errors. It works pretty well though. 

68 """ 

69 self.xtrain = X 

70 self.ytrain = y 

71 super().fit(X, y) 

72 

73 # Now train the leaf models 

74 leaves = self.apply(self.xtrain) 

75 self.leaf_models = {} 

76 

77 for leaf in set(leaves): 

78 # get the x,y-points for this leaf, generate the features you might 

79 # want to store this data, in case you need it for UQ 

80 _X = self.xtrain[leaves == leaf] 

81 _y = self.ytrain[leaves == leaf] 

82 

83 # This is the model for this leaf 

84 self.leaf_models[leaf] = clone(self.leaf_model) 

85 self.leaf_models[leaf].fit(_X, _y) 

86 

87 return self 

88 

89 def predict(self, X, return_std=False): 

90 """Predict values for X. 

91 

92 if RETURN_STD is truthy, try to get a stderr from the leaf models. 

93 """ 

94 # Get leaves for X that we are predicting 

95 pleaves = self.apply(X) 

96 

97 predictions = np.zeros(X.shape[0]) 

98 errors = np.zeros(X.shape[0]) 

99 

100 for leaf in set(pleaves): 

101 model = self.leaf_models[leaf] 

102 

103 ind = pleaves == leaf 

104 try: 

105 py, pse = model.predict(X[ind], return_std=return_std) 

106 

107 # if the model doesn't accept return_std you get TypeError 

108 # we just catch it here. 

109 except (ValueError, TypeError): 

110 py = model.predict(X[ind]) 

111 pse = np.empty(py.shape) * np.nan 

112 predictions[ind] = py 

113 errors[ind] = pse 

114 

115 if return_std: 

116 return np.array(predictions), np.array(errors) 

117 else: 

118 return np.array(predictions)