22.使用信息熵实现决策树

约 875 字大约 3 分钟

2025-09-20

决策树的本质

损失函数 -- 总信息熵

因为要把一个东西转向为一个确定的分类结果,所以信息熵应该是最低的.

梯度 -- 信息增益
决策树 - 梯度下降路径
非参数模型

二分类信息熵

import numpy as np
from matplotlib import pyplot as plt

# 信息熵
# 这里定义的信息熵是总体样本所含信息熵.
def entropy(p):
    return -p*np.log2(p)-(1-p)*np.log2(1-p)

plot_x = np.linspace(0.001,0.999,100)
plt.plot(plot_x,entropy(plot_x))
plt.show()
# 在图中p=0.5时不确定性程度最高

from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[:,:2]
y = iris.target
X[:5]

 array([[5.1, 3.5], [4.9, 3. ], [4.7, 3.2], [4.6, 3.1], [5. , 3.6]])

plt.scatter(X[:,0],X[:,1],c=y)
plt.show()

from sklearn.tree  import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=3,criterion='entropy')
clf.fit(X,y)

 DecisionTreeClassifier(criterion='entropy', max_depth=3)

clf.score(X,y)

 0.8066666666666666

from sklearn.tree import plot_tree
plot_tree(clf)

 [Text(0.5, 0.875, 'x[0] <= 5.55\nentropy = 1.585\nsamples = 150\nvalue = [50, 50, 50]'), Text(0.25, 0.625, 'x[1] <= 2.8\nentropy = 0.813\nsamples = 59\nvalue = [47, 11, 1]'), Text(0.125, 0.375, 'x[0] <= 4.95\nentropy = 0.817\nsamples = 12\nvalue = [1, 10, 1]'), Text(0.0625, 0.125, 'entropy = 1.585\nsamples = 3\nvalue = [1, 1, 1]'), Text(0.1875, 0.125, 'entropy = 0.0\nsamples = 9\nvalue = [0, 9, 0]'), Text(0.375, 0.375, 'x[1] <= 3.05\nentropy = 0.149\nsamples = 47\nvalue = [46, 1, 0]'), Text(0.3125, 0.125, 'entropy = 0.544\nsamples = 8\nvalue = [7, 1, 0]'), Text(0.4375, 0.125, 'entropy = 0.0\nsamples = 39\nvalue = [39, 0, 0]'), Text(0.75, 0.625, 'x[1] <= 3.7\nentropy = 1.167\nsamples = 91\nvalue = [3, 39, 49]'), Text(0.625, 0.375, 'x[0] <= 6.25\nentropy = 0.994\nsamples = 86\nvalue = [0, 39, 47]'), Text(0.5625, 0.125, 'entropy = 0.909\nsamples = 37\nvalue = [0, 25, 12]'), Text(0.6875, 0.125, 'entropy = 0.863\nsamples = 49\nvalue = [0, 14, 35]'), Text(0.875, 0.375, 'x[0] <= 6.75\nentropy = 0.971\nsamples = 5\nvalue = [3, 0, 2]'), Text(0.8125, 0.125, 'entropy = 0.0\nsamples = 3\nvalue = [3, 0, 0]'), Text(0.9375, 0.125, 'entropy = 0.0\nsamples = 2\nvalue = [0, 0, 2]')]

### 寻找最优化分条件

from collections import Counter
Counter(y)

 Counter({0: 50, 1: 50, 2: 50})

def calc_entropy(y):
    counter =Counter(y)
    sum_entropy = 0
    for num in counter:
        p = counter[num]/len(y)
        # sum_entropy += entropy(p)
        sum_entropy += (-p*np.log2(p))
    return sum_entropy

calc_entropy(y)

 1.584962500721156

# 分割样本函数
def split_dataset(x,y,dim,value):
    index_left = x[:,dim]<=value
    index_right = x[:,dim]>value
    return x[index_left],y[index_left],x[index_right],y[index_right]


def find_best_split(X, y):
    best_dim = -1
    best_val = -1
    best_entropy = np.inf
    best_entropy_left,best_entropy_right = -1,-1
    # 遍历列数
    for dim in range(X.shape[1]):
        sorted_idx = np.argsort(X[:, dim])
        # 便利所有行数,即遍历所有分裂点
        # 因为后面有一个i+1,所以需要减去1
        for i in range(X.shape[0]-1):
            # 获取当前和下一个的值
            value_left,value_right = X[sorted_idx[i], dim],X[sorted_idx[i+1], dim]
            val = (value_left + value_right) / 2
            x_left,y_left,x_right,y_right = split_dataset(X, y, dim, val)
            # 计算熵值
            entropy_left,entropy_right = calc_entropy(y_left),calc_entropy(y_right)
            entropy = entropy_left*len(y_left)/len(y) + entropy_right*len(y_right)/len(y)
            # 记录最好的划分
            if entropy < best_entropy:
                best_dim = dim
                best_val = val
                best_entropy = entropy
                best_entropy_left,best_entropy_right = entropy_left,entropy_right
    return best_dim,best_val,best_entropy,best_entropy_left,best_entropy_right

find_best_split(X,y)
# 在左子树右子树可以继续调用find_best_split函数

 (0, 5.5, 1.0277298129142294, 0.8128223064150747, 1.167065448996099)