決定木系モデルを一通り学ぶ。
note: あらゆるライブラリで実装されているfeature_importance、つまり、特徴量の重要度は何を示しているのかというと、特徴量で分割することでどのくらい情報利得が生じたかを表している。
#collapse-show
import numpy as np
# 一般的な2分岐を想定する
def IG(impurity_func, target_p, target_left, target_right):
N_left = len(target_left)
N_right = len(target_right)
N_p = len(target_p)
I_p = impurity_func(target_p)
I_left = impurity_func(target_left)
I_right = impurity_func(target_right)
return I_p - N_left / N_p * I_left - N_right / N_p * I_right
def IG_twoing(target_left, target_right):
N_left = len(target_left)
N_right = len(target_right)
N_p = N_left + N_right
classes = np.unique(np.hstack([target_right,target_left]))
I = 0
for i in classes:
I += np.abs(len(target_left[target_left==i])/N_left - len(target_right[target_right==i])/N_right)
I = N_left * N_right / N_p**2 * I**2
return I
#collapse-show
def gini(target):
N = len(target)
_, counts = np.unique(target,return_counts=True)
p = counts / N
return 1 - np.sum(p**2)
#collapse-show
def entropy(target):
N = len(target)
_, counts = np.unique(target,return_counts=True)
p = counts / N
return -np.sum(p*np.log2(p))
#collapse-show
def error(target):
N = len(target)
_, counts = np.unique(target,return_counts=True)
p = counts / N
return 1 - np.max(p)
#collapse-show
target_p = np.hstack([np.zeros(40),np.ones(40)])
target_la = np.hstack([np.zeros(30),np.ones(10)])
target_ra = np.hstack([np.zeros(10),np.ones(30)])
target_lb = np.hstack([np.zeros(20),np.ones(40)])
target_rb = np.hstack([np.zeros(20),np.ones(0)])
#collapse-show
IG_a = IG(gini,target_p,target_la,target_ra)
IG_b = IG(gini,target_p,target_lb,target_rb)
print("A: {}, B: {}".format(IG_a,IG_b))
#collapse-show
IG_a = IG(entropy,target_p,target_la,target_ra)
IG_b = IG(entropy,target_p,target_lb,target_rb)
print("A: {}, B: {}".format(IG_a,IG_b))
#collapse-show
IG_a = IG(error,target_p,target_la,target_ra)
IG_b = IG(error,target_p,target_lb,target_rb)
print("A: {}, B: {}".format(IG_a,IG_b))
#collapse-show
IG_a = IG_twoing(target_la,target_ra)
IG_b = IG_twoing(target_lb,target_rb)
print("A: {}, B: {}".format(IG_a,IG_b))
#collapse-show
def mse(target):
return np.mean((target - np.mean(target))**2)
#collapse-show
def search_best_split(impurity_func, data, target):
features = data.shape[1]
best_thrs = None
best_f = None
IG_max = 0
for feat_idx in range(features):
values = data[:, feat_idx]
for val in values:
target_left = target[values < val]
target_right = target[values >= val]
ig = IG(impurity_func, target, target_left, target_right)
if IG_max < ig: # 情報利得の最大値を更新
IG_max = ig
best_thrs = val
best_f = feat_idx
return IG_max, best_thrs, best_f
#collapse-show
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
features_name = iris.feature_names
data = iris.data
target = iris.target
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=0)
#collapse-show
IG_max, best_thrs, best_f = search_best_split(gini, X_train, y_train)
print('Information gain: {}, Best threshold: {}, Best feature: {}'.format(IG_max, best_thrs, features_name[best_f]))
left.method_name
はleftに格納されたノードクラスのmethod_name
を呼び出していて、そのメソッドでさらにleftノードクラスのleftとrightに分割されたノードクラスが格納され、さらに格納されたノードクラスのmethod_name
が呼び出されるというように繰り返される。制限をつけないと無限ループするので、ストップする基準としてmax_depthに達するか、左右子ノードの不純度が0(クラスが1種類しか含まない)という条件を使うことにする。#collapse-show
feature_importances = {}
class Node:
def __init__(self, data, target, max_depth, impurity):
self.left = None
self.right = None
self.max_depth = max_depth
self.depth = None
self.data = data
self.target = target
self.threshold = None
self.feature = None
self.IG_max = None
self.importance = {}
self.label = np.argmax(np.bincount(target)) # ノード内で一番多いラベル(いわゆるそのノードでの予測値)
self.impurity = impurity
if self.impurity == 'gini':
self.impurity_func = gini
elif self.impurity == 'entropy':
self.impurity_func = entropy
elif self.impurity == 'error':
self.impurity_func = error
# 再帰的分割メソッド
def split(self, depth):
self.depth = depth
self.IG_max, self.threshold, self.feature = search_best_split(self.impurity_func, self.data, self.target)
print('Depth: {}, Sep at Feature: {},Threshold: {}, Label: {}'.format(self.depth, self.feature, self.threshold, self.label))
# 各特徴量の重要度をグローバル変数に記録
global feature_importances
if self.IG_max != 0:
if self.feature not in feature_importances:
feature_importances[self.feature] = self.IG_max
else:
feature_importances[self.feature] += self.IG_max
# 剪定(prune)して無限ループを防ぐ
if self.depth == self.max_depth or self.IG_max == self.impurity_func(self.target):
return
# 得られた特徴量と閾値にしたがてデータを2分割する
idx_left = self.data[:, self.feature] >= self.threshold
idx_right = self.data[:, self.feature] < self.threshold
# 分割された左右ノードを対応する変数に格納し、再帰的に分割を行う
self.left = Node(self.data[idx_left], self.target[idx_left], self.max_depth, self.impurity)
self.right = Node(self.data[idx_right], self.target[idx_right], self.max_depth, self.impurity)
self.left.split(self.depth +1)
self.right.split(self.depth +1)
# 渡されたデータを末端(葉)ノードに達するまで分岐にそって流して、行きついた先の対応するラベルを予測値として返す
def predict(self, data):
if self.IG_max == self.impurity_func(self.target) or self.depth == self.max_depth:
return self.label
else:
if data[self.feature] > self.threshold:
return self.left.predict(data)
else:
return self.right.predict(data)
#collapse-show
class DesicionTreeClassifier:
def __init__(self, max_depth, impurity):
self.max_depth = max_depth
self.impurity = impurity
self.tree = None
# ルートノードから分割を繰り返し決定木を成長させる
def fit(self, data, target):
initial_depth = 0
self.tree = Node(data, target, self.max_depth, self.impurity)
self.tree.split(initial_depth)
# 成長した決定木でデータのラベルを予測する
def predict(self, data):
pred = []
for s in data:
pred.append(self.tree.predict(s))
return np.array(pred)
#collapse-show
clf = DesicionTreeClassifier(3, 'gini')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = sum(y_pred == y_test)/float(len(y_test))
print('Classification accuracy: {}'.format(score))
#collapse-show
import matplotlib.pyplot as plt
%matplotlib inline
keys = [features_name[i] for i in feature_importances.keys()]
values = list(feature_importances.values())
plt.figure(figsize=(10,7))
plt.title('Feature importances', fontsize=15)
plt.bar(keys, values)
plt.show()
#collapse-show
from sklearn.tree import DecisionTreeClassifier as DecisionTreeClassifier2
clf2 = DecisionTreeClassifier2(max_depth=3)
clf2.fit(X_train,y_train)
y_pred = clf2.predict(X_test)
score = sum(y_pred == y_test)/float(len(y_test))
print('Classification accuracy: {}'.format(score))
#collapse-show
import graphviz
from sklearn.tree import export_graphviz
dot_data = export_graphviz(
clf2,
class_names=iris.target_names,
feature_names=features_name,
filled=True,
rounded=True,
out_file=None
)
graph = graphviz.Source(dot_data)
graph.render("iris-tree", format="png")
#collapse-show
from IPython.display import Image,display_png
display_png(Image('iris-tree.png'))