当前位置:网站首页>Bagging、Boosting、Stacking集成学习代码
Bagging、Boosting、Stacking集成学习代码
2022-08-08 20:54:00 【yddcs】
Bootstrap aggregating, called simply bagging, is very popular technique used in ensemble of predictors. It helps to increase the accuracy of prediction result and at the same time also reduces variance and allows to avoid overfitting. It is a method for generating multiple versions of predictors and using them to get an aggregated prediction values for unseen input data.
Bagging的算法过程如下:
从原始样本集中使用Bootstraping 方法随机抽取n个训练样本,共进行k轮抽取,得到k个训练集(k个训练集之间相互独立,元素可以有重复)。
对于n个训练集,我们训练k个模型,(这个模型可根据具体的情况而定,可以是决策树,knn等)
对于分类问题:由投票表决产生的分类结果;对于回归问题,由k个模型预测结果的均值作为最后预测的结果(所有模型的重要性相同)。
Boosting的算法过程如下:
对于训练集中的每个样本建立权值wi,表示对每个样本的权重, 其关键在与对于被错误分类的样本权重会在下一轮的分类中获得更大的权重(错误分类的样本的权重增加)。
同时加大分类 误差概率小的弱分类器的权值,使其在表决中起到更大的作用,减小分类误差率较大弱分类器的权值,使其在表决中起到较小的作用。每一次迭代都得到一个弱分类器,需要使用某种策略将其组合,最为最终模型,(adaboost给每个迭代之后的弱分类器一个权值,将其线性组合作为最终的分类器,误差小的分类器权值越大。)
sklearn 集成学习简单实现:
import matplotlib.pyplot as plt
import argparse, random
from sklearn.svm import SVC
import pandas as pd
from sklearn.metrics import f1_score, classification_report, recall_score, precision_score, accuracy_score, confusion_matrix
import sys, time, torch
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
# X, y = make_classification(n_samples=100, n_features=4,
# n_informative=2, n_redundant=0,
# random_state=0, shuffle=False)
data_train = np.loadtxt(open('traindata1.csv', encoding='gb18030', errors="ignore"), delimiter=",", skiprows=0)
data_test = np.loadtxt(open('testdata1.csv', encoding='gb18030', errors="ignore"), delimiter=",", skiprows=0)
X_train, y_train = data_train[:, :-1], data_train[:, -1]
X_test, y_test = data_test[:, :-1], data_test[:, -1]
# clf = BaggingClassifier(base_estimator=SVC(), n_estimators=200, random_state=0)
# n_estimators=default 100
# clf = RandomForestClassifier(n_estimators=50, n_jobs=2)
# tree = DecisionTreeClassifier(criterion='entropy', max_depth=None)
# n_estimators=500 生成500个决策树
clf = BaggingClassifier(base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1)
# clf = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
# clf = AdaBoostClassifier(n_estimators=100)
# clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# best_ntree = []
# for i in range(1, 200):
# rf = RandomForestClassifier(n_estimators = i+1, n_jobs = -1)
# rf_cv = cross_val_score(rf, X_train, y_train, cv=10).mean()
# best_ntree.append(rf_cv)
# print(max(best_ntree), np.argmax(best_ntree)+1)
# 0.9782692307692308 106
plt.figure(figsize=[20, 5])
plt.plot(range(1,200), best_ntree)
plt.show()
print("acc:", accuracy_score(y_test, y_pred))
print("precision", precision_score(y_test, y_pred, average='macro'))
print("recall", recall_score(y_test, y_pred, average='micro'))
print("F1", f1_score(y_test, y_pred, average='macro'))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
detail information and Snapshot_Ensemble see Github
stacking kernel code:
stack_train = np.zeros((X_train.shape[0], num_classes*len(members)),dtype=np.float32) # Number of training data x Number of classifiers
stack_test = np.zeros((X2.shape[0], num_classes*len(members)),dtype=np.float32) # Number of testing data x Number of classifiers
n_folds = 5
a, b = 0, 0
skf = KFold(n_splits=n_folds, shuffle=True, random_state=1)
# For each classifier, we train the number of fold times (=len(skf))
for j, clf in enumerate(members):
print('Training classifier [%s]' % (j))
for i, (train_index, cv_index) in enumerate(skf.split(X_train,y_train)):
print('Fold [%s], j' % (i), j)
print(train_index, cv_index, len(train_index), len(cv_index))
# for j,(train_index,test_index) in enumerate(skf.split(X_train,y_train)):
tr_x = X_train[train_index]
tr_y = y_train[train_index]
# reg.fit(tr_x, tr_y)
# This is the training and validation set
# X_train = X_dev[train_index]
# Y_train = Y_dev[train_index]
# X_cv = X_dev[cv_index]
deepgp = DeepGP(tr_x, tr_y, num_classes=10)
deepgp.train(args.num_epochs, args.num_iters, args.batch_size, args.learning_rate)
# X_train = np.concatenate((X_train, ret_x),axis=0)
# Y_train = np.concatenate((Y_train, ret_y),axis=0)
# clf.fit(tr_x, tr_y)
# clf.fit(tr_x, tr_y, verbose=1, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))
# stack_train[cv_index, j*num_classes:(j+1)*num_classes] = test(X_train[cv_index], y_train[cv_index], clf)
# stack_test[:, j*num_classes:(j+1)*num_classes] += test(X_test, y_test, clf)
# train predction
stack_train[cv_index, 0*num_classes:(0+1)*num_classes] = test(X_train[cv_index], y_train[cv_index], deepgp)
stack_test[:, j*num_classes:(j+1)*num_classes] += test2(X_test, y_test, deepgp)
# j = j+1
stack_test = stack_test / float(n_folds)
bagging kernel code:
def bagging_by_DGP(dataMat, labelMat, fileName, t = 30):#默认并行生成十个基学习器
test_data,test_label = loadDataSet(fileName) #获取测试样本与标签
predict_list = []
accuracy=[]
precisi=[]
recal=[]
f1=[]
for i in range(t):#并行生成T个
train_data,train_label = rand_train(dataMat,labelMat)#自主采样1得到样本
tr_x = torch.from_numpy(train_data)
tr_y = torch.from_numpy(train_label)
deepgp = DeepGP(tr_x, tr_y, num_classes=13)
# epoch iter batch learningr
deepgp.train(60, 128, 512, 0.0001)
y_pred, a, p, r, f = test(test_data, test_label, deepgp, t)
# print('pred', y_pred)
predict_list.append(y_pred), accuracy.append(a), precisi.append(p), recal.append(r), f1.append(f)
print('mean:', np.mean(accuracy), np.mean(precisi), np.mean(recal), np.mean(f1))
print('meta-learning : t', t)
return predict_list,test_label
boosting kernel code:
#训练分类器
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
weakClassArr = []
m = shape(dataArr)[0]
D = mat(ones((m, 1)) / m) #设置一样的初始权重值
aggClassEst = mat(zeros((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D) #得到“单层”最优决策树
print("D:",D.T)
alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16))) #计算alpha值
bestStump['alpha'] = alpha
weakClassArr.append(bestStump) #存储弱分类器
print("classEst: ",classEst.T)
expon = multiply(-1 * alpha * mat(classLabels).T, classEst)
D = multiply(D, exp(expon)) # 更新分类器权重
D = D / D.sum() #保证权重加和为1
aggClassEst += alpha * classEst
print("aggClassEst: ",aggClassEst.T)
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1))) #检查分类出错的类别
errorRate = aggErrors.sum() / m
print("total error: ", errorRate)
if errorRate == 0.0:
break
return weakClassArr, aggClassEst
边栏推荐
猜你喜欢
随机推荐
怎样在网上开户买股票比较安全?如何办理开户业务?
amd和Intel的CPU到底哪个好?
Matlab用回归、SEIRD模型、聚类预测美国总统大选、新冠疫情对中美经济的影响
磁控胶囊胃镜:具有良好耐受性的非侵入性胃镜检查
各类测试场景的检查点
0-1 背包问题
矩阵相乘
PHPUnit 单元测试
源码分析MyCat专栏
Simple Swing interface notes
Redis布隆过滤器
Flask 教程 第十章:邮件支持
2020-8-18js练习
Flask 教程 第四章:数据库
Flask 教程 第十三章:国际化和本地化
C#实现Everything——数据显示
Kotlin实用的一些框架
瑞吉外卖项目实战Day06--手机端
头脑风暴:打家劫舍2
究竟什么才是“云计算” | 科普好文