|
目的:判断学习算法是否过拟合(高方差)或者欠拟合(高偏差);
过拟合:模型中有太多自由度或参数,对未知数据泛化能力差;解决办法:增加训练样本或降低自由度或参数;
欠拟合:训练和预测精度都很差;解决办法:增加模型参数;
learing curve:描述样本大小与测试精度和训练精度之间的关系;
training accuracy和validation accuracy 都随着training size而变化,但是在training accuracy 和 validation accuracy之间存在差值,所以模型具有轻微的过拟合;
import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.learning_curve import learning_curve from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline ############################################################################### df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None) #将数据分成训练集和测试集 from sklearn.preprocessing import LabelEncoder X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) #print(le.transform(['M', 'B'])) #将数据分成训练集和测试集 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1) pipe_lr = Pipeline([ ('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0))]) train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1,1.0,10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean+train_std, train_mean-train_std,alpha=0.15,color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--',marker='s', markersize=5, label='validation accuracy') plt.fill_between(train_sizes, test_mean+test_std, test_mean-test_std,alpha=0.15,color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.9, 1.0]) plt.show()
validation curve:描述准确率与模型参数之间的关系;
随着LogisticRegession正则化参数C的变化,training accuracy和validtion accuracy都发生变化;但最佳C值在0.1左右;
import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.learning_curve import learning_curve from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.learning_curve import validation_curve ############################################################################### df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None) #将数据分成训练集和测试集 from sklearn.preprocessing import LabelEncoder X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) #print(le.transform(['M', 'B'])) #将数据分成训练集和测试集 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1) ############################################################################### param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] pipe_lr = Pipeline([ ('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0))]) train_scores, test_scores = validation_curve(estimator=pipe_lr, X=X_train, y=y_train, param_name='clf__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean+train_std, train_mean-train_std,alpha=0.15,color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--',marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean+test_std, test_mean-test_std,alpha=0.15,color='green') plt.grid() plt.xscale('log') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.92, 1.0]) plt.show()
#参考《Python 机器学习》,作者:Sebastian Raschaka, 机械工业出版社;
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-11-24 13:35
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社