|
confusion matrix:混淆矩阵
预测误差ERR:
预测准确率ACC:
当类别数量不均衡时:
真正率TPR:
假正率FPR:
准确率PRE, precision:
召回率REC, recall:
ROC(Receiver Operator characteristi)曲线
利用假正率、真正率等指标选择分类模型;
理想分类器,假正率为0、真正率为1;
基于ROC曲线,可以计算Area under the curve(AUC),即线下区域;
import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.learning_curve import learning_curve from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.learning_curve import validation_curve from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold ############################################################################### df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None) #将数据分成训练集和测试集 from sklearn.preprocessing import LabelEncoder X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) #print(le.transform(['M', 'B'])) #将数据分成训练集和测试集 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1) ############################################################################### pipe_lr = Pipeline([ ('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0))]) from sklearn.metrics import roc_curve, auc from scipy import interp X_train2 = X_train[:,[4,14]] cv = StratifiedKFold(y_train, n_folds=3, random_state=1) fig = plt.figure(figsize=(7,5)) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas = pipe_lr.fit(X_train2[train], y_train[train]).predict_proba(X_train2[test]) fpr, tpr, thresholds = roc_curve(y_train[test], probas[:,1], pos_label=1) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc)) plt.plot([0,1], [0,1], linestyle='--', color=(0.6, 0.6, 0.6), label='random guessing') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='mean ROC (area =%0.2f)' % mean_auc, lw=2) plt.plot([0,0,1], [0,1,1], lw=2, linestyle=':', color='black', label='perfect performace') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('false positive rate') plt.ylabel('ture posistive rate') plt.legend(loc='lower right') plt.show()
#参考《Python 机器学习》,作者:Sebastian Raschaka, 机械工业出版社;
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-11-14 11:25
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社