|
减无监督数据降维技术:线性判别法(Linear Discriminant Analysis, LDA)
LDA是找出可以最优化分类的特征子空间;PCA试图在数据集中找到方差最大、正交的主成分分量的轴;
LDA是有监督学习算法、PCA是无监督学习算法;
LDA假设:数据呈现正态分布、各类别数据具有相同的协方差矩阵、样本特征从统计上讲师相互独立;说明:即使某个假设为满足,LDA仍可以很好的对数据进行降维;
LDA算法流程如下:
对d维数据集进行标准化处理(d为特征数量);
对于每一类别,计算d维的均值向量;
构造类间的散布分布矩阵SB以及类内的散布矩阵SW;
计算矩阵的特征值以及对应的特征向量;
选取前k个特征值所对应的特征向量,构造一个d*k的转换矩阵W,其中特征向量以列的形式排列;
利用转换矩阵W将样本映射到新空间中;
试用scikit-learn进行LDA分析
结果如下:
训练数据集 测试数据集
from plot_decision_regions import plot_decision_regions from sklearn.linear_model import LogisticRegression #from sklearn.lda import LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train_std, y_train) lr = LogisticRegression() lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.show() #测试样本 X_test_lda = lda.transform(X_test_std) plot_decision_regions(X_test_lda, y_test, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.show()
单独编程代码如下:
import matplotlib.pyplot as plt import numpy as np import pandas as pd ############################################################################### df_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] #将数据分成训练集和测试集 from sklearn.cross_validation import train_test_split X, y = df_wine.iloc[:,1:].values, df_wine.iloc[:,0].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=0) #标准化 from sklearn.preprocessing import StandardScaler stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.transform(X_test) #计算均值(找label类型分别计算) np.set_printoptions(precision=4) mean_vecs = [] for label in range(1,4): mean_vecs.append(np.mean(X_train_std[y_train==label], axis=0)) print('Mv %s: %s \n' %(label, mean_vecs[label-1])) #计算类内散布矩阵 d = 13 S_W = np.zeros((d,d)) for label, mv in zip(range(1,4), mean_vecs): class_scatter = np.cov(X_train_std[y_train==label].T) S_W += class_scatter print('Within-class scatter matrix:%sx%s' %(S_W.shape[0], S_W.shape[1])) #计算类间散布矩阵 mean_overall = np.mean(X_train_std, axis=0) d = 13 S_B = np.zeros((d,d)) for i, mean_vec in enumerate(mean_vecs): n = X_train_std[y_train==i+1, :].shape[0] mean_vec = mean_vec.reshape(d,1) mean_overall = mean_overall.reshape(d,1) S_B += n * (mean_vec -mean_overall).dot((mean_vec -mean_overall).T) print('Between-class sctter matrix: %s x %s' % (S_B.shape[0], S_B.shape[1])) #计算特征矩阵和特征值 eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B)) eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))] eigen_pairs = sorted(eigen_pairs, key = lambda k:k[0], reverse =True) print('Eigenvalues in decreasing order : \n') for eigen_val in eigen_pairs: print(eigen_val[0]) tot = sum(eigen_vals.real) discr = [(i/tot) for i in sorted(eigen_vals.real, reverse=True)] cum_discr = np.cumsum(discr) plt.bar(range(1, 14), discr, alpha=0.5, align='center', label='individual explained variance') plt.step(range(1, 14), cum_discr, where='mid', label='cumulative explained variance') plt.ylabel('Explained variance ration') plt.xlabel('Principal components') plt.legend(loc='best') plt.show() w = np.hstack((eigen_pairs[0][1][:, np.newaxis].real, eigen_pairs[1][1][:, np.newaxis].real)) #w = np.hstack((eigen_pairs[0][1][:, np.newaxis].real)) print('Matrix W: \n', w) X_train_lda = X_train_std.dot(w) #在新的空间中,显示数据 colors = ['r', 'b', 'g'] markers = ['s', 'x', 'o'] for l, c, m in zip(np.unique(y_train), colors, markers): plt.scatter(X_train_lda[y_train==l, 0], X_train_lda[y_train==l, 1], c=c, label=l, marker=m) plt.xlabel('PC 1') plt.ylabel('PC 2') plt.legend(loc='lower left') plt.show()
#参考《Python 机器学习》,作者:Sebastian Raschaka, 机械工业出版社;
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-4-25 19:42
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社