特征工程（七）—特征学习RBM

article/2025/11/6 23:51:44

1、MNIST数据集

"""
MNIST数据集，包括6000个0-9手写数字图像，以及学习的真实值此处使用很低级的特征，而不是解释性很好的特征。每一个数据点包括784个特征（灰度图像的像素值）
"""import numpy as np
import matplotlib.pyplot as plt
%matplotlib inlinefrom sklearn import linear_model, datasets, metrics# RBM 这是sklearn唯一的RBM实现
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import  Pipeline# 把数据集导入到Numpy数组
images = np.genfromtxt('../data/mnist_train.csv', delimiter=',')# 6000个图像，785列（28像素 * 28像素  + 1个响应变量）
images.shape
#%%
# 785由784像素 + 一个响应变量（第1列）组成，除了响应的变量，每一列的范围都是0到255，表示像素的强度
# 0代表白色背景 255代表全黑的像素images_X, images_y = images[:, 1:], images[:, 0]# 值很大，但是scikit-learn的RBM会进行0~1的缩放
np.min(images_X),np.max(images_X)

在这里插入图片描述

# 查看第一个图像
plt.imshow(images_X[0].reshape(28, 28), cmap=plt.cm.gray_r)images_y[0]

在这里插入图片描述

'''
scikit-learn的唯一的RBM实现是伯努利RBM，要求数据的数值为0~1。修改原始数据集，只考虑硬编码的黑白像素强度，这样每个像素的值会变成0或1（白或黑）
'''# 缩放到0~1
images_X = images_X / 255
# 二分像素（白或黑）
images_X = (images_X > 0.5).astype(float)
np.min(images_X),np.max(images_X)

# 看一下修改后的数字5
plt.imshow(images_X[0].reshape(28, 28), cmap=plt.cm.gray_r)images_y[0]
# 可以看出，图像的模糊消失了，要分类的数字变得很清晰

在这里插入图片描述
应用PCA会如何

# 可以看出，图像的模糊消失了，要分类的数字变得很清晰
#%%
'''
引入RBM之前，先看看应用PCA会如何？使用特征（784个黑或白的像素），并对矩阵进行特征值的分解，从数据集中提取特征数字。从784个主成分中提取100个并绘制出来，查看一下外观。
'''
from sklearn.decomposition import PCApca = PCA(n_components=100)pca.fit(images_X)# 绘制100个主成分
plt.figure(figsize=(10, 10))
for i, comp in enumerate(pca.components_):plt.subplot(10, 10, i + 1)plt.imshow(comp.reshape((28, 28)), cmap=plt.cm.gray_r)plt.xticks()plt.yticks()plt.suptitle('100 components by pca')plt.show()# 协方差矩阵被缩放程原始图像尺寸时候，特征值的样子。
# 每个主成分都试图理解图像的某个方面，例如，第1个（也是最重要的）特征图像有可能捕捉数字0
# 前10个主成分似乎保留了一点数字的形状，之后图像好像没什么意义了。这可能是因为，PCA(LDA)是参数变换，从图像等复杂数据集提取的能力有限。

在这里插入图片描述

full_pca = PCA(n_components=784)
full_pca.fit(images_X)import matplotlib as mpl
# 解决中文乱码
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = Falseplt.plot(np.cumsum(full_pca.explained_variance_ratio_)
)
plt.xlabel('主成分')
plt.ylabel('累积方差')# 100个主成分捕获90%的方差

在这里插入图片描述

从MNIST中提取RBM的特征

'''
从MNIST中提取RBM的特征'''
rbm = BernoulliRBM(random_state=0, # 复现训练verbose=True, # 查看训练过程n_iter=20, # 前后向传导的次数n_components=100 # 希望创建的特征数，任意整数
)rbm.fit(images_X)

在这里插入图片描述

# 对rbm进行可视化，查看和特征数字的区别
# 绘制100个主成分
plt.figure(figsize=(10, 10))
for i, comp in enumerate(rbm.components_):plt.subplot(10, 10, i + 1)plt.imshow(comp.reshape((28, 28)), cmap=plt.cm.gray_r)plt.xticks()plt.yticks()plt.suptitle('100 components by rbm')plt.show()# rbm特征在提取不同的形状和笔划

在这里插入图片描述

plt.figure(figsize=(25, 25))
for i, comp in enumerate(top_features):plt.subplot(5, 4, i + 1)plt.imshow(rbm.components_[comp].reshape((28, 28)), cmap=plt.cm.gray_r)plt.title("Components {},feature value:{}".format(comp,round(image_new_features[comp], 2)), fontsize=20)plt.suptitle('20 components by top_features', fontsize=30)plt.show()# 里面特征十分有意义，例如，34好像直接识别出了5，21隔离了数字5底部的圆圈

在这里插入图片描述

2、在流水线上应用rbm

1、对原始像素值应用线性模型

'''
对原始像素值应用线性模型
'''
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np# 把数据集导入到Numpy数组
images = np.genfromtxt('../data/mnist_train.csv', delimiter=',')
# 785由784像素 + 一个响应变量（第1列）组成，除了响应的变量，每一列的范围都是0到255，表示像素的强度
# 0代表白色背景 255代表全黑的像素images_X, images_y = images[:, 1:], images[:, 0]# 缩放到0~1
images_X = images_X / 255
# 二分像素（白或黑）
images_X = (images_X > 0.5).astype(float)# 创建逻辑回归
lr = LogisticRegression()
params = {'C':[1e-2, 1e-1, 1e0, 1e1, 1e2]
}grid = GridSearchCV(lr, params)# 拟合数据
grid.fit(images_X, images_y)'''
最佳参数：{'C': 0.1}
交叉验证的最佳准确率：0.8983333333333332
'''
print('最佳参数：{}'.format(grid.best_params_))
print('交叉验证的最佳准确率：{}'.format(grid.best_score_))

2、对提取的PCA主成分应用线性模型

'''
对提取的pca主成分应用线性模型
'''
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
pca = PCA()params = {'clf__C':[1e-2, 1e-1, 1e0, 1e1, 1e2],'pca__n_components': [10, 100, 200]
}# 创建流水线
pipeline = Pipeline([('pca', pca),('clf', lr)]
)grid = GridSearchCV(pipeline, params)# 拟合数据
grid.fit(images_X, images_y)'''
最佳参数：{'clf__C': 0.1, 'pca__n_components': 100}
交叉验证的最佳准确率：0.8981666666666666准确率稍差一些。
'''
print('最佳参数：{}'.format(grid.best_params_))
print('交叉验证的最佳准确率：{}'.format(grid.best_score_))

3、对提取的RBM特征应用线性模型

'''
对提取的rbm特征应用线性模型
'''
from sklearn.neural_network import BernoulliRBMrbm = BernoulliRBM(random_state=0)params = {'clf__C':[1e-2, 1e-1, 1e0, 1e1, 1e2],'rbm__n_components': [100, 200]
}
# 创建流水线
pipeline = Pipeline([('rbm', rbm),('clf', lr)]
)grid = GridSearchCV(pipeline, params)# 拟合数据
grid.fit(images_X, images_y)'''
最佳参数：{'clf__C': 1.0, 'rbm__n_components': 200}
交叉验证的最佳准确率：0.9206666666666667
rbm模块的交叉验证率是，能从数字中提取200个新特征。以上说明，面对复杂的任务（例如图像识别、音频处理、和自然语言处理），特征学习算法很有效。
'''
print('最佳参数：{}'.format(grid.best_params_))
print('交叉验证的最佳准确率：{}'.format(grid.best_score_))