《Python数据分析和应用》实验四-使用scikit-learn-构建模型【精选推荐】
下面是小编为大家整理的《Python数据分析和应用》实验四-使用scikit-learn-构建模型【精选推荐】,供大家参考。
实验四
使用 scikit- -n learn 构建模型
教材 P196
实训 1- 实训 4 1 、实训 1 # 读数据 import pandas as pd wine=pd.read_csv("D:\\桌面\\实验四\\data\\wine.csv") winequality=pd.read_csv("D:\\桌面\\实验四\\data\\winequality.csv",sep=";") # 数据和标签拆分开 wine_data=wine.iloc[:,1:] wine_target=wine["Class"] print("wine 数据集的数据为:\n",wine_data) print("wine 数据集的标签为:\n",wine_target)
winequality_data=winequality.iloc[:,:-1] winequality_target=winequality["quality"] print("winequality 数据集的数据为:\n",winequality_data) print("winequality 数据集的标签为:\n",winequality_target)
# 划分训练集和测试集 from sklearn.model_selection import train_test_split wine_data_train, wine_data_test, \ wine_target_train, wine_target_test = \ train_test_split(wine_data, wine_target, \ test_size=0.1, random_state=6) winequality_data_train, winequality_data_test, \ winequality_target_train, winequality_target_test = \ train_test_split(winequality_data, winequality_target, \ test_size=0.1, random_state=6) # 标准化数据集 from sklearn.preprocessing import StandardScaler stdScale = StandardScaler().fit(wine_data_train)
wine_trainScaler = stdScale.transform(wine_data_train) wine_testScaler = stdScale.transform(wine_data_test)
stdScale = StandardScaler().fit(winequality_data_train)
winequality_trainScaler = stdScale.transform(winequality_data_train) winequality_testScaler = stdScale.transform(winequality_data_test) #PCA 降维 from sklearn.decomposition import PCA pca = PCA(n_components=5).fit(wine_trainScaler)
wine_trainPca = pca.transform(wine_trainScaler)
wine_testPca = pca.transform(wine_testScaler)
pca = PCA(n_components=5).fit(winequality_trainScaler) winequality_trainPca = pca.transform(winequality_trainScaler)
winequality_testPca = pca.transform(winequality_testScaler) 2 、实训 2 # 根据实训 1 的 的 wine 数据集处理的结果, 构建聚类数目为 3 的 K-Means 模型 from sklearn.cluster import Kmeans kmeans = KMeans(n_clusters = 3,random_state=1).fit(wine_trainScaler) print("构建的 KMeans 模型为:\n",kmeans)
# 对比真实标签和聚类标签求取 FMI from sklearn.metrics import fowlkes_mallows_score score=fowlkes_mallows_score(wine_target_train,kmeans.labels_) print("wine 数据集的 FMI:%f"%(score))
# 在聚类数目为 2~10 类时, 确定最优聚类数目 for i in range(2,11):
kmeans = KMeans(n_clusters = i,random_state=123).fit(wine_trainScaler)
score = fowlkes_mallows_score(wine_target_train,kmeans.labels_) print("iris 数据聚%d 类 FMI 评价分值为:%f" %(i,score))
# 求取模型的轮廓系数, 绘制轮廓系数折线图, 确定最 优聚类数目 from sklearn.metrics import silhouette_score import matplotlib.pyplot as plt silhouettteScore = [] for i in range(2,11):
kmeans = KMeans(n_clusters = i,random_state=1).fit(wine)
score = silhouette_score(wine,kmeans.labels_)
silhouettteScore.append(score) plt.figure(figsize=(10,6)) plt.plot(range(2,11),silhouettteScore,linewidth=1.5, linestyle="-") plt.show()
# 求取 Calinski-Harabasz 指数, 确定最优聚类数 from sklearn.metrics import calinski_harabaz_score for i in range(2,11):
kmeans = KMeans(n_clusters = i,random_state=1).fit(wine)
score = calinski_harabaz_score(wine,kmeans.labels_) print("seeds 数据聚%d 类 calinski_harabaz 指数为:%f"%(i,score))
出现错误,代码没有问题。换了个电脑出现结果;
3 、实训 3 # 读取 wine 数据集, 区分标签和数据 import pandas as pd wine = pd.read_csv ("D:\\桌面\\实验四\\data\\wine.csv") wine_data=wine.iloc[:,1:] wine_target=wine["Class"] #将 将 wine 数据集划分为训练集和测试集 from sklearn.model_selection import train_test_split wine_data_train, wine_data_test, \ wine_target_train, wine_target_test = \ train_test_split(wine_data, wine_target, \
test_size=0.1, random_state=6) # 使用离差标准化方法标准化 wine 数据集 from sklearn.preprocessing import MinMaxScaler stdScale = MinMaxScaler().fit(wine_data_train)
wine_trainScaler = stdScale.transform(wine_data_train) wine_testScaler = stdScale.transform(wine_data_test) # 构建 SVM 模型, 并预测测试集结果。
from sklearn.svm import SVC
svm = SVC().fit(wine_trainScaler,wine_target_train) print("建立的 SVM 模型为:\n",svm) wine_target_pred = svm.predict(wine_testScaler) print("预测前 10 个结果为:\n",wine_target_pred[:10])
#打印出分类报告,评价分类模型性能 from sklearn.metrics import classification_report print("使用 SVM 预测 iris 数据的分类报告为:","\n",
classification_report(wine_target_test,
wine_target_pred))
4 、实训 4 # 根据 wine_quality 数据集处理的结果, 构建线性回归模型。
from sklearn.linear_model import LinearRegression clf = LinearRegression().fit(winequality_trainPca,winequality_target_train) y_pred = clf.predict(winequality_testPca) print("线性回归模型预测前 10 个结果为:","\n",y_pred[:10])
# 根据 wine_quality 数据集处理的结果, 构建梯度提升回归模型 from sklearn.ensemble import GradientBoostingRegressor GBR_wine = GradientBoostingRegressor().\
fit(winequality_trainPca,winequality_target_train) wine_target_pred = GBR_wine.predict(winequality_testPca) print("梯度提升回归模型预测前 10 个结果为:","\n",wine_target_pred[:10]) print("真实标签前十个预测结果为:","\n",list(winequality_target_test[:10]))
# 结合真实评分和预测评分, 计算均方误差、中值绝对误差、可解释方差值 # 根据得分, 判定模型的性能优劣 from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import median_absolute_error from sklearn.metrics import explained_variance_score from sklearn.metrics import
r2_score print("线性回归模型评价结果:") print("winequality 数据线性回归模型的平均绝对误差为:",
mean_absolute_error(winequality_target_test,y_pred)) print("winequality 数据线性回归模型的均方误差为:",
mean_squared_error(winequality_target_test,y_pred)) print("winequality 数据线性回归模型的中值绝对误差为:",
median_absolute_error(winequality_target_test,y_pred)) print("winequality 数据线性回归模型的可解释方差值为:",
explained_variance_score(winequality_target_test,y_pred)) print("winequality 数据线性回归模型的 R 方值为:",
r2_score(winequality_target_test,y_pred)) print("梯度提升回归模型评价结果:") from sklearn.metrics import explained_variance_score,\ mean_absolute_error,mean_squared_error,median_absolute_error,r2_score print("winequality 数据梯度提升回归树模型的平均绝对误差为:",
mean_absolute_error(winequality_target_test,wine_target_pred)) print("winequality 数据梯度提升回归树模型的均方误差为:",
mean_squared_error(winequality_target_test,wine_target_pred)) print("winequality 数据梯度提升回归树模型的中值绝对误差为:",
median_absolute_error(winequality_target_test,wine_target_pred)) print("winequality 数据梯度提升回归树模型的可解释方差值为:",
explained_variance_score(winequality_target_test,wine_target_pred)) print("winequality 数据梯度提升回归树模型的 R 方值为:",
r2_score(winequality_target_test,wine_target_pred))
通过对比两者的回归评价指标发现,线性回归模型的平均绝对误差、均方误差、中值绝对误差都要大于梯度回归模型,而可解释方差和 R2 值都要小于梯度回归模型。对于一个回归模型来说,平均绝对误差、均方误差、中值绝对误差越接近 0 越好,可解释方差和 R2 值越接近 1 越好,因此可知,梯度回归模型在五个指标上都要优于线性回归模型,故在本题中,梯度回归模型性能更优。
推荐访问:python数据分析实验心得 《Python数据分析和应用》实验四-使用scikit-learn-构建模型 构建 模型 实验
热门文章:
- 前台收银员年度工作总结11篇(范例推荐)2024-02-01
- 项目质检员个人工作总结4篇2024-02-01
- 2024年度审计年度工作总结参考6篇2024-02-01
- 2024年面试简短自我介绍模板4篇2024-02-01
- 妇女节演讲稿最新3篇(完整)2024-02-01
- 在小学实习报告9篇【完整版】2024-02-01
- 2024年有关护理实习报告模板3篇(全文完整)2024-02-01
- 2024年司机辞职报告10篇(完整)2024-02-01
- 模具专业求职信模板6篇【完整版】2024-02-01
- 关于作业的检讨书12篇(范例推荐)2024-02-01
相关文章:
- 化学实验室总结报告3篇2022-05-19
- 化学实验室个人工作总结15篇2022-05-19
- 实验室计划和总结11篇2022-05-27
- 化学实验室的安全管理制度应该(,)13篇2022-05-31
- 实验报告怎么写格式11篇2022-06-08
- 医学检验实验室生物安全【精选推荐】2022-10-28
- 实验室员工保密协议9篇2022-10-29
- 实验室员工保密协议3篇2022-11-23
- 文科实验班班主任工作总结10篇(全文完整)2023-12-24
- 实验室人员年终总结3篇(范文推荐)2023-12-26