# -*- coding: utf-8 -*- import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn import preprocessing from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_squared_error, r2_score # 导入数据 dataset_url = r'/Users/light/workspace/courses/numerical_analysis/8/wine.csv' data = pd.read_csv(dataset_url, sep=';') print(data) # print(data.describe()) # 把数据分为训练集和测试集 y = data.quality X = data.drop('quality', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123, stratify=y) # 声明数据预处理步骤 pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100)) # 声明超参数 hyperparameters = {'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]} # 优化模型 clf = GridSearchCV(pipeline, hyperparameters, cv=10) clf.fit(X_train, y_train) # 评估模型及预测 pred = clf.predict(X_test) print(r2_score(y_test, pred)) print(mean_squared_error(y_test, pred))