36 lines
1.4 KiB
Python
36 lines
1.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn import preprocessing
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
from sklearn.pipeline import make_pipeline
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.metrics import mean_squared_error, r2_score
|
|
|
|
# 导入数据
|
|
dataset_url = r'/Users/light/workspace/courses/numerical_analysis/8/wine.csv'
|
|
data = pd.read_csv(dataset_url, sep=';')
|
|
print(data)
|
|
# print(data.describe())
|
|
# 把数据分为训练集和测试集
|
|
y = data.quality
|
|
X = data.drop('quality', axis=1)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y,
|
|
test_size=0.1,
|
|
random_state=123,
|
|
stratify=y)
|
|
# 声明数据预处理步骤
|
|
pipeline = make_pipeline(preprocessing.StandardScaler(),
|
|
RandomForestRegressor(n_estimators=100))
|
|
# 声明超参数
|
|
hyperparameters = {'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
|
|
'randomforestregressor__max_depth': [None, 5, 3, 1]}
|
|
# 优化模型
|
|
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
|
|
clf.fit(X_train, y_train)
|
|
# 评估模型及预测
|
|
pred = clf.predict(X_test)
|
|
print(r2_score(y_test, pred))
|
|
print(mean_squared_error(y_test, pred))
|