[머신러닝 가이드] 5-3 다양한 회귀
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
cancer = load_breast_cancer()
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)
x_train, x_test, y_train, y_test = train_test_split(data_scaled, cancer.target,
test_size = 0.3, random_state = 0)
평균이 0, 분산이 1인 정규분포 형태로 형 변환을 했습니다.
로지스틱 회귀기법은 선형 회귀 방식에 응용으로 데이터의 정규분포도에 영향을 많이 받습니다.
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)
lr_preds = lr_clf.predict(x_test)
print('정확도 :', np.round(accuracy_score(y_test, lr_preds), 4))
print('roc 커브 :', np.round(roc_auc_score(y_test, lr_preds), 4))
from sklearn.model_selection import GridSearchCV
params = {'penalty' : ['l2', 'l1'],
'C' : [0.01, 0.1, 1, 5, 10]}
grid_clf = GridSearchCV(lr_clf, param_grid = params, scoring = 'accuracy', cv = 3)
grid_clf.fit(data_scaled, cancer.target)
print('최적 파라미터 : ', grid_clf.best_params_, '최적 평균 정확도', grid_clf.best_score_)
최적 파라미터는 l2 규제로(릿지 회귀) c가(알파의 역수) 1일때 입니다.
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
boston = load_boston()
bostonDF = pd.DataFrame(boston.data, columns = boston.feature_names)
bostonDF['PRICE'] = boston.target
y_target = bostonDF['PRICE']
x_data = bostonDF.drop(['PRICE'], axis = 1, inplace=False)
rf = RandomForestRegressor(random_state = 0, n_estimators = 1000)
neg_mse_scores = cross_val_score(rf, x_data, y_target, scoring = 'neg_mean_squared_error', cv = 5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)
print('mse score : ', np.round(neg_mse_scores, 4))
print('rmse score : ', np.round(rmse_scores, 4))
print('평균 rmse score : ', np.round(avg_rmse, 4))
랜덤 포레스트 회귀 입니다. 평균 rmse 값은 4.42로 꽤 좋은 수치 입니다.
def get_model_cv_prediction(model, x_data, y_target):
neg_mse_scores = cross_val_score(model, x_data, y_target, scoring = 'neg_mean_squared_error', cv = 5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)
print(model.__class__.__name__)
print('평균 rmse : ', np.round(avg_rmse, 4))
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
dt_reg = DecisionTreeRegressor(random_state = 0, max_depth = 4)
rf_reg = RandomForestRegressor(random_state = 0, n_estimators = 1000)
gb_reg = GradientBoostingRegressor(random_state = 0, n_estimators = 1000)
xgb_reg = XGBRegressor(random_state = 0, n_estimators = 1000)
lgb_reg = LGBMRegressor(random_state = 0, n_estimators = 1000)
models = [dt_reg, rf_reg, gb_reg, xgb_reg, lgb_reg]
for model in models:
get_model_cv_prediction(model, x_data, y_target)
여러 모델을 테스트 해보았습니다.
xgb부스팅 모델의 성능이 가장 우수하게 나왔습니다.