로지스틱 회귀

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

cancer = load_breast_cancer()

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)

x_train, x_test, y_train, y_test = train_test_split(data_scaled, cancer.target, 
                                                    test_size = 0.3, random_state = 0)

평균이 0, 분산이 1인 정규분포 형태로 형 변환을 했습니다.

로지스틱 회귀기법은 선형 회귀 방식에 응용으로 데이터의 정규분포도에 영향을 많이 받습니다.

from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)
lr_preds = lr_clf.predict(x_test)

print('정확도 :', np.round(accuracy_score(y_test, lr_preds), 4))
print('roc 커브 :', np.round(roc_auc_score(y_test, lr_preds), 4))

정확도 : 0.9766
roc 커브 : 0.9716

from sklearn.model_selection import GridSearchCV

params = {'penalty' : ['l2', 'l1'], 
          'C' : [0.01, 0.1, 1, 5, 10]}

grid_clf = GridSearchCV(lr_clf, param_grid = params, scoring = 'accuracy', cv = 3)
grid_clf.fit(data_scaled, cancer.target)
print('최적 파라미터 : ', grid_clf.best_params_, '최적 평균 정확도', grid_clf.best_score_)

/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  FitFailedWarning)
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  FitFailedWarning)
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  FitFailedWarning)
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  FitFailedWarning)

최적 파라미터 :  {'C': 1, 'penalty': 'l2'} 최적 평균 정확도 0.975392184164114

/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  FitFailedWarning)

최적 파라미터는 l2 규제로(릿지 회귀) c가(알파의 역수) 1일때 입니다.

트리 기반 회귀 모델

from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

boston = load_boston()
bostonDF = pd.DataFrame(boston.data, columns = boston.feature_names)

bostonDF['PRICE'] = boston.target
y_target = bostonDF['PRICE']
x_data = bostonDF.drop(['PRICE'], axis = 1, inplace=False)

rf = RandomForestRegressor(random_state = 0, n_estimators = 1000)
neg_mse_scores = cross_val_score(rf, x_data, y_target, scoring = 'neg_mean_squared_error', cv = 5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print('mse score : ', np.round(neg_mse_scores, 4))
print('rmse score : ', np.round(rmse_scores, 4))
print('평균 rmse score : ', np.round(avg_rmse, 4))

mse score :  [ -7.933  -13.0584 -20.5278 -46.3057 -18.8008]
rmse score :  [2.8166 3.6136 4.5308 6.8048 4.336 ]
평균 rmse score :  4.4204

랜덤 포레스트 회귀 입니다. 평균 rmse 값은 4.42로 꽤 좋은 수치 입니다.

def get_model_cv_prediction(model, x_data, y_target):
    neg_mse_scores = cross_val_score(model, x_data, y_target, scoring = 'neg_mean_squared_error', cv = 5)
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print(model.__class__.__name__)
    print('평균 rmse : ', np.round(avg_rmse, 4))

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

dt_reg = DecisionTreeRegressor(random_state = 0, max_depth = 4)
rf_reg = RandomForestRegressor(random_state = 0, n_estimators = 1000)
gb_reg = GradientBoostingRegressor(random_state = 0, n_estimators = 1000)
xgb_reg = XGBRegressor(random_state = 0, n_estimators = 1000)
lgb_reg = LGBMRegressor(random_state = 0, n_estimators = 1000)

models = [dt_reg, rf_reg, gb_reg, xgb_reg, lgb_reg]

for model in models:
    get_model_cv_prediction(model, x_data, y_target)

DecisionTreeRegressor
평균 rmse :  6.2377
RandomForestRegressor
평균 rmse :  4.4204
GradientBoostingRegressor
평균 rmse :  4.2692
[13:18:05] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[13:18:07] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[13:18:07] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[13:18:07] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[13:18:08] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor
평균 rmse :  4.0889
LGBMRegressor
평균 rmse :  4.6464

여러 모델을 테스트 해보았습니다.

xgb부스팅 모델의 성능이 가장 우수하게 나왔습니다.