경사하강법

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(8)
x = 2 * np.random.randn(100,1)
y = 6 + 4 * x + np.random.randn(100,1)
plt.scatter(x,y)

<matplotlib.collections.PathCollection at 0x7f6bcf5bb850>

y = 4x + 6 근사

np.random.randn => 표준정규분포에서 값 생성. 100,1 은 값 행렬 형식 선언 입니다.

def get_cost(y, y_pred):
    N = len(y)
    cost = np.sum(np.square(y-y_pred)) / N
    return cost

편차 제곱 평균을 계산해주는 함수.

np.square => 제곱 해주는 함수

def get_weight_updates(w1, w0, x, y, learning_rate = 0.01):
    N = len(y)
    # w1, w0 동일한 행렬 크기를 갖는 0 값으로 초기화 
    w1_update = np.zeros_like(w1)
    w0_update = np.zeros_like(w0)

    #np.dot 행렬의 곱
    y_pred = np.dot(x, w1.T) + w0
    diff = y - y_pred

    w0_factors = np.ones((N, 1))
    
    w1_update = -(2/N) * learning_rate * (np.dot(x.T, diff))
    w0_update = -(2/N) * learning_rate * (np.dot(w0_factors.T, diff))

    return w1_update, w0_update

편미분한 w1, w0값을 이용해서 w0, w1값을 지속적으로 업데이트 해줍니다

np.zeros_like(w1) => w1값과 같은 형태에 값은 0인 행렬 생성

np.dot(,) => 행렬 연산

def gradient_descent_steps(x,y, iters = 10000):
    w0 = np.zeros((1,1))
    w1 = np.zeros((1,1))

    for ind in range(iters):
        w1_update, w0_update = get_weight_updates(w1, w0, x, y, learning_rate=0.01)
        w1 = w1 - w1_update
        w0 = w0 - w0_update
    
    return w1, w0

위 두 함수를 통해 w1, w0 값을 지속적으로 업데이트 하여 최적에 값에 도달하게 합니다.

w1, w0 = gradient_descent_steps(x,y, 1000)
print('w1 :', np.round(w1[0,0],4), 'w0 :', np.round(w0[0,0],4))
y_pred = w1[0,0] * x + w0
print('편차제곱평균:', np.round(get_cost(y, y_pred),4))

w1 : 3.9974 w0 : 5.9649
편차제곱평균: 1.1967

plt.scatter(x,y)
plt.plot(x,y_pred)

[<matplotlib.lines.Line2D at 0x7f6bcf10f110>]

경사하강법을 이용해 회귀선이 잘 만들어졌습니다.

다만 데이터에 개수가 100개보다 훨씬 많아지면 전체데이터로 계수를 업데이트 하지 못합니다.

그 때문에 실전에서는 대부분 (미니배치)확률적 경사 하강법을 이용합니다.

이 방식은 전체 데이터가 아닌 일부 데이터로 계수를 업데이트 하기 때문에 속도가 상대적으로 빠릅니다.

이를 구현해보겠습니다.

def stochastic_gradient_descent_steps(x,y,batch_size = 10, iters = 1000):
    w0 = np.zeros((1,1))
    w1 = np.zeros((1,1))
    prev_cost = 100000
    iter_index = 0

    for ind in range(iters):
        np.random.seed(ind)
        stochastic_random_index = np.random.permutation(x.shape[0])
        sample_x = x[stochastic_random_index[0:batch_size]]
        sample_y = y[stochastic_random_index[0:batch_size]]

        w1_update, w0_update = get_weight_updates(w1, w0, sample_x, sample_y)
        w1 = w1 - w1_update
        w0 = w0 - w0_update

    return w1, w0

np.random.permutation(x.shape[0]) => 주어진 데이터를 셔플해서 출력함

앞 함수와 바뀐 부분은 x, y를 샘플링해서 넣는다는 점 입니다.

w1, w0 = stochastic_gradient_descent_steps(x,y,iters=1000)
print('w1:', np.round(w1[0,0],3), 'w0:', np.round(w0[0,0],4))

y_pred = w1[0,0] * x + w0

print('편차제곱 평균:', np.round(get_cost(y,y_pred),4))

w1: 4.006 w0: 5.9135
편차제곱 평균: 1.1996

편차제곱 평균 값이 전체 x,y를 투입했을때와 큰 차이가 없습니다.

그러므로 계산 속도가 훨씬 빠른 미니배치 경사하강법을 많이 사용합니다.

단순 선형 회귀(보스턴 주택 가격)

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.datasets import load_boston
%matplotlib inline

boston = load_boston()

bostonDF = pd.DataFrame(boston.data, columns = boston.feature_names)

bostonDF['PRICE'] = boston.target
print('보스턴 데이터 세트 크기:', bostonDF.shape)
bostonDF.head()

보스턴 데이터 세트 크기: (506, 14)

사이킷런에 내장되어있는 보스턴 주택 데이터를 불러왔습니다.

bostonDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB

결측값은 없으며 모든 피처가 float 형 입니다.

fig, axs = plt.subplots(figsize=(16,8), ncols = 4, nrows = 2)
lm_features = ['RM','ZN', 'INDUS','NOX','AGE','PTRATIO','LSTAT','RAD']
for i, feature in enumerate(lm_features):
    row = int(i/4)
    col = i%4
    sns.regplot(x=feature, y='PRICE', data=bostonDF, ax=axs[row][col])

sns.regplot(x,y) => x,y 산점도와 함께 회귀직선을 그려줌.

plt.subplots(ncols = , nrows= ) 여러개의 그림을 그릴 수 있게 해줌.

RM과 LSTAT 변수가 가장 PRICE 변수와 연관성이 있어보입니다.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

y_target = bostonDF['PRICE']
x_data = bostonDF.drop(['PRICE'], axis = 1, inplace=False)

x_train, x_test, y_train, y_test = train_test_split(x_data, y_target, test_size = 0.3, 
                                                    random_state = 156)

lr = LinearRegression()
lr.fit(x_train, y_train)
y_preds = lr.predict(x_test)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

print('mse :', np.round(mse,4), ', rmse :', np.round(rmse, 4))
print('결정계수:', np.round(r2_score(y_test, y_preds), 4))

mse : 17.2969 , rmse : 4.159
결정계수: 0.7572

모델을 어느정도 설명해 준 모습입니다.

print('절편 값:',lr.intercept_)
print('회귀 계수값:', np.round(lr.coef_,1))

절편 값: 40.995595172164755
회귀 계수값: [ -0.1   0.1   0.    3.  -19.8   3.4   0.   -1.7   0.4  -0.   -0.9   0.
  -0.6]

coeff = pd.Series(data=np.round(lr.coef_, 1), index = x_data.columns)
coeff.sort_values(ascending=False)

RM          3.4
CHAS        3.0
RAD         0.4
ZN          0.1
B           0.0
TAX        -0.0
AGE         0.0
INDUS       0.0
CRIM       -0.1
LSTAT      -0.6
PTRATIO    -0.9
DIS        -1.7
NOX       -19.8
dtype: float64

변수 이름과 추정 회귀 계수를 맵핑 시킨 모습입니다.

NOX 변수의 계수 값이 크게 작아보입니다.

from sklearn.model_selection import cross_val_score

neg_mse_scores = cross_val_score(lr, x_data, y_target, scoring='neg_mean_squared_error', cv = 5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print('mse scores', np.round(neg_mse_scores,2))
print('rmse scores', np.round(rmse_scores, 2))
print('평균 rmse score:', np.round(avg_rmse,2))

mse scores [-12.46 -26.05 -33.07 -80.76 -33.31]
rmse scores [3.53 5.1  5.75 8.99 5.77]
평균 rmse score: 5.83

5개의 폴드 세트를 이용한 교차검증 입니다.

scoring = 'neg_mean_squared_error' 같은 경우 보통 모델 평가를 위한 값이 커야 좋은 값인데, mse 값은 작아야 좋습니다.

그러므로 음수를 붙여서 보정해준다고 생각하면 좋습니다.

다음에는 다항회귀, 릿지/라쏘 회귀 부분을 공부하겠습니다.

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	PRICE
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33	36.2