붓꽃 데이터

from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

iris = load_iris()
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
irisDF = pd.DataFrame(iris.data, columns = columns)
irisDF['target'] = iris.target
irisDF.head(3)

markers = ['^','s','o']

for i, marker in enumerate(markers):
    x_axis_data = irisDF[irisDF['target'] == i]['sepal_length']
    y_axis_data = irisDF[irisDF['target'] == i]['sepal_width']
    plt.scatter(x_axis_data, y_axis_data, marker = marker, label = iris.target_names[i])

plt.legend()
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.show()

길이를 x축 너비를 y축으로, 도형으로 붓꽃 데이터를 구분했습니다.

파란색 데이터는 y축값 3이상, x축값 6이하인 곳에 일정하게 분포돼 있습니다.

노란색과 초록색 데이터는 이 두 특성으로 구분하기 힘듭니다.

from sklearn.preprocessing import StandardScaler

iris_scaled = StandardScaler().fit_transform(irisDF.iloc[:,:-1])

타겟 값을 제외한 모든 특성을 표준 정규 분포를 따르게 변환했습니다.

PCA방법은 특성의 스케일에 영향을 받기 때문에 동일한 스케일로 변환하는 것이 필수입니다.

from sklearn.decomposition import PCA

pca = PCA(n_components = 2)

pca.fit(iris_scaled)
iris_pca = pca.transform(iris_scaled)
print(iris_pca.shape)

(150, 2)

4차원 데이터를 2차원 PCA 데이터로 변환하였습니다.

pca_columns = ['pca_component_1', 'pca_component_2']
irisDF_pca = pd.DataFrame(iris_pca, columns = pca_columns)
irisDF_pca['target'] = iris.target
irisDF_pca.head(3)

만들어진 PCA 특성 값으로 데이터 프레임을 만들었습니다.

markers = ['^','s','o']

for i, marker in enumerate(markers):
    x_axis_data = irisDF_pca[irisDF['target'] == i]['pca_component_1']
    y_axis_data = irisDF_pca[irisDF['target'] == i]['pca_component_2']
    plt.scatter(x_axis_data, y_axis_data, marker = marker, label = iris.target_names[i])

plt.legend()
plt.xlabel('pca_component_1')
plt.ylabel('pca_component_2')
plt.show()

두 개의 pca 특성 값으로 노란색과 초록색 데이터 까지 분류가 가능해집니다.

사실 두 개의 pca 특성 값에 네 개의 특성값이 섞여있다고 볼 수 있는데요.

삼차원 이상에 데이터는 시각화 하기 힘들기 때문에 이렇게 시각화 할 수 있는것이 pca분석의 장점이라고 할 수 있습니다.

print(pca.explained_variance_ratio_)

[0.72962445 0.22850762]

explained_varianceratio 값은 변환 된 특성이 얼마나 변동을 설명하는 가를 보여쥽니다.

두 개의 pca 특성이 약 95% 정도에 변동을 설명하고 있습니다.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

rcf = RandomForestClassifier(random_state = 156)
scores = cross_val_score(rcf, iris.data, iris.target, scoring = 'accuracy', cv = 3)
print('개별 정확도 :', scores)
print('평균 정확도 :', np.mean(scores))

개별 정확도 : [0.98 0.94 0.96]
평균 정확도 : 0.96

기존 4차원 데이터를 랜덤포레스트 기법을 이용해서 검정했습니다.

평균 정확도는 약 96%가 나옵니다.

pca_x = irisDF_pca[['pca_component_1','pca_component_2']]
scores_pca = cross_val_score(rcf, pca_x, iris.target, scoring='accuracy', cv = 3)
print('개별 정확도 :', scores_pca)
print('평균 정확도 :', np.mean(scores_pca))

개별 정확도 : [0.88 0.88 0.88]
평균 정확도 : 0.88

PCA기법으로 변환한 데이터를 통해 분석한 결과, 평균 정확도는 약 88%가 나옵니다.

성능이 다소 감소했다고도 볼 수 있습니다.

하지만 특성 수가 절반이 된 걸 생각해보면 원본 데이터의 특성을 상당부분 잘 유지하고 있다고도 볼 수 있습니다.

신용카드 고객 데이터 세트

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/credit_card.xls', header = 1, sheet_name='Data').iloc[0:,1:]
print(df.shape)
df.head(3)

(30000, 24)

24개의 특성과 3만개의 데이터가 있습니다.

df.rename(columns={'PAY_0':'PAY_1', 'default payment next month':'default'}, inplace=True)
y_target = df['default']
x_features = df.drop('default', axis = 1)

pay_0 다음 pay_2 칼럼이 있어서 pay_1로 이름 변경했습니다.

default.. 칼럼도 길어서 짧게 바꿨습니다.

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

corr = x_features.corr()
plt.figure(figsize = (14,14))
sns.heatmap(corr, annot=True, fmt = '.1g')

<matplotlib.axes._subplots.AxesSubplot at 0x7f76d75da490>

상관계수 행렬을 관찰해본 결과 PAY 변수끼리, 또 BILL 변수 끼리 상관계수가 매우 높은 것을 알 수 있습니다.

다중공선성 등 상당부분 문제가 있기 때문에 PCA 방법으로 조정해보겠습니다.

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

cols_bill = ['BILL_AMT'+str(i) for i in range(1,7)]
print('대상 속성명:', cols_bill)

scaler = StandardScaler()
df_cols_scaled = scaler.fit_transform(x_features[cols_bill])
pca = PCA(n_components = 2)
pca.fit(df_cols_scaled)
print('변동성:', pca.explained_variance_ratio_)

대상 속성명: ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
변동성: [0.90555253 0.0509867 ]

단 두 개의 pca 특성으로 변동성을 95프로이상 설명할 수 있습니다.

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rcf = RandomForestClassifier(n_estimators = 300, random_state = 156)
scores = cross_val_score(rcf, x_features, y_target, scoring='accuracy', cv = 3)

print('개별 정확도:', scores)
print('평균 정확도:', np.mean(scores))

개별 정확도: [0.8083 0.8196 0.8232]
평균 정확도: 0.8170333333333333

원본 데이터를 그대로 적용했을 때 정확도 입니다.

scaler = StandardScaler()
df_scaled = scaler.fit_transform(x_features)

pca = PCA(n_components = 6)
df_pca = pca.fit_transform(df_scaled)
scores_pca = cross_val_score(rcf, df_pca, y_target, scoring='accuracy', cv = 3)

print('개별 정확도:', scores_pca)
print('평균 정확도:', np.mean(scores_pca))

개별 정확도: [0.7924 0.7969 0.8012]
평균 정확도: 0.7968333333333334

전체 23개의 속성중 6개 속성만 이용했음에도 정확도가 원본 데이터 대비 크게 떨어지지 않습니다.

이 기법은 최근 컴퓨터 비전 분야에 많이 쓰입니다.

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2

	pca_component_1	pca_component_2
0	-2.264703	0.480027
1	-2.080961	-0.674134
2	-2.364229	-0.341908

	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	PAY_5	PAY_6	BILL_AMT1	BILL_AMT2	BILL_AMT3	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	default payment next month
0	20000	2	2	1	24	2	2	-1	-1	-2	-2	3913	3102	689	0	0	0	0	689	0	0	0	0	1
1	120000	2	2	2	26	-1	2	0	0	0	2	2682	1725	2682	3272	3455	3261	0	1000	1000	1000	0	2000	1
2	90000	2	2	2	34	0	0	0	0	0	0	29239	14027	13559	14331	14948	15549	1518	1500	1000	1000	1000	5000	0