구글 드라이브 연동 / 기초 패키지 설치

from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/boostcamp/' # 데이터 저장 경로

Mounted at /content/drive

import numpy as np 
import pandas as pd
import matplotlib as mpl

import matplotlib.pyplot as plt

가장 기본적인 그래프 실습

fig = plt.figure(figsize=(12, 7)) # (가로, 세로) 인치단위

fig.set_facecolor('lightgray') # 배경색

ax1 = fig.add_subplot(1, 3, 1, aspect = 3) # 가로 1개, 세로 3개, 1번째, aspect로 비율 조정 가능
ax2 = fig.add_subplot(1, 3, 2, aspect = 1.2) # 가로 1개, 세로 3개, 2번째
ax3 = fig.add_subplot(1, 3, 3) # 가로 1개, 세로 3개, 2번째

ax1.plot([1, 1, 1], color='r') # 한 글자로 정하는 색상
ax1.plot([2, 2, 2], color='forestgreen') # color name
ax1.plot([3, 3, 3], color='#000000') # hex code

ax2.plot([1, 1, 1], label='1') # label : 범례추가
ax2.plot([2, 2, 2], label='2') 
ax2.plot([3, 3, 3], label='3')

ax3.plot([1, 1, 1]) 
ax3.plot([2, 2, 2]) 
ax3.plot([3, 3, 3])

ax2.set_title('Basic Plot') # 제목 추가
ax2.set_xticks([0, 1, 2, 3, 4]) # x축 범위 추가
ax2.legend() # label : 범례추가
ax2.grid() # 바깥 실선(격자) 추가

ax3.set_xticks([0, 1, 2])
ax3.set_xticklabels(['zero', 'one', 'two']) # x축 값 입력
ax3.text(x=1, y=2, s='This is Text') # 텍스트 추가


plt.show()

# fig.savefig('file_names', dpi = 150) # png 파일로 저장, dpi로 해상도 높일 수도 있음

Figure는 큰 틀(언제나 1개), Ax는 각 플롯이 들어가는 공간(N개) 입니다.

fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111, aspect=1) # aspect : x, y 범위 고정

np.random.seed(970725)

x = np.random.rand(20)
y = np.random.rand(20)

ax.scatter(x, y)
ax.set_xlim(0, 1.05)
ax.set_ylim(0, 1.05)

plt.show()

aspect 함수를 사용하면 x, y의 스케일을 맞출 수 있습니다.

student 자료를 이용한 bar 시각화

student = pd.read_csv(path + 'StudentsPerformance.csv')
student.sample(5)

head 보다 sample을 보면 데이터를 더 잘 파악할 수 있습니다.

student.describe(include='all')

describe 함수를 사용할 때 include='all' 옵션을 사용하면 범주형 변수도 나타나고, unique 등 특징도 알 수 있어 좋습니다.

group = student.groupby('gender')['race/ethnicity'].value_counts().sort_index() # index 기준으로 정렬

fig, axes = plt.subplots(1, 2, figsize=(15, 7), sharey=True) # sharey : Y축 고정
axes[0].bar(group['male'].index, group['male'], color='royalblue')
axes[1].bar(group['female'].index, group['female'], color='tomato', alpha=0.9) # alpha : 투명도
plt.show()

sharey = True 옵션을 사용하면 두 그래프의 Y 축을 맞춰줄 수 있어 좋은 시각화를 할 수 있습니다.

fig, ax = plt.subplots(1, 1, figsize=(12, 7))

group = group.sort_index(ascending=False) # 역순 정렬
total=group['male']+group['female'] # 각 그룹별 합


ax.barh(group['male'].index, group['male']/total, 
        color='royalblue')

ax.barh(group['female'].index, group['female']/total, 
        left=group['male']/total, 
        color='tomato')

ax.set_xlim(0, 1)
for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)

plt.show()

퍼센트르 반영한 가로방향 그래프입니다. ax.spines['top'].set_visible(False) 으로 외각선을 없앨 수도 있습니다.

score = student.groupby('gender').mean().T
score_var = student.groupby('gender').std().T

fig, ax = plt.subplots(1, 1, figsize=(10, 10))

idx = np.arange(len(score.index))
width=0.3


ax.bar(idx-width/2, score['male'], 
       color='royalblue',
       width=width, # 폭
       label='Male',
       yerr=score_var['male'], # 표준편차를 이용한 오차 막대
       capsize=10
      )

ax.bar(idx+width/2, score['female'], 
       color='tomato',
       width=width, # 폭
       label='Female',
       yerr=score_var['female'], # 표준편차를 이용한 오차 막대
       capsize=10,  # 막대 위 선(모자)
       edgecolor='black', # 겉 테두리 색깔
       linewidth=2, # 겉 테두리 두께
      )

ax.set_xticks(idx)
ax.set_xticklabels(score.index)
ax.set_ylim(0, 100)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.legend()
ax.set_title('Gender / Score', fontsize=20)
ax.set_xlabel('Subject', fontweight='bold')
ax.set_ylabel('Score', fontweight='bold')

plt.show()

오차막대, 박스 겉 테두리선 등 신경써서 꾸민 그래프입니다. width은 폭을 조정하고, yerr에 분산값을 넣어주면 오차막대가 생깁니다.

capsize은 오차막대 위, 아래 선을 추가해줍니다. edgecolor는 박스 테두리를 채워줍니다. linewidth은 그 테두리 두께 크기를 정합니다.

bar 그래프 관련 기타 참고 사항으로 잉크양과 실제 값은 비례하는 것이 좋습니다. 값에 시작은 0부터 진행되면 좋겠군요.

또 사람의 눈은 2차원을 가장 잘 인식하기 때문에 필요성이 떨어지는 복잡함은 지양하는 것이 좋겠습니다.

주식 자료를 이용한 line 시각화

line 시각화는 주로 시계열 자료에서 추세을 나타내고자 할 때 쓰임니다.

추세가 중요한 경우 bar 그래프와 달리 꼭 0을 시작점으로 잡을 필요는 없습니다.

fig, ax = plt.subplots(1, 1, figsize=(5, 5))

np.random.seed(97)
x = np.arange(7)
y = np.random.rand(7)

ax.plot(x, y,
        color='black', # 색깔
        marker='*', # 마커
        linestyle='solid', # 선의 종류 (`solid`, `dashed`, `dashdot`, `dotted`, `None`)
       )

plt.show()

점은 크게 색, 마커, 선의 종류 3가지 요소가 존재합니다.

stock = pd.read_csv(path + 'prices.csv')
stock['date'] = pd.to_datetime(stock['date'], format='%Y-%m-%d', errors='raise')
stock.set_index("date", inplace = True)
apple = stock[stock['symbol']=='AAPL']
google = stock[stock['symbol']=='GOOGL']
google.sample(5)

미국 주식 자료입니다. 추세가 있는 시계열 자료이기 때문에 line 시각화가 어울립니다.

google_rolling = google.rolling(window=20).mean()

fig, axes = plt.subplots(2, 1, figsize=(12, 7), dpi=300, sharex=True) # dpi : 해상도(디폴트 100), sharex : x축 고정

axes[0].plot(google.index,google['close'])
axes[1].plot(google_rolling.index,google_rolling['close'])

plt.show()

우선 dpi은 해상도를 보장해주는 옵션입니다. 저는 잘 모르겠는데 확실히 또렷하다고 합니다.

이런 노이즈가 많이 껴있는 시계열 자료는 추세를 잘 보여주기 위해 이동평균 기법을 사용합니다.

fig = plt.figure(figsize=(12, 5))

x = np.linspace(0, 2*np.pi, 1000)
y1 = np.sin(x)
y2 = np.cos(x)

ax = fig.add_subplot(111, aspect=1)
ax.plot(x, y1,
       color='#1ABDE9',
       linewidth=2,)

ax.plot(x, y2,
       color='#F36E8E',
       linewidth=2,)

ax.text(x[-1]+0.1, y1[-1], s='sin', fontweight='bold',
         va='center', ha='left', 
         bbox=dict(boxstyle='round,pad=0.3', fc='#1ABDE9', ec='black', alpha=0.3))

ax.text(x[-1]+0.1, y2[-1], s='cos', fontweight='bold',
         va='center', ha='left', 
         bbox=dict(boxstyle='round,pad=0.3', fc='#F36E8E', ec='black', alpha=0.3))


ax.spines['top'].set_visible(False) # 테두리 선 없앰
ax.spines['right'].set_visible(False) # 테두리 선 없앰

plt.show()

line 그래프는 위와 같이 범례를 따로 만드는 것 보다 선 끝에 무슨 항목인지 표시해 주는 것이 가독성이 좋습니다.

텍스트

text

fig, ax = plt.subplots()
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)

ax.text(x=0.5, y=0.5, s='Text\nis Important',
        fontsize=20,
        fontweight='bold',
        fontfamily='serif',
        color='royalblue',
        fontstyle = 'italic',
        linespacing=2,
        va='center', # top, bottom, center 위 아래 정렬 (중요)
        ha='center', # left, right, center 왼쪽, 오른쪽 정렬 (중요)
        rotation='horizontal', # vertical 글짜 방향(수평/수직)
        # bbox는 딕셔너리 형태로 넣어줘야, 적용할 수 있는 것이 더 많아 검색 필요
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.4) 
       )

fig.text(0.2, 0.9, s='Figure Text') # 위치(가로, 세로) 기반 텍스트 작성

plt.show()

기본적으로 text는 크게 4개 요소가 존재합니다.

크기(fontsize)
굵기(fontweight)
글꼴(fontfamily)
스타일(fontstyle)

이 외에는 bbox나 정렬, 그 외 세부사항으로 나뉩니다.

텍스트 작성에는 fig, ax 두 곳에서 모두 가능하며 fig는 비율로, ax는 좌표로 인지합니다.

Color

데이터를 구분 짓는데 시각적으로 가장 강력하고 효과적인 것이 색 입니다. 기존 정보에서 사용하는 색을 사용하면(네이버 - 초록색, 카카오 - 노란색) 설명력이 크게 높아집니다.

색은 RGB 보다는 HSL로 이해하는 것이 좋습니다.

H : Hue(색조) : 빨강부터 보라까지를 0~360으로 표현
S : Saturate(채도) : 선명도(파스텔톤 설명 가능)
L : Lightness(광도) : 밝기(연하다 진하다)

하지만 너무 화려한 색을 쓰는 것 보단 하고 싶은 말을 강조하는 쪽으로 쓰는 것이 중요합니다.

student = pd.read_csv(path + 'StudentsPerformance.csv')

a_color, nota_color = 'black', 'lightgray'

colors = student['race/ethnicity'].apply(lambda x : a_color if x =='group A' else nota_color)
color_bars = [a_color] + [nota_color]*4

fig = plt.figure(figsize=(18, 15))
groups = student['race/ethnicity'].value_counts().sort_index()

ax_bar = fig.add_subplot(2, 1, 1)
ax_bar.bar(groups.index, groups, color=color_bars, width=0.5)

ax_s1 = fig.add_subplot(2, 3, 4)
ax_s2 = fig.add_subplot(2, 3, 5)
ax_s3 = fig.add_subplot(2, 3, 6)

ax_s1.scatter(student['math score'], student['reading score'], color=colors, alpha=0.5)
ax_s2.scatter(student['math score'], student['writing score'], color=colors, alpha=0.5)
ax_s3.scatter(student['writing score'], student['reading score'], color=colors, alpha=0.5)

for ax in [ax_s1, ax_s2, ax_s3]:
    ax.set_xlim(-2, 105)
    ax.set_ylim(-2, 105)

plt.show()

위와 같이 색을 말하고 싶은 정보를 강조하는 방법으로 사용하면 효과가 좋습니다.

기타

fig, ax = plt.subplots(figsize=(10, 10))
ax.set_aspect(1)

math_mean = student['math score'].mean()
reading_mean = student['reading score'].mean()

ax.axvline(math_mean, color='gray', linestyle='--') # 면인경우 axvspan
ax.axhline(reading_mean, color='gray', linestyle='--')

ax.scatter(x=student['math score'], y=student['reading score'],
           alpha=0.5,
           color=['royalblue' if m>math_mean and r>reading_mean else 'gray'  for m, r in zip(student['math score'], student['reading score'])],
           zorder=10,
          )

ax.set_xlabel('Math')
ax.set_ylabel('Reading')

ax.set_xlim(-3, 103)
ax.set_ylim(-3, 103)
plt.show()

axvline 라는 선을 그리는 함수를 추가해서 내가 원하는 영역을 강조할 수 있습니다.

fig = plt.figure(figsize=(12, 6))

_ = fig.add_subplot(1,2,1)
ax = fig.add_subplot(1,2,2)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.spines['left'].set_linewidth(1.5)
ax.spines['left'].set_position('center')
ax.spines['bottom'].set_position('center')
plt.show()

각 선도 보이지 않게 할 수도(set_visible), 굵기를 두껍게 할 수도(set_linewidth), 아에 이동 할 수도(set_position) 있습니다.

plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.linestyle'] = ':'

plt.rcParams['figure.dpi'] = 150
# plt.rcParams.update(plt.rcParamsDefault) # 다시 디폴트 설정으로 돌리기
plt.plot([1,2,3])

[<matplotlib.lines.Line2D at 0x7f791d531c10>]

plt 자체에서 디폴트 설정을 바꿀 수도 있습니다.

plt.rcParams.update(plt.rcParamsDefault) # 디폴트 설정으로 돌리기
print(mpl.style.available)

mpl.style.use('ggplot') # ggplot 스타일로

plt.plot([1, 2, 3])

['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']

[<matplotlib.lines.Line2D at 0x7f791cf18210>]

아에 스타일을 바꾸는 방법도 있습니다. 강의 내용을 빌리면 ggplot 이쁘다고 하네요.

느낀점

빨간날이 하나 껴있지만 하나도 체감을 못했습니다. 월요일에 늦잠잔거 정도?

열심히 달리긴 했지만 시각화 강의 실습을 제대로 체험해본 느낌은 아니여서 정말 아쉽습니다.

강의가 좋고 실습 코드도 다시 한번 리마인드 하면 좋을 것 같습니다.

다음주에도 열심히 화이팅 해보겠습니다.

** 위 수식과 그림은 부스트캠프 AI Tech 교육 자료를 참고하였습니다.

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
884	female	group E	associate's degree	standard	none	51	51	54
332	male	group E	associate's degree	standard	completed	62	56	53
936	male	group A	associate's degree	standard	none	67	57	53
814	female	group C	high school	standard	none	72	80	83
616	female	group E	bachelor's degree	standard	none	37	45	38

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
count	1000	1000	1000	1000	1000	1000.00000	1000.000000	1000.000000
unique	2	5	6	2	2	NaN	NaN	NaN
top	female	group C	some college	standard	none	NaN	NaN	NaN
freq	518	319	226	645	642	NaN	NaN	NaN
mean	NaN	NaN	NaN	NaN	NaN	66.08900	69.169000	68.054000
std	NaN	NaN	NaN	NaN	NaN	15.16308	14.600192	15.195657
min	NaN	NaN	NaN	NaN	NaN	0.00000	17.000000	10.000000
25%	NaN	NaN	NaN	NaN	NaN	57.00000	59.000000	57.750000
50%	NaN	NaN	NaN	NaN	NaN	66.00000	70.000000	69.000000
75%	NaN	NaN	NaN	NaN	NaN	77.00000	79.000000	79.000000
max	NaN	NaN	NaN	NaN	NaN	100.00000	100.000000	100.000000

	symbol	open	close	low	high	volume
date
2011-06-23	GOOGL	482.130003	480.220020	473.729995	482.860016	9593700.0
2010-12-08	GOOGL	591.970006	590.539980	583.690025	592.519993	3510200.0
2015-02-06	GOOGL	531.010010	533.880005	528.650024	540.219971	2146900.0
2011-05-16	GOOGL	526.310023	518.420033	516.400021	527.270000	5910400.0
2014-11-05	GOOGL	566.789978	555.950012	554.150024	566.900024	1645300.0