Open In Colab

지금까지 배웠던 파이토치를 사용한 딥러닝 모델 실습을 한번 쭉 따라가려 합니다.

토치 비전에 있는 FashionMNIST 데이터를 resnet18 모델에 사용합니다. 이후 ray를 이용한 하이퍼파라미터 탐색을 진행합니다.

BoostCamp 심화과제 코드를 많이 참고해 진행합니다.

패키지 설치 및 사전학습 모델과 데이터 불러오기

패키지 설치 및 불러오기

!pip uninstall -y -q pyarrow
!pip install -q -U ray[tune]
!pip install -q ray[debug]
!pip install torchsummary

import torchvision
import torch
from torchsummary import summary

import numpy as np

from ray import tune
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune import CLIReporter
import ray

from tqdm.notebook import tqdm
import math
WARNING: Skipping pyarrow as it is not installed.
WARNING: ray 2.0.0 does not provide the extra 'debug'
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: torchsummary in /usr/local/lib/python3.7/dist-packages (1.5.1)

FashionMNIST 데이터 셋 불러오기

common_transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
fashion_train_transformed = torchvision.datasets.FashionMNIST(root='./fashion', train=True, download=True, transform=common_transform)
fashion_test_transformed = torchvision.datasets.FashionMNIST(root='./fashion', train=False, download=True, transform=common_transform)
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./fashion/FashionMNIST/raw/train-images-idx3-ubyte.gz
Extracting ./fashion/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./fashion/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./fashion/FashionMNIST/raw/train-labels-idx1-ubyte.gz
Extracting ./fashion/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./fashion/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./fashion/FashionMNIST/raw/t10k-images-idx3-ubyte.gz
Extracting ./fashion/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./fashion/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./fashion/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
Extracting ./fashion/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./fashion/FashionMNIST/raw

fashion_train_transformed[0][0].size()
torch.Size([1, 28, 28])

이미지 데이터는 1차원 형태 입니다.

fashion_train_transformed.classes
['T-shirt/top',
 'Trouser',
 'Pullover',
 'Dress',
 'Coat',
 'Sandal',
 'Shirt',
 'Sneaker',
 'Bag',
 'Ankle boot']

타겟 값의 종류는 총 10개 입니다.

사전학습 모델 불러오기

from torchsummary import summary
imagenet_resnet18 = torchvision.models.resnet18(pretrained=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# torchsummary의 summary함수를 이용하면 깔끔한 출력물을 볼 수 있다.
summary(imagenet_resnet18.to(device), (3,224,224))
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64, 56, 56]               0
           Conv2d-15           [-1, 64, 56, 56]          36,864
      BatchNorm2d-16           [-1, 64, 56, 56]             128
             ReLU-17           [-1, 64, 56, 56]               0
       BasicBlock-18           [-1, 64, 56, 56]               0
           Conv2d-19          [-1, 128, 28, 28]          73,728
      BatchNorm2d-20          [-1, 128, 28, 28]             256
             ReLU-21          [-1, 128, 28, 28]               0
           Conv2d-22          [-1, 128, 28, 28]         147,456
      BatchNorm2d-23          [-1, 128, 28, 28]             256
           Conv2d-24          [-1, 128, 28, 28]           8,192
      BatchNorm2d-25          [-1, 128, 28, 28]             256
             ReLU-26          [-1, 128, 28, 28]               0
       BasicBlock-27          [-1, 128, 28, 28]               0
           Conv2d-28          [-1, 128, 28, 28]         147,456
      BatchNorm2d-29          [-1, 128, 28, 28]             256
             ReLU-30          [-1, 128, 28, 28]               0
           Conv2d-31          [-1, 128, 28, 28]         147,456
      BatchNorm2d-32          [-1, 128, 28, 28]             256
             ReLU-33          [-1, 128, 28, 28]               0
       BasicBlock-34          [-1, 128, 28, 28]               0
           Conv2d-35          [-1, 256, 14, 14]         294,912
      BatchNorm2d-36          [-1, 256, 14, 14]             512
             ReLU-37          [-1, 256, 14, 14]               0
           Conv2d-38          [-1, 256, 14, 14]         589,824
      BatchNorm2d-39          [-1, 256, 14, 14]             512
           Conv2d-40          [-1, 256, 14, 14]          32,768
      BatchNorm2d-41          [-1, 256, 14, 14]             512
             ReLU-42          [-1, 256, 14, 14]               0
       BasicBlock-43          [-1, 256, 14, 14]               0
           Conv2d-44          [-1, 256, 14, 14]         589,824
      BatchNorm2d-45          [-1, 256, 14, 14]             512
             ReLU-46          [-1, 256, 14, 14]               0
           Conv2d-47          [-1, 256, 14, 14]         589,824
      BatchNorm2d-48          [-1, 256, 14, 14]             512
             ReLU-49          [-1, 256, 14, 14]               0
       BasicBlock-50          [-1, 256, 14, 14]               0
           Conv2d-51            [-1, 512, 7, 7]       1,179,648
      BatchNorm2d-52            [-1, 512, 7, 7]           1,024
             ReLU-53            [-1, 512, 7, 7]               0
           Conv2d-54            [-1, 512, 7, 7]       2,359,296
      BatchNorm2d-55            [-1, 512, 7, 7]           1,024
           Conv2d-56            [-1, 512, 7, 7]         131,072
      BatchNorm2d-57            [-1, 512, 7, 7]           1,024
             ReLU-58            [-1, 512, 7, 7]               0
       BasicBlock-59            [-1, 512, 7, 7]               0
           Conv2d-60            [-1, 512, 7, 7]       2,359,296
      BatchNorm2d-61            [-1, 512, 7, 7]           1,024
             ReLU-62            [-1, 512, 7, 7]               0
           Conv2d-63            [-1, 512, 7, 7]       2,359,296
      BatchNorm2d-64            [-1, 512, 7, 7]           1,024
             ReLU-65            [-1, 512, 7, 7]               0
       BasicBlock-66            [-1, 512, 7, 7]               0
AdaptiveAvgPool2d-67            [-1, 512, 1, 1]               0
           Linear-68                 [-1, 1000]         513,000
================================================================
Total params: 11,689,512
Trainable params: 11,689,512
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 62.79
Params size (MB): 44.59
Estimated Total Size (MB): 107.96
----------------------------------------------------------------

출력물을 보고 이 모델의 입력값과 출력값을 조정해줘야한다고 느껴집니다.

입력값은 사용 데이터가 1차원이기 때문에 3차원이 아닌 1차원을 받는 Conc2d 층을 만들어야합니다.

출력값은 데이터 라벨 종류가 총 10개이기 때문에 출력층도 1000개에서 10개로 축소해야합니다.

imagenet_resnet18.fc.parameters()
<generator object Module.parameters at 0x7f0721d2c650>
FASHION_INPUT_NUM = 1
FASHION_CLASS_NUM = 10
# 앞서 말한대로 모델 레이어 일부를 갈아낌
imagenet_resnet18.conv1 = torch.nn.Conv2d(FASHION_INPUT_NUM, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
imagenet_resnet18.fc = torch.nn.Linear(in_features=512, out_features=FASHION_CLASS_NUM, bias=True)

# 가중치 초기값 넣어주는 모습. 근거는 생략함.
torch.nn.init.xavier_uniform_(imagenet_resnet18.fc.weight)
stdv = 1. / math.sqrt(imagenet_resnet18.fc.weight.size(1))
imagenet_resnet18.fc.bias.data.uniform_(-stdv, stdv)

# 역전파로 인한 가중치 업데이트는 맨 앞과 맨 뒤만 처리해줌.
# 이를 frozen 시켰다고 말함.
for param in imagenet_resnet18.parameters(): 
    param.requires_grad = False

for param in imagenet_resnet18.conv1.parameters():
    param.requires_grad = True

for param in imagenet_resnet18.fc.parameters():
    param.requires_grad = True

imagenet_resnet18
ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer2): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer3): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc): Linear(in_features=512, out_features=10, bias=True)
)

출력물을 확인하면 정상적으로 처리된 것을 알 수 있습니다.

ray를 사용해 하이퍼파라미터 튜닝

하이퍼파라미터로 변하는 요소

def get_adam_by_learningrate(model, learning_rate:float):
    return torch.optim.Adam(model.parameters(), lr=learning_rate)

# epoch 출력 함수
def get_epoch_by_epoch(epoch:int):
    return epoch

# batchsize에 따른 데이터로더 출력 함수, train/test가 dict형태로 출력 
def get_dataloaders_by_batchsize(batch_size:int): 
    BATCH_SIZE = batch_size
    fashion_train_dataloader = torch.utils.data.DataLoader(fashion_train_transformed, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
    fashion_test_dataloader = torch.utils.data.DataLoader(fashion_test_transformed, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

    dataloaders = {
        "train" : fashion_train_dataloader,
        "test" : fashion_test_dataloader
    }

    return dataloaders

# 하이퍼파라미터 랜덤서칭, tune 이용함.
config_space = {
    "NUM_EPOCH" : tune.choice([4,5,6,7,8,9]),
    "LearningRate" : tune.uniform(0.0001, 0.001),
    "BatchSize" : tune.choice([32,64,128]),
}

디바이스와 loss 함수 정의

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
loss_fn = torch.nn.CrossEntropyLoss()

training 함수

def training(config): 
    imagenet_resnet18.to(device)

    # 함수 입력값 config를 통해 하이퍼파라미터 입력하기.
    # 에포크수, 배치사이즈, 러닝레이트 3개 요소
    NUM_EPOCH = get_epoch_by_epoch(config["NUM_EPOCH"])
    dataloaders = get_dataloaders_by_batchsize(config["BatchSize"])
    optimizer = get_adam_by_learningrate(imagenet_resnet18, config["LearningRate"])

    # 학습시작
    best_test_accuracy = 0.
    best_test_loss = 9999.

    for epoch in range(NUM_EPOCH):
        for phase in ["train", "test"]:
            running_loss = 0.
            running_acc = 0.
            if phase == "train":
                imagenet_resnet18.train() # 모델을 train mode로.
            elif phase == "test":
                imagenet_resnet18.eval() # 모델을 test mode로.

            pbar = tqdm(dataloaders[phase]) # tqdm 출력물을 보다 다양하게
            for ind, (images, labels) in enumerate(pbar):
                # 데이터를 GPU에 실고
                images = images.to(device)
                labels = labels.to(device)

                # 1) 옵티마이저 가중치 초기화
                optimizer.zero_grad() 

                ## train 모드일 시에는 gradient를 계산하고, 아닐 때는 gradient를 계산하지 않아 연산량 최소화
                with torch.set_grad_enabled(phase == "train"):
                    # 2) 모델에 입력값을 넣어 아웃풋 배출
                    logits = imagenet_resnet18(images)
                    _, preds = torch.max(logits, 1)

                    # 3) 아웃풋 값을 loss 함수에 입력해 loss 값 출력
                    loss = loss_fn(logits, labels)

                    if phase == "train":
                        # 4) loss 값을 이용해 역전파 gradient 값을 구함
                        loss.backward() 

                        # 5) 계산된 gradient를 가지고 옵티마이저 이용해 모델 업데이트
                        optimizer.step()

                # 한 Batch에서의 loss 값 저장
                running_loss += loss.item() * images.size(0)
                # 한 Batch에서의 Accuracy 값 저장
                running_acc += torch.sum(preds == labels.data)
                
                accs = torch.sum(preds == labels.data) / config["BatchSize"]
                
                pbar.set_description(f"epoch : {epoch}, running_loss : {loss.item() * images.size(0)}, running_acc : {accs}")

            # 한 epoch이 모두 종료되었을 때
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_acc / len(dataloaders[phase].dataset)

            # phase가 test일 때, best accuracy/loss 계산
            if phase == "test" and best_test_accuracy < epoch_acc: 
                best_test_accuracy = epoch_acc
            if phase == "test" and best_test_loss > epoch_loss:
                best_test_loss = epoch_loss
    # tune을 사용해 accuracy, loss 값 기록
    tune.report(accuracy=best_test_accuracy.item(), loss=best_test_loss)

HyperOptSearch를 사용해 목푯값 설정 (정확도)

from ray.tune.suggest.hyperopt import HyperOptSearch

optim = HyperOptSearch( # HyperOptSearch 통해 Search를 진행합니다. 더 다양한 Optimizer들은 https://docs.ray.io/en/master/tune/api_docs/suggestion.html#bayesopt 문서를 참고해주세요
    metric='accuracy', # hyper parameter tuning 시 최적화할 metric을 결정합니다. 본 실험은 test accuracy를 target으로 합니다
    mode="max", # target objective를 maximize 하는 것을 목표로 설정합니다
)
from ray.tune import CLIReporter
import ray

# 실험을 최대 수행할 횟수
NUM_TRIAL = 10 

# 중간 수행 결과 출력
reporter = CLIReporter( 
    parameter_columns=["NUM_EPOCH", "LearningRate", "BatchSize"],
    metric_columns=["accuracy", "loss"])

# ray 초기화 후 실행
ray.shutdown() 

analysis = tune.run(
    training, # 전체과정 수행 함수
    config=config_space, # 함수에 입력하는 값. 딕셔너리 형태
    search_alg=optim, # HyperOptSearch 형태로 목푯값 정의
    #verbose=1,
    progress_reporter=reporter, # 중간 수행 결과 출력
    num_samples=NUM_TRIAL,
    resources_per_trial={'gpu': 1}

최고 성능 파라미터 출력

best_trial = analysis.get_best_trial('accuracy', 'max')
print(f"최고 성능 config : {best_trial.config}")
print(f"최고 test accuracy : {best_trial.last_result['accuracy']}")

느낀점

이번주 배운 내용을 전반적으로 정리할 수 있어서 좋았습니다.

구성도 좋고 설명도 친절한 심화과제 덕분에 더 쉽게 배울 수 있었던 것 같습니다.