CheatSheet | pyTorch

2018-11-04 3. System Design Comments

Introduction

Dataset
Data loader
Model
Train
Data Parallelism

1. Dataset

TODO

import torch.utils.data as data_utl

class Dataset(data_utl.Dataset):
    def __init_(self):
        # Read data and preprocess input features with labels
        self.features = []
        self.labels = []

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

2. DataLoader

TODO

# Create Dataset using torch.utils.data.Dataset
dataset = dataset.Dataset()
# Split Dataset into train and validation set
num_train = int(len(dataset) * 0.9)
train_set, val_set = random_split(dataset, [num_train, len(dataset) - num_train])
# train_loader, val_loader
batch_size = 192 * torch.cuda.device_count() # batch will be divided by the number of gpus 
train_loader = DataLoader(train_set, batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size, shuffle=True)

3. Model

Simple Linear Classifier

import torch.nn as nn
import torch.nn.functional as F

class Classifier(nn.Module):
    def __init__(self, num_classes):
        super(SenBert_Classifier, self).__init__()
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

4. Train

TODO

device = "cuda"

# Setup Model
model = senbcls.SenBert_Classifier(num_classes=len(config.CATEGORY_PRED1))
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
model.to(device)
model.train()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
for epoch in range(config.TRAIN_EPOCHS):
    # train model
    start = time.time()
    for step, batch in enumerate(train_loader):
        inputs, labels = batch 
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

5. Data Parallelism

nn.DataParallel : You can simply use multiple gpus with nn.DataParallel that processes followings :
1. nn.parallel.replicate(module, device_ids) : your model is replicated on each device
2. nn.parallel.scatter(input, device_ids) : batch size in your dataloader will be devided into num_gpu => batch_size/num_gpu
3. nn.parallel.gather(outputs, output_device): gathering outputs from replicated models on each device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = myModel()
# multiple gpus => model to DataParallel
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
model.to(device)

References :
- pytorch Distributed DataParallel