Introduction
- 
Dataset
 - 
Data loader
 - 
Model
 - 
Train
 - 
Data Parallelism
 
1. Dataset
- TODO
 
import torch.utils.data as data_utl
class Dataset(data_utl.Dataset):
    def __init_(self):
        # Read data and preprocess input features with labels
        self.features = []
        self.labels = []
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
2. DataLoader
- TODO
 
# Create Dataset using torch.utils.data.Dataset
dataset = dataset.Dataset()
# Split Dataset into train and validation set
num_train = int(len(dataset) * 0.9)
train_set, val_set = random_split(dataset, [num_train, len(dataset) - num_train])
# train_loader, val_loader
batch_size = 192 * torch.cuda.device_count() # batch will be divided by the number of gpus 
train_loader = DataLoader(train_set, batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size, shuffle=True)
3. Model
- Simple Linear Classifier
 
import torch.nn as nn
import torch.nn.functional as F
class Classifier(nn.Module):
    def __init__(self, num_classes):
        super(SenBert_Classifier, self).__init__()
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, num_classes)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x
4. Train
- TODO
 
device = "cuda"
# Setup Model
model = senbcls.SenBert_Classifier(num_classes=len(config.CATEGORY_PRED1))
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
model.to(device)
model.train()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
for epoch in range(config.TRAIN_EPOCHS):
    # train model
    start = time.time()
    for step, batch in enumerate(train_loader):
        inputs, labels = batch 
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
5. Data Parallelism
- 
nn.DataParallel: You can simply use multiple gpus withnn.DataParallelthat processes followings :- 
nn.parallel.replicate(module, device_ids): your model is replicated on each device - 
nn.parallel.scatter(input, device_ids): batch size in your dataloader will be devided into num_gpu => batch_size/num_gpu - 
nn.parallel.gather(outputs, output_device): gathering outputs from replicated models on each device 
 - 
 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = myModel()
# multiple gpus => model to DataParallel
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
model.to(device)
- 
References :