Merge pull request #27 from chrishkchris/singa3.1
Update of documentation in distributed training and graph
diff --git a/docs-site/docs/dist-train.md b/docs-site/docs/dist-train.md
index c456440..24d956f 100644
--- a/docs-site/docs/dist-train.md
+++ b/docs-site/docs/dist-train.md
@@ -42,37 +42,63 @@
1. Define the neural network model:
```python
-class CNN:
- def __init__(self):
- self.conv1 = autograd.Conv2d(1, 20, 5, padding=0)
- self.conv2 = autograd.Conv2d(20, 50, 5, padding=0)
- self.linear1 = autograd.Linear(4 * 4 * 50, 500)
- self.linear2 = autograd.Linear(500, 10)
- self.pooling1 = autograd.MaxPool2d(2, 2, padding=0)
- self.pooling2 = autograd.MaxPool2d(2, 2, padding=0)
+class CNN(model.Model):
+
+ def __init__(self, num_classes=10, num_channels=1):
+ super(CNN, self).__init__()
+ self.conv1 = layer.Conv2d(num_channels, 20, 5, padding=0, activation="RELU")
+ self.conv2 = layer.Conv2d(20, 50, 5, padding=0, activation="RELU")
+ self.linear1 = layer.Linear(500)
+ self.linear2 = layer.Linear(num_classes)
+ self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+ self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+ self.relu = layer.ReLU()
+ self.flatten = layer.Flatten()
+ self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
def forward(self, x):
y = self.conv1(x)
- y = autograd.relu(y)
y = self.pooling1(y)
y = self.conv2(y)
- y = autograd.relu(y)
y = self.pooling2(y)
- y = autograd.flatten(y)
+ y = self.flatten(y)
y = self.linear1(y)
- y = autograd.relu(y)
+ y = self.relu(y)
y = self.linear2(y)
return y
+ def train_one_batch(self, x, y, dist_option='fp32', spars=0):
+ out = self.forward(x)
+ loss = self.softmax_cross_entropy(out, y)
+
+ # Allow different options for distributed training
+ # See the section "Optimizations for Distributed Training"
+ if dist_option == 'fp32':
+ self.optimizer(loss)
+ elif dist_option == 'fp16':
+ self.optimizer.backward_and_update_half(loss)
+ elif dist_option == 'partialUpdate':
+ self.optimizer.backward_and_partial_update(loss)
+ elif dist_option == 'sparseTopK':
+ self.optimizer.backward_and_sparse_update(loss,
+ topK=True,
+ spars=spars)
+ elif dist_option == 'sparseThreshold':
+ self.optimizer.backward_and_sparse_update(loss,
+ topK=False,
+ spars=spars)
+ return out, loss
+
# create model
model = CNN()
```
-2. Create the `DistOpt` instance:
+2. Create the `DistOpt` instance and attach it to the created model:
```python
sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
sgd = opt.DistOpt(sgd)
+model.set_optimizer(sgd)
dev = device.create_cuda_gpu_on(sgd.local_rank)
```
@@ -116,26 +142,23 @@
A partition of the dataset is returned for this `dev`.
+Here, `world_size` represents the total number of processes in all the nodes you
+are using for distributed training.
+
4. Initialize and synchronize the model parameters among all workers:
```python
-def synchronize(tensor, dist_opt):
- dist_opt.all_reduce(tensor.data)
- tensor /= dist_opt.world_size
-
#Synchronize the initial parameter
tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+model.compile([tx], is_train=True, use_graph=graph, sequential=True)
...
-out = model.forward(tx)
-loss = autograd.softmax_cross_entropy(out, ty)
-for p, g in autograd.backward(loss):
- synchronize(p, sgd)
+#Use the same random seed for different ranks
+seed = 0
+dev.SetRandSeed(seed)
+np.random.seed(seed)
```
-Here, `world_size` represents the total number of processes in all the nodes you
-are using for distributed training.
-
5. Run BackPropagation and distributed SGD
```python
@@ -149,6 +172,8 @@
loss = autograd.softmax_cross_entropy(out, ty)
# do backpropagation and all-reduce
sgd.backward_and_update(loss)
+ # Train the model
+ out, loss = model(tx, ty)
```
### Execution Instruction
@@ -321,10 +346,18 @@
reduce the communication cost. Refer to the API for `DistOpt` for the
configuration of each strategy.
+When we use `model.Model` to build a model, we need to put the options for
+distributed training in the `train_one_batch` method. Please refer to the
+example code on top of this page. We could just copy the code for the options
+and use it in other models.
+
+With the defined options, we can put the arguments `dist_option` and `spars` when we
+start the training with `model(tx, ty, dist_option, spars)`
+
### No Optimizations
```python
-sgd.backward_and_update(loss)
+out, loss = model(tx, ty)
```
`loss` is the output tensor from the loss function, e.g., cross-entropy for
@@ -333,7 +366,7 @@
### Half-precision Gradients
```python
-sgd.backward_and_update_half(loss)
+out, loss = model(tx, ty, dist_option = 'fp16')
```
It converts each gradient value to 16-bit representation (i.e., half-precision)
@@ -342,7 +375,7 @@
### Partial Synchronization
```python
-sgd.backward_and_partial_update(loss)
+out, loss = model(tx, ty, dist_option = 'partialUpdate')
```
In each iteration, every rank do the local sgd update. Then, only a chunk of
@@ -351,10 +384,6 @@
### Gradient Sparsification
-```python
-sgd.backward_and_sparse_update(loss)
-```
-
It applies sparsification schemes to select a subset of gradients for
all-reduce. There are two scheme:
@@ -362,14 +391,14 @@
elements selected.
```python
-sgd.backward_and_sparse_update(loss = loss, spars = spars, topK = True)
+out, loss = model(tx, ty, dist_option = 'sparseTopK', spars = spars)
```
- All gradients whose absolute value are larger than predefined threshold spars
are selected.
```python
-sgd.backward_and_sparse_update(loss = loss, spars = spars, topK = False)
+out, loss = model(tx, ty, dist_option = 'sparseThreshold', spars = spars)
```
The hyper-parameters are configured when creating the `DistOpt` instance.
diff --git a/docs-site/docs/graph.md b/docs-site/docs/graph.md
index fb4ba69..abce638 100644
--- a/docs-site/docs/graph.md
+++ b/docs-site/docs/graph.md
@@ -18,59 +18,60 @@
## Example
-The following code illustrates the usage of the `Module` API.
+The following code illustrates the usage of the `Model` API.
-1. Implement the new model as a subclass the Module class.
+1. Implement the new model as a subclass the Model class.
```Python
-class CNN(module.Module):
+class CNN(model.Model):
- def __init__(self, optimizer):
+ def __init__(self, num_classes=10, num_channels=1):
super(CNN, self).__init__()
- # define layers
- self.conv1 = autograd.Conv2d(1, 20, 5, padding=0)
- self.conv2 = autograd.Conv2d(20, 50, 5, padding=0)
- self.linear1 = autograd.Linear(4 * 4 * 50, 500)
- self.linear2 = autograd.Linear(500, 10)
- self.pooling1 = autograd.MaxPool2d(2, 2, padding=0)
- self.pooling2 = autograd.MaxPool2d(2, 2, padding=0)
-
- self.optimizer = optimizer
+ self.conv1 = layer.Conv2d(num_channels, 20, 5, padding=0, activation="RELU")
+ self.conv2 = layer.Conv2d(20, 50, 5, padding=0, activation="RELU")
+ self.linear1 = layer.Linear(500)
+ self.linear2 = layer.Linear(num_classes)
+ self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+ self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+ self.relu = layer.ReLU()
+ self.flatten = layer.Flatten()
+ self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
def forward(self, x):
- # define the forward operations
y = self.conv1(x)
- y = autograd.relu(y)
y = self.pooling1(y)
y = self.conv2(y)
- y = autograd.relu(y)
y = self.pooling2(y)
- y = autograd.flatten(y)
+ y = self.flatten(y)
y = self.linear1(y)
- y = autograd.relu(y)
+ y = self.relu(y)
y = self.linear2(y)
return y
- def loss(self, x, ty):
- # define the training loss
- return autograd.softmax_cross_entropy(x, ty)
-
- def optim(self, loss):
- # update the parameters using SGD algorithms
- self.optimizer.backward_and_update(loss)
+ def train_one_batch(self, x, y):
+ out = self.forward(x)
+ loss = self.softmax_cross_entropy(out, y)
+ self.optimizer(loss)
+ return out, loss
```
-2. Create an instance of the model, and do some configurations
+2. Create an instance of model, optimizer, device, etc. Compile the model
```python
-model = CNN(sgd)
-# set the mode of running the operations:
-# True for training; False for evaluation
-model.train(mode=True)
-# set the device for running the operations
-model.on_device(dev)
-# whether to create the graph or run the operations imperatively
-model.graph(mode=True)
+model = CNN()
+
+# initialize optimizer and attach it to the model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+
+# initialize device
+dev = device.create_cuda_gpu()
+
+# input and target placeholders for the model
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+
+# compile the model before training
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
```
3. Train the model iteratively
@@ -84,11 +85,8 @@
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)
- # run forward propagation
- out = model(tx)
- loss = model.loss(out, ty)
- # run backward propagation
- model.optim(loss)
+ # Training with one batch
+ out, loss = model(tx, ty)
```
A Google Colab notebook of this example is available
@@ -116,13 +114,19 @@
class
```python
-class MLP(module.Module):
+class MLP(model.Model):
+
+ def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
+ super(MLP, self).__init__()
+ self.linear1 = layer.Linear(perceptron_size)
+ ...
def forward(self, inputs):
- x = autograd.matmul(inputs, self.w0)
+ y = self.linear1(inputs)
...
```
+The `Linear` layer is composed of the `mutmul` operator.
`autograd` implements the `matmul` operator by calling the function `Mult`
exposed from CPP via SWIG.
@@ -148,7 +152,7 @@
addition, it also has the information about the blocks (a block is a chunk of
memory for a tensor) to be read and written by this function.
-Once `Module.forward()` has been executed once, all operations are buffered by
+Once `Model.forward()` has been executed once, all operations are buffered by
`Device`. Next, the read/write information of all operations are analyzed to
create the computational graph. For example, if a block `b` is written by one
operation O1 and is later read by another operation O2, we would know O2 depends