package singa;
* To start a training job, all we need is a JobProto object.
* It should contain following fields
* - Job Name (name)
* the name to identify the job
* - NeuralNet (neuralnet)
* the neural network structure contains a set of layers
* - Train One Batch (alg)
* the training algorithm
* - Updater (updater)
* the protocol for updating parameters at server side
* - Cluster Topology (cluster)
* the distributed topology of workers/servers
* - Training Steps (train_steps)
* the number of training iteration
* All other fields/functions are optional, e.g., test, checkpoint
message JobProto {
// job name, e.g., "cifar10-dcnn", "mnist-mlp"
optional string name = 1;
// neural net consits of a set of connected layers
optional NetProto neuralnet = 3;
// algorithm for computing gradients over one mini-batch
optional AlgProto train_one_batch = 5;
// configuration of SGD updater, including learning rate, etc.
optional UpdaterProto updater = 7;
// cluster toplogy conf
optional ClusterProto cluster = 9;
// total num of steps for training
optional int32 train_steps = 16;
// frequency of displaying training info
optional int32 disp_freq = 17 [default = 0];
// GPU device IDs for use, if fewer than workers per procs, some workers run
// on GPU and the rest run on CPU.
repeated int32 gpu = 18;
// frequency of test, e.g., do test every 100 training steps
optional int32 test_freq = 20 [default = 0];
// total num of steps for testing all test data;
// TODO(wangwei): set -1 for test forever
optional int32 test_steps = 21 [default = 0];
// frequency of validation, e.g., do validation every 100 training steps
optional int32 validate_freq = 25 [default = 0];
// total num of steps for validating all validation data
optional int32 validate_steps = 26 [default = 0];
// frequency of checkpoint
optional int32 checkpoint_freq = 30 [default = 0];
// for loading checkpoint files to init parameters
repeated string checkpoint_path = 60;
// send parameters to servers after training for this num of steps
optional int32 warmup_steps = 61 [default = 0];
// display debug info
optional bool debug = 62 [default = false];
// reset the version of params loaded from checkpoint file to step
optional bool reset_param_version = 63 [default = true];
// set num of threads used by openblas
optional int32 num_openblas_threads = 64 [default = 1];
// start checkpoint after this num steps
optional int32 checkpoint_after = 80 [default = 0];
// start display after this num steps
optional int32 disp_after = 81[default = 0];
// start test after this num steps
optional int32 test_after = 82 [default = 0];
// start validation after this num steps
optional int32 validate_after = 83 [default = 0];
// for internal use
// users typically do not touch following fields
// resume flag
optional bool resume = 90 [default = false];
// last snapshot step
optional int32 step = 91 [default = 0];
// job id allocated by zookeeper
optional int32 id = 92 [default = -1];
extensions 101 to 200;
// Protos used by JobProto
// -----------------------
message AlgProto {
// algorithms calculating gradients for one mini-batch/iteration
optional AlgType alg = 1 [default = kUserAlg];
// user defined algorithm
optional string user_alg = 2;
// for setting CD fields
optional CDProto cd_conf = 10;
extensions 101 to 200;
message NetProto {
repeated LayerProto layer = 1;
// partitioning type for parallelism
optional int32 partition_dim = 20 [default = 0];
// Each layer corresponds to a group of unrolled layers, used in RNN models
repeated LayerGroupProto layer_group = 21;
optional int32 unroll_len = 22 [default = 1];
message LayerGroupProto {
// name of the layers belong to the same group
repeated string layer = 1;
message UpdaterProto {
// built-in updater type
optional UpdaterType type = 1 [default = kUserUpdater];
// user-defned updater type
optional string user_type = 2;
// configuration for RMSProp algorithm
optional RMSPropProto rmsprop_conf = 3;
// congiguration for AdaDelta algorithm
optional AdaDeltaProto adadelta_conf = 4;
// congiguration for Adam algorithm
optional AdamProto adam_conf = 5;
// congiguration for AdamMax algorithm
optional AdamMaxProto adammax_conf = 6;
// learning rate generator
optional LRGenProto learning_rate = 11;
optional float momentum = 31 [default = 0];
optional float weight_decay = 32 [default = 0];
// used to avoid divide by 0, i.e. x/(y+delta)
optional float delta = 35 [default = 0.00000001];
optional float clip_low = 36 [default = 0];
optional float clip_high = 37 [default = 0];
extensions 101 to 200;
message ClusterProto {
optional int32 nworker_groups = 1 [default = 1];
optional int32 nserver_groups = 2 [default = 1];
optional int32 nworkers_per_group = 3 [default = 1];
optional int32 nservers_per_group = 4 [default = 1];
optional int32 nworkers_per_procs = 5 [default = 1];
optional int32 nservers_per_procs = 6 [default = 1];
// local workspace for checkpoint files and vis files
//required string workspace = 10;
optional string workspace = 10;
// servers and workers in different processes?
optional bool server_worker_separate = 20 [default = false];
// sync frequency between server groups
optional int32 sync_freq = 21 [default = 1];
// port number used by ZeroMQ
optional int32 start_port = 60 [default = 6723];
// share memory space between worker groups in one procs
optional bool share_memory = 62 [default = true];
// poll time in milliseconds
optional int32 poll_time = 81 [default = 100];
message CDProto {
//number of steps for gibbs sampling
optional int32 cd_k = 1 [default = 1];
message LayerProto {
// the layer name used for identification
required string name = 1;
// source layer names
repeated string srclayers = 3;
// parameters, e.g., weight matrix or bias vector
repeated ParamProto param = 12;
// all layers are included in the net structure for training phase by default.
// some layers like data layer for loading test data are not used by training
// phase should be removed by setting the exclude field.
repeated Phase exclude = 15;
// exclude field is deprecated, please use include field instead!!!
// some layers like data layer for loading test data are not used by training
// in this case, only test phase should be included by setting the include field.
repeated Phase include = 14;
// type of built-in layer
optional LayerType type = 20 [default = kUserLayer];
// type of user layer
optional string user_type = 21;
// share data and grad blob with the single src layer, e.g., relu layer can
// share blobs from conv layer. It is useful for saving memory space.
optional bool share_src_blobs = 22 [default = false];
// for unrolling layers in RNN model
optional int32 unroll_len = 23 [default = 1];
optional int32 unroll_index = 24 [default = 0];
repeated UnrollConnType unroll_conn_type = 25;
repeated int32 shift = 26;
// overrides the partition dimension for neural net
optional int32 partition_dim = 60 [default = -1];
// names of parameters shared from other layers
optional int32 partition_id = 90 [default = 0];
// num of partitions for this layer
optional int32 num_partitions = 91 [default = 1];
// layer specific configuration
// configuration for input layers, id range [100, 200)
optional StoreProto store_conf = 100;
optional DataProto lmdbdata_conf = 190;
optional MnistProto mnist_conf = 192;
optional RGBImageProto rgbimage_conf = 193;
optional DataProto sharddata_conf = 194;
optional CharRNNProto char_rnn_conf = 195;
optional OnehotProto onehot_conf = 196;
// configuration for neuron layers id range [200, 300)
optional ActivationProto activation_conf = 200;
optional ConvolutionProto convolution_conf = 201;
optional DropoutProto dropout_conf = 203;
optional DummyProto dummy_conf = 204;
optional InnerProductProto innerproduct_conf = 205;
optional LRNProto lrn_conf = 206;
optional PoolingProto pooling_conf = 207;
optional RBMProto rbm_conf = 209;
optional ReLUProto relu_conf = 211;
optional SoftmaxProto softmax_conf = 214;
optional GRUProto gru_conf = 215;
optional EmbeddingProto embedding_conf = 216;
optional BMProto bm_conf = 217;
// configuration for loss layers, id range [300, 400)
optional SoftmaxLossProto softmaxloss_conf = 301;
// configuration for output layers id range [400, 500)
optional ArgSortProto argsort_conf = 401;
// configuration for connection layers, id range [501, )
optional ConcateProto concate_conf = 502;
optional SliceProto slice_conf = 503;
optional SplitProto split_conf = 504;
optional RNNDummyProto rnn_dummy_conf = 505;
extensions 1001 to 1100;
// weight matrix should be defined before bias vector
// TODO(wangwei): separate conf for diff init method
message ParamProto {
// used for identifying the same params from diff models and display deug info
optional string name = 1 [default = ""];
// for built-in Param
optional ParamType type = 3 [default = kParam];
// for user-defined Param
optional string user_type = 4;
optional ParamGenProto init =5;
// multiplied on the global learning rate.
optional float lr_scale = 15 [default = 1];
// multiplied on the global weight decay.
optional float wd_scale = 16 [default = 1];
// name of the owner param from which this param shares the values
optional string share_from = 60;
// used interally
optional int32 id = 90;
// used internally
optional int32 owner = 91 [default = -1];
// partition dimension, -1 for no partition
optional int32 partition_dim = 92;
// usually, the program will infer the param shape
repeated int32 shape = 93;
extensions 101 to 200;
// ---------------------------
// protos for different layers
// ---------------------------
// learning rate generator proto
message LRGenProto {
// user-defined change method
optional ChangeMethod type = 1 [default = kUserChange];
optional string user_type = 2;
optional float base_lr = 3 [default = 0.01];
optional FixedStepProto fixedstep_conf = 40;
optional StepProto step_conf = 41;
optional LinearProto linear_conf = 42;
optional ExponentialProto exponential_conf = 43;
optional InverseProto inverse_conf = 44;
optional InverseTProto inverset_conf = 45;
extensions 101 to 200;
message ParamGenProto {
optional InitMethod type = 1 [default = kUserInit];
optional string user_type =2;
// constant init
optional float value = 3 [default = 1];
// for gaussian sampling
optional float mean = 4 [default = 0];
optional float std = 5 [default = 1];
// for uniform sampling
optional float low = 8 [default = -1];
optional float high = 9 [default = 1];
extensions 101 to 200;
enum ActivationType {
RELU = 1;
TANH = 3;
STANH = 4;
message ActivationProto {
optional ActivationType type = 1 [default = RELU];
message OnehotProto {
optional int32 vocab_size = 1 [default = 0];
message RGBImageProto {
// scale factor for each pixel
optional float scale = 1 [default = 1.0];
// size after cropping
optional int32 cropsize = 2 [default = 0];
// mirror the image
optional bool mirror = 3 [default = false];
// meanfile path
optional string meanfile = 4 [default = ""];
message SplitProto {
optional int32 num_splits = 1 [default = 1];
message StoreProto {
optional string backend = 1;
optional string path = 2;
optional string separator = 3 [default = ","];
optional string mean_file = 4;
optional string std_file = 5;
optional float mean_value = 6;
optional float std_value = 7;
repeated int32 batchsize = 8;
repeated int32 shape = 9;
optional bool encoded = 10 [default = false];
optional int32 random_skip = 11 [default = 0];
optional bool has_label = 12 [default = true];
optional bool prefetching = 13 [default = false];
message CharRNNProto {
optional string path = 1;
optional string vocab_path = 2;
// num of chars to read per instance, should = NetProto::unroll_len
optional int32 unroll_len = 3 [default = 50];
optional int32 batchsize = 4 [default = 1];
message EmbeddingProto {
optional int32 vocab_size = 1 [default = 0];
optional int32 feature_dim = 2 [default = 100];
message BMProto {
message SoftmaxLossProto {
// computing accuracy against topk results
optional int32 topk = 1 [default = 1];
// loss scale factor
optional float scale = 30 [default = 1];
message ArgSortProto {
// keep labels with topk scores
optional int32 topk = 1 [default = 1];
message ConcateProto {
optional int32 concate_dim = 1 [default = 0];
optional int32 num_concates = 2 [default = 1];
message ConvolutionProto {
// The number of outputs for the layer
optional int32 num_filters = 1;
// the kernel height/width
optional int32 kernel = 2 [default = 3];
// The padding height/width
optional int32 pad = 30 [default = 0];
// the stride
optional int32 stride = 31 [default = 1];
optional int32 kernel_x = 41 [default = 3];
optional int32 kernel_y = 42 [default = 3];
optional int32 pad_x = 44 [default = 0];
optional int32 pad_y = 45 [default = 0];
optional int32 stride_x = 47 [default = 1];
optional int32 stride_y = 48 [default = 1];
// cudnn workspace size in MB
optional int32 workspace_byte_limit = 50 [default = 512];
message DataProto {
// path to the data file/folder, absolute or relative to the workspace
required string path = 2;
// batch size.
required int32 batchsize = 4;
// skip [0,random_skip] records
optional int32 random_skip = 30 [default = 0];
message MnistProto {
// normalization x/norm_a
required float norm_a = 1 [default = 1];
// normalization x-norm_b
required float norm_b = 2 [default = 0];
// elastic distortion
optional int32 kernel = 30 [default = 0];
optional float sigma = 31 [default = 0];
optional float alpha = 32 [default = 0];
// rotation or horizontal shearing
optional float beta = 33 [default = 0];
// scaling
optional float gamma = 34 [default = 0];
// scale to this size as input for deformation
optional int32 resize = 35 [default = 0] ;
optional int32 elastic_freq = 36 [default = 0];
message DummyProto {
// shape of data and grad blobs
optional bool input = 1 [default = false];
optional bool output = 2 [default = false];
repeated int32 shape = 3;
message RNNDummyProto {
optional string dynamic_srclayer = 1;
// if shape set, random generate the data blob
repeated int32 shape = 2;
// if integer is true, generate integer data
optional bool integer = 3 [default = false];
// range of the random generation
optional float low = 4 [default = 0];
optional float high = 5 [default = 0];
// Message that stores parameters used by DropoutLayer
message DropoutProto {
// dropout ratio
optional float dropout_ratio = 30 [default = 0.5];
message RBMProto {
required int32 hdim = 1; // The number of outputs for the layer
optional bool bias_term = 2 [default = true]; // whether to have bias terms
optional bool gaussian = 3 [default = false]; // use gaussian sampling or not
// Message that stores parameters used by GRULayer
message GRUProto {
// dimension of hidden state for the layer
required int32 dim_hidden = 1;
// use bias vector or not
optional bool bias_term = 2 [default = true];
// Message that stores parameters used by InnerProductLayer
message InnerProductProto {
// number of outputs for the layer
required int32 num_output = 1;
// use bias vector or not
optional bool bias_term = 30 [default = true];
// transpose or not
optional bool transpose = 31 [default = false];
message LRNProto {
// local response size
required int32 local_size = 1 [default = 5];
// scale factor
optional float alpha = 31 [default = 1.0];
// exponential number
optional float beta = 32 [default = 0.75];
// offset
optional float knorm = 34 [default = 1.0];
message PoolingProto {
// The kernel size (square)
optional int32 kernel= 1 [default = 3];
enum PoolMethod {
MAX = 0;
AVG = 1;
// The pooling method
optional PoolMethod pool = 30 [default = MAX];
// The padding size
optional uint32 pad = 31 [default = 0];
// The stride
optional uint32 stride = 32 [default = 2];
optional int32 kernel_x = 41 [default = 3];
optional int32 kernel_y = 42 [default = 3];
optional int32 pad_x = 44 [default = 0];
optional int32 pad_y = 45 [default = 0];
optional int32 stride_x = 47 [default = 2];
optional int32 stride_y = 48 [default = 2];
message ReLUProto {
// Ref. Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013).
// Rectifier nonlinearities improve neural network acoustic models.
// In ICML Workshop on Deep Learning for Audio, Speech, and Language Processing.
optional float negative_slope = 1 [default = 0];
message SliceProto {
optional int32 slice_dim = 1 [default = 0];
optional int32 num_slices = 2 [default = 1];
message SoftmaxProto {
// Can be used to do softmax over each channel of one image by setting it to
// be the size of the second dimension (the first dimension is batchsize).
optional int32 num_softmax_per_instance = 1 [default = 1];
message RMSPropProto {
// history=history*rho_+(1-rho_)*(grad*grad_scale);
required float rho = 1;
message AdaDeltaProto {
required float rho = 1 [default = 0.9];
message AdamProto {
required float beta1 = 1 [default = 0.9];
required float beta2 = 2 [default = 0.999];
message AdamMaxProto {
required float beta1 = 1 [default = 0.9];
required float beta2 = 2 [default = 0.999];
message FixedStepProto {
repeated int32 step = 28;
// lr = step_lr[i] if current step >= step[i]
repeated float step_lr = 29;
message StepProto {
// lr = base_lr * gamma^(step/change_freq)
required float gamma = 35 [default = 1];
// lr = base_lr * gamma^(step/change_freq)
required int32 change_freq = 40;
message LinearProto {
// lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
required int32 change_freq= 40;
// lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
required float final_lr = 39;
message ExponentialProto {
// lr = base / 2^(step/change_freq)
required int32 change_freq = 40;
message InverseTProto {
// lr = base_lr / (1+step/final_lr)
required float final_lr = 39;
message InverseProto {
// lr = base_lr*(1+gamma*step)^(-pow)
required float gamma = 1 [default = 1];
// lr = base_lr*(1+gamma*step)^(-pow)
required float pow = 2 [default = 0];
message UniformProto {
optional float low = 1 [default = -1];
optional float high = 2 [default = 1];
message GaussianProto {
optional float mean = 1 [default = 0];
optional float std = 2 [default = 1];
// --------------
// All Enum Types
// --------------
enum AlgType {
// Back-propagation algorithm for feed-forward models, e.g., CNN and RNN
kBP = 1;
// Contrastive Divergence algorithm for RBM, DBM, etc.
kCD = 2;
// BPTT for training RNN models
kBPTT = 3;
// For user defined algorithm.
kUserAlg = 104;
enum LayerType {
* Input layers
* - Load records from file, database
kCSVInput = 100;
kImagePreprocess = 101;
kRecordInput = 103;
kLMDBData = 190; // deprecated
kLabel = 191; // deprecated
kMnist = 192; // deprecated
kRGBImage = 193; // deprecated
kShardData = 194; // deprecated
kCharRNN = 195;
kRNNLabel = 196;
kOneHot = 197;
* Neuron layers
* - Feature transformation
kConvolution = 201;
kCConvolution = 202;
kDropout = 203;
kDummy = 204;
kInnerProduct = 205;
kLRN = 206;
kPooling = 207;
kCPooling = 208;
kRBMHid = 209;
kRBMVis = 210;
kReLU = 211;
kSTanh = 212;
kSigmoid = 213;
kSoftmax = 214;
kGRU = 215;
kEmbedding = 216;
kActivation = 217;
kBM = 218;
kCudnnConv = 250;
kCudnnPool = 251;
kCudnnLRN = 252;
kCudnnSoftmax = 253;
kCudnnActivation = 254;
kCudnnBM = 255;
* Loss layers
* - Compute objective loss
kEuclideanLoss = 300;
kSoftmaxLoss = 301;
// cudnn v3
kCudnnSoftmaxLoss = 350;
* Output layers
* - Write results to file, database
kAccuracy = 400;
kArgSort = 401;
kCSVOutput = 402;
kRecordOutput = 403;
kCharRNNOutput = 404;
* Connection layers
* - Connect layers when neural net is partitioned
kBridgeDst = 500;
kBridgeSrc = 501;
kConcate = 502;
kSlice = 503;
kSplit = 504;
kRNNDummy = 505;
* User defined layer
* - users should configure user_type
kUserLayer = 600;
enum UpdaterType {
// noraml SGD with momentum and weight decay
kSGD = 1;
// adaptive subgradient,
kAdaGrad = 2;
kRMSProp = 3;
// Nesterov first optimal gradient method
kNesterov = 4;
// AdaDelta
kAdaDelta = 5;
// Adam
kAdam = 6;
// AdamMax
kAdamMax = 7;
// For user defined updater
kUserUpdater = 105;
enum Phase {
kUnknown = 0;
kTrain = 1;
kVal = 2;
kTest= 4;
// postivie phase for contrastive divergence algorithm
kPositive = 8;
// negative phase for contrastive divergence algorithm
kNegative = 16;
kForward = 32;
kBackward = 64;
kLoss = 128;
kDeploy = 256;
// used for aggregate parameter gradients when Param is shared
kAggGrad = 512;
enum ParamType {
// built-in Param
kParam = 0;
// user-defined Param
kUser = 103;
enum ChangeMethod {
kFixed = 0;
kInverseT = 1;
kInverse = 2;
kExponential = 3;
kLinear = 4;
kStep = 5;
kFixedStep = 6;
// For user defiend change method
kUserChange = 100;
enum InitMethod {
// fix the values of all parameters a constant in the value field
kConstant = 0;
// sample gaussian with std and mean
kGaussian = 1;
// uniform sampling between low and high
kUniform = 2;
// from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from
// Gaussian distribution
kGaussianSqrtFanIn = 4;
// from Toronto Convnet, rectified linear activation, let
// a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3),
// the program will multiply it.
kUniformSqrtFanIn = 5;
// from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh
// activation, range is [-a, +a], for sigmoid activation, range is
// [-4a, +4a], put the scale factor to value field.
// <a href=""> Theano MLP</a>
kUniformSqrtFanInOut = 6;
// For user defined init method
kUserInit = 101;
enum UnrollConnType {
// i-th unrolled layer <- (i - shift)-th src unrolled layer
kUnrollOneToOne = 1;
// i-th unrolled layer <- all src unrolled layers
kUnrollOneToAll = 2;
// i-th unrolled layer <- last unrolled src layer
kUnrollFirstToLast = 3;
// customized connection type defined by src_conn
kUnrollCustomized = 4;