| /************************************************************ |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *************************************************************/ |
| |
| package singa; |
| |
| /* |
| * To start a training job, all we need is a JobProto object. |
| * It should contain following fields |
| * - Job Name (name) |
| * the name to identify the job |
| * - NeuralNet (neuralnet) |
| * the neural network structure contains a set of layers |
| * - Train One Batch (alg) |
| * the training algorithm |
| * - Updater (updater) |
| * the protocol for updating parameters at server side |
| * - Cluster Topology (cluster) |
| * the distributed topology of workers/servers |
| * - Training Steps (train_steps) |
| * the number of training iteration |
| * All other fields/functions are optional, e.g., test, checkpoint |
| */ |
| |
| message JobProto { |
| // job name, e.g., "cifar10-dcnn", "mnist-mlp" |
| optional string name = 1; |
| // neural net consits of a set of connected layers |
| optional NetProto neuralnet = 3; |
| // algorithm for computing gradients over one mini-batch |
| optional AlgProto train_one_batch = 5; |
| // configuration of SGD updater, including learning rate, etc. |
| optional UpdaterProto updater = 7; |
| // cluster toplogy conf |
| optional ClusterProto cluster = 9; |
| // total num of steps for training |
| optional int32 train_steps = 16; |
| // frequency of displaying training info |
| optional int32 disp_freq = 17 [default = 0]; |
| // GPU device IDs for use, if fewer than workers per procs, some workers run |
| // on GPU and the rest run on CPU. |
| repeated int32 gpu = 18; |
| |
| // frequency of test, e.g., do test every 100 training steps |
| optional int32 test_freq = 20 [default = 0]; |
| // total num of steps for testing all test data; |
| // TODO(wangwei): set -1 for test forever |
| optional int32 test_steps = 21 [default = 0]; |
| // frequency of validation, e.g., do validation every 100 training steps |
| optional int32 validate_freq = 25 [default = 0]; |
| // total num of steps for validating all validation data |
| optional int32 validate_steps = 26 [default = 0]; |
| // frequency of checkpoint |
| optional int32 checkpoint_freq = 30 [default = 0]; |
| |
| // for loading checkpoint files to init parameters |
| repeated string checkpoint_path = 60; |
| // send parameters to servers after training for this num of steps |
| optional int32 warmup_steps = 61 [default = 0]; |
| // display debug info |
| optional bool debug = 62 [default = false]; |
| // reset the version of params loaded from checkpoint file to step |
| optional bool reset_param_version = 63 [default = true]; |
| // set num of threads used by openblas |
| optional int32 num_openblas_threads = 64 [default = 1]; |
| |
| // start checkpoint after this num steps |
| optional int32 checkpoint_after = 80 [default = 0]; |
| // start display after this num steps |
| optional int32 disp_after = 81[default = 0]; |
| // start test after this num steps |
| optional int32 test_after = 82 [default = 0]; |
| // start validation after this num steps |
| optional int32 validate_after = 83 [default = 0]; |
| |
| // for internal use |
| // users typically do not touch following fields |
| |
| // resume flag |
| optional bool resume = 90 [default = false]; |
| // last snapshot step |
| optional int32 step = 91 [default = 0]; |
| // job id allocated by zookeeper |
| optional int32 id = 92 [default = -1]; |
| |
| extensions 101 to 200; |
| } |
| |
| // Protos used by JobProto |
| // ----------------------- |
| |
| message AlgProto { |
| // algorithms calculating gradients for one mini-batch/iteration |
| optional AlgType alg = 1 [default = kUserAlg]; |
| // user defined algorithm |
| optional string user_alg = 2; |
| // for setting CD fields |
| optional CDProto cd_conf = 10; |
| |
| extensions 101 to 200; |
| } |
| message NetProto { |
| repeated LayerProto layer = 1; |
| // partitioning type for parallelism |
| optional int32 partition_dim = 20 [default = 0]; |
| // Each layer corresponds to a group of unrolled layers, used in RNN models |
| repeated LayerGroupProto layer_group = 21; |
| optional int32 unroll_len = 22 [default = 1]; |
| } |
| |
| message LayerGroupProto { |
| // name of the layers belong to the same group |
| repeated string layer = 1; |
| } |
| |
| message UpdaterProto { |
| // built-in updater type |
| optional UpdaterType type = 1 [default = kUserUpdater]; |
| // user-defned updater type |
| optional string user_type = 2; |
| |
| // configuration for RMSProp algorithm |
| optional RMSPropProto rmsprop_conf = 3; |
| // congiguration for AdaDelta algorithm |
| optional AdaDeltaProto adadelta_conf = 4; |
| // congiguration for Adam algorithm |
| optional AdamProto adam_conf = 5; |
| // congiguration for AdamMax algorithm |
| optional AdamMaxProto adammax_conf = 6; |
| |
| // learning rate generator |
| optional LRGenProto learning_rate = 11; |
| optional float momentum = 31 [default = 0]; |
| optional float weight_decay = 32 [default = 0]; |
| |
| // used to avoid divide by 0, i.e. x/(y+delta) |
| optional float delta = 35 [default = 0.00000001]; |
| |
| optional float clip_low = 36 [default = 0]; |
| optional float clip_high = 37 [default = 0]; |
| |
| extensions 101 to 200; |
| } |
| |
| message ClusterProto { |
| optional int32 nworker_groups = 1 [default = 1]; |
| optional int32 nserver_groups = 2 [default = 1]; |
| optional int32 nworkers_per_group = 3 [default = 1]; |
| optional int32 nservers_per_group = 4 [default = 1]; |
| optional int32 nworkers_per_procs = 5 [default = 1]; |
| optional int32 nservers_per_procs = 6 [default = 1]; |
| // local workspace for checkpoint files and vis files |
| //required string workspace = 10; |
| optional string workspace = 10; |
| |
| // servers and workers in different processes? |
| optional bool server_worker_separate = 20 [default = false]; |
| |
| // sync frequency between server groups |
| optional int32 sync_freq = 21 [default = 1]; |
| |
| // port number used by ZeroMQ |
| optional int32 start_port = 60 [default = 6723]; |
| // share memory space between worker groups in one procs |
| optional bool share_memory = 62 [default = true]; |
| |
| // poll time in milliseconds |
| optional int32 poll_time = 81 [default = 100]; |
| } |
| |
| message CDProto { |
| //number of steps for gibbs sampling |
| optional int32 cd_k = 1 [default = 1]; |
| } |
| |
| message LayerProto { |
| // the layer name used for identification |
| required string name = 1; |
| // source layer names |
| repeated string srclayers = 3; |
| // parameters, e.g., weight matrix or bias vector |
| repeated ParamProto param = 12; |
| // all layers are included in the net structure for training phase by default. |
| // some layers like data layer for loading test data are not used by training |
| // phase should be removed by setting the exclude field. |
| repeated Phase exclude = 15; |
| // exclude field is deprecated, please use include field instead!!! |
| // some layers like data layer for loading test data are not used by training |
| // in this case, only test phase should be included by setting the include field. |
| repeated Phase include = 14; |
| // type of built-in layer |
| optional LayerType type = 20 [default = kUserLayer]; |
| // type of user layer |
| optional string user_type = 21; |
| // share data and grad blob with the single src layer, e.g., relu layer can |
| // share blobs from conv layer. It is useful for saving memory space. |
| optional bool share_src_blobs = 22 [default = false]; |
| // for unrolling layers in RNN model |
| optional int32 unroll_len = 23 [default = 1]; |
| optional int32 unroll_index = 24 [default = 0]; |
| repeated UnrollConnType unroll_conn_type = 25; |
| repeated int32 shift = 26; |
| |
| // overrides the partition dimension for neural net |
| optional int32 partition_dim = 60 [default = -1]; |
| // names of parameters shared from other layers |
| optional int32 partition_id = 90 [default = 0]; |
| // num of partitions for this layer |
| optional int32 num_partitions = 91 [default = 1]; |
| |
| // layer specific configuration |
| // configuration for input layers, id range [100, 200) |
| optional StoreProto store_conf = 100; |
| optional DataProto lmdbdata_conf = 190; |
| optional MnistProto mnist_conf = 192; |
| optional RGBImageProto rgbimage_conf = 193; |
| optional DataProto sharddata_conf = 194; |
| optional CharRNNProto char_rnn_conf = 195; |
| optional OnehotProto onehot_conf = 196; |
| |
| // configuration for neuron layers id range [200, 300) |
| optional ActivationProto activation_conf = 200; |
| optional ConvolutionProto convolution_conf = 201; |
| optional DropoutProto dropout_conf = 203; |
| optional DummyProto dummy_conf = 204; |
| optional InnerProductProto innerproduct_conf = 205; |
| optional LRNProto lrn_conf = 206; |
| optional PoolingProto pooling_conf = 207; |
| optional RBMProto rbm_conf = 209; |
| optional ReLUProto relu_conf = 211; |
| optional SoftmaxProto softmax_conf = 214; |
| optional GRUProto gru_conf = 215; |
| optional EmbeddingProto embedding_conf = 216; |
| optional BMProto bm_conf = 217; |
| |
| // configuration for loss layers, id range [300, 400) |
| optional SoftmaxLossProto softmaxloss_conf = 301; |
| |
| // configuration for output layers id range [400, 500) |
| optional ArgSortProto argsort_conf = 401; |
| |
| // configuration for connection layers, id range [501, ) |
| optional ConcateProto concate_conf = 502; |
| optional SliceProto slice_conf = 503; |
| optional SplitProto split_conf = 504; |
| optional RNNDummyProto rnn_dummy_conf = 505; |
| |
| extensions 1001 to 1100; |
| } |
| |
| // weight matrix should be defined before bias vector |
| // TODO(wangwei): separate conf for diff init method |
| message ParamProto { |
| // used for identifying the same params from diff models and display deug info |
| optional string name = 1 [default = ""]; |
| // for built-in Param |
| optional ParamType type = 3 [default = kParam]; |
| // for user-defined Param |
| optional string user_type = 4; |
| |
| optional ParamGenProto init =5; |
| // multiplied on the global learning rate. |
| optional float lr_scale = 15 [default = 1]; |
| // multiplied on the global weight decay. |
| optional float wd_scale = 16 [default = 1]; |
| |
| // name of the owner param from which this param shares the values |
| optional string share_from = 60; |
| |
| // used interally |
| optional int32 id = 90; |
| // used internally |
| optional int32 owner = 91 [default = -1]; |
| // partition dimension, -1 for no partition |
| optional int32 partition_dim = 92; |
| // usually, the program will infer the param shape |
| repeated int32 shape = 93; |
| |
| extensions 101 to 200; |
| } |
| |
| // --------------------------- |
| // protos for different layers |
| // --------------------------- |
| // learning rate generator proto |
| message LRGenProto { |
| // user-defined change method |
| optional ChangeMethod type = 1 [default = kUserChange]; |
| optional string user_type = 2; |
| |
| optional float base_lr = 3 [default = 0.01]; |
| |
| optional FixedStepProto fixedstep_conf = 40; |
| optional StepProto step_conf = 41; |
| optional LinearProto linear_conf = 42; |
| optional ExponentialProto exponential_conf = 43; |
| optional InverseProto inverse_conf = 44; |
| optional InverseTProto inverset_conf = 45; |
| |
| extensions 101 to 200; |
| } |
| |
| message ParamGenProto { |
| optional InitMethod type = 1 [default = kUserInit]; |
| optional string user_type =2; |
| // constant init |
| optional float value = 3 [default = 1]; |
| // for gaussian sampling |
| optional float mean = 4 [default = 0]; |
| optional float std = 5 [default = 1]; |
| // for uniform sampling |
| optional float low = 8 [default = -1]; |
| optional float high = 9 [default = 1]; |
| |
| extensions 101 to 200; |
| } |
| |
| enum ActivationType { |
| RELU = 1; |
| SIGMOID = 2; |
| TANH = 3; |
| STANH = 4; |
| } |
| |
| message ActivationProto { |
| optional ActivationType type = 1 [default = RELU]; |
| } |
| |
| message OnehotProto { |
| optional int32 vocab_size = 1 [default = 0]; |
| } |
| |
| message RGBImageProto { |
| // scale factor for each pixel |
| optional float scale = 1 [default = 1.0]; |
| // size after cropping |
| optional int32 cropsize = 2 [default = 0]; |
| // mirror the image |
| optional bool mirror = 3 [default = false]; |
| // meanfile path |
| optional string meanfile = 4 [default = ""]; |
| } |
| |
| message SplitProto { |
| optional int32 num_splits = 1 [default = 1]; |
| } |
| |
| message StoreProto { |
| optional string backend = 1; |
| optional string path = 2; |
| optional string separator = 3 [default = ","]; |
| optional string mean_file = 4; |
| optional string std_file = 5; |
| optional float mean_value = 6; |
| optional float std_value = 7; |
| repeated int32 batchsize = 8; |
| repeated int32 shape = 9; |
| optional bool encoded = 10 [default = false]; |
| optional int32 random_skip = 11 [default = 0]; |
| optional bool has_label = 12 [default = true]; |
| optional bool prefetching = 13 [default = false]; |
| } |
| |
| message CharRNNProto { |
| optional string path = 1; |
| optional string vocab_path = 2; |
| // num of chars to read per instance, should = NetProto::unroll_len |
| optional int32 unroll_len = 3 [default = 50]; |
| optional int32 batchsize = 4 [default = 1]; |
| } |
| |
| message EmbeddingProto { |
| optional int32 vocab_size = 1 [default = 0]; |
| optional int32 feature_dim = 2 [default = 100]; |
| |
| } |
| |
| message BMProto { |
| } |
| |
| message SoftmaxLossProto { |
| // computing accuracy against topk results |
| optional int32 topk = 1 [default = 1]; |
| // loss scale factor |
| optional float scale = 30 [default = 1]; |
| } |
| |
| message ArgSortProto { |
| // keep labels with topk scores |
| optional int32 topk = 1 [default = 1]; |
| } |
| |
| message ConcateProto { |
| optional int32 concate_dim = 1 [default = 0]; |
| optional int32 num_concates = 2 [default = 1]; |
| } |
| |
| message ConvolutionProto { |
| // The number of outputs for the layer |
| optional int32 num_filters = 1; |
| // the kernel height/width |
| optional int32 kernel = 2 [default = 3]; |
| // The padding height/width |
| optional int32 pad = 30 [default = 0]; |
| // the stride |
| optional int32 stride = 31 [default = 1]; |
| |
| optional int32 kernel_x = 41 [default = 3]; |
| optional int32 kernel_y = 42 [default = 3]; |
| |
| optional int32 pad_x = 44 [default = 0]; |
| optional int32 pad_y = 45 [default = 0]; |
| |
| optional int32 stride_x = 47 [default = 1]; |
| optional int32 stride_y = 48 [default = 1]; |
| |
| // cudnn workspace size in MB |
| optional int32 workspace_byte_limit = 50 [default = 512]; |
| } |
| |
| message DataProto { |
| // path to the data file/folder, absolute or relative to the workspace |
| required string path = 2; |
| // batch size. |
| required int32 batchsize = 4; |
| // skip [0,random_skip] records |
| optional int32 random_skip = 30 [default = 0]; |
| } |
| |
| message MnistProto { |
| // normalization x/norm_a |
| required float norm_a = 1 [default = 1]; |
| // normalization x-norm_b |
| required float norm_b = 2 [default = 0]; |
| |
| // elastic distortion |
| optional int32 kernel = 30 [default = 0]; |
| optional float sigma = 31 [default = 0]; |
| optional float alpha = 32 [default = 0]; |
| // rotation or horizontal shearing |
| optional float beta = 33 [default = 0]; |
| // scaling |
| optional float gamma = 34 [default = 0]; |
| // scale to this size as input for deformation |
| optional int32 resize = 35 [default = 0] ; |
| optional int32 elastic_freq = 36 [default = 0]; |
| } |
| |
| message DummyProto { |
| // shape of data and grad blobs |
| optional bool input = 1 [default = false]; |
| optional bool output = 2 [default = false]; |
| repeated int32 shape = 3; |
| } |
| |
| message RNNDummyProto { |
| optional string dynamic_srclayer = 1; |
| // if shape set, random generate the data blob |
| repeated int32 shape = 2; |
| // if integer is true, generate integer data |
| optional bool integer = 3 [default = false]; |
| // range of the random generation |
| optional float low = 4 [default = 0]; |
| optional float high = 5 [default = 0]; |
| } |
| |
| // Message that stores parameters used by DropoutLayer |
| message DropoutProto { |
| // dropout ratio |
| optional float dropout_ratio = 30 [default = 0.5]; |
| } |
| |
| message RBMProto { |
| required int32 hdim = 1; // The number of outputs for the layer |
| optional bool bias_term = 2 [default = true]; // whether to have bias terms |
| optional bool gaussian = 3 [default = false]; // use gaussian sampling or not |
| } |
| |
| // Message that stores parameters used by GRULayer |
| message GRUProto { |
| // dimension of hidden state for the layer |
| required int32 dim_hidden = 1; |
| // use bias vector or not |
| optional bool bias_term = 2 [default = true]; |
| } |
| |
| |
| // Message that stores parameters used by InnerProductLayer |
| message InnerProductProto { |
| // number of outputs for the layer |
| required int32 num_output = 1; |
| // use bias vector or not |
| optional bool bias_term = 30 [default = true]; |
| // transpose or not |
| optional bool transpose = 31 [default = false]; |
| } |
| |
| message LRNProto { |
| // local response size |
| required int32 local_size = 1 [default = 5]; |
| // scale factor |
| optional float alpha = 31 [default = 1.0]; |
| // exponential number |
| optional float beta = 32 [default = 0.75]; |
| // offset |
| optional float knorm = 34 [default = 1.0]; |
| } |
| |
| message PoolingProto { |
| // The kernel size (square) |
| optional int32 kernel= 1 [default = 3]; |
| enum PoolMethod { |
| MAX = 0; |
| AVG = 1; |
| } |
| // The pooling method |
| optional PoolMethod pool = 30 [default = MAX]; |
| // The padding size |
| optional uint32 pad = 31 [default = 0]; |
| // The stride |
| optional uint32 stride = 32 [default = 2]; |
| |
| optional int32 kernel_x = 41 [default = 3]; |
| optional int32 kernel_y = 42 [default = 3]; |
| |
| optional int32 pad_x = 44 [default = 0]; |
| optional int32 pad_y = 45 [default = 0]; |
| |
| optional int32 stride_x = 47 [default = 2]; |
| optional int32 stride_y = 48 [default = 2]; |
| } |
| |
| message ReLUProto { |
| // Ref. Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). |
| // Rectifier nonlinearities improve neural network acoustic models. |
| // In ICML Workshop on Deep Learning for Audio, Speech, and Language Processing. |
| optional float negative_slope = 1 [default = 0]; |
| } |
| |
| message SliceProto { |
| optional int32 slice_dim = 1 [default = 0]; |
| optional int32 num_slices = 2 [default = 1]; |
| } |
| |
| message SoftmaxProto { |
| // Can be used to do softmax over each channel of one image by setting it to |
| // be the size of the second dimension (the first dimension is batchsize). |
| optional int32 num_softmax_per_instance = 1 [default = 1]; |
| } |
| |
| message RMSPropProto { |
| // history=history*rho_+(1-rho_)*(grad*grad_scale); |
| required float rho = 1; |
| } |
| message AdaDeltaProto { |
| required float rho = 1 [default = 0.9]; |
| } |
| message AdamProto { |
| required float beta1 = 1 [default = 0.9]; |
| required float beta2 = 2 [default = 0.999]; |
| } |
| message AdamMaxProto { |
| required float beta1 = 1 [default = 0.9]; |
| required float beta2 = 2 [default = 0.999]; |
| } |
| |
| message FixedStepProto { |
| repeated int32 step = 28; |
| // lr = step_lr[i] if current step >= step[i] |
| repeated float step_lr = 29; |
| } |
| |
| message StepProto { |
| // lr = base_lr * gamma^(step/change_freq) |
| required float gamma = 35 [default = 1]; |
| // lr = base_lr * gamma^(step/change_freq) |
| required int32 change_freq = 40; |
| } |
| |
| message LinearProto { |
| // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr |
| required int32 change_freq= 40; |
| // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr |
| required float final_lr = 39; |
| } |
| |
| message ExponentialProto { |
| // lr = base / 2^(step/change_freq) |
| required int32 change_freq = 40; |
| } |
| |
| message InverseTProto { |
| // lr = base_lr / (1+step/final_lr) |
| required float final_lr = 39; |
| } |
| message InverseProto { |
| // lr = base_lr*(1+gamma*step)^(-pow) |
| required float gamma = 1 [default = 1]; |
| // lr = base_lr*(1+gamma*step)^(-pow) |
| required float pow = 2 [default = 0]; |
| } |
| message UniformProto { |
| optional float low = 1 [default = -1]; |
| optional float high = 2 [default = 1]; |
| } |
| message GaussianProto { |
| optional float mean = 1 [default = 0]; |
| optional float std = 2 [default = 1]; |
| } |
| |
| // -------------- |
| // All Enum Types |
| // -------------- |
| |
| enum AlgType { |
| // Back-propagation algorithm for feed-forward models, e.g., CNN and RNN |
| kBP = 1; |
| // Contrastive Divergence algorithm for RBM, DBM, etc. |
| kCD = 2; |
| // BPTT for training RNN models |
| kBPTT = 3; |
| // For user defined algorithm. |
| kUserAlg = 104; |
| } |
| |
| enum LayerType { |
| /* |
| * Input layers |
| * - Load records from file, database |
| */ |
| kCSVInput = 100; |
| kImagePreprocess = 101; |
| kRecordInput = 103; |
| kLMDBData = 190; // deprecated |
| kLabel = 191; // deprecated |
| kMnist = 192; // deprecated |
| kRGBImage = 193; // deprecated |
| kShardData = 194; // deprecated |
| kCharRNN = 195; |
| kRNNLabel = 196; |
| kOneHot = 197; |
| |
| /* |
| * Neuron layers |
| * - Feature transformation |
| */ |
| kConvolution = 201; |
| kCConvolution = 202; |
| kDropout = 203; |
| kDummy = 204; |
| kInnerProduct = 205; |
| kLRN = 206; |
| kPooling = 207; |
| kCPooling = 208; |
| kRBMHid = 209; |
| kRBMVis = 210; |
| kReLU = 211; |
| kSTanh = 212; |
| kSigmoid = 213; |
| kSoftmax = 214; |
| kGRU = 215; |
| kEmbedding = 216; |
| kActivation = 217; |
| kBM = 218; |
| |
| kCudnnConv = 250; |
| kCudnnPool = 251; |
| kCudnnLRN = 252; |
| kCudnnSoftmax = 253; |
| kCudnnActivation = 254; |
| kCudnnBM = 255; |
| |
| /* |
| * Loss layers |
| * - Compute objective loss |
| */ |
| kEuclideanLoss = 300; |
| kSoftmaxLoss = 301; |
| // cudnn v3 |
| kCudnnSoftmaxLoss = 350; |
| |
| /* |
| * Output layers |
| * - Write results to file, database |
| */ |
| kAccuracy = 400; |
| kArgSort = 401; |
| kCSVOutput = 402; |
| kRecordOutput = 403; |
| kCharRNNOutput = 404; |
| |
| /* |
| * Connection layers |
| * - Connect layers when neural net is partitioned |
| */ |
| kBridgeDst = 500; |
| kBridgeSrc = 501; |
| kConcate = 502; |
| kSlice = 503; |
| kSplit = 504; |
| kRNNDummy = 505; |
| |
| /* |
| * User defined layer |
| * - users should configure user_type |
| */ |
| kUserLayer = 600; |
| } |
| |
| enum UpdaterType { |
| // noraml SGD with momentum and weight decay |
| kSGD = 1; |
| // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf |
| kAdaGrad = 2; |
| // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf |
| kRMSProp = 3; |
| // Nesterov first optimal gradient method |
| kNesterov = 4; |
| // AdaDelta |
| kAdaDelta = 5; |
| // Adam |
| kAdam = 6; |
| // AdamMax |
| kAdamMax = 7; |
| // For user defined updater |
| kUserUpdater = 105; |
| } |
| |
| enum Phase { |
| kUnknown = 0; |
| kTrain = 1; |
| kVal = 2; |
| kTest= 4; |
| // postivie phase for contrastive divergence algorithm |
| kPositive = 8; |
| // negative phase for contrastive divergence algorithm |
| kNegative = 16; |
| kForward = 32; |
| kBackward = 64; |
| kLoss = 128; |
| kDeploy = 256; |
| |
| // used for aggregate parameter gradients when Param is shared |
| kAggGrad = 512; |
| } |
| |
| enum ParamType { |
| // built-in Param |
| kParam = 0; |
| // user-defined Param |
| kUser = 103; |
| } |
| |
| enum ChangeMethod { |
| kFixed = 0; |
| kInverseT = 1; |
| kInverse = 2; |
| kExponential = 3; |
| kLinear = 4; |
| kStep = 5; |
| kFixedStep = 6; |
| // For user defiend change method |
| kUserChange = 100; |
| } |
| |
| enum InitMethod { |
| // fix the values of all parameters a constant in the value field |
| kConstant = 0; |
| // sample gaussian with std and mean |
| kGaussian = 1; |
| // uniform sampling between low and high |
| kUniform = 2; |
| // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from |
| // Gaussian distribution |
| kGaussianSqrtFanIn = 4; |
| // from Toronto Convnet, rectified linear activation, let |
| // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3), |
| // the program will multiply it. |
| kUniformSqrtFanIn = 5; |
| // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh |
| // activation, range is [-a, +a], for sigmoid activation, range is |
| // [-4a, +4a], put the scale factor to value field. |
| // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a> |
| kUniformSqrtFanInOut = 6; |
| |
| // For user defined init method |
| kUserInit = 101; |
| } |
| |
| enum UnrollConnType { |
| // i-th unrolled layer <- (i - shift)-th src unrolled layer |
| kUnrollOneToOne = 1; |
| // i-th unrolled layer <- all src unrolled layers |
| kUnrollOneToAll = 2; |
| // i-th unrolled layer <- last unrolled src layer |
| kUnrollFirstToLast = 3; |
| // customized connection type defined by src_conn |
| kUnrollCustomized = 4; |
| } |