src/test/scripts/gpu/nn/ResNet18GPU.dml - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 conv2d_forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
   int C, int Hin, int Win, int Hf, int Wf, int strideh, int stridew,
   int padh, int padw) return (matrix[double] out, int Hout, int Wout)
 {
   N = nrow(X)
   F = nrow(W)
   Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
   Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
   # Convolution - built-in implementation
   out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
                stride=[strideh,stridew], padding=[padh,padw])
   # Add bias term to each output filter
   out = bias_add(out, b)
 }

 conv2d_backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
   matrix[double] W, matrix[double] b, int C, int Hin, int Win, int Hf, int Wf,
   int strideh, int stridew, int padh, int padw)
   return (matrix[double] dX, matrix[double] dW, matrix[double] db)
 {
   N = nrow(X)
   F = nrow(W)
   # Partial derivatives for convolution - built-in implementation
   dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
                               input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
   dX = conv2d_backward_data(W, dout, stride=[strideh,stridew], padding=[padh,padw],
                             input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
   # Partial derivatives for bias vector
   # Here we sum each column, reshape to (F, Hout*Wout), and sum each row
   # to result in the summation for each channel.
   db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))  # shape (F, 1)
 }

 conv2d_init = function(int F, int C, int Hf, int Wf, int seed = -1)
   return (matrix[double] W, matrix[double] b) {
   W = rand(rows=F, cols=C*Hf*Wf, pdf="normal", seed=seed) * sqrt(2.0/(C*Hf*Wf))
   b = matrix(0, rows=F, cols=1)
 }

 bn2d_forward = function(matrix[double] X, int C, int Hin, int Win,
     double mu, double epsilon) return (matrix[double] out)
 {
     gamma = matrix(1, rows=C, cols=1)
     beta = matrix(0, rows=C, cols=1)
     ema_mean = matrix(0, rows=C, cols=1)
     ema_var = matrix(1, rows=C, cols=1)
     ema_mean_upd = ema_mean;
     ema_var_upd = ema_var;
     cache_mean = ema_mean;
     cache_inv_var = ema_var
     mode = 'train';
     [out, ema_mean_upd, ema_var_upd, cache_mean, cache_inv_var] = batch_norm2d(X, gamma, beta, ema_mean, ema_var, mode, epsilon, mu)
 }

 affine_forward = function(matrix[double] X, matrix[double] W, matrix[double] b) return (matrix[double] out) {
   out = X %*% W + b;
 }

 affine_init = function(int D, int M, int seed = -1 ) return (matrix[double] W, matrix[double] b) {
   W = rand(rows=D, cols=M, pdf="normal", seed=seed) * sqrt(2.0/D);
   b = matrix(0, rows=1, cols=M);
 }

 relu_forward = function(matrix[double] X) return (matrix[double] out) {
   out = max(0, X);
 }

 max_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
   int strideh, int stridew, int padh, int padw) return(matrix[double] out, int Hout, int Wout)
 {
   N = nrow(X)
   Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
   Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
   out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
     stride=[strideh,stridew], padding=[padh,padw])
 }

 avg_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win)
   return (matrix[double] out, int Hout, int Wout) {
   N = nrow(X)
   Hout = 1
   Wout = 1
   out = avg_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hin,Win], stride=[1,1], padding=[0, 0])
 }

 softmax_forward = function(matrix[double] scores) return (matrix[double] probs) {
   scores = scores - rowMaxs(scores);  # numerical stability
   unnorm_probs = exp(scores);  # unnormalized probabilities
   probs = unnorm_probs / rowSums(unnorm_probs);  # normalized probabilities
 }

 basic_block = function(matrix[double] X, int C, int C_base, int Hin, int Win, int strideh,
     int stridew, matrix[double] WC1, matrix[double] bC1, matrix[double] WC2, matrix[double] bC2)
   return (matrix[double] out, int Hout, int Wout)
 {
   mu_bn = 0.1;
   ep_bn = 1e-05;
   downsample = strideh > 1 | stridew > 1 | C != C_base;
   if (downsample) {
     [WC3, bC3] = conv2d_init(C_base, C, Hf=1, Wf=1, 42);
   }
   # Residual Path
   # conv1 -> bn1 -> relu1
   [out, Hout, Wout] = conv2d_forward(X,WC1,bC1,C,Hin,Win,3,3,strideh,stridew,1,1);
   out = bn2d_forward(out,C_base,Hout,Wout,mu_bn,ep_bn);
   out = relu_forward(out);
   # conv2 -> bn2 -> relu2
   [out, Hout, Wout] = conv2d_forward(out,WC2,bC2,C_base,Hout,Wout,3,3,1,1,1,1);
   out = bn2d_forward(out,C_base,Hout,Wout,mu_bn,ep_bn);
   # Identity Path
   identity = X;
   if (downsample) {
     # Downsample input
     [identity, Hout, Wout] = conv2d_forward(X,WC3,bC3,C,Hin,Win,1,1,strideh,stridew,0,0);
     out = bn2d_forward(identity,C_base,Hout,Wout,mu_bn,ep_bn);
   }
   out = relu_forward(out + identity);
 }

 getWeights = function(int fel, int lid,
     matrix[double] W_pt, matrix[double] b_pt,
     matrix[double] W_init, matrix[double] b_init)
   return (matrix[double] Wl, matrix[double] bl)
 {
   if (lid < fel) { #extract pretrained features
     Wl = W_pt;
     bl = b_pt;
   }
   else {  #use initialized weights
     Wl = W_init;
     bl = b_init;
   }
 }

 rwRowIndexMax = function(matrix[double] X, matrix[double] oneVec, matrix[double] idxSeq)
     return (matrix[double] index) {
   rm = rowMaxs(X) %*% oneVec;
   I = X == rm;
   index = rowMaxs(I * idxSeq);
 }

 resnet18_forward = function(matrix[double] X, int C, int Hin, int Win, int K)
   return (matrix[double] Y_pred)
 {
   mu_bn = 0.1;
   ep_bn = 1e-05;

   # Get the transferred layers. FIXME: use pretrained weights
   [W1_pt, b1_pt] = conv2d_init(64, C, Hf=7, Wf=7, 42);
   [W2_pt, b2_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
   [W3_pt, b3_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
   [W4_pt, b4_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
   [W5_pt, b5_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
   [W6_pt, b6_pt] = conv2d_init(128, 64, Hf=3, Wf=3, 42);
   [W7_pt, b7_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
   [W8_pt, b8_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
   [W9_pt, b9_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
   [W10_pt, b10_pt] = conv2d_init(256, 128, Hf=3, Wf=3, 42);
   [W11_pt, b11_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
   [W12_pt, b12_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
   [W13_pt, b13_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
   [W14_pt, b14_pt] = conv2d_init(512, 256, Hf=3, Wf=3, 42);
   [W15_pt, b15_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
   [W16_pt, b16_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
   [W17_pt, b17_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
   [W18_pt, b18_pt] = affine_init(512, K, 42);
   W18_pt = W18_pt/sqrt(2);

   # Initialize the weights for the non-transferred layers
   [W1_init, b1_init] = conv2d_init(64, C, Hf=7, Wf=7, 43);
   [W2_init, b2_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
   [W3_init, b3_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
   [W4_init, b4_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
   [W5_init, b5_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
   [W6_init, b6_init] = conv2d_init(128, 64, Hf=3, Wf=3, 43);
   [W7_init, b7_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
   [W8_init, b8_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
   [W9_init, b9_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
   [W10_init, b10_init] = conv2d_init(256, 128, Hf=3, Wf=3, 42);
   [W11_init, b11_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
   [W12_init, b12_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
   [W13_init, b13_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
   [W14_init, b14_init] = conv2d_init(512, 256, Hf=3, Wf=3, 42);
   [W15_init, b15_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
   [W16_init, b16_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
   [W17_init, b17_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
   [W18_init, b18_init] = affine_init(512, K, 42);
   W18_init = W18_init/sqrt(2);

   # Compute prediction over mini-batches
   N = nrow(X);
   Y_pred = matrix(0, rows=N, cols=3);
   batch_size = 64;
   oneVec = matrix(1, rows=1, cols=K);
   idxSeq = matrix(1, rows=batch_size, cols=1) %*% t(seq(1, K));
   iters = ceil (N / batch_size);

   for (i in 1:iters) {
     # Get next batch
     beg = ((i-1) * batch_size) %% N + 1;
     end = min(N, beg+batch_size-1);
     X_batch = X[beg:end,];

     # Extract 3 layers
     j = 1;
     fel = 10; #extract 9, 8, 7, 6
     while (j < 4) {
       # Compute forward pass
       # Layer1: conv2d 7x7 -> bn -> relu -> maxpool 3x3
       lid = 1;
       [Wl1, bl1] = getWeights(fel, lid, W1_pt, b1_pt, W1_init, b1_init);
       [outc1, Houtc1, Woutc1] = conv2d_forward(X_batch,Wl1,bl1,C,Hin,Win,7,7,2,2,3,3);
       outb1 = bn2d_forward(outc1,64,Houtc1,Woutc1,mu_bn,ep_bn);
       outr1 = relu_forward(outb1);
       [outp1, Houtp1, Woutp1] = max_pool2d_forward(outr1,64,Houtc1, Woutc1,3,3,2,2,1,1);

       # Layer2: residual block1
       lid = 2;
       [Wc1, bc1] = getWeights(fel, lid, W2_pt, b2_pt, W2_init, b2_init);
       [Wc2, bc2] = getWeights(fel, lid, W3_pt, b3_pt, W3_init, b3_init);
       [outrb1, Houtrb1, Woutrb1] = basic_block(outp1,64,64,Houtp1,Woutp1,1,1,Wc1,bc1,Wc2,bc2);
       print(nrow(outrb1)+" "+ncol(outrb1));

       # Layer3: residual block2
       lid = 3;
       [Wc1, bc1] = getWeights(fel, lid, W4_pt, b4_pt, W4_init, b4_init);
       [Wc2, bc2] = getWeights(fel, lid, W5_pt, b5_pt, W5_init, b5_init);
       [outrb2, Houtrb2, Woutrb2] = basic_block(outrb1,64,64,Houtrb1,Woutrb1,1,1,Wc1,bc1,Wc2,bc2);
       print(nrow(outrb2)+" "+ncol(outrb2));

       # Layer4: residual block3
       lid = 4;
       [Wc1, bc1] = getWeights(fel, lid, W6_pt, b6_pt, W6_init, b6_init);
       [Wc2, bc2] = getWeights(fel, lid, W7_pt, b7_pt, W7_init, b7_init);
       [outrb3, Houtrb3, Woutrb3] = basic_block(outrb2,64,128,Houtrb2,Woutrb2,2,2,Wc1,bc1,Wc2,bc2);
       print(nrow(outrb3)+" "+ncol(outrb3));

       # Layer5: residual block4
       lid = 5;
       [Wc1, bc1] = getWeights(fel, lid, W8_pt, b8_pt, W8_init, b8_init);
       [Wc2, bc2] = getWeights(fel, lid, W9_pt, b9_pt, W9_init, b9_init);
       [outrb4, Houtrb4, Woutrb4] = basic_block(outrb3,128,128,Houtrb3,Woutrb3,1,1,Wc1,bc1,Wc2,bc2);
       print(nrow(outrb4)+" "+ncol(outrb4));

       # Layer6: residual block5
       lid = 6;
       [Wc1, bc1] = getWeights(fel, lid, W10_pt, b10_pt, W10_init, b10_init);
       [Wc2, bc2] = getWeights(fel, lid, W11_pt, b11_pt, W11_init, b11_init);
       [outrb5, Houtrb5, Woutrb5] = basic_block(outrb4,128,256,Houtrb4,Woutrb4,2,2,Wc1,bc1,Wc2,bc2);
       print(nrow(outrb5)+" "+ncol(outrb5));

       # Layer7: residual block6
       lid = 7;
       [Wc1, bc1] = getWeights(fel, lid, W12_pt, b12_pt, W12_init, b12_init);
       [Wc2, bc2] = getWeights(fel, lid, W13_pt, b13_pt, W13_init, b13_init);
       [outrb6, Houtrb6, Woutrb6] = basic_block(outrb5,256,256,Houtrb5,Woutrb5,1,1,Wc1,bc1,Wc2,bc2);
       print(nrow(outrb6)+" "+ncol(outrb6));

       # Layer8: residual block7
       lid = 8;
       [Wc1, bc1] = getWeights(fel, lid, W14_pt, b14_pt, W14_init, b14_init);
       [Wc2, bc2] = getWeights(fel, lid, W15_pt, b15_pt, W15_init, b15_init);
       [outrb7, Houtrb7, Woutrb7] = basic_block(outrb6,256,512,Houtrb6,Woutrb6,2,2,Wc1,bc1,Wc2,bc2);
       print(nrow(outrb7)+" "+ncol(outrb7));

       # Layer9: residual block8
       lid = 9;
       [Wc1, bc1] = getWeights(fel, lid, W16_pt, b16_pt, W16_init, b16_init);
       [Wc2, bc2] = getWeights(fel, lid, W17_pt, b17_pt, W17_init, b17_init);
       [outrb8, Houtrb8, Woutrb8] = basic_block(outrb7,512,512,Houtrb7,Woutrb7,1,1,Wc1,bc1,Wc2,bc2);
       print(nrow(outrb8)+" "+ncol(outrb8));

       # Global average pooling
       [outap1, Houtap1, Houtap2] = avg_pool2d_forward(outrb8, 512, Houtrb8, Woutrb8);

       # layer10 : Fully connected layer
       lid = 10;
       [Wl10, bl10] = getWeights(fel, lid, W18_pt, b18_pt, W18_init, b18_init);
       outa1 = affine_forward(outap1, Wl10, bl10);
       probs_batch = softmax_forward(outa1);

       # Store the predictions
       Y_pred[beg:end,j] = rwRowIndexMax(probs_batch, oneVec, idxSeq);
       j = j + 1;
       fel = fel - 1;
     }
   }

 }

 generate_dummy_data = function(int N, int C, int Hin, int Win, int K)
   return (matrix[double] X, matrix[double] Y) {
   X = rand(rows=N, cols=C*Hin*Win, pdf="normal", seed=45) #linearized images
   classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform", seed=46))
   Y = table(seq(1, N), classes, N, K)  #one-hot encoding
 }

 # Read training data and settings
 N = 64;    #num of images in the target dataset
 C = 3;       #num of color channels
 Hin = 224;   #input image height
 Win = 224;   #input image width
 K = 10;      #num of classes

 # Generate dummy data
 [X, Y] = generate_dummy_data(N, C, Hin, Win, K);

 # Load the CuDNN libraries by calling a conv2d
 print("Eagerly loading cuDNN library");
 [W1, b1] = conv2d_init(96, C, Hf=11, Wf=11, 42);
 [outc1, Houtc1, Woutc1] = conv2d_forward(X[1:8,], W1, b1, C, Hin, Win, 11, 11, 1, 1, 2, 2);
 print(sum(outc1));

 print("Starting exploratory feature transfers");
 t1 = time();
 Y_pred = resnet18_forward(X, C, Hin, Win, K);
 R = colSums(Y_pred)
 print(R);

 t2 = time();
 print("Elapsed time for feature transfers = "+floor((t2-t1)/1000000)+" millsec");

 write(R, $1)
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	conv2d_forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
	int C, int Hin, int Win, int Hf, int Wf, int strideh, int stridew,
	int padh, int padw) return (matrix[double] out, int Hout, int Wout)
	{
	N = nrow(X)
	F = nrow(W)
	Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
	Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
	# Convolution - built-in implementation
	out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
	stride=[strideh,stridew], padding=[padh,padw])
	# Add bias term to each output filter
	out = bias_add(out, b)
	}

	conv2d_backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
	matrix[double] W, matrix[double] b, int C, int Hin, int Win, int Hf, int Wf,
	int strideh, int stridew, int padh, int padw)
	return (matrix[double] dX, matrix[double] dW, matrix[double] db)
	{
	N = nrow(X)
	F = nrow(W)
	# Partial derivatives for convolution - built-in implementation
	dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
	input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
	dX = conv2d_backward_data(W, dout, stride=[strideh,stridew], padding=[padh,padw],
	input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
	# Partial derivatives for bias vector
	# Here we sum each column, reshape to (F, Hout*Wout), and sum each row
	# to result in the summation for each channel.
	db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout)) # shape (F, 1)
	}

	conv2d_init = function(int F, int C, int Hf, int Wf, int seed = -1)
	return (matrix[double] W, matrix[double] b) {
	W = rand(rows=F, cols=CHfWf, pdf="normal", seed=seed) * sqrt(2.0/(CHfWf))
	b = matrix(0, rows=F, cols=1)
	}

	bn2d_forward = function(matrix[double] X, int C, int Hin, int Win,
	double mu, double epsilon) return (matrix[double] out)
	{
	gamma = matrix(1, rows=C, cols=1)
	beta = matrix(0, rows=C, cols=1)
	ema_mean = matrix(0, rows=C, cols=1)
	ema_var = matrix(1, rows=C, cols=1)
	ema_mean_upd = ema_mean;
	ema_var_upd = ema_var;
	cache_mean = ema_mean;
	cache_inv_var = ema_var
	mode = 'train';
	[out, ema_mean_upd, ema_var_upd, cache_mean, cache_inv_var] = batch_norm2d(X, gamma, beta, ema_mean, ema_var, mode, epsilon, mu)
	}

	affine_forward = function(matrix[double] X, matrix[double] W, matrix[double] b) return (matrix[double] out) {
	out = X %*% W + b;
	}

	affine_init = function(int D, int M, int seed = -1 ) return (matrix[double] W, matrix[double] b) {
	W = rand(rows=D, cols=M, pdf="normal", seed=seed) * sqrt(2.0/D);
	b = matrix(0, rows=1, cols=M);
	}

	relu_forward = function(matrix[double] X) return (matrix[double] out) {
	out = max(0, X);
	}

	max_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
	int strideh, int stridew, int padh, int padw) return(matrix[double] out, int Hout, int Wout)
	{
	N = nrow(X)
	Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
	Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
	out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
	stride=[strideh,stridew], padding=[padh,padw])
	}

	avg_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win)
	return (matrix[double] out, int Hout, int Wout) {
	N = nrow(X)
	Hout = 1
	Wout = 1
	out = avg_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hin,Win], stride=[1,1], padding=[0, 0])
	}

	softmax_forward = function(matrix[double] scores) return (matrix[double] probs) {
	scores = scores - rowMaxs(scores); # numerical stability
	unnorm_probs = exp(scores); # unnormalized probabilities
	probs = unnorm_probs / rowSums(unnorm_probs); # normalized probabilities
	}

	basic_block = function(matrix[double] X, int C, int C_base, int Hin, int Win, int strideh,
	int stridew, matrix[double] WC1, matrix[double] bC1, matrix[double] WC2, matrix[double] bC2)
	return (matrix[double] out, int Hout, int Wout)
	{
	mu_bn = 0.1;
	ep_bn = 1e-05;
	downsample = strideh > 1 \| stridew > 1 \| C != C_base;
	if (downsample) {
	[WC3, bC3] = conv2d_init(C_base, C, Hf=1, Wf=1, 42);
	}
	# Residual Path
	# conv1 -> bn1 -> relu1
	[out, Hout, Wout] = conv2d_forward(X,WC1,bC1,C,Hin,Win,3,3,strideh,stridew,1,1);
	out = bn2d_forward(out,C_base,Hout,Wout,mu_bn,ep_bn);
	out = relu_forward(out);
	# conv2 -> bn2 -> relu2
	[out, Hout, Wout] = conv2d_forward(out,WC2,bC2,C_base,Hout,Wout,3,3,1,1,1,1);
	out = bn2d_forward(out,C_base,Hout,Wout,mu_bn,ep_bn);
	# Identity Path
	identity = X;
	if (downsample) {
	# Downsample input
	[identity, Hout, Wout] = conv2d_forward(X,WC3,bC3,C,Hin,Win,1,1,strideh,stridew,0,0);
	out = bn2d_forward(identity,C_base,Hout,Wout,mu_bn,ep_bn);
	}
	out = relu_forward(out + identity);
	}

	getWeights = function(int fel, int lid,
	matrix[double] W_pt, matrix[double] b_pt,
	matrix[double] W_init, matrix[double] b_init)
	return (matrix[double] Wl, matrix[double] bl)
	{
	if (lid < fel) { #extract pretrained features
	Wl = W_pt;
	bl = b_pt;
	}
	else { #use initialized weights
	Wl = W_init;
	bl = b_init;
	}
	}

	rwRowIndexMax = function(matrix[double] X, matrix[double] oneVec, matrix[double] idxSeq)
	return (matrix[double] index) {
	rm = rowMaxs(X) %*% oneVec;
	I = X == rm;
	index = rowMaxs(I * idxSeq);
	}

	resnet18_forward = function(matrix[double] X, int C, int Hin, int Win, int K)
	return (matrix[double] Y_pred)
	{
	mu_bn = 0.1;
	ep_bn = 1e-05;

	# Get the transferred layers. FIXME: use pretrained weights
	[W1_pt, b1_pt] = conv2d_init(64, C, Hf=7, Wf=7, 42);
	[W2_pt, b2_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
	[W3_pt, b3_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
	[W4_pt, b4_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
	[W5_pt, b5_pt] = conv2d_init(64, 64, Hf=3, Wf=3, 42);
	[W6_pt, b6_pt] = conv2d_init(128, 64, Hf=3, Wf=3, 42);
	[W7_pt, b7_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
	[W8_pt, b8_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
	[W9_pt, b9_pt] = conv2d_init(128, 128, Hf=3, Wf=3, 42);
	[W10_pt, b10_pt] = conv2d_init(256, 128, Hf=3, Wf=3, 42);
	[W11_pt, b11_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
	[W12_pt, b12_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
	[W13_pt, b13_pt] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
	[W14_pt, b14_pt] = conv2d_init(512, 256, Hf=3, Wf=3, 42);
	[W15_pt, b15_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
	[W16_pt, b16_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
	[W17_pt, b17_pt] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
	[W18_pt, b18_pt] = affine_init(512, K, 42);
	W18_pt = W18_pt/sqrt(2);

	# Initialize the weights for the non-transferred layers
	[W1_init, b1_init] = conv2d_init(64, C, Hf=7, Wf=7, 43);
	[W2_init, b2_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
	[W3_init, b3_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
	[W4_init, b4_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
	[W5_init, b5_init] = conv2d_init(64, 64, Hf=3, Wf=3, 43);
	[W6_init, b6_init] = conv2d_init(128, 64, Hf=3, Wf=3, 43);
	[W7_init, b7_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
	[W8_init, b8_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
	[W9_init, b9_init] = conv2d_init(128, 128, Hf=3, Wf=3, 43);
	[W10_init, b10_init] = conv2d_init(256, 128, Hf=3, Wf=3, 42);
	[W11_init, b11_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
	[W12_init, b12_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
	[W13_init, b13_init] = conv2d_init(256, 256, Hf=3, Wf=3, 42);
	[W14_init, b14_init] = conv2d_init(512, 256, Hf=3, Wf=3, 42);
	[W15_init, b15_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
	[W16_init, b16_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
	[W17_init, b17_init] = conv2d_init(512, 512, Hf=3, Wf=3, 42);
	[W18_init, b18_init] = affine_init(512, K, 42);
	W18_init = W18_init/sqrt(2);

	# Compute prediction over mini-batches
	N = nrow(X);
	Y_pred = matrix(0, rows=N, cols=3);
	batch_size = 64;
	oneVec = matrix(1, rows=1, cols=K);
	idxSeq = matrix(1, rows=batch_size, cols=1) %*% t(seq(1, K));
	iters = ceil (N / batch_size);

	for (i in 1:iters) {
	# Get next batch
	beg = ((i-1) * batch_size) %% N + 1;
	end = min(N, beg+batch_size-1);
	X_batch = X[beg:end,];

	# Extract 3 layers
	j = 1;
	fel = 10; #extract 9, 8, 7, 6
	while (j < 4) {
	# Compute forward pass
	# Layer1: conv2d 7x7 -> bn -> relu -> maxpool 3x3
	lid = 1;
	[Wl1, bl1] = getWeights(fel, lid, W1_pt, b1_pt, W1_init, b1_init);
	[outc1, Houtc1, Woutc1] = conv2d_forward(X_batch,Wl1,bl1,C,Hin,Win,7,7,2,2,3,3);
	outb1 = bn2d_forward(outc1,64,Houtc1,Woutc1,mu_bn,ep_bn);
	outr1 = relu_forward(outb1);
	[outp1, Houtp1, Woutp1] = max_pool2d_forward(outr1,64,Houtc1, Woutc1,3,3,2,2,1,1);

	# Layer2: residual block1
	lid = 2;
	[Wc1, bc1] = getWeights(fel, lid, W2_pt, b2_pt, W2_init, b2_init);
	[Wc2, bc2] = getWeights(fel, lid, W3_pt, b3_pt, W3_init, b3_init);
	[outrb1, Houtrb1, Woutrb1] = basic_block(outp1,64,64,Houtp1,Woutp1,1,1,Wc1,bc1,Wc2,bc2);
	print(nrow(outrb1)+" "+ncol(outrb1));

	# Layer3: residual block2
	lid = 3;
	[Wc1, bc1] = getWeights(fel, lid, W4_pt, b4_pt, W4_init, b4_init);
	[Wc2, bc2] = getWeights(fel, lid, W5_pt, b5_pt, W5_init, b5_init);
	[outrb2, Houtrb2, Woutrb2] = basic_block(outrb1,64,64,Houtrb1,Woutrb1,1,1,Wc1,bc1,Wc2,bc2);
	print(nrow(outrb2)+" "+ncol(outrb2));

	# Layer4: residual block3
	lid = 4;
	[Wc1, bc1] = getWeights(fel, lid, W6_pt, b6_pt, W6_init, b6_init);
	[Wc2, bc2] = getWeights(fel, lid, W7_pt, b7_pt, W7_init, b7_init);
	[outrb3, Houtrb3, Woutrb3] = basic_block(outrb2,64,128,Houtrb2,Woutrb2,2,2,Wc1,bc1,Wc2,bc2);
	print(nrow(outrb3)+" "+ncol(outrb3));

	# Layer5: residual block4
	lid = 5;
	[Wc1, bc1] = getWeights(fel, lid, W8_pt, b8_pt, W8_init, b8_init);
	[Wc2, bc2] = getWeights(fel, lid, W9_pt, b9_pt, W9_init, b9_init);
	[outrb4, Houtrb4, Woutrb4] = basic_block(outrb3,128,128,Houtrb3,Woutrb3,1,1,Wc1,bc1,Wc2,bc2);
	print(nrow(outrb4)+" "+ncol(outrb4));

	# Layer6: residual block5
	lid = 6;
	[Wc1, bc1] = getWeights(fel, lid, W10_pt, b10_pt, W10_init, b10_init);
	[Wc2, bc2] = getWeights(fel, lid, W11_pt, b11_pt, W11_init, b11_init);
	[outrb5, Houtrb5, Woutrb5] = basic_block(outrb4,128,256,Houtrb4,Woutrb4,2,2,Wc1,bc1,Wc2,bc2);
	print(nrow(outrb5)+" "+ncol(outrb5));

	# Layer7: residual block6
	lid = 7;
	[Wc1, bc1] = getWeights(fel, lid, W12_pt, b12_pt, W12_init, b12_init);
	[Wc2, bc2] = getWeights(fel, lid, W13_pt, b13_pt, W13_init, b13_init);
	[outrb6, Houtrb6, Woutrb6] = basic_block(outrb5,256,256,Houtrb5,Woutrb5,1,1,Wc1,bc1,Wc2,bc2);
	print(nrow(outrb6)+" "+ncol(outrb6));

	# Layer8: residual block7
	lid = 8;
	[Wc1, bc1] = getWeights(fel, lid, W14_pt, b14_pt, W14_init, b14_init);
	[Wc2, bc2] = getWeights(fel, lid, W15_pt, b15_pt, W15_init, b15_init);
	[outrb7, Houtrb7, Woutrb7] = basic_block(outrb6,256,512,Houtrb6,Woutrb6,2,2,Wc1,bc1,Wc2,bc2);
	print(nrow(outrb7)+" "+ncol(outrb7));

	# Layer9: residual block8
	lid = 9;
	[Wc1, bc1] = getWeights(fel, lid, W16_pt, b16_pt, W16_init, b16_init);
	[Wc2, bc2] = getWeights(fel, lid, W17_pt, b17_pt, W17_init, b17_init);
	[outrb8, Houtrb8, Woutrb8] = basic_block(outrb7,512,512,Houtrb7,Woutrb7,1,1,Wc1,bc1,Wc2,bc2);
	print(nrow(outrb8)+" "+ncol(outrb8));

	# Global average pooling
	[outap1, Houtap1, Houtap2] = avg_pool2d_forward(outrb8, 512, Houtrb8, Woutrb8);

	# layer10 : Fully connected layer
	lid = 10;
	[Wl10, bl10] = getWeights(fel, lid, W18_pt, b18_pt, W18_init, b18_init);
	outa1 = affine_forward(outap1, Wl10, bl10);
	probs_batch = softmax_forward(outa1);

	# Store the predictions
	Y_pred[beg:end,j] = rwRowIndexMax(probs_batch, oneVec, idxSeq);
	j = j + 1;
	fel = fel - 1;
	}
	}

	}

	generate_dummy_data = function(int N, int C, int Hin, int Win, int K)
	return (matrix[double] X, matrix[double] Y) {
	X = rand(rows=N, cols=CHinWin, pdf="normal", seed=45) #linearized images
	classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform", seed=46))
	Y = table(seq(1, N), classes, N, K) #one-hot encoding
	}

	# Read training data and settings
	N = 64; #num of images in the target dataset
	C = 3; #num of color channels
	Hin = 224; #input image height
	Win = 224; #input image width
	K = 10; #num of classes

	# Generate dummy data
	[X, Y] = generate_dummy_data(N, C, Hin, Win, K);

	# Load the CuDNN libraries by calling a conv2d
	print("Eagerly loading cuDNN library");
	[W1, b1] = conv2d_init(96, C, Hf=11, Wf=11, 42);
	[outc1, Houtc1, Woutc1] = conv2d_forward(X[1:8,], W1, b1, C, Hin, Win, 11, 11, 1, 1, 2, 2);
	print(sum(outc1));

	print("Starting exploratory feature transfers");
	t1 = time();
	Y_pred = resnet18_forward(X, C, Hin, Win, K);
	R = colSums(Y_pred)
	print(R);

	t2 = time();
	print("Elapsed time for feature transfers = "+floor((t2-t1)/1000000)+" millsec");

	write(R, $1)