src/test/scripts/functions/lineage/GPUCacheEviction4.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 conv2d_forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
   int C, int Hin, int Win, int Hf, int Wf, int strideh, int stridew,
   int padh, int padw) return (matrix[double] out, int Hout, int Wout)
 {
   N = nrow(X)
   F = nrow(W)
   Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
   Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
   # Convolution - built-in implementation
   out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
                stride=[strideh,stridew], padding=[padh,padw])
   # Add bias term to each output filter
   out = bias_add(out, b)
 }

 conv2d_backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
   matrix[double] W, matrix[double] b, int C, int Hin, int Win, int Hf, int Wf,
   int strideh, int stridew, int padh, int padw)
   return (matrix[double] dX, matrix[double] dW, matrix[double] db)
 {
   N = nrow(X)
   F = nrow(W)
   # Partial derivatives for convolution - built-in implementation
   dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
                               input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
   dX = conv2d_backward_data(W, dout, stride=[strideh,stridew], padding=[padh,padw],
                             input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
   # Partial derivatives for bias vector
   # Here we sum each column, reshape to (F, Hout*Wout), and sum each row
   # to result in the summation for each channel.
   db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))  # shape (F, 1)
 }

 conv2d_init = function(int F, int C, int Hf, int Wf, int seed = -1)
   return (matrix[double] W, matrix[double] b) {
   W = rand(rows=F, cols=C*Hf*Wf, pdf="normal", seed=seed) * sqrt(2.0/(C*Hf*Wf))
   b = matrix(0, rows=F, cols=1)
 }

 affine_forward = function(matrix[double] X, matrix[double] W, matrix[double] b) return (matrix[double] out) {
   out = X %*% W + b;
 }

 affine_init = function(int D, int M, int seed = -1 ) return (matrix[double] W, matrix[double] b) {
   W = rand(rows=D, cols=M, pdf="normal", seed=seed) * sqrt(2.0/D);
   b = matrix(0, rows=1, cols=M);
 }

 relu_forward = function(matrix[double] X) return (matrix[double] out) {
   out = max(0, X);
 }

 max_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
   int strideh, int stridew, int padh, int padw) return(matrix[double] out, int Hout, int Wout)
 {
   N = nrow(X)
   Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
   Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
   out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
     stride=[strideh,stridew], padding=[padh,padw])
 }

 softmax_forward = function(matrix[double] scores) return (matrix[double] probs) {
   scores = scores - rowMaxs(scores);  # numerical stability
   unnorm_probs = exp(scores);  # unnormalized probabilities
   probs = unnorm_probs / rowSums(unnorm_probs);  # normalized probabilities
 }

 getWeights = function(int fel, int lid,
     matrix[double] W_pt, matrix[double] b_pt,
     matrix[double] W_init, matrix[double] b_init)
   return (matrix[double] Wl, matrix[double] bl)
 {
   if (lid < fel) { #extract pretrained features
     Wl = W_pt;
     bl = b_pt;
     print("Extract feature of layer "+lid);
   }
   else {  #use initialized weights
     Wl = W_init;
     bl = b_init;
     print("Initialize weights for layer "+lid);
   }
 }

 rwRowIndexMax = function(matrix[double] X, matrix[double] oneVec, matrix[double] idxSeq)
     return (matrix[double] index) {
   rm = rowMaxs(X) %*% oneVec;
   I = X == rm;
   index = rowMaxs(I * idxSeq);
 }

 predict = function(matrix[double] X, int C, int Hin, int Win, int K,
     matrix[double] W1, matrix[double] b1, matrix[double] W2, matrix[double] b2,
     matrix[double] W3, matrix[double] b3, matrix[double] W4, matrix[double] b4,
     matrix[double] W5, matrix[double] b5, matrix[double] W6, matrix[double] b6,
     matrix[double] W7, matrix[double] b7, matrix[double] W8, matrix[double] b8)
   return (matrix[double] Y_pred)
 {
   N = nrow(X);
   Hf = 5;  #filter height
   Wf = 5;  #filter width
   # Define filters
   F1 = 96;
   F2 = 256;
   F3 = 384;
   F4 = 384;
   F5 = 256;
   N3 = 4096;
   # Define strides
   s4 = 4;
   s2 = 2;
   s1 = 1;
   # Define pads
   pad2 = 2;
   pad1 = 1;
   pad0 = 0;

   # Initialize the weights for the non-transferred layers
   [W1_init, b1_init] = conv2d_init(F1, C, Hf=11, Wf=11, 43);
   [W2_init, b2_init] = conv2d_init(F2, F1, Hf=5, Wf=5, 43);
   [W3_init, b3_init] = conv2d_init(F3, F2, Hf=3, Wf=3, 43);
   [W4_init, b4_init] = conv2d_init(F4, F3, Hf=3, Wf=3, 43);
   [W5_init, b5_init] = conv2d_init(F5, F4, Hf=3, Wf=3, 43);
   [W6_init, b6_init] = affine_init(9216, 4096, 43);
   [W7_init, b7_init] = affine_init(N3, N3, 43);
   [W8_init, b8_init] = affine_init(N3, K, 43);
   W8_init = W8_init/sqrt(2);

   # Compute prediction over mini-batches
   verbose = FALSE;
   probs = matrix(0, rows=N, cols=K);
   Y_pred = matrix(0, rows=N, cols=4);
   batch_size = 64;
   oneVec = matrix(1, rows=1, cols=K);
   idxSeq = matrix(1, rows=batch_size, cols=1) %*% t(seq(1, K));
   iters = ceil (N / batch_size);
   for (i in 1:iters) {
     # Get next batch
     beg = ((i-1) * batch_size) %% N + 1;
     end = min(N, beg+batch_size-1);
     X_batch = X[beg:end,];

     j = 1;
     fel = 8;
     while (j < 5) {
       # Compute forward pass
       # layer 1: conv1 -> relu1 -> pool1
       lid = 1;
       [Wl1, bl1] = getWeights(fel, lid, W1, b1, W1_init, b1_init);
       [outc1, Houtc1, Woutc1] = conv2d_forward(X_batch, Wl1, bl1, C, Hin, Win, Hf=11, Wf=11,
           s4, s4, pad0, pad0);
       if(verbose) print("sum(conv1) = "+sum(outc1));
       if(verbose) print(nrow(outc1)+", "+ncol(outc1));
       outr1 = relu_forward(outc1);
       [outp1, Houtp1, Woutp1] = max_pool2d_forward(outr1, F1, Houtc1, Woutc1, Hf=3, Wf=3,
           strideh=s2, stridew=s2, padh=0, padw=0)
       if(verbose) print("sum(pool1) = "+sum(outp1));
       if(verbose) print(nrow(outp1)+", "+ncol(outp1));

       # layer 2: conv2 -> relu2 -> pool2
       lid = 2;
       [Wl2, bl2] = getWeights(fel, lid, W2, b2, W2_init, b2_init);
       [outc2, Houtc2, Woutc2] = conv2d_forward(outp1, Wl2, bl2, F1, Houtp1, Woutp1, Hf=5,
           Wf=5, s1, s1, pad2, pad2);
       if(verbose) print("sum(conv2) = "+sum(outc2));
       if(verbose) print(nrow(outc2)+", "+ncol(outc2));
       outr2 = relu_forward(outc2);
       [outp2, Houtp2, Woutp2] = max_pool2d_forward(outr2, F2, Houtc2, Woutc2, Hf=3, Wf=3,
           strideh=s2, stridew=s2, padh=0, padw=0)
       if(verbose) print("sum(pool2) = "+sum(outp2));
       if(verbose) print(nrow(outp2)+", "+ncol(outp2));

       # layer 3: conv3 -> relu3
       lid = 3
       [Wl3, bl3] = getWeights(fel, lid, W3, b3, W3_init, b3_init);
       [outc3, Houtc3, Woutc3] = conv2d_forward(outp2, Wl3, bl3, F2, Houtp2, Woutp2, Hf=3,
           Wf=3, s1, s1, pad1, pad1);
       if(verbose) print("sum(conv3) = "+sum(outc3));
       if(verbose) print(nrow(outc3)+", "+ncol(outc3));
       outr3 = relu_forward(outc3);

       # layer 4: conv4 -> relu4
       lid = 4;
       [Wl4, bl4] = getWeights(fel, lid, W4, b4, W4_init, b4_init);
       [outc4, Houtc4, Woutc4] = conv2d_forward(outr3, Wl4, bl4, F3, Houtc3, Woutc3, Hf=3,
           Wf=3, s1, s1, pad1, pad1);
       if(verbose) print("sum(conv4) = "+sum(outc4));
       if(verbose) print(nrow(outc4)+", "+ncol(outc4));
       outr4 = relu_forward(outc4);

       # layer 5: conv5 -> relu5 -> pool3
       lid = 5;
       [Wl5, bl5] = getWeights(fel, lid, W5, b5, W5_init, b5_init);
       [outc5, Houtc5, Woutc5] = conv2d_forward(outr4, Wl5, bl5, F4, Houtc4, Woutc4, Hf=3,
           Wf=3, s1, s1, pad1, pad1);
       if(verbose) print("sum(conv5) = "+sum(outc5));
       if(verbose) print(nrow(outc5)+", "+ncol(outc5));
       outr5 = relu_forward(outc5);
       [outp5, Houtp5, Woutp5] = max_pool2d_forward(outr5, F5, Houtc5, Woutc5, Hf=3, Wf=3,
           strideh=s2, stridew=s2, padh=0, padw=0)
       if(verbose) print("sum(pool3) = "+sum(outp5));
       if(verbose) print(nrow(outp5)+", "+ncol(outp5));

       # layer 6: affine1 -> relu6
       lid = 6;
       [Wl6, bl6] = getWeights(fel, lid, W6, b6, W6_init, b6_init);
       outa6 = affine_forward(outp5, Wl6, bl6);
       if(verbose) print(nrow(outa6)+", "+ncol(outa6));
       outr6 = relu_forward(outa6);

       # layer 7: affine2 -> relu7
       lid = 7;
       [Wl7, bl7] = getWeights(fel, lid, W7, b7, W7_init, b7_init);
       outa7 = affine_forward(outr6, Wl7, bl7);
       if(verbose) print(nrow(outa7)+", "+ncol(outa7));
       outr7 = relu_forward(outa7);

       # layer 8: affine3 -> softmax
       lid = 8;
       [Wl8, bl8] = getWeights(fel, lid, W8, b8, W8_init, b8_init);
       outa8 = affine_forward(outr7, Wl8, bl8);
       if(verbose) print(nrow(outa8)+", "+ncol(outa8));
       probs_batch = softmax_forward(outa8);

       # Store the predictions
       Y_pred[beg:end,j] = rwRowIndexMax(probs_batch, oneVec, idxSeq);
       j = j + 1;
       fel = fel - 1;
     }
   }
 }

 generate_dummy_data = function(int N, int C, int Hin, int Win, int K)
   return (matrix[double] X, matrix[double] Y) {
   X = rand(rows=N, cols=C*Hin*Win, pdf="normal", seed=45) #linearized images
   classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform", seed=46))
   Y = table(seq(1, N), classes, N, K)  #one-hot encoding
 }

 # Read training data and settings
 N = 64;    #num of training images
 Nval = 512;  #num of validation images
 Ntest = 64; #num of test images
 C = 3;       #num of color channels
 Hin = 227; #227   #input image height
 Win = 227; #227   #input image width
 K = 10;      #num of classes
 epochs = 1;

 # Generate dummy data
 [X, Y] = generate_dummy_data(N, C, Hin, Win, K);
 [X_val, Y_val] = generate_dummy_data(Nval, C, Hin, Win, K);
 [X_test, Y_test] = generate_dummy_data(Ntest, C, Hin, Win, K);

 # Train
 #[W1, b1, W2, b2] = train(X, Y, X_val, Y_val, C, Hin, Win, epochs);

 # Predict
 Hf = 5;
 Wf = 5;
 # Initialize random weights. FIXME: use pretrained weights
 [W1, b1] = conv2d_init(96, C, Hf=11, Wf=11, 42);
 [W2, b2] = conv2d_init(256, 96, Hf=5, Wf=5, 42);
 [W3, b3] = conv2d_init(384, 256, Hf=3, Wf=3, 42);
 [W4, b4] = conv2d_init(384, 384, Hf=3, Wf=3, 42);
 [W5, b5] = conv2d_init(256, 384, Hf=3, Wf=3, 42);
 [W6, b6] = affine_init(9216, 4096, 42);
 [W7, b7] = affine_init(4096, 4096, 42);
 [W8, b8] = affine_init(4096, K, 42);
 W8 = W8/sqrt(2);

 # Load the CuDNN libraries by calling a conv2d
 print("Eagerly loading cuDNN library");
 [outc1, Houtc1, Woutc1] = conv2d_forward(X[1:8,], W1, b1, C, Hin, Win, 11, 11, 1, 1, 2, 2);
 print(sum(outc1));

 print("Starting exploratory feature transfers");
 Y_pred = predict(X_test, C, Hin, Win, K, W1, b1, W2, b2, W3, b3, W4, b4,
    W5, b5, W6, b6, W7, b7, W8, b8);
 write(Y_pred, $1, format="text");
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	conv2d_forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
	int C, int Hin, int Win, int Hf, int Wf, int strideh, int stridew,
	int padh, int padw) return (matrix[double] out, int Hout, int Wout)
	{
	N = nrow(X)
	F = nrow(W)
	Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
	Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
	# Convolution - built-in implementation
	out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
	stride=[strideh,stridew], padding=[padh,padw])
	# Add bias term to each output filter
	out = bias_add(out, b)
	}

	conv2d_backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
	matrix[double] W, matrix[double] b, int C, int Hin, int Win, int Hf, int Wf,
	int strideh, int stridew, int padh, int padw)
	return (matrix[double] dX, matrix[double] dW, matrix[double] db)
	{
	N = nrow(X)
	F = nrow(W)
	# Partial derivatives for convolution - built-in implementation
	dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
	input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
	dX = conv2d_backward_data(W, dout, stride=[strideh,stridew], padding=[padh,padw],
	input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
	# Partial derivatives for bias vector
	# Here we sum each column, reshape to (F, Hout*Wout), and sum each row
	# to result in the summation for each channel.
	db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout)) # shape (F, 1)
	}

	conv2d_init = function(int F, int C, int Hf, int Wf, int seed = -1)
	return (matrix[double] W, matrix[double] b) {
	W = rand(rows=F, cols=CHfWf, pdf="normal", seed=seed) * sqrt(2.0/(CHfWf))
	b = matrix(0, rows=F, cols=1)
	}

	affine_forward = function(matrix[double] X, matrix[double] W, matrix[double] b) return (matrix[double] out) {
	out = X %*% W + b;
	}

	affine_init = function(int D, int M, int seed = -1 ) return (matrix[double] W, matrix[double] b) {
	W = rand(rows=D, cols=M, pdf="normal", seed=seed) * sqrt(2.0/D);
	b = matrix(0, rows=1, cols=M);
	}

	relu_forward = function(matrix[double] X) return (matrix[double] out) {
	out = max(0, X);
	}

	max_pool2d_forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
	int strideh, int stridew, int padh, int padw) return(matrix[double] out, int Hout, int Wout)
	{
	N = nrow(X)
	Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1))
	Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1))
	out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
	stride=[strideh,stridew], padding=[padh,padw])
	}

	softmax_forward = function(matrix[double] scores) return (matrix[double] probs) {
	scores = scores - rowMaxs(scores); # numerical stability
	unnorm_probs = exp(scores); # unnormalized probabilities
	probs = unnorm_probs / rowSums(unnorm_probs); # normalized probabilities
	}

	getWeights = function(int fel, int lid,
	matrix[double] W_pt, matrix[double] b_pt,
	matrix[double] W_init, matrix[double] b_init)
	return (matrix[double] Wl, matrix[double] bl)
	{
	if (lid < fel) { #extract pretrained features
	Wl = W_pt;
	bl = b_pt;
	print("Extract feature of layer "+lid);
	}
	else { #use initialized weights
	Wl = W_init;
	bl = b_init;
	print("Initialize weights for layer "+lid);
	}
	}

	rwRowIndexMax = function(matrix[double] X, matrix[double] oneVec, matrix[double] idxSeq)
	return (matrix[double] index) {
	rm = rowMaxs(X) %*% oneVec;
	I = X == rm;
	index = rowMaxs(I * idxSeq);
	}

	predict = function(matrix[double] X, int C, int Hin, int Win, int K,
	matrix[double] W1, matrix[double] b1, matrix[double] W2, matrix[double] b2,
	matrix[double] W3, matrix[double] b3, matrix[double] W4, matrix[double] b4,
	matrix[double] W5, matrix[double] b5, matrix[double] W6, matrix[double] b6,
	matrix[double] W7, matrix[double] b7, matrix[double] W8, matrix[double] b8)
	return (matrix[double] Y_pred)
	{
	N = nrow(X);
	Hf = 5; #filter height
	Wf = 5; #filter width
	# Define filters
	F1 = 96;
	F2 = 256;
	F3 = 384;
	F4 = 384;
	F5 = 256;
	N3 = 4096;
	# Define strides
	s4 = 4;
	s2 = 2;
	s1 = 1;
	# Define pads
	pad2 = 2;
	pad1 = 1;
	pad0 = 0;

	# Initialize the weights for the non-transferred layers
	[W1_init, b1_init] = conv2d_init(F1, C, Hf=11, Wf=11, 43);
	[W2_init, b2_init] = conv2d_init(F2, F1, Hf=5, Wf=5, 43);
	[W3_init, b3_init] = conv2d_init(F3, F2, Hf=3, Wf=3, 43);
	[W4_init, b4_init] = conv2d_init(F4, F3, Hf=3, Wf=3, 43);
	[W5_init, b5_init] = conv2d_init(F5, F4, Hf=3, Wf=3, 43);
	[W6_init, b6_init] = affine_init(9216, 4096, 43);
	[W7_init, b7_init] = affine_init(N3, N3, 43);
	[W8_init, b8_init] = affine_init(N3, K, 43);
	W8_init = W8_init/sqrt(2);

	# Compute prediction over mini-batches
	verbose = FALSE;
	probs = matrix(0, rows=N, cols=K);
	Y_pred = matrix(0, rows=N, cols=4);
	batch_size = 64;
	oneVec = matrix(1, rows=1, cols=K);
	idxSeq = matrix(1, rows=batch_size, cols=1) %*% t(seq(1, K));
	iters = ceil (N / batch_size);
	for (i in 1:iters) {
	# Get next batch
	beg = ((i-1) * batch_size) %% N + 1;
	end = min(N, beg+batch_size-1);
	X_batch = X[beg:end,];

	j = 1;
	fel = 8;
	while (j < 5) {
	# Compute forward pass
	# layer 1: conv1 -> relu1 -> pool1
	lid = 1;
	[Wl1, bl1] = getWeights(fel, lid, W1, b1, W1_init, b1_init);
	[outc1, Houtc1, Woutc1] = conv2d_forward(X_batch, Wl1, bl1, C, Hin, Win, Hf=11, Wf=11,
	s4, s4, pad0, pad0);
	if(verbose) print("sum(conv1) = "+sum(outc1));
	if(verbose) print(nrow(outc1)+", "+ncol(outc1));
	outr1 = relu_forward(outc1);
	[outp1, Houtp1, Woutp1] = max_pool2d_forward(outr1, F1, Houtc1, Woutc1, Hf=3, Wf=3,
	strideh=s2, stridew=s2, padh=0, padw=0)
	if(verbose) print("sum(pool1) = "+sum(outp1));
	if(verbose) print(nrow(outp1)+", "+ncol(outp1));

	# layer 2: conv2 -> relu2 -> pool2
	lid = 2;
	[Wl2, bl2] = getWeights(fel, lid, W2, b2, W2_init, b2_init);
	[outc2, Houtc2, Woutc2] = conv2d_forward(outp1, Wl2, bl2, F1, Houtp1, Woutp1, Hf=5,
	Wf=5, s1, s1, pad2, pad2);
	if(verbose) print("sum(conv2) = "+sum(outc2));
	if(verbose) print(nrow(outc2)+", "+ncol(outc2));
	outr2 = relu_forward(outc2);
	[outp2, Houtp2, Woutp2] = max_pool2d_forward(outr2, F2, Houtc2, Woutc2, Hf=3, Wf=3,
	strideh=s2, stridew=s2, padh=0, padw=0)
	if(verbose) print("sum(pool2) = "+sum(outp2));
	if(verbose) print(nrow(outp2)+", "+ncol(outp2));

	# layer 3: conv3 -> relu3
	lid = 3
	[Wl3, bl3] = getWeights(fel, lid, W3, b3, W3_init, b3_init);
	[outc3, Houtc3, Woutc3] = conv2d_forward(outp2, Wl3, bl3, F2, Houtp2, Woutp2, Hf=3,
	Wf=3, s1, s1, pad1, pad1);
	if(verbose) print("sum(conv3) = "+sum(outc3));
	if(verbose) print(nrow(outc3)+", "+ncol(outc3));
	outr3 = relu_forward(outc3);

	# layer 4: conv4 -> relu4
	lid = 4;
	[Wl4, bl4] = getWeights(fel, lid, W4, b4, W4_init, b4_init);
	[outc4, Houtc4, Woutc4] = conv2d_forward(outr3, Wl4, bl4, F3, Houtc3, Woutc3, Hf=3,
	Wf=3, s1, s1, pad1, pad1);
	if(verbose) print("sum(conv4) = "+sum(outc4));
	if(verbose) print(nrow(outc4)+", "+ncol(outc4));
	outr4 = relu_forward(outc4);

	# layer 5: conv5 -> relu5 -> pool3
	lid = 5;
	[Wl5, bl5] = getWeights(fel, lid, W5, b5, W5_init, b5_init);
	[outc5, Houtc5, Woutc5] = conv2d_forward(outr4, Wl5, bl5, F4, Houtc4, Woutc4, Hf=3,
	Wf=3, s1, s1, pad1, pad1);
	if(verbose) print("sum(conv5) = "+sum(outc5));
	if(verbose) print(nrow(outc5)+", "+ncol(outc5));
	outr5 = relu_forward(outc5);
	[outp5, Houtp5, Woutp5] = max_pool2d_forward(outr5, F5, Houtc5, Woutc5, Hf=3, Wf=3,
	strideh=s2, stridew=s2, padh=0, padw=0)
	if(verbose) print("sum(pool3) = "+sum(outp5));
	if(verbose) print(nrow(outp5)+", "+ncol(outp5));

	# layer 6: affine1 -> relu6
	lid = 6;
	[Wl6, bl6] = getWeights(fel, lid, W6, b6, W6_init, b6_init);
	outa6 = affine_forward(outp5, Wl6, bl6);
	if(verbose) print(nrow(outa6)+", "+ncol(outa6));
	outr6 = relu_forward(outa6);

	# layer 7: affine2 -> relu7
	lid = 7;
	[Wl7, bl7] = getWeights(fel, lid, W7, b7, W7_init, b7_init);
	outa7 = affine_forward(outr6, Wl7, bl7);
	if(verbose) print(nrow(outa7)+", "+ncol(outa7));
	outr7 = relu_forward(outa7);

	# layer 8: affine3 -> softmax
	lid = 8;
	[Wl8, bl8] = getWeights(fel, lid, W8, b8, W8_init, b8_init);
	outa8 = affine_forward(outr7, Wl8, bl8);
	if(verbose) print(nrow(outa8)+", "+ncol(outa8));
	probs_batch = softmax_forward(outa8);

	# Store the predictions
	Y_pred[beg:end,j] = rwRowIndexMax(probs_batch, oneVec, idxSeq);
	j = j + 1;
	fel = fel - 1;
	}
	}
	}

	generate_dummy_data = function(int N, int C, int Hin, int Win, int K)
	return (matrix[double] X, matrix[double] Y) {
	X = rand(rows=N, cols=CHinWin, pdf="normal", seed=45) #linearized images
	classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform", seed=46))
	Y = table(seq(1, N), classes, N, K) #one-hot encoding
	}

	# Read training data and settings
	N = 64; #num of training images
	Nval = 512; #num of validation images
	Ntest = 64; #num of test images
	C = 3; #num of color channels
	Hin = 227; #227 #input image height
	Win = 227; #227 #input image width
	K = 10; #num of classes
	epochs = 1;

	# Generate dummy data
	[X, Y] = generate_dummy_data(N, C, Hin, Win, K);
	[X_val, Y_val] = generate_dummy_data(Nval, C, Hin, Win, K);
	[X_test, Y_test] = generate_dummy_data(Ntest, C, Hin, Win, K);

	# Train
	#[W1, b1, W2, b2] = train(X, Y, X_val, Y_val, C, Hin, Win, epochs);

	# Predict
	Hf = 5;
	Wf = 5;
	# Initialize random weights. FIXME: use pretrained weights
	[W1, b1] = conv2d_init(96, C, Hf=11, Wf=11, 42);
	[W2, b2] = conv2d_init(256, 96, Hf=5, Wf=5, 42);
	[W3, b3] = conv2d_init(384, 256, Hf=3, Wf=3, 42);
	[W4, b4] = conv2d_init(384, 384, Hf=3, Wf=3, 42);
	[W5, b5] = conv2d_init(256, 384, Hf=3, Wf=3, 42);
	[W6, b6] = affine_init(9216, 4096, 42);
	[W7, b7] = affine_init(4096, 4096, 42);
	[W8, b8] = affine_init(4096, K, 42);
	W8 = W8/sqrt(2);

	# Load the CuDNN libraries by calling a conv2d
	print("Eagerly loading cuDNN library");
	[outc1, Houtc1, Woutc1] = conv2d_forward(X[1:8,], W1, b1, C, Hin, Win, 11, 11, 1, 1, 2, 2);
	print(sum(outc1));

	print("Starting exploratory feature transfers");
	Y_pred = predict(X_test, C, Hin, Win, K, W1, b1, W2, b2, W3, b3, W4, b4,
	W5, b5, W6, b6, W7, b7, W8, b8);
	write(Y_pred, $1, format="text");