src/operator/contrib/krprod.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  *  Copyright (c) 2017 by Contributors
  *  \file krprod.h
  *  \brief Core function for Khatri-Rao product
  *  \author Jencir Lee, Chris Swierczewski
  */
 #ifndef MXNET_OPERATOR_CONTRIB_KRPROD_H_
 #define MXNET_OPERATOR_CONTRIB_KRPROD_H_
 #include <algorithm>
 #include <utility>
 #include <vector>
 #include "mshadow/tensor.h"
 #include "../operator_common.h"
 #include "../c_lapack_api.h"


 namespace mxnet {
 namespace op {

 using namespace mshadow;
 using namespace mshadow::expr;

 /*!
  * \brief Computes row-wise Kronecker product
  *
  * Given input matrices, this function computes the Kronecker product
  * row-wise. E.g. if the input matrices  are of shape (3, 2), (3, 4),
  * (3, 5), the result matrix will be of shape (3, 2 * 4 * 5), which is
  * (3, 40).
  *
  * \param out result matrix
  * \param ts_arr vector of input matrices
  */
 template <typename DType>
 inline void row_wise_kronecker
   (Tensor<cpu, 2, DType> out,
   const std::vector<Tensor<cpu, 2, DType> > &ts_arr) {
   CHECK_GE(ts_arr.size(), 1) << "The input matrices must be non-empty.";

   // Check all input and output matrices have the same number of rows
   // and the output matrix has the right number of columns
   int nrows = static_cast<int>(out.size(0));
   int ncols = 1;
   for (auto & ts : ts_arr) {
     CHECK_EQ(nrows, static_cast<int>(ts.size(0)))
       << "All input and output matrices must have the same number of rows.";
     ncols *= ts.size(1);
   }
   CHECK_EQ(ncols, static_cast<int>(out.size(1)));

   // Create an intermediate space of the same shape as out
   //
   // Suppose storage stores the result at step i-1, we'd
   // compute and store the result into out for step i;
   // we then proceed to compute and store the result in storage
   // for step i+1 and so on and so forth, by alternating using
   // storage and out to store the given variable and the result variable
   Tensor<cpu, 2, DType> storage(out.shape_);
   AllocSpace(&storage);

   // Pointers to the given variable and result variable
   // We exchange what given and result point to at every step
   Tensor<cpu, 2, DType> *given = &storage,
     *result = &out, *tmp;

   // Compute each intermediate row-wise Kronecker product
   storage = 1;
   ncols = 1;
   for (auto & ts : ts_arr) {
     expr::BLASEngine<cpu, DType>::SetStream
       (result->stream_);

     // Compute the current row-wise Kronecker product
     *result = 0;
     for (int i = 0; i < nrows; ++i) {
       // BLAS signature
       //
       // dger(
       //   m : ts.size(1), length of each row of current matrix
       //   n : ncols, length of each row of previous result
       //   alpha : 1, scaling to the outer product of x and y
       //   x : ts[i].dptr_, current row of current matrix
       //   incx : 1, as each element in the row is contiguous
       //   y : (*given)[i].dptr_, current row of the given variable
       //   incy : 1, as each element in the row is contiguous
       //   a : (*result)[i].dptr_, current row of the result variable
       //   lda : ts.size(1), as the outer product is stored as one row
       //         which occupies contiguous memory, and as BLASEngine::ger()
       //         assumes column-major matrix, lda has to be precisely
       //         the length of x, i.e. ts[i].size(1)
       // )
       expr::BLASEngine<cpu, DType>::ger
         (result->stream_,
         ts.size(1), ncols, 1,
         ts[i].dptr_, 1,
         (*given)[i].dptr_, 1,
         (*result)[i].dptr_, ts.size(1));
     }
     ncols *= ts.size(1);

     tmp = given;
     given = result;
     result = tmp;
   }

   // If the final result is stored in storage,
   // copy its value to out
   if (given != &out)
     Copy(out, storage);

   FreeSpace(&storage);
 }

 /*!
  * \brief Khatri-Rao product
  *
  * \param out result matrix
  * \param ts_arr vector of input matrices
  */
 template <typename DType>
 inline void khatri_rao
   (Tensor<cpu, 2, DType> out,
   const std::vector<Tensor<cpu, 2, DType> > &ts_arr) {
   CHECK_GE(ts_arr.size(), 1) << "The input matrices must be non-empty.";

   // Check all input and output matrices have the same number
   // of columns and the output matrix has the right number of rows
   int ncols = static_cast<int>(out.size(1));
   int nrows = 1;
   for (auto & ts : ts_arr) {
     CHECK_EQ(ncols, static_cast<int>(ts.size(1)))
       << "All input and output matrices must have the same number of columns.";
     nrows *= ts.size(0);
   }
   CHECK_EQ(nrows, static_cast<int>(out.size(0)));

   // Change the layout of matrices to column-major
   Tensor<cpu, 2, DType> out_t(Shape2(out.size(1), out.size(0)));
   AllocSpace(&out_t);
   flip<cpu, DType>(out.size(0), out.size(1), out_t.dptr_, out_t.stride_,
     out.dptr_, out.stride_);

   std::vector<Tensor<cpu, 2, DType> > ts_t_arr;
   for (int i = 0; i < static_cast<int>(ts_arr.size()); ++i) {
     ts_t_arr.emplace_back(Shape2(ts_arr[i].size(1), ts_arr[i].size(0)));
     AllocSpace(&ts_t_arr[i]);
     flip<cpu, DType>(ts_arr[i].size(0), ts_arr[i].size(1), ts_t_arr[i].dptr_,
       ts_t_arr[i].stride_, ts_arr[i].dptr_, ts_arr[i].stride_);
   }

   // Perform row-wise Kronecker product
   row_wise_kronecker(out_t, ts_t_arr);

   // Change the layout of result matrix back to row-major
   flip<cpu, DType>(out.size(1), out.size(0), out.dptr_, out.stride_,
     out_t.dptr_, out_t.stride_);

   FreeSpace(&out_t);
   for (auto &ts_t : ts_t_arr)
     FreeSpace(&ts_t);
 }

 /*!
  * \brief Moore-Penrose pseudoinverse of the Khatri-Rao product
  *
  * Given input matrices A_1, ..., A_n, of shape (l_1, k), ..., (l_n, k) respectively, the pseudoinverse of the Khatri-Rao product is
  *
  *   pinv(A_1 khatri-rao A_2 khatri-rao ... khatri-rao A_n) =
  *     ((A_1^T A_1) hadamard-dot ... hadamard-dot (A_n^T A_n))
  *     (A_1 khatri-rao ... khatri-rao A_n)^T
  *
  * As the first term of the r.h.s is a square matrix, the result is always of the same shape as the transpose of the Khatri-Rao product of the input matrices. The input argument ts_arr could contain the original input matrices, or transposed ones.
  *
  * \param out result matrix
  * \param ts_arr vector of input matrices
  * \param input_transposed if every input matrices is transposed
  */
 template <typename DType>
 inline void inv_khatri_rao
   (Tensor<cpu, 2, DType> out,
   const std::vector<Tensor<cpu, 2, DType> > &ts_arr,
   bool input_transposed = false) {
   CHECK_GE(ts_arr.size(), 1) << "Input tensor array must be non-empty";

   // Initialise the Hadamard product to eye(k)
   // where k is the number of "factors"
   int k = out.size(0);
   Tensor<cpu, 2, DType> hadamard_prod(Shape2(k, k));
   AllocSpace(&hadamard_prod);
   hadamard_prod = 1;

   // Note that out is of the same shape as the transpose of
   // the Khatri-Rao product
   //
   // When input is transposed, we could first put the transpose of
   // the Khatri-Rao product in out, then call the linear solver, which
   // will update the out's content to the final result;
   //
   // If the input is not transposed, we need to create an intermediate
   // tensor to store the Khatri-Rao product, call the linear solver with
   // MXNET_LAPACK_COL_MAJOR as the matrix layout, and transpose
   // the final result into out

   int info;
   if (input_transposed) {
     row_wise_kronecker(out, ts_arr);
     for (auto &ts : ts_arr)
       hadamard_prod *= implicit_dot(ts, ts.T());

     info = MXNET_LAPACK_posv<DType>(MXNET_LAPACK_ROW_MAJOR, 'U',
       k, out.size(1), hadamard_prod.dptr_, hadamard_prod.stride_,
       out.dptr_, out.stride_);
   } else {
     Tensor<cpu, 2, DType> kr(Shape2(out.size(1), out.size(0)));
     AllocSpace(&kr);
     khatri_rao(kr, ts_arr);

     for (auto &ts : ts_arr)
       hadamard_prod *= implicit_dot(ts.T(), ts);

     info = MXNET_LAPACK_posv<DType>(MXNET_LAPACK_COL_MAJOR, 'U',
       k, out.size(1), hadamard_prod.dptr_, hadamard_prod.stride_,
       kr.dptr_, kr.stride_);

     flip<cpu, DType>(out.size(1), out.size(0), out.dptr_, out.stride_,
       kr.dptr_, kr.stride_);
     FreeSpace(&kr);
   }

   FreeSpace(&hadamard_prod);
   if (info != 0)
     LOG(FATAL) << "The linear solver in inv_prod() returns " << info;
 }


 template<typename xpu, typename DType>
 inline void KhatriRaoCompute_(const nnvm::NodeAttrs &attrs,
                               const OpContext &ctx,
                               const std::vector<TBlob> &in_data,
                               const std::vector<OpReqType> &req,
                               const std::vector<TBlob> &out_data) {
   using namespace mxnet_op;
   if (req[0] == kNullOp) return;

   Stream<xpu> *stream = ctx.get_stream<xpu>();
   Tensor<xpu, 2, DType> out = out_data[0].get<xpu, 2, DType>(stream);
   std::vector<Tensor<xpu, 2, DType> > ts_arr(in_data.size());
   std::transform(in_data.begin(), in_data.end(), ts_arr.begin(),
                  [&stream](TBlob blob) -> Tensor<xpu, 2, DType> {
                    return blob.get<xpu, 2, DType>(stream);
                  });
   khatri_rao(out, ts_arr);
 }


 template<typename xpu>
 inline void KhatriRaoCompute(const nnvm::NodeAttrs &attrs,
                              const OpContext &ctx,
                              const std::vector<TBlob> &inputs,
                              const std::vector<OpReqType> &req,
                              const std::vector<TBlob> &outputs) {
   using namespace mxnet_op;
   CHECK_EQ(outputs.size(), 1U);
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       KhatriRaoCompute_<xpu, DType>(attrs, ctx, inputs, req, outputs);
   });
 }

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_CONTRIB_KRPROD_H_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* Copyright (c) 2017 by Contributors
	* \file krprod.h
	* \brief Core function for Khatri-Rao product
	* \author Jencir Lee, Chris Swierczewski
	*/
	#ifndef MXNET_OPERATOR_CONTRIB_KRPROD_H_
	#define MXNET_OPERATOR_CONTRIB_KRPROD_H_
	#include <algorithm>
	#include <utility>
	#include <vector>
	#include "mshadow/tensor.h"
	#include "../operator_common.h"
	#include "../c_lapack_api.h"


	namespace mxnet {
	namespace op {

	using namespace mshadow;
	using namespace mshadow::expr;

	/*!
	* \brief Computes row-wise Kronecker product
	*
	* Given input matrices, this function computes the Kronecker product
	* row-wise. E.g. if the input matrices are of shape (3, 2), (3, 4),
	* (3, 5), the result matrix will be of shape (3, 2 * 4 * 5), which is
	* (3, 40).
	*
	* \param out result matrix
	* \param ts_arr vector of input matrices
	*/
	template <typename DType>
	inline void row_wise_kronecker
	(Tensor<cpu, 2, DType> out,
	const std::vector<Tensor<cpu, 2, DType> > &ts_arr) {
	CHECK_GE(ts_arr.size(), 1) << "The input matrices must be non-empty.";

	// Check all input and output matrices have the same number of rows
	// and the output matrix has the right number of columns
	int nrows = static_cast<int>(out.size(0));
	int ncols = 1;
	for (auto & ts : ts_arr) {
	CHECK_EQ(nrows, static_cast<int>(ts.size(0)))
	<< "All input and output matrices must have the same number of rows.";
	ncols *= ts.size(1);
	}
	CHECK_EQ(ncols, static_cast<int>(out.size(1)));

	// Create an intermediate space of the same shape as out
	//
	// Suppose storage stores the result at step i-1, we'd
	// compute and store the result into out for step i;
	// we then proceed to compute and store the result in storage
	// for step i+1 and so on and so forth, by alternating using
	// storage and out to store the given variable and the result variable
	Tensor<cpu, 2, DType> storage(out.shape_);
	AllocSpace(&storage);

	// Pointers to the given variable and result variable
	// We exchange what given and result point to at every step
	Tensor<cpu, 2, DType> *given = &storage,
	result = &out, tmp;

	// Compute each intermediate row-wise Kronecker product
	storage = 1;
	ncols = 1;
	for (auto & ts : ts_arr) {
	expr::BLASEngine<cpu, DType>::SetStream
	(result->stream_);

	// Compute the current row-wise Kronecker product
	*result = 0;
	for (int i = 0; i < nrows; ++i) {
	// BLAS signature
	//
	// dger(
	// m : ts.size(1), length of each row of current matrix
	// n : ncols, length of each row of previous result
	// alpha : 1, scaling to the outer product of x and y
	// x : ts[i].dptr_, current row of current matrix
	// incx : 1, as each element in the row is contiguous
	// y : (*given)[i].dptr_, current row of the given variable
	// incy : 1, as each element in the row is contiguous
	// a : (*result)[i].dptr_, current row of the result variable
	// lda : ts.size(1), as the outer product is stored as one row
	// which occupies contiguous memory, and as BLASEngine::ger()
	// assumes column-major matrix, lda has to be precisely
	// the length of x, i.e. ts[i].size(1)
	// )
	expr::BLASEngine<cpu, DType>::ger
	(result->stream_,
	ts.size(1), ncols, 1,
	ts[i].dptr_, 1,
	(*given)[i].dptr_, 1,
	(*result)[i].dptr_, ts.size(1));
	}
	ncols *= ts.size(1);

	tmp = given;
	given = result;
	result = tmp;
	}

	// If the final result is stored in storage,
	// copy its value to out
	if (given != &out)
	Copy(out, storage);

	FreeSpace(&storage);
	}

	/*!
	* \brief Khatri-Rao product
	*
	* \param out result matrix
	* \param ts_arr vector of input matrices
	*/
	template <typename DType>
	inline void khatri_rao
	(Tensor<cpu, 2, DType> out,
	const std::vector<Tensor<cpu, 2, DType> > &ts_arr) {
	CHECK_GE(ts_arr.size(), 1) << "The input matrices must be non-empty.";

	// Check all input and output matrices have the same number
	// of columns and the output matrix has the right number of rows
	int ncols = static_cast<int>(out.size(1));
	int nrows = 1;
	for (auto & ts : ts_arr) {
	CHECK_EQ(ncols, static_cast<int>(ts.size(1)))
	<< "All input and output matrices must have the same number of columns.";
	nrows *= ts.size(0);
	}
	CHECK_EQ(nrows, static_cast<int>(out.size(0)));

	// Change the layout of matrices to column-major
	Tensor<cpu, 2, DType> out_t(Shape2(out.size(1), out.size(0)));
	AllocSpace(&out_t);
	flip<cpu, DType>(out.size(0), out.size(1), out_t.dptr_, out_t.stride_,
	out.dptr_, out.stride_);

	std::vector<Tensor<cpu, 2, DType> > ts_t_arr;
	for (int i = 0; i < static_cast<int>(ts_arr.size()); ++i) {
	ts_t_arr.emplace_back(Shape2(ts_arr[i].size(1), ts_arr[i].size(0)));
	AllocSpace(&ts_t_arr[i]);
	flip<cpu, DType>(ts_arr[i].size(0), ts_arr[i].size(1), ts_t_arr[i].dptr_,
	ts_t_arr[i].stride_, ts_arr[i].dptr_, ts_arr[i].stride_);
	}

	// Perform row-wise Kronecker product
	row_wise_kronecker(out_t, ts_t_arr);

	// Change the layout of result matrix back to row-major
	flip<cpu, DType>(out.size(1), out.size(0), out.dptr_, out.stride_,
	out_t.dptr_, out_t.stride_);

	FreeSpace(&out_t);
	for (auto &ts_t : ts_t_arr)
	FreeSpace(&ts_t);
	}

	/*!
	* \brief Moore-Penrose pseudoinverse of the Khatri-Rao product
	*
	* Given input matrices A_1, ..., A_n, of shape (l_1, k), ..., (l_n, k) respectively, the pseudoinverse of the Khatri-Rao product is
	*
	* pinv(A_1 khatri-rao A_2 khatri-rao ... khatri-rao A_n) =
	* ((A_1^T A_1) hadamard-dot ... hadamard-dot (A_n^T A_n))
	* (A_1 khatri-rao ... khatri-rao A_n)^T
	*
	* As the first term of the r.h.s is a square matrix, the result is always of the same shape as the transpose of the Khatri-Rao product of the input matrices. The input argument ts_arr could contain the original input matrices, or transposed ones.
	*
	* \param out result matrix
	* \param ts_arr vector of input matrices
	* \param input_transposed if every input matrices is transposed
	*/
	template <typename DType>
	inline void inv_khatri_rao
	(Tensor<cpu, 2, DType> out,
	const std::vector<Tensor<cpu, 2, DType> > &ts_arr,
	bool input_transposed = false) {
	CHECK_GE(ts_arr.size(), 1) << "Input tensor array must be non-empty";

	// Initialise the Hadamard product to eye(k)
	// where k is the number of "factors"
	int k = out.size(0);
	Tensor<cpu, 2, DType> hadamard_prod(Shape2(k, k));
	AllocSpace(&hadamard_prod);
	hadamard_prod = 1;

	// Note that out is of the same shape as the transpose of
	// the Khatri-Rao product
	//
	// When input is transposed, we could first put the transpose of
	// the Khatri-Rao product in out, then call the linear solver, which
	// will update the out's content to the final result;
	//
	// If the input is not transposed, we need to create an intermediate
	// tensor to store the Khatri-Rao product, call the linear solver with
	// MXNET_LAPACK_COL_MAJOR as the matrix layout, and transpose
	// the final result into out

	int info;
	if (input_transposed) {
	row_wise_kronecker(out, ts_arr);
	for (auto &ts : ts_arr)
	hadamard_prod *= implicit_dot(ts, ts.T());

	info = MXNET_LAPACK_posv<DType>(MXNET_LAPACK_ROW_MAJOR, 'U',
	k, out.size(1), hadamard_prod.dptr_, hadamard_prod.stride_,
	out.dptr_, out.stride_);
	} else {
	Tensor<cpu, 2, DType> kr(Shape2(out.size(1), out.size(0)));
	AllocSpace(&kr);
	khatri_rao(kr, ts_arr);

	for (auto &ts : ts_arr)
	hadamard_prod *= implicit_dot(ts.T(), ts);

	info = MXNET_LAPACK_posv<DType>(MXNET_LAPACK_COL_MAJOR, 'U',
	k, out.size(1), hadamard_prod.dptr_, hadamard_prod.stride_,
	kr.dptr_, kr.stride_);

	flip<cpu, DType>(out.size(1), out.size(0), out.dptr_, out.stride_,
	kr.dptr_, kr.stride_);
	FreeSpace(&kr);
	}

	FreeSpace(&hadamard_prod);
	if (info != 0)
	LOG(FATAL) << "The linear solver in inv_prod() returns " << info;
	}


	template<typename xpu, typename DType>
	inline void KhatriRaoCompute_(const nnvm::NodeAttrs &attrs,
	const OpContext &ctx,
	const std::vector<TBlob> &in_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &out_data) {
	using namespace mxnet_op;
	if (req[0] == kNullOp) return;

	Stream<xpu> *stream = ctx.get_stream<xpu>();
	Tensor<xpu, 2, DType> out = out_data[0].get<xpu, 2, DType>(stream);
	std::vector<Tensor<xpu, 2, DType> > ts_arr(in_data.size());
	std::transform(in_data.begin(), in_data.end(), ts_arr.begin(),
	[&stream](TBlob blob) -> Tensor<xpu, 2, DType> {
	return blob.get<xpu, 2, DType>(stream);
	});
	khatri_rao(out, ts_arr);
	}


	template<typename xpu>
	inline void KhatriRaoCompute(const nnvm::NodeAttrs &attrs,
	const OpContext &ctx,
	const std::vector<TBlob> &inputs,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &outputs) {
	using namespace mxnet_op;
	CHECK_EQ(outputs.size(), 1U);
	MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
	KhatriRaoCompute_<xpu, DType>(attrs, ctx, inputs, req, outputs);
	});
	}

	} // namespace op
	} // namespace mxnet

	#endif // MXNET_OPERATOR_CONTRIB_KRPROD_H_