src/modules/linalg/metric.cpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file metric.cpp
  *
  * @brief Metric operations
  *
  *//* ----------------------------------------------------------------------- */

 #include <dbconnector/dbconnector.hpp>
 #include <limits>
 #include <string>
 #include <vector>
 #include <set>
 #include <algorithm>
 #include <sstream>
 #include <cmath>
 #include <boost/algorithm/string.hpp>

 #include "metric.hpp"

 using std::string;

 namespace madlib {

 using namespace dbconnector::postgres;

 namespace modules {

 namespace linalg {

 namespace {

 template <class TupleType>
 struct ReverseLexicographicComparator {
     /**
      * @brief Return true if the first argument is less than the second
      */
     bool operator()(const TupleType& inTuple1, const TupleType& inTuple2) {
         // This could be a real reverse lexicographic comparator in C++11, but
         // lacking variadic template arguments, we simply pretend here that
         // all tuples contain only 2 elements.
         return std::get<1>(inTuple1) < std::get<1>(inTuple2) ||
             (std::get<1>(inTuple1) == std::get<1>(inTuple2) &&
                 std::get<0>(inTuple1) < std::get<0>(inTuple2));
     }
 };

 } // anonymous namespace

 /**
  * @brief Compute the k columns of a matrix that are closest to a vector
  *
  * @tparam DistanceFunction Type of the distance function. This can be a
  *     function pointer, or a class with method <tt>operator()</tt> (like
  *     \c FunctionHandle)
  * @tparam NumClosestColumns The number of closest columns to compute. We assume
  *     that \c outIndicesAndDistances is of size \c NumClosestColumns.
  *
  * @param inMatrix Matrix \f$ M \f$
  * @param inVector Vector \f$ \vec x \f$
  * @param[out] outClosestColumns A list of \c NumClosestColumns pairs
  *     \f$ (i, d) \f$ sorted in ascending order of \f$ d \f$, where $i$ is a
  *     0-based column index in \f$ M \f$ and
  *     \f$ d \f$ is the distance (using \c inMetric) between \f$ M_i \f$ and
  *     \f$ x \f$.
  */
 template <class DistanceFunction, class RandomAccessIterator>
 void
 closestColumnsAndDistances(
     const MappedMatrix& inMatrix,
     const MappedColumnVector& inVector,
     DistanceFunction& inMetric,
     RandomAccessIterator ioFirst,
     RandomAccessIterator ioLast)
 {

     ReverseLexicographicComparator<
         typename std::iterator_traits<RandomAccessIterator>::value_type>
             comparator;

     std::fill(ioFirst, ioLast,
         std::make_tuple(0, std::numeric_limits<double>::infinity()));
     for (Index i = 0; i < inMatrix.cols(); ++i) {
         double currentDist;
         currentDist
             = AnyType_cast<double>(
                     inMetric(MappedColumnVector(inMatrix.col(i)), inVector)
                     );

         // outIndicesAndDistances is a heap, so the first element is maximal
         if (currentDist < std::get<1>(*ioFirst)) {
             // Unfortunately, the STL does not have a decrease-key function,
             // so we are wasting a bit of performance here
             std::pop_heap(ioFirst, ioLast, comparator);
             *(ioLast - 1) = std::make_tuple(i, currentDist);
             std::push_heap(ioFirst, ioLast, comparator);
         }
     }
     std::sort_heap(ioFirst, ioLast, comparator);
 }


 template <class RandomAccessIterator>
 void
 closestColumnsAndDistancesUDF(
     const MappedMatrix& inMatrix,
     const MappedColumnVector& inVector,
     RandomAccessIterator ioFirst,
     RandomAccessIterator ioLast,
     Oid oid)
 {

     ReverseLexicographicComparator<
         typename std::iterator_traits<RandomAccessIterator>::value_type>
             comparator;

     std::fill(ioFirst, ioLast,
         std::make_tuple(0, std::numeric_limits<double>::infinity()));
     for (Index i = 0; i < inMatrix.cols(); ++i) {
         double currentDist;
         currentDist = static_cast<double>(DatumGetFloat8(OidFunctionCall2(
                         oid,
                         PointerGetDatum(VectorToNativeArray(inMatrix.col(i))),
                         PointerGetDatum(VectorToNativeArray(inVector))
                         )));

         // outIndicesAndDistances is a heap, so the first element is maximal
         if (currentDist < std::get<1>(*ioFirst)) {
             // Unfortunately, the STL does not have a decrease-key function,
             // so we are wasting a bit of performance here
             std::pop_heap(ioFirst, ioLast, comparator);
             *(ioLast - 1) = std::make_tuple(i, currentDist);
             std::push_heap(ioFirst, ioLast, comparator);
         }
     }
     std::sort_heap(ioFirst, ioLast, comparator);
 }

 double
 distPNorm(const MappedColumnVector& inX, const MappedColumnVector& inY, double p) {
     if (inX.size() != inY.size()) {
         throw std::runtime_error("Found input arrays of "
                 "different lengths unexpectedly.");
     }
     if (p <= 0 || std::isnan(p)) {
         throw std::runtime_error("Expect input p to be positive.");
     }

     if (!std::isfinite(p)) {
         return (inX - inY).lpNorm<Eigen::Infinity>();
     } else {
         double res = 0.0;
         for (int i = 0; i < inX.size(); i++) {
             res += std::pow(std::abs(inX(i) - inY(i)), p);
         }
         return std::pow(res, 1./p);
     }
 }

 double
 distNorm1(const MappedColumnVector& inX, const MappedColumnVector& inY) {
     if (inX.size() != inY.size()) {
         throw std::runtime_error("Found input arrays of "
                 "different lengths unexpectedly.");
     }

     return (inX - inY).lpNorm<1>();
 }

 double
 distNorm2(const MappedColumnVector& inX, const MappedColumnVector& inY) {
     if (inX.size() != inY.size()) {
         throw std::runtime_error("Found input arrays of "
                 "different lengths unexpectedly.");
     }

     return (inX - inY).norm();
 }

 double
 cosineSimilarity(const MappedColumnVector& inX, const MappedColumnVector& inY) {
     if (inX.size() != inY.size()) {
         throw std::runtime_error("Found input arrays of "
                 "different lengths unexpectedly.");
     }

 	double xnorm = inX.norm(), ynorm = inY.norm();
 	if (xnorm < std::numeric_limits<double>::denorm_min()
 		|| ynorm < std::numeric_limits<double>::denorm_min()) {
         return -1;
     }

     return inX.dot(inY) / (xnorm * ynorm);
 }

 double
 squaredDistNorm2(const MappedColumnVector& inX, const MappedColumnVector& inY) {
     if (inX.size() != inY.size()) {
         throw std::runtime_error("Found input arrays of "
                 "different lengths unexpectedly.");
     }

     return (inX - inY).squaredNorm();
 }

 double
 distAngle(const MappedColumnVector& inX, const MappedColumnVector& inY) {
     if (inX.size() != inY.size()) {
         throw std::runtime_error("Found input arrays of "
                 "different lengths unexpectedly.");
     }

 	// Deal with the undefined case where one of the norm is zero
 	// Angle is not defined. Just return \pi.
 	double xnorm = inX.norm(), ynorm = inY.norm();
 	if (xnorm < std::numeric_limits<double>::denorm_min()
 		|| ynorm < std::numeric_limits<double>::denorm_min())
 		return std::acos(-1);

     double cosine = dot(inX, inY) / (xnorm * ynorm);
     if (cosine > 1)
         cosine = 1;
     else if (cosine < -1)
         cosine = -1;
     return std::acos(cosine);
 }

 double
 distTanimoto(const MappedColumnVector& inX, const MappedColumnVector& inY) {
     if (inX.size() != inY.size()) {
         throw std::runtime_error("Found input arrays of "
                 "different lengths unexpectedly.");
     }

     // Note that this is not a metric in general!
     double dotProduct = dot(inX, inY);
     double tanimoto = inX.squaredNorm() + inY.squaredNorm();
     return (tanimoto - 2 * dotProduct) / (tanimoto - dotProduct);
 }

 double
 distJaccard(const ArrayHandle<text*>& inX, const ArrayHandle<text*>& inY) {
     if (inX.size() == 0 && inY.size() == 0)
         return 0.0;  // both empty are treated as zero distance
     if (inX.size() == 0 || inY.size() == 0)
         return 1.0;   // one of the sets being empty is treated as max distance

     std::set<string> x_set;
     for (size_t i = 0; i < inX.size(); i++){
         x_set.insert(std::string(VARDATA_ANY(inX[i]),
                                  VARSIZE_ANY(inX[i]) - VARHDRSZ));
     }

     size_t n_intersection = 0;
     size_t n_union = x_set.size();
     std::set<string> y_set;
     for (size_t i = 0; i < inY.size(); i++){
         string y_elem = std::string(VARDATA_ANY(inY[i]),
                                     VARSIZE_ANY(inY[i]) - VARHDRSZ);
         if (y_set.count(y_elem) == 0){
             // for sets, count returns 1 if an element with that value
             // exists in the container, and zero otherwise.

             if (x_set.count(y_elem) == 1){
                 // element present in both sets (already counted for union)
                 n_intersection++;
             }
             else{
                 // element only in inY, not yet counted for union
                 n_union++;
             }
         }
         y_set.insert(y_elem);
     }
     return 1.0 - static_cast<double>(n_intersection) / static_cast<double>(n_union);
 }

 /**
  * @brief Compute the k columns of a matrix that are closest to a vector
  *
  * For performance, we cheat here: For the following four distance functions, we
  * take a special shortcut.
  * FIXME: FunctionHandle should be tuned so that this shortcut no longer
  * impacts performance by more than, say, ~10%.
  */

 std::string dist_fn_name(string s)
 {
     std::istringstream ss(s);
     std::string token, fname;
     if (std::getline(ss, token, '.')) fname = token; // suppose there is no schema name
     if (std::getline(ss, token, '.')) fname = token; // previous part is schema name
     return fname;
 }


 template <class RandomAccessIterator>
 inline
 void
 closestColumnsAndDistancesShortcut(
     const MappedMatrix& inMatrix,
     const MappedColumnVector& inVector,
     FunctionHandle &inDist,
     std::string fname,
     RandomAccessIterator ioFirst,
     RandomAccessIterator ioLast) {

     // Sorted in the order of expected use
     if (fname.compare("squared_dist_norm2") == 0)
         closestColumnsAndDistances(inMatrix, inVector, squaredDistNorm2,
             ioFirst, ioLast);
     else if (fname.compare("dist_norm2") == 0)
         closestColumnsAndDistances(inMatrix, inVector, distNorm2,
             ioFirst, ioLast);
     else if (fname.compare("dist_norm1") == 0)
         closestColumnsAndDistances(inMatrix, inVector, distNorm1,
             ioFirst, ioLast);
     else if (fname.compare("dist_angle") == 0)
         closestColumnsAndDistances(inMatrix, inVector, distAngle,
             ioFirst, ioLast);
     else if (fname.compare("dist_tanimoto") == 0) {
         closestColumnsAndDistances(inMatrix, inVector, distTanimoto,
             ioFirst, ioLast);
     } else {
         closestColumnsAndDistancesUDF(inMatrix, inVector, ioFirst,
                 ioLast, inDist.funcID());
     }
 }


 /**
  * @brief Compute the minimum distance between a vector and any column of a
  *     matrix
  *
  * This function calls a user-supplied function, for which it does not do
  * garbage collection. It is therefore meant to be called only constantly many
  * times before control is returned to the backend.
  */
 AnyType
 closest_column::run(AnyType& args) {
     //if (true) throw std::runtime_error("Begin cc run\n");
     try{
         MappedMatrix M = args[0].getAs<MappedMatrix>();
         MappedColumnVector x = args[1].getAs<MappedColumnVector>();
         FunctionHandle dist = args[2].getAs<FunctionHandle>()
             .unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall);
         string dist_fname = args[3].getAs<char *>();
         std::string fname = dist_fn_name(dist_fname);
         std::tuple<Index, double> result;
         closestColumnsAndDistancesShortcut(M, x, dist, fname, &result, &result + 1);

         AnyType tuple;
         return tuple
             << static_cast<int32_t>(std::get<0>(result))
             << std::get<1>(result);
     }catch (const ArrayWithNullException &e) {
         return Null();
     }
 }

 /**
  * @brief Compute the minimum distance between a vector and any column of a
  *     matrix
  *
  * This function calls a user-supplied function, for which it does not do
  * garbage collection. It is therefore meant to be called only constantly many
  * times before control is returned to the backend.
   */
 AnyType
 closest_columns::run(AnyType& args) {

     /* If the input has a null value, we want to return nothing for that
     *  particular data point (because we cannot calculate the distance)
     *  instead of failing.
     */
     try{
         MappedMatrix M = args[0].getAs<MappedMatrix>();
         MappedColumnVector x = args[1].getAs<MappedColumnVector>();
         uint32_t num = args[2].getAs<uint32_t>();
         FunctionHandle dist = args[3].getAs<FunctionHandle>()
             .unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall);
         string dist_fname = args[4].getAs<char *>();

         std::string fname = dist_fn_name(dist_fname);

         std::vector<std::tuple<Index, double> > result(num);
         closestColumnsAndDistancesShortcut(M, x, dist, fname, result.begin(),
             result.end());

         MutableArrayHandle<int32_t> indices = allocateArray<int32_t,
             dbal::FunctionContext, dbal::DoNotZero, dbal::ThrowBadAlloc>(num);
         MutableArrayHandle<double> distances = allocateArray<double,
             dbal::FunctionContext, dbal::DoNotZero, dbal::ThrowBadAlloc>(num);
         for (uint32_t i = 0; i < num; ++i)
             std::tie(indices[i], distances[i]) = result[i];

         AnyType tuple;
         return tuple << indices << distances;
     }catch (const ArrayWithNullException &e) {
         return Null();
     }
 }

 AnyType
 norm1::run(AnyType& args) {
     return static_cast<double>(args[0].getAs<MappedColumnVector>().lpNorm<1>());
 }

 AnyType
 norm2::run(AnyType& args) {
     return static_cast<double>(args[0].getAs<MappedColumnVector>().norm());
 }

 AnyType
 dist_inf_norm::run(AnyType& args) {
     return distPNorm(
         args[0].getAs<MappedColumnVector>(),
         args[1].getAs<MappedColumnVector>(),
         std::numeric_limits<double>::infinity()
     );
 }

 AnyType
 dist_pnorm::run(AnyType& args) {
     return distPNorm(
         args[0].getAs<MappedColumnVector>(),
         args[1].getAs<MappedColumnVector>(),
         args[2].getAs<double>()
     );
 }

 AnyType
 dist_norm1::run(AnyType& args) {
     return distNorm1(
         args[0].getAs<MappedColumnVector>(),
         args[1].getAs<MappedColumnVector>()
     );
 }

 AnyType
 dist_norm2::run(AnyType& args) {
     // FIXME: it would be nice to declare this as a template function (so it
     // works for dense and sparse vectors), and the C++ AL takes care of the
     // rest...
     return distNorm2(
         args[0].getAs<MappedColumnVector>(),
         args[1].getAs<MappedColumnVector>()
     );
 }

 AnyType
 cosine_similarity::run(AnyType& args) {
     return cosineSimilarity(
         args[0].getAs<MappedColumnVector>(),
         args[1].getAs<MappedColumnVector>()
     );
 }

 AnyType
 squared_dist_norm2::run(AnyType& args) {
     return squaredDistNorm2(
         args[0].getAs<MappedColumnVector>(),
         args[1].getAs<MappedColumnVector>()
     );
 }

 AnyType
 dist_angle::run(AnyType& args) {
     return distAngle(
         args[0].getAs<MappedColumnVector>(),
         args[1].getAs<MappedColumnVector>()
     );
 }

 AnyType
 dist_tanimoto::run(AnyType& args) {
     return distTanimoto(
         args[0].getAs<MappedColumnVector>(),
         args[1].getAs<MappedColumnVector>()
     );
 }

 AnyType
 dist_jaccard::run(AnyType& args) {
     return distJaccard(
         args[0].getAs<ArrayHandle<text*> >(),
         args[1].getAs<ArrayHandle<text*> >()
     );
 }

 } // namespace linalg

 } // namespace modules

 } // namespace regress
	/* ----------------------------------------------------------------------- //*
	*
	* @file metric.cpp
	*
	* @brief Metric operations
	*
	// ----------------------------------------------------------------------- */

	#include <dbconnector/dbconnector.hpp>
	#include <limits>
	#include <string>
	#include <vector>
	#include <set>
	#include <algorithm>
	#include <sstream>
	#include <cmath>
	#include <boost/algorithm/string.hpp>

	#include "metric.hpp"

	using std::string;

	namespace madlib {

	using namespace dbconnector::postgres;

	namespace modules {

	namespace linalg {

	namespace {

	template <class TupleType>
	struct ReverseLexicographicComparator {
	/**
	* @brief Return true if the first argument is less than the second
	*/
	bool operator()(const TupleType& inTuple1, const TupleType& inTuple2) {
	// This could be a real reverse lexicographic comparator in C++11, but
	// lacking variadic template arguments, we simply pretend here that
	// all tuples contain only 2 elements.
	return std::get<1>(inTuple1) < std::get<1>(inTuple2) \|\|
	(std::get<1>(inTuple1) == std::get<1>(inTuple2) &&
	std::get<0>(inTuple1) < std::get<0>(inTuple2));
	}
	};

	} // anonymous namespace

	/**
	* @brief Compute the k columns of a matrix that are closest to a vector
	*
	* @tparam DistanceFunction Type of the distance function. This can be a
	* function pointer, or a class with method <tt>operator()</tt> (like
	* \c FunctionHandle)
	* @tparam NumClosestColumns The number of closest columns to compute. We assume
	* that \c outIndicesAndDistances is of size \c NumClosestColumns.
	*
	* @param inMatrix Matrix \f$ M \f$
	* @param inVector Vector \f$ \vec x \f$
	* @param[out] outClosestColumns A list of \c NumClosestColumns pairs
	* \f$ (i, d) \f$ sorted in ascending order of \f$ d \f$, where $i$ is a
	* 0-based column index in \f$ M \f$ and
	* \f$ d \f$ is the distance (using \c inMetric) between \f$ M_i \f$ and
	* \f$ x \f$.
	*/
	template <class DistanceFunction, class RandomAccessIterator>
	void
	closestColumnsAndDistances(
	const MappedMatrix& inMatrix,
	const MappedColumnVector& inVector,
	DistanceFunction& inMetric,
	RandomAccessIterator ioFirst,
	RandomAccessIterator ioLast)
	{

	ReverseLexicographicComparator<
	typename std::iterator_traits<RandomAccessIterator>::value_type>
	comparator;

	std::fill(ioFirst, ioLast,
	std::make_tuple(0, std::numeric_limits<double>::infinity()));
	for (Index i = 0; i < inMatrix.cols(); ++i) {
	double currentDist;
	currentDist
	= AnyType_cast<double>(
	inMetric(MappedColumnVector(inMatrix.col(i)), inVector)
	);

	// outIndicesAndDistances is a heap, so the first element is maximal
	if (currentDist < std::get<1>(*ioFirst)) {
	// Unfortunately, the STL does not have a decrease-key function,
	// so we are wasting a bit of performance here
	std::pop_heap(ioFirst, ioLast, comparator);
	*(ioLast - 1) = std::make_tuple(i, currentDist);
	std::push_heap(ioFirst, ioLast, comparator);
	}
	}
	std::sort_heap(ioFirst, ioLast, comparator);
	}


	template <class RandomAccessIterator>
	void
	closestColumnsAndDistancesUDF(
	const MappedMatrix& inMatrix,
	const MappedColumnVector& inVector,
	RandomAccessIterator ioFirst,
	RandomAccessIterator ioLast,
	Oid oid)
	{

	ReverseLexicographicComparator<
	typename std::iterator_traits<RandomAccessIterator>::value_type>
	comparator;

	std::fill(ioFirst, ioLast,
	std::make_tuple(0, std::numeric_limits<double>::infinity()));
	for (Index i = 0; i < inMatrix.cols(); ++i) {
	double currentDist;
	currentDist = static_cast<double>(DatumGetFloat8(OidFunctionCall2(
	oid,
	PointerGetDatum(VectorToNativeArray(inMatrix.col(i))),
	PointerGetDatum(VectorToNativeArray(inVector))
	)));

	// outIndicesAndDistances is a heap, so the first element is maximal
	if (currentDist < std::get<1>(*ioFirst)) {
	// Unfortunately, the STL does not have a decrease-key function,
	// so we are wasting a bit of performance here
	std::pop_heap(ioFirst, ioLast, comparator);
	*(ioLast - 1) = std::make_tuple(i, currentDist);
	std::push_heap(ioFirst, ioLast, comparator);
	}
	}
	std::sort_heap(ioFirst, ioLast, comparator);
	}

	double
	distPNorm(const MappedColumnVector& inX, const MappedColumnVector& inY, double p) {
	if (inX.size() != inY.size()) {
	throw std::runtime_error("Found input arrays of "
	"different lengths unexpectedly.");
	}
	if (p <= 0 \|\| std::isnan(p)) {
	throw std::runtime_error("Expect input p to be positive.");
	}

	if (!std::isfinite(p)) {
	return (inX - inY).lpNorm<Eigen::Infinity>();
	} else {
	double res = 0.0;
	for (int i = 0; i < inX.size(); i++) {
	res += std::pow(std::abs(inX(i) - inY(i)), p);
	}
	return std::pow(res, 1./p);
	}
	}

	double
	distNorm1(const MappedColumnVector& inX, const MappedColumnVector& inY) {
	if (inX.size() != inY.size()) {
	throw std::runtime_error("Found input arrays of "
	"different lengths unexpectedly.");
	}

	return (inX - inY).lpNorm<1>();
	}

	double
	distNorm2(const MappedColumnVector& inX, const MappedColumnVector& inY) {
	if (inX.size() != inY.size()) {
	throw std::runtime_error("Found input arrays of "
	"different lengths unexpectedly.");
	}

	return (inX - inY).norm();
	}

	double
	cosineSimilarity(const MappedColumnVector& inX, const MappedColumnVector& inY) {
	if (inX.size() != inY.size()) {
	throw std::runtime_error("Found input arrays of "
	"different lengths unexpectedly.");
	}

	double xnorm = inX.norm(), ynorm = inY.norm();
	if (xnorm < std::numeric_limits<double>::denorm_min()
	\|\| ynorm < std::numeric_limits<double>::denorm_min()) {
	return -1;
	}

	return inX.dot(inY) / (xnorm * ynorm);
	}

	double
	squaredDistNorm2(const MappedColumnVector& inX, const MappedColumnVector& inY) {
	if (inX.size() != inY.size()) {
	throw std::runtime_error("Found input arrays of "
	"different lengths unexpectedly.");
	}

	return (inX - inY).squaredNorm();
	}

	double
	distAngle(const MappedColumnVector& inX, const MappedColumnVector& inY) {
	if (inX.size() != inY.size()) {
	throw std::runtime_error("Found input arrays of "
	"different lengths unexpectedly.");
	}

	// Deal with the undefined case where one of the norm is zero
	// Angle is not defined. Just return \pi.
	double xnorm = inX.norm(), ynorm = inY.norm();
	if (xnorm < std::numeric_limits<double>::denorm_min()
	\|\| ynorm < std::numeric_limits<double>::denorm_min())
	return std::acos(-1);

	double cosine = dot(inX, inY) / (xnorm * ynorm);
	if (cosine > 1)
	cosine = 1;
	else if (cosine < -1)
	cosine = -1;
	return std::acos(cosine);
	}

	double
	distTanimoto(const MappedColumnVector& inX, const MappedColumnVector& inY) {
	if (inX.size() != inY.size()) {
	throw std::runtime_error("Found input arrays of "
	"different lengths unexpectedly.");
	}

	// Note that this is not a metric in general!
	double dotProduct = dot(inX, inY);
	double tanimoto = inX.squaredNorm() + inY.squaredNorm();
	return (tanimoto - 2 * dotProduct) / (tanimoto - dotProduct);
	}

	double
	distJaccard(const ArrayHandle<text>& inX, const ArrayHandle<text>& inY) {
	if (inX.size() == 0 && inY.size() == 0)
	return 0.0; // both empty are treated as zero distance
	if (inX.size() == 0 \|\| inY.size() == 0)
	return 1.0; // one of the sets being empty is treated as max distance

	std::set<string> x_set;
	for (size_t i = 0; i < inX.size(); i++){
	x_set.insert(std::string(VARDATA_ANY(inX[i]),
	VARSIZE_ANY(inX[i]) - VARHDRSZ));
	}

	size_t n_intersection = 0;
	size_t n_union = x_set.size();
	std::set<string> y_set;
	for (size_t i = 0; i < inY.size(); i++){
	string y_elem = std::string(VARDATA_ANY(inY[i]),
	VARSIZE_ANY(inY[i]) - VARHDRSZ);
	if (y_set.count(y_elem) == 0){
	// for sets, count returns 1 if an element with that value
	// exists in the container, and zero otherwise.

	if (x_set.count(y_elem) == 1){
	// element present in both sets (already counted for union)
	n_intersection++;
	}
	else{
	// element only in inY, not yet counted for union
	n_union++;
	}
	}
	y_set.insert(y_elem);
	}
	return 1.0 - static_cast<double>(n_intersection) / static_cast<double>(n_union);
	}

	/**
	* @brief Compute the k columns of a matrix that are closest to a vector
	*
	* For performance, we cheat here: For the following four distance functions, we
	* take a special shortcut.
	* FIXME: FunctionHandle should be tuned so that this shortcut no longer
	* impacts performance by more than, say, ~10%.
	*/

	std::string dist_fn_name(string s)
	{
	std::istringstream ss(s);
	std::string token, fname;
	if (std::getline(ss, token, '.')) fname = token; // suppose there is no schema name
	if (std::getline(ss, token, '.')) fname = token; // previous part is schema name
	return fname;
	}


	template <class RandomAccessIterator>
	inline
	void
	closestColumnsAndDistancesShortcut(
	const MappedMatrix& inMatrix,
	const MappedColumnVector& inVector,
	FunctionHandle &inDist,
	std::string fname,
	RandomAccessIterator ioFirst,
	RandomAccessIterator ioLast) {

	// Sorted in the order of expected use
	if (fname.compare("squared_dist_norm2") == 0)
	closestColumnsAndDistances(inMatrix, inVector, squaredDistNorm2,
	ioFirst, ioLast);
	else if (fname.compare("dist_norm2") == 0)
	closestColumnsAndDistances(inMatrix, inVector, distNorm2,
	ioFirst, ioLast);
	else if (fname.compare("dist_norm1") == 0)
	closestColumnsAndDistances(inMatrix, inVector, distNorm1,
	ioFirst, ioLast);
	else if (fname.compare("dist_angle") == 0)
	closestColumnsAndDistances(inMatrix, inVector, distAngle,
	ioFirst, ioLast);
	else if (fname.compare("dist_tanimoto") == 0) {
	closestColumnsAndDistances(inMatrix, inVector, distTanimoto,
	ioFirst, ioLast);
	} else {
	closestColumnsAndDistancesUDF(inMatrix, inVector, ioFirst,
	ioLast, inDist.funcID());
	}
	}


	/**
	* @brief Compute the minimum distance between a vector and any column of a
	* matrix
	*
	* This function calls a user-supplied function, for which it does not do
	* garbage collection. It is therefore meant to be called only constantly many
	* times before control is returned to the backend.
	*/
	AnyType
	closest_column::run(AnyType& args) {
	//if (true) throw std::runtime_error("Begin cc run\n");
	try{
	MappedMatrix M = args[0].getAs<MappedMatrix>();
	MappedColumnVector x = args[1].getAs<MappedColumnVector>();
	FunctionHandle dist = args[2].getAs<FunctionHandle>()
	.unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall);
	string dist_fname = args[3].getAs<char *>();
	std::string fname = dist_fn_name(dist_fname);
	std::tuple<Index, double> result;
	closestColumnsAndDistancesShortcut(M, x, dist, fname, &result, &result + 1);

	AnyType tuple;
	return tuple
	<< static_cast<int32_t>(std::get<0>(result))
	<< std::get<1>(result);
	}catch (const ArrayWithNullException &e) {
	return Null();
	}
	}

	/**
	* @brief Compute the minimum distance between a vector and any column of a
	* matrix
	*
	* This function calls a user-supplied function, for which it does not do
	* garbage collection. It is therefore meant to be called only constantly many
	* times before control is returned to the backend.
	*/
	AnyType
	closest_columns::run(AnyType& args) {

	/* If the input has a null value, we want to return nothing for that
	* particular data point (because we cannot calculate the distance)
	* instead of failing.
	*/
	try{
	MappedMatrix M = args[0].getAs<MappedMatrix>();
	MappedColumnVector x = args[1].getAs<MappedColumnVector>();
	uint32_t num = args[2].getAs<uint32_t>();
	FunctionHandle dist = args[3].getAs<FunctionHandle>()
	.unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall);
	string dist_fname = args[4].getAs<char *>();

	std::string fname = dist_fn_name(dist_fname);

	std::vector<std::tuple<Index, double> > result(num);
	closestColumnsAndDistancesShortcut(M, x, dist, fname, result.begin(),
	result.end());

	MutableArrayHandle<int32_t> indices = allocateArray<int32_t,
	dbal::FunctionContext, dbal::DoNotZero, dbal::ThrowBadAlloc>(num);
	MutableArrayHandle<double> distances = allocateArray<double,
	dbal::FunctionContext, dbal::DoNotZero, dbal::ThrowBadAlloc>(num);
	for (uint32_t i = 0; i < num; ++i)
	std::tie(indices[i], distances[i]) = result[i];

	AnyType tuple;
	return tuple << indices << distances;
	}catch (const ArrayWithNullException &e) {
	return Null();
	}
	}

	AnyType
	norm1::run(AnyType& args) {
	return static_cast<double>(args[0].getAs<MappedColumnVector>().lpNorm<1>());
	}

	AnyType
	norm2::run(AnyType& args) {
	return static_cast<double>(args[0].getAs<MappedColumnVector>().norm());
	}

	AnyType
	dist_inf_norm::run(AnyType& args) {
	return distPNorm(
	args[0].getAs<MappedColumnVector>(),
	args[1].getAs<MappedColumnVector>(),
	std::numeric_limits<double>::infinity()
	);
	}

	AnyType
	dist_pnorm::run(AnyType& args) {
	return distPNorm(
	args[0].getAs<MappedColumnVector>(),
	args[1].getAs<MappedColumnVector>(),
	args[2].getAs<double>()
	);
	}

	AnyType
	dist_norm1::run(AnyType& args) {
	return distNorm1(
	args[0].getAs<MappedColumnVector>(),
	args[1].getAs<MappedColumnVector>()
	);
	}

	AnyType
	dist_norm2::run(AnyType& args) {
	// FIXME: it would be nice to declare this as a template function (so it
	// works for dense and sparse vectors), and the C++ AL takes care of the
	// rest...
	return distNorm2(
	args[0].getAs<MappedColumnVector>(),
	args[1].getAs<MappedColumnVector>()
	);
	}

	AnyType
	cosine_similarity::run(AnyType& args) {
	return cosineSimilarity(
	args[0].getAs<MappedColumnVector>(),
	args[1].getAs<MappedColumnVector>()
	);
	}

	AnyType
	squared_dist_norm2::run(AnyType& args) {
	return squaredDistNorm2(
	args[0].getAs<MappedColumnVector>(),
	args[1].getAs<MappedColumnVector>()
	);
	}

	AnyType
	dist_angle::run(AnyType& args) {
	return distAngle(
	args[0].getAs<MappedColumnVector>(),
	args[1].getAs<MappedColumnVector>()
	);
	}

	AnyType
	dist_tanimoto::run(AnyType& args) {
	return distTanimoto(
	args[0].getAs<MappedColumnVector>(),
	args[1].getAs<MappedColumnVector>()
	);
	}

	AnyType
	dist_jaccard::run(AnyType& args) {
	return distJaccard(
	args[0].getAs<ArrayHandle<text*> >(),
	args[1].getAs<ArrayHandle<text*> >()
	);
	}

	} // namespace linalg

	} // namespace modules

	} // namespace regress