| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file metric.cpp |
| * |
| * @brief Metric operations |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| #include <dbconnector/dbconnector.hpp> |
| #include <limits> |
| #include <string> |
| #include <vector> |
| #include <set> |
| #include <algorithm> |
| #include <sstream> |
| #include <cmath> |
| #include <boost/algorithm/string.hpp> |
| |
| #include "metric.hpp" |
| |
| using std::string; |
| |
| namespace madlib { |
| |
| using namespace dbconnector::postgres; |
| |
| namespace modules { |
| |
| namespace linalg { |
| |
| namespace { |
| |
| template <class TupleType> |
| struct ReverseLexicographicComparator { |
| /** |
| * @brief Return true if the first argument is less than the second |
| */ |
| bool operator()(const TupleType& inTuple1, const TupleType& inTuple2) { |
| // This could be a real reverse lexicographic comparator in C++11, but |
| // lacking variadic template arguments, we simply pretend here that |
| // all tuples contain only 2 elements. |
| return std::get<1>(inTuple1) < std::get<1>(inTuple2) || |
| (std::get<1>(inTuple1) == std::get<1>(inTuple2) && |
| std::get<0>(inTuple1) < std::get<0>(inTuple2)); |
| } |
| }; |
| |
| } // anonymous namespace |
| |
| /** |
| * @brief Compute the k columns of a matrix that are closest to a vector |
| * |
| * @tparam DistanceFunction Type of the distance function. This can be a |
| * function pointer, or a class with method <tt>operator()</tt> (like |
| * \c FunctionHandle) |
| * @tparam NumClosestColumns The number of closest columns to compute. We assume |
| * that \c outIndicesAndDistances is of size \c NumClosestColumns. |
| * |
| * @param inMatrix Matrix \f$ M \f$ |
| * @param inVector Vector \f$ \vec x \f$ |
| * @param[out] outClosestColumns A list of \c NumClosestColumns pairs |
| * \f$ (i, d) \f$ sorted in ascending order of \f$ d \f$, where $i$ is a |
| * 0-based column index in \f$ M \f$ and |
| * \f$ d \f$ is the distance (using \c inMetric) between \f$ M_i \f$ and |
| * \f$ x \f$. |
| */ |
| template <class DistanceFunction, class RandomAccessIterator> |
| void |
| closestColumnsAndDistances( |
| const MappedMatrix& inMatrix, |
| const MappedColumnVector& inVector, |
| DistanceFunction& inMetric, |
| RandomAccessIterator ioFirst, |
| RandomAccessIterator ioLast) |
| { |
| |
| ReverseLexicographicComparator< |
| typename std::iterator_traits<RandomAccessIterator>::value_type> |
| comparator; |
| |
| std::fill(ioFirst, ioLast, |
| std::make_tuple(0, std::numeric_limits<double>::infinity())); |
| for (Index i = 0; i < inMatrix.cols(); ++i) { |
| double currentDist; |
| currentDist |
| = AnyType_cast<double>( |
| inMetric(MappedColumnVector(inMatrix.col(i)), inVector) |
| ); |
| |
| // outIndicesAndDistances is a heap, so the first element is maximal |
| if (currentDist < std::get<1>(*ioFirst)) { |
| // Unfortunately, the STL does not have a decrease-key function, |
| // so we are wasting a bit of performance here |
| std::pop_heap(ioFirst, ioLast, comparator); |
| *(ioLast - 1) = std::make_tuple(i, currentDist); |
| std::push_heap(ioFirst, ioLast, comparator); |
| } |
| } |
| std::sort_heap(ioFirst, ioLast, comparator); |
| } |
| |
| |
| template <class RandomAccessIterator> |
| void |
| closestColumnsAndDistancesUDF( |
| const MappedMatrix& inMatrix, |
| const MappedColumnVector& inVector, |
| RandomAccessIterator ioFirst, |
| RandomAccessIterator ioLast, |
| Oid oid) |
| { |
| |
| ReverseLexicographicComparator< |
| typename std::iterator_traits<RandomAccessIterator>::value_type> |
| comparator; |
| |
| std::fill(ioFirst, ioLast, |
| std::make_tuple(0, std::numeric_limits<double>::infinity())); |
| for (Index i = 0; i < inMatrix.cols(); ++i) { |
| double currentDist; |
| currentDist = static_cast<double>(DatumGetFloat8(OidFunctionCall2( |
| oid, |
| PointerGetDatum(VectorToNativeArray(inMatrix.col(i))), |
| PointerGetDatum(VectorToNativeArray(inVector)) |
| ))); |
| |
| // outIndicesAndDistances is a heap, so the first element is maximal |
| if (currentDist < std::get<1>(*ioFirst)) { |
| // Unfortunately, the STL does not have a decrease-key function, |
| // so we are wasting a bit of performance here |
| std::pop_heap(ioFirst, ioLast, comparator); |
| *(ioLast - 1) = std::make_tuple(i, currentDist); |
| std::push_heap(ioFirst, ioLast, comparator); |
| } |
| } |
| std::sort_heap(ioFirst, ioLast, comparator); |
| } |
| |
| double |
| distPNorm(const MappedColumnVector& inX, const MappedColumnVector& inY, double p) { |
| if (inX.size() != inY.size()) { |
| throw std::runtime_error("Found input arrays of " |
| "different lengths unexpectedly."); |
| } |
| if (p <= 0 || std::isnan(p)) { |
| throw std::runtime_error("Expect input p to be positive."); |
| } |
| |
| if (!std::isfinite(p)) { |
| return (inX - inY).lpNorm<Eigen::Infinity>(); |
| } else { |
| double res = 0.0; |
| for (int i = 0; i < inX.size(); i++) { |
| res += std::pow(std::abs(inX(i) - inY(i)), p); |
| } |
| return std::pow(res, 1./p); |
| } |
| } |
| |
| double |
| distNorm1(const MappedColumnVector& inX, const MappedColumnVector& inY) { |
| if (inX.size() != inY.size()) { |
| throw std::runtime_error("Found input arrays of " |
| "different lengths unexpectedly."); |
| } |
| |
| return (inX - inY).lpNorm<1>(); |
| } |
| |
| double |
| distNorm2(const MappedColumnVector& inX, const MappedColumnVector& inY) { |
| if (inX.size() != inY.size()) { |
| throw std::runtime_error("Found input arrays of " |
| "different lengths unexpectedly."); |
| } |
| |
| return (inX - inY).norm(); |
| } |
| |
| double |
| cosineSimilarity(const MappedColumnVector& inX, const MappedColumnVector& inY) { |
| if (inX.size() != inY.size()) { |
| throw std::runtime_error("Found input arrays of " |
| "different lengths unexpectedly."); |
| } |
| |
| double xnorm = inX.norm(), ynorm = inY.norm(); |
| if (xnorm < std::numeric_limits<double>::denorm_min() |
| || ynorm < std::numeric_limits<double>::denorm_min()) { |
| return -1; |
| } |
| |
| return inX.dot(inY) / (xnorm * ynorm); |
| } |
| |
| double |
| squaredDistNorm2(const MappedColumnVector& inX, const MappedColumnVector& inY) { |
| if (inX.size() != inY.size()) { |
| throw std::runtime_error("Found input arrays of " |
| "different lengths unexpectedly."); |
| } |
| |
| return (inX - inY).squaredNorm(); |
| } |
| |
| double |
| distAngle(const MappedColumnVector& inX, const MappedColumnVector& inY) { |
| if (inX.size() != inY.size()) { |
| throw std::runtime_error("Found input arrays of " |
| "different lengths unexpectedly."); |
| } |
| |
| // Deal with the undefined case where one of the norm is zero |
| // Angle is not defined. Just return \pi. |
| double xnorm = inX.norm(), ynorm = inY.norm(); |
| if (xnorm < std::numeric_limits<double>::denorm_min() |
| || ynorm < std::numeric_limits<double>::denorm_min()) |
| return std::acos(-1); |
| |
| double cosine = dot(inX, inY) / (xnorm * ynorm); |
| if (cosine > 1) |
| cosine = 1; |
| else if (cosine < -1) |
| cosine = -1; |
| return std::acos(cosine); |
| } |
| |
| double |
| distTanimoto(const MappedColumnVector& inX, const MappedColumnVector& inY) { |
| if (inX.size() != inY.size()) { |
| throw std::runtime_error("Found input arrays of " |
| "different lengths unexpectedly."); |
| } |
| |
| // Note that this is not a metric in general! |
| double dotProduct = dot(inX, inY); |
| double tanimoto = inX.squaredNorm() + inY.squaredNorm(); |
| return (tanimoto - 2 * dotProduct) / (tanimoto - dotProduct); |
| } |
| |
| double |
| distJaccard(const ArrayHandle<text*>& inX, const ArrayHandle<text*>& inY) { |
| if (inX.size() == 0 && inY.size() == 0) |
| return 0.0; // both empty are treated as zero distance |
| if (inX.size() == 0 || inY.size() == 0) |
| return 1.0; // one of the sets being empty is treated as max distance |
| |
| std::set<string> x_set; |
| for (size_t i = 0; i < inX.size(); i++){ |
| x_set.insert(std::string(VARDATA_ANY(inX[i]), |
| VARSIZE_ANY(inX[i]) - VARHDRSZ)); |
| } |
| |
| size_t n_intersection = 0; |
| size_t n_union = x_set.size(); |
| std::set<string> y_set; |
| for (size_t i = 0; i < inY.size(); i++){ |
| string y_elem = std::string(VARDATA_ANY(inY[i]), |
| VARSIZE_ANY(inY[i]) - VARHDRSZ); |
| if (y_set.count(y_elem) == 0){ |
| // for sets, count returns 1 if an element with that value |
| // exists in the container, and zero otherwise. |
| |
| if (x_set.count(y_elem) == 1){ |
| // element present in both sets (already counted for union) |
| n_intersection++; |
| } |
| else{ |
| // element only in inY, not yet counted for union |
| n_union++; |
| } |
| } |
| y_set.insert(y_elem); |
| } |
| return 1.0 - static_cast<double>(n_intersection) / static_cast<double>(n_union); |
| } |
| |
| /** |
| * @brief Compute the k columns of a matrix that are closest to a vector |
| * |
| * For performance, we cheat here: For the following four distance functions, we |
| * take a special shortcut. |
| * FIXME: FunctionHandle should be tuned so that this shortcut no longer |
| * impacts performance by more than, say, ~10%. |
| */ |
| |
| std::string dist_fn_name(string s) |
| { |
| std::istringstream ss(s); |
| std::string token, fname; |
| if (std::getline(ss, token, '.')) fname = token; // suppose there is no schema name |
| if (std::getline(ss, token, '.')) fname = token; // previous part is schema name |
| return fname; |
| } |
| |
| |
| template <class RandomAccessIterator> |
| inline |
| void |
| closestColumnsAndDistancesShortcut( |
| const MappedMatrix& inMatrix, |
| const MappedColumnVector& inVector, |
| FunctionHandle &inDist, |
| std::string fname, |
| RandomAccessIterator ioFirst, |
| RandomAccessIterator ioLast) { |
| |
| // Sorted in the order of expected use |
| if (fname.compare("squared_dist_norm2") == 0) |
| closestColumnsAndDistances(inMatrix, inVector, squaredDistNorm2, |
| ioFirst, ioLast); |
| else if (fname.compare("dist_norm2") == 0) |
| closestColumnsAndDistances(inMatrix, inVector, distNorm2, |
| ioFirst, ioLast); |
| else if (fname.compare("dist_norm1") == 0) |
| closestColumnsAndDistances(inMatrix, inVector, distNorm1, |
| ioFirst, ioLast); |
| else if (fname.compare("dist_angle") == 0) |
| closestColumnsAndDistances(inMatrix, inVector, distAngle, |
| ioFirst, ioLast); |
| else if (fname.compare("dist_tanimoto") == 0) { |
| closestColumnsAndDistances(inMatrix, inVector, distTanimoto, |
| ioFirst, ioLast); |
| } else { |
| closestColumnsAndDistancesUDF(inMatrix, inVector, ioFirst, |
| ioLast, inDist.funcID()); |
| } |
| } |
| |
| |
| /** |
| * @brief Compute the minimum distance between a vector and any column of a |
| * matrix |
| * |
| * This function calls a user-supplied function, for which it does not do |
| * garbage collection. It is therefore meant to be called only constantly many |
| * times before control is returned to the backend. |
| */ |
| AnyType |
| closest_column::run(AnyType& args) { |
| //if (true) throw std::runtime_error("Begin cc run\n"); |
| try{ |
| MappedMatrix M = args[0].getAs<MappedMatrix>(); |
| MappedColumnVector x = args[1].getAs<MappedColumnVector>(); |
| FunctionHandle dist = args[2].getAs<FunctionHandle>() |
| .unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall); |
| string dist_fname = args[3].getAs<char *>(); |
| std::string fname = dist_fn_name(dist_fname); |
| std::tuple<Index, double> result; |
| closestColumnsAndDistancesShortcut(M, x, dist, fname, &result, &result + 1); |
| |
| AnyType tuple; |
| return tuple |
| << static_cast<int32_t>(std::get<0>(result)) |
| << std::get<1>(result); |
| }catch (const ArrayWithNullException &e) { |
| return Null(); |
| } |
| } |
| |
| /** |
| * @brief Compute the minimum distance between a vector and any column of a |
| * matrix |
| * |
| * This function calls a user-supplied function, for which it does not do |
| * garbage collection. It is therefore meant to be called only constantly many |
| * times before control is returned to the backend. |
| */ |
| AnyType |
| closest_columns::run(AnyType& args) { |
| |
| /* If the input has a null value, we want to return nothing for that |
| * particular data point (because we cannot calculate the distance) |
| * instead of failing. |
| */ |
| try{ |
| MappedMatrix M = args[0].getAs<MappedMatrix>(); |
| MappedColumnVector x = args[1].getAs<MappedColumnVector>(); |
| uint32_t num = args[2].getAs<uint32_t>(); |
| FunctionHandle dist = args[3].getAs<FunctionHandle>() |
| .unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall); |
| string dist_fname = args[4].getAs<char *>(); |
| |
| std::string fname = dist_fn_name(dist_fname); |
| |
| std::vector<std::tuple<Index, double> > result(num); |
| closestColumnsAndDistancesShortcut(M, x, dist, fname, result.begin(), |
| result.end()); |
| |
| MutableArrayHandle<int32_t> indices = allocateArray<int32_t, |
| dbal::FunctionContext, dbal::DoNotZero, dbal::ThrowBadAlloc>(num); |
| MutableArrayHandle<double> distances = allocateArray<double, |
| dbal::FunctionContext, dbal::DoNotZero, dbal::ThrowBadAlloc>(num); |
| for (uint32_t i = 0; i < num; ++i) |
| std::tie(indices[i], distances[i]) = result[i]; |
| |
| AnyType tuple; |
| return tuple << indices << distances; |
| }catch (const ArrayWithNullException &e) { |
| return Null(); |
| } |
| } |
| |
| AnyType |
| norm1::run(AnyType& args) { |
| return static_cast<double>(args[0].getAs<MappedColumnVector>().lpNorm<1>()); |
| } |
| |
| AnyType |
| norm2::run(AnyType& args) { |
| return static_cast<double>(args[0].getAs<MappedColumnVector>().norm()); |
| } |
| |
| AnyType |
| dist_inf_norm::run(AnyType& args) { |
| return distPNorm( |
| args[0].getAs<MappedColumnVector>(), |
| args[1].getAs<MappedColumnVector>(), |
| std::numeric_limits<double>::infinity() |
| ); |
| } |
| |
| AnyType |
| dist_pnorm::run(AnyType& args) { |
| return distPNorm( |
| args[0].getAs<MappedColumnVector>(), |
| args[1].getAs<MappedColumnVector>(), |
| args[2].getAs<double>() |
| ); |
| } |
| |
| AnyType |
| dist_norm1::run(AnyType& args) { |
| return distNorm1( |
| args[0].getAs<MappedColumnVector>(), |
| args[1].getAs<MappedColumnVector>() |
| ); |
| } |
| |
| AnyType |
| dist_norm2::run(AnyType& args) { |
| // FIXME: it would be nice to declare this as a template function (so it |
| // works for dense and sparse vectors), and the C++ AL takes care of the |
| // rest... |
| return distNorm2( |
| args[0].getAs<MappedColumnVector>(), |
| args[1].getAs<MappedColumnVector>() |
| ); |
| } |
| |
| AnyType |
| cosine_similarity::run(AnyType& args) { |
| return cosineSimilarity( |
| args[0].getAs<MappedColumnVector>(), |
| args[1].getAs<MappedColumnVector>() |
| ); |
| } |
| |
| AnyType |
| squared_dist_norm2::run(AnyType& args) { |
| return squaredDistNorm2( |
| args[0].getAs<MappedColumnVector>(), |
| args[1].getAs<MappedColumnVector>() |
| ); |
| } |
| |
| AnyType |
| dist_angle::run(AnyType& args) { |
| return distAngle( |
| args[0].getAs<MappedColumnVector>(), |
| args[1].getAs<MappedColumnVector>() |
| ); |
| } |
| |
| AnyType |
| dist_tanimoto::run(AnyType& args) { |
| return distTanimoto( |
| args[0].getAs<MappedColumnVector>(), |
| args[1].getAs<MappedColumnVector>() |
| ); |
| } |
| |
| AnyType |
| dist_jaccard::run(AnyType& args) { |
| return distJaccard( |
| args[0].getAs<ArrayHandle<text*> >(), |
| args[1].getAs<ArrayHandle<text*> >() |
| ); |
| } |
| |
| } // namespace linalg |
| |
| } // namespace modules |
| |
| } // namespace regress |