blob: 0001ea4e84bf39f01ae18fced07af94dace6988b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.wayang.apps.simwords
import scala.util.Random
/**
* A [[SparseVector]] is a vector of arbitrary dimension. It does not store `0` components.
*/
case class SparseVector(indices: Array[Int], values: Array[Double]) {
/**
* Applies scalar to this vector.
*/
def *=(scalar: Double): Unit = {
for (i <- this.values.indices) {
values(i) *= scalar
}
}
/**
* @return the length of this vector
*/
def length: Double = math.sqrt(this.values.map(v => v * v).sum)
/**
* Keep the direction but bring to length 1.
*/
def normalize(): Unit = {
val scalar = 1d / length
this *= scalar
}
/**
* Add two vectors.
*/
def +(that: SparseVector): SparseVector = {
val newIndices: Array[Int] = new Array[Int](this.indices.length + that.indices.length)
val newValues: Array[Double] = new Array[Double](this.values.length + that.values.length)
var newI = 0
// Co-iterate the lists.
var (thisI, thatI) = (0, 0)
while (thisI < this.indices.length && thatI < that.indices.length) {
val (thisIndex, thatIndex) = (this.indices(thisI), that.indices(thatI))
if (thisIndex < thatIndex) {
newIndices(newI) = thisIndex
newValues(newI) = this.values(thisI)
thisI += 1
} else if (thisIndex > thatIndex) {
newIndices(newI) = thatIndex
newValues(newI) = that.values(thatI)
thatI += 1
} else {
newIndices(newI) = thisIndex
newValues(newI) = this.values(thisI) + that.values(thatI)
thisI += 1
thatI += 1
}
newI += 1
}
// Append trailing list tail.
while (thisI < this.indices.length) {
newIndices(newI) = this.indices(thisI)
newValues(newI) = this.values(thisI)
thisI += 1
newI += 1
}
while (thatI < that.indices.length) {
newIndices(newI) = that.indices(thatI)
newValues(newI) = that.values(thatI)
thatI += 1
newI += 1
}
// Build the new instance.
if (newI < newIndices.length) {
SparseVector(java.util.Arrays.copyOfRange(newIndices, 0, newI),
java.util.Arrays.copyOfRange(newValues, 0, newI))
} else {
SparseVector(newIndices, newValues)
}
}
/**
* Dot-product of two sparse vectors.
*/
def *(that: SparseVector): Double = {
var prodSum = 0d;
// Co-iterate the lists.
var (thisI, thatI) = (0, 0)
while (thisI < this.indices.length && thatI < that.indices.length) {
val (thisIndex, thatIndex) = (this.indices(thisI), that.indices(thatI))
if (thisIndex < thatIndex) {
thisI += 1
} else if (thisIndex > thatIndex) {
thatI += 1
} else {
prodSum += this.values(thisI) * that.values(thatI)
thisI += 1
thatI += 1
}
}
prodSum
}
/**
* Cosine similarity of two sparse vectors.
*/
def ~(that: SparseVector): Double = {
math.abs(this * that) / (this.length * that.length)
}
override def toString = s"(${
this.indices
.zip(this.values)
.map(component => s"${component._1}->${component._2}")
.mkString(", ")
})"
def toDictionaryString = s"{${
this.indices
.zip(this.values)
.map(component => s"${component._1}:${component._2}")
.mkString(",")
}}"
override def hashCode = java.util.Arrays.hashCode(this.values) ^ java.util.Arrays.hashCode(this.indices)
override def equals(o: Any): Boolean = {
if (o == null || !o.isInstanceOf[SparseVector]) return false;
val that = o.asInstanceOf[SparseVector]
java.util.Arrays.equals(this.indices, that.indices) && java.util.Arrays.equals(this.values, that.values)
}
}
/**
* Companion object for [[SparseVector]].
*/
object SparseVector {
private lazy val random = new Random
/**
* Create a random instance.
*
* @param indices possible [[SparseVector#indices]]
* @param probCompleteness probability of using all components in `numCreations`
* @param numCreations number of [[SparseVector]]s that will be created
*/
def createRandom(indices: Array[Int], probCompleteness: Double, numCreations: Int, withNegative: Boolean = false) = {
// Find out the probability to include each component into the vector.
val probPickElement = calculatePickElementProb(probCompleteness, numCreations)
// Generate the components.
val builder = new Builder
for (i <- indices.indices) {
if (probPickElement >= this.random.nextDouble()) {
builder.add(indices(i), this.random.nextDouble() * (if (withNegative && this.random.nextBoolean()) -1 else 1))
}
}
// Build the vector.
builder.build
}
/**
* Estimate the required probability to pick an element in a single try, given a the probability of picking all
* elements in a number of repetitions.
*/
private def calculatePickElementProb(probCompleteness: Double, numCreations: Int) =
1 - math.pow(1 - probCompleteness, 1d / numCreations)
class Builder {
private val components = scala.collection.mutable.Map[Int, Double]()
/**
* Add a delta to a component.
*/
def add(dimension: Int, delta: Double): Builder = {
val newValue = components.getOrElse(dimension, 0d) + delta
if (newValue == 0) this.components.remove(dimension)
else this.components.update(dimension, newValue)
this
}
/**
* Create the [[SparseVector]].
*/
def build: SparseVector = {
val indices = new Array[Int](this.components.size)
val values = new Array[Double](this.components.size)
var i = 0
for (component <- this.components.toArray.sortBy(_._1)) {
indices(i) += component._1
values(i) += component._2
i += 1
}
SparseVector(indices, values)
}
/**
* Tells whether this instance contains components.
*/
def isEmpty = components.isEmpty
}
}