| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #---------------------------------------------------------------------------------------- |
| |
| # The quantizeByCluster-function implements product quantization. Initially, it |
| # divides the original vector space into M subspaces. The resulting lower dimensional |
| # subvectors are then quantized. If the column count is not divisible by the number of |
| # subspaces M, the data is padded with zeros. Optimal space decomposition can be |
| # computed, when the data follows a Gaussian distribution. The function uses kmeans for |
| # quantizing and svd to compute the space decomposition. |
| # |
| # INPUT: |
| # --------------------------------------------------------------------------------------- |
| # X The input matrix to perform product quantization on |
| # M Number of subspaces |
| # k Number of vectors in the subcodebooks |
| # runs Number of runs (with different initial centroids) |
| # max_iter Maximum number of iterations per run |
| # eps Tolerance (epsilon) for WCSS change ratio |
| # avg_sample_size_per_centroid Average number of records per centroid in data samples |
| # separate Cluster subspaces separately. If value is set to true, |
| # kmeans is run M times, once for each subspace. Otherwise |
| # kmeans is run only once. |
| # space_decomp Decompose the vector space by multiplying the input |
| # matrix X with an orthogonal matrix R. Assumes the data |
| # follows a parametric Gaussian distribution. |
| # Time complexity in O(nrow(X)^2 * min(nrow(X), ncol(X))). |
| # seed The seed used for initial sampling. If set to -1 random |
| # seeds are selected. |
| # --------------------------------------------------------------------------------------- |
| # |
| # OUTPUT: |
| # ------------------------------------------------------------------------------------------ |
| # codebook The matrix containing the centroids. If clustered separately, the ith |
| # subcodebook is the ith chunk of size k. The codebook matrix has the dimensions |
| # [k*M x ncol(X)/M]. |
| # codes The mapping of vectors to centroids. Each vector of the input matrix X is mapped |
| # onto a vector of codes. The entries in the codes matrix are the indices of |
| # the vectors in the codebook. The codes matrix has the dimensions [nrow(X) x M]. |
| # R The orthogonal matrix R which is applied to the input matrix X before performing |
| # the product quantization. Only relevant when space_decomp = TRUE. |
| # ------------------------------------------------------------------------------------------ |
| |
| m_quantizeByCluster = function(Matrix[Double] X, Integer M = 4, Integer k = 10, Integer runs = 10, |
| Integer max_iter = 1000, Double eps = 1e-6, Integer avg_sample_size_per_centroid = 50, Boolean separate=TRUE, Boolean space_decomp=FALSE, Integer seed = -1) |
| return(Matrix[Double] codebook, Matrix[Double] codes, Matrix[Double] R) |
| { |
| #Pad the data with zeros if the number of columns of the input matrix X is not divisible by M |
| if(ncol(X) %% M != 0) { |
| zeros = matrix(0, rows=nrow(X), cols=((ncol(X) %/% M) +1) * M - ncol(X)) |
| X = cbind(X, zeros) |
| } |
| subvector_size = ncol(X) / M |
| #Transform the vector space by an orthogonal matrix R. |
| #R is computed by reordering the principal directions of the input matrix X such that the variance of each subspace is balanced. |
| if(space_decomp) { |
| #Perform PCA using SVD |
| X2 = X - colMeans(X) |
| [U, S, V] = svd(X2) |
| eigen_v = diag(S)^2 / (nrow(X)-1) |
| #Balance the variance of the subspaces following a greedy approach. |
| R = matrix(0, rows=nrow(V), cols=ncol(V)) |
| subspace_variance = matrix(1, rows=M, cols=1) |
| subspace_idx = seq(0,M-1) * subvector_size + 1 |
| for(i in 1:nrow(R)) { |
| #free_buckets[j] == 0 iff subspace j is full |
| free_buckets = subspace_idx - seq(1,M) * subvector_size < 1 |
| b = as.scalar(rowIndexMin(t(subspace_variance) %*% (diag(1 / free_buckets)))) |
| subspace_variance[b] = subspace_variance[b] * eigen_v[i] |
| R[,as.scalar(subspace_idx[b])] = V[,i] |
| subspace_idx[b] = subspace_idx[b] + 1 |
| } |
| #Apply space decomposition |
| X = X %*% t(R) |
| } |
| else { |
| R = matrix(0, rows=1, cols=1) |
| } |
| #Kmeans is run just once for all subspaces together. Subvectors are mapped to vectors of the codebook of size k*M. |
| #The ith entry of a code vector has a value in [1, k*M]. |
| if(!separate) { |
| A = matrix(X, rows= nrow(X) * M, cols=subvector_size) |
| [codebook, B] = kmeans(A, k * M, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed) |
| codes = matrix(B, rows = nrow(B) / M, cols = ncol(B) * M) |
| } |
| #Kmeans is run for every subspace separately. Subvectors are mapped to a subset of k vectors of the codebook. |
| #The ith entry of a code vector has a value in ((i-1)*k, i*k]. |
| else { |
| codebook = matrix(1, rows=k*M, cols=subvector_size) |
| codes = matrix(1, rows=nrow(X), cols=M) |
| for(i in 1:M, check=0) { |
| [tmp_cbook, tmp_c] = kmeans(X[,(i-1)*subvector_size+1:i*subvector_size], k, runs, max_iter, eps, FALSE, avg_sample_size_per_centroid, seed) |
| #If no output is produced, use a single centroid |
| if(as.scalar(tmp_c[1,1]) < 1) { |
| tmp_cbook = matrix(0, rows=k, cols=subvector_size) |
| tmp_cbook[1,] = colSums(X[,(i-1)*subvector_size+1:i*subvector_size]) / nrow(X) |
| tmp_c = matrix(1, rows=nrow(X), cols=1) |
| } |
| codebook[(i-1)*k+1:i*k,] = tmp_cbook |
| offset = matrix((i-1)*k, rows=nrow(codes), cols=1) |
| codes[,i] = tmp_c + offset |
| } |
| } |
| } |