blob: 5750c37612263d00d13b86dfff3efbb5c59464c2 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Generator of random records with boolean features
# Average record (row) and feature (column) densities follow
# power laws: E(#1s in line k) = const / (k + add)^pow
# Cell[1, 1] has the highest probability to be 1, also input
# By setting num_features >> num_records we allow lots of rare
# features while keeping most records nonempty.
# The power ("pow") in the power law determines the tail behavior;
# The additive ("add") determines how steeply the density changes
# in the first few records or features.
num_records = 1000; # The number of records (rows)
num_features = 50000; # The number of boolean features (columns)
pow_records = 2.0; # The Zipf law power for record density
pow_features = 1.0; # The Zipf law power for feature density
add_records = 100.0; # The additive shift for record density
add_features = 20.0; # The additive shift for feature density
max_cell_prob = 1.0; # The probability for Cell[1, 1] to be 1
############
c = max_cell_prob * ((1.0 + add_records)^pow_records) * ((1.0 + add_features)^pow_features);
vec_records = matrix (1.0, rows = num_records, cols = 1);
vec_records = sumup (vec_records);
vec_records = 1.0 / ((vec_records + add_records)^pow_records);
vec_features = matrix (1.0, rows = num_features, cols = 1);
vec_features = sumup (vec_features);
vec_features = 1.0 / ((t(vec_features) + add_features)^pow_features);
Probs = c * (vec_records %*% vec_features);
avg_density_records = rowSums (Probs);
avg_density_features = colSums (Probs);
Tosses = Rand (rows = num_records, cols = num_features, min = 0.0, max = 1.0);
Data = (Tosses <= Probs);
write (avg_density_records, "Zipf.AvgDensity.Rows", format="text");
write (avg_density_features, "Zipf.AvgDensity.Cols", format="text");
write (Data, "Zipf.Data", format="text");
sumup = function (Matrix[double] A) return (Matrix[double] sum_A)
{
shift = 1;
m_A = nrow(A);
sum_A = A;
while (shift < m_A) {
sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ];
shift = 2 * shift;
}
}