blob: 12222c64656ee8f7e7e5cf96d980c59913315373 [file] [log] [blame]
//! Implementation of the core random number generators.
use crate::{distribution::Distribution, text::TextPool};
use std::fmt::Display;
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
pub struct RowRandomInt {
seed: i64,
usage: i32,
seeds_per_row: i32,
}
impl RowRandomInt {
/// The default multiplier value is a TPC-H constant.
const MULTIPLIER: i64 = 16807;
/// Modulus value is a TPC-H constant 2^31 - 1.
const MODULUS: i64 = 2147483647;
/// Default seed value as specified in CMU's benchbase.
const DEFAULT_SEED: i64 = 19620718;
/// Creates a new random number generator with the given seed and potential
/// number of random values per row.
pub fn new(seed: i64, seeds_per_row: i32) -> Self {
Self {
seed,
usage: 0,
seeds_per_row,
}
}
/// Creates a new random number generator with a specified column number and number of
/// random values per row. Uses the default seed value `19620718`.
pub fn new_with_default_seed_and_column_number(column_number: i32, seeds_per_row: i32) -> Self {
Self::new_with_column_number(column_number, Self::DEFAULT_SEED, seeds_per_row)
}
/// Creates a new random number generator with a specified column number and number of
/// random values per row. The seed can be specified.
pub fn new_with_column_number(column_number: i32, seed: i64, seeds_per_row: i32) -> Self {
Self {
seed: seed + column_number as i64 * (Self::MODULUS / 799),
seeds_per_row,
usage: 0,
}
}
/// Returns a random value between lower and upper bounds (both inclusive).
pub fn next_int(&mut self, lower_bound: i32, upper_bound: i32) -> i32 {
let _ = self.next_rand();
// This is buggy because it can overflow e.g in `RandomAlphaNumeric` when
// `upper_bound` is `i32::MAX` and `lower_bound` is 0 so to replicate
// the overflow behaviour we need to wrap around to `i32::MIN`.
let range = (upper_bound - lower_bound).wrapping_add(1) as f64;
let value = ((1.0 * (self.seed as f64) / Self::MODULUS as f64) * range) as i32;
lower_bound + value
}
/// Instantiates a new seed for the next random value.
pub fn next_rand(&mut self) -> i64 {
self.seed = (self.seed * Self::MULTIPLIER) % Self::MODULUS;
self.usage += 1;
self.seed
}
/// Signals to the random number generator that we consumed the seeds for the current row
/// and we need to start generating new ones.
pub fn row_finished(&mut self) {
self.advance_seed((self.seeds_per_row - self.usage) as i64);
self.usage = 0;
}
/// Advance the specified number of rows which is required for partitionned datasets.
pub fn advance_rows(&mut self, row_count: i64) {
// Signals the we consumed all the seeds for the current row.
if self.usage != 0 {
self.row_finished();
}
self.advance_seed(self.seeds_per_row as i64 * row_count);
}
/// Advances the seed value by the number of uses.
fn advance_seed(&mut self, count: i64) {
let mut multiplier = Self::MULTIPLIER;
let mut count = count;
while count > 0 {
if count % 2 != 0 {
self.seed = (multiplier * self.seed) % Self::MODULUS;
}
count /= 2;
multiplier = (multiplier * multiplier) % Self::MODULUS;
}
}
}
/// Random 64-bit integer generator for large scale factors
#[derive(Default, Debug, Clone, Copy)]
pub struct RowRandomLong {
seed: i64,
seeds_per_row: i32,
usage: i32,
}
impl RowRandomLong {
/// The default multiplier value is a TPC-H constant.
const MULTIPLIER: i64 = 6364136223846793005;
/// The default increment used.
const INCREMENT: i64 = 1;
/// The default multiplier for 32-bit seeds (a TPC-H oddity).
const MULTIPLIER_32: i64 = 16807;
/// The default modulus for 32-bit seeds (a TPC-H oddity).
const MODULUS_32: i64 = 2147483647;
/// Instantiates a new random number generator with the specified seed and number of random
/// values per row.
pub fn new(seed: i64, seeds_per_row: i32) -> Self {
Self {
seed,
seeds_per_row,
usage: 0,
}
}
/// Returnsa random value between `lower_bound` and `upper_bound` (both inclusive).
pub fn next_long(&mut self, lower_bound: i64, upper_bound: i64) -> i64 {
self.next_rand();
let value_in_range = (self.seed.abs()) % (upper_bound - lower_bound + 1);
lower_bound + value_in_range
}
/// Instantiates a new seed for the next random value.
fn next_rand(&mut self) -> i64 {
self.seed = (self.seed.wrapping_mul(Self::MULTIPLIER)) + Self::INCREMENT;
self.usage += 1;
self.seed
}
/// Signals that all seeds for the current row have been consumed and we need to start
/// generating new ones.
pub fn row_finished(&mut self) {
// For the 64-bit case, TPC-H actually uses the 32-bit advance method
self.advance_seed_32((self.seeds_per_row - self.usage) as i64);
self.usage = 0;
}
/// Advances the seed by the specified number of rows
pub fn advance_rows(&mut self, row_count: i64) {
// Finish current row if needed
if self.usage != 0 {
self.row_finished();
}
// Advance the seed
self.advance_seed_32((self.seeds_per_row as i64) * row_count);
}
// TPC-H uses this 32-bit method even for 64-bit numbers
fn advance_seed_32(&mut self, mut count: i64) {
let mut multiplier: i64 = Self::MULTIPLIER_32;
while count > 0 {
if count % 2 != 0 {
self.seed = (multiplier * self.seed) % Self::MODULUS_32;
}
// Integer division, truncates
count /= 2;
multiplier = (multiplier * multiplier) % Self::MODULUS_32;
}
}
}
/// Random number generator for bounded values.
#[derive(Default, Debug, Clone, Copy)]
pub struct RandomBoundedInt {
lower_bound: i32,
upper_bound: i32,
random_int: RowRandomInt,
}
impl RandomBoundedInt {
/// Creates a new random number generator with the given seed and lower and upper bounds.
pub fn new(seed: i64, lower_bound: i32, upper_bound: i32) -> Self {
Self {
lower_bound,
upper_bound,
random_int: RowRandomInt::new(seed, 1),
}
}
/// Creates a new random number generator with the given seed and lower and upper bounds
/// and the number of random values per row.
pub fn new_with_seeds_per_row(
seed: i64,
lower_bound: i32,
upper_bound: i32,
seeds_per_row: i32,
) -> Self {
Self {
lower_bound,
upper_bound,
random_int: RowRandomInt::new(seed, seeds_per_row),
}
}
/// Returns a random value between the lower and upper bounds (both inclusive).
pub fn next_value(&mut self) -> i32 {
self.random_int.next_int(self.lower_bound, self.upper_bound)
}
/// Advance the inner random number generator by the specified number of rows.
pub fn advance_rows(&mut self, row_count: i64) {
self.random_int.advance_rows(row_count);
}
pub fn row_finished(&mut self) {
self.random_int.row_finished();
}
}
/// Random number generator for bounded 64-bit values.
#[derive(Default, Debug, Clone, Copy)]
pub struct RandomBoundedLong {
use_64bits: bool,
lower_bound: i64,
upper_bound: i64,
// 64-bit values.
random_long: RowRandomLong,
// 32-bit values.
random_int: RowRandomInt,
}
impl RandomBoundedLong {
/// Creates a new random number generator with the given seed and lower and upper bounds.
pub fn new(seed: i64, use_64bits: bool, lower_bound: i64, upper_bound: i64) -> Self {
Self {
lower_bound,
use_64bits,
upper_bound,
random_long: RowRandomLong::new(seed, 1),
random_int: RowRandomInt::new(seed, 1),
}
}
/// Creates a new random number generator with the given seed and lower and upper bounds
/// and the number of random values per row.
pub fn new_with_seeds_per_row(
seed: i64,
use_64bits: bool,
lower_bound: i64,
upper_bound: i64,
seeds_per_row: i32,
) -> Self {
Self {
use_64bits,
lower_bound,
upper_bound,
random_long: RowRandomLong::new(seed, seeds_per_row),
random_int: RowRandomInt::new(seed, seeds_per_row),
}
}
/// Returns a random value between the lower and upper bounds (both inclusive).
pub fn next_value(&mut self) -> i64 {
if self.use_64bits {
self.random_long
.next_long(self.lower_bound, self.upper_bound)
} else {
self.random_int
.next_int(self.lower_bound as i32, self.upper_bound as i32) as i64
}
}
/// Advance the inner random number generator by the specified number of rows.
pub fn advance_rows(&mut self, row_count: i64) {
if self.use_64bits {
self.random_long.advance_rows(row_count);
} else {
self.random_int.advance_rows(row_count);
}
}
pub fn row_finished(&mut self) {
if self.use_64bits {
self.random_long.row_finished();
} else {
self.random_int.row_finished();
}
}
}
/// Generates random alphanumeric strings
#[derive(Debug)]
pub struct RandomAlphaNumeric {
inner: RowRandomInt,
min_length: i32,
max_length: i32,
}
impl RandomAlphaNumeric {
// Characters allowed in alphanumeric strings
const ALPHA_NUMERIC: &'static [u8] =
b"0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,";
// Length multipliers from TPC-H spec
const LOW_LENGTH_MULTIPLIER: f64 = 0.4;
const HIGH_LENGTH_MULTIPLIER: f64 = 1.6;
// Usage count per row
const USAGE_PER_ROW: i32 = 9;
pub fn new(seed: i64, average_length: i32) -> Self {
Self::new_with_expected_row_count(seed, average_length, 1)
}
pub fn new_with_expected_row_count(seed: i64, average_length: i32, seeds_per_row: i32) -> Self {
let min_length = (average_length as f64 * Self::LOW_LENGTH_MULTIPLIER) as i32;
let max_length = (average_length as f64 * Self::HIGH_LENGTH_MULTIPLIER) as i32;
Self {
inner: RowRandomInt::new(seed, Self::USAGE_PER_ROW * seeds_per_row),
min_length,
max_length,
}
}
/// Returns the next string as a [`RandomAlphaNumericInstance`], which can
/// generate the string on demand.
pub fn next_value(&mut self) -> RandomAlphaNumericInstance {
let length = self.inner.next_int(self.min_length, self.max_length) as usize;
RandomAlphaNumericInstance {
length,
snapshot: self.inner,
}
}
/// Advance the inner random number generator by the specified number of rows.
pub fn advance_rows(&mut self, row_count: i64) {
self.inner.advance_rows(row_count);
}
pub fn row_finished(&mut self) {
self.inner.row_finished();
}
}
/// A random alphanumeric string. To avoid allocations
/// the string is created on demand with the Display implementation.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RandomAlphaNumericInstance {
length: usize,
/// snapshot of the random number generator
snapshot: RowRandomInt,
}
impl Display for RandomAlphaNumericInstance {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// Use up to 64 bytes of a stack buffer for small strings to avoid
// allocation, and heap allocation for larger ones.
let mut stack_buffer = [0u8; 64];
let mut heap_buffer = Vec::new();
let buffer = if self.length <= stack_buffer.len() {
&mut stack_buffer[0..self.length]
} else {
heap_buffer.resize(self.length, 0);
&mut heap_buffer
};
let mut generator = self.snapshot; // copy to for mutation
let mut char_index = 0;
// todo remove
#[allow(clippy::needless_range_loop)]
for i in 0..self.length {
if i % 5 == 0 {
char_index = generator.next_int(0, i32::MAX) as i64;
}
let char_pos = (char_index & 0x3f) as usize;
buffer[i] = RandomAlphaNumeric::ALPHA_NUMERIC[char_pos];
char_index >>= 6;
}
// Safety: only pushed ascii characters into the buffer
let s = unsafe { std::str::from_utf8_unchecked(buffer) };
f.write_str(s)?;
Ok(())
}
}
/// Generates phone numbers according to TPC-H spec
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct RandomPhoneNumber {
inner: RowRandomInt,
}
impl RandomPhoneNumber {
// Maximum number of nations in TPC-H
const NATIONS_MAX: i32 = 90;
pub fn new(seed: i64) -> Self {
Self::new_with_expected_row_count(seed, 1)
}
pub fn new_with_expected_row_count(seed: i64, seeds_per_row: i32) -> Self {
Self {
inner: RowRandomInt::new(seed, 3 * seeds_per_row),
}
}
pub fn next_value(&mut self, nation_key: i64) -> PhoneNumberInstance {
PhoneNumberInstance {
country_code: 10 + (nation_key % Self::NATIONS_MAX as i64) as i32,
local1: self.inner.next_int(100, 999),
local2: self.inner.next_int(100, 999),
local3: self.inner.next_int(1000, 9999),
}
}
/// Advance the inner random number generator by the specified number of rows.
pub fn advance_rows(&mut self, row_count: i64) {
self.inner.advance_rows(row_count);
}
pub fn row_finished(&mut self) {
self.inner.row_finished();
}
}
/// A displayable phone number
///
/// Example display:
/// ```text
/// 27-918-335-1736
/// ```
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct PhoneNumberInstance {
country_code: i32,
local1: i32,
local2: i32,
local3: i32,
}
impl Display for PhoneNumberInstance {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{:02}-{:03}-{:03}-{:04}",
self.country_code, self.local1, self.local2, self.local3
)
}
}
/// Fetches random strings from a distribution.
#[derive(Debug, Clone)]
pub struct RandomString<'a> {
inner: RowRandomInt,
distribution: &'a Distribution,
}
impl<'a> RandomString<'a> {
pub fn new(seed: i64, distribution: &'a Distribution) -> Self {
Self::new_with_expected_row_count(seed, distribution, 1)
}
pub fn new_with_expected_row_count(
seed: i64,
distribution: &'a Distribution,
seeds_per_row: i32,
) -> Self {
Self {
inner: RowRandomInt::new(seed, seeds_per_row),
distribution,
}
}
pub fn next_value(&mut self) -> &'a str {
self.distribution.random_value(&mut self.inner)
}
/// Advance the inner random number generator by the given number of rows.
pub fn advance_rows(&mut self, row_count: i64) {
self.inner.advance_rows(row_count);
}
pub fn row_finished(&mut self) {
self.inner.row_finished();
}
}
/// Generates sequences of random sequence of strings from a distribution
#[derive(Debug)]
pub struct RandomStringSequence<'a> {
inner: RowRandomInt,
count: i32,
distribution: &'a Distribution,
}
impl<'a> RandomStringSequence<'a> {
pub fn new(seed: i64, count: i32, distribution: &'a Distribution) -> Self {
Self::new_with_expected_row_count(seed, count, distribution, 1)
}
pub fn new_with_expected_row_count(
seed: i64,
count: i32,
distribution: &'a Distribution,
seeds_per_row: i32,
) -> Self {
Self {
inner: RowRandomInt::new(seed, distribution.size() as i32 * seeds_per_row),
count,
distribution,
}
}
pub fn next_value(&mut self) -> StringSequenceInstance<'a> {
// Get all values from the distribution
let mut values: Vec<&str> = self.distribution.get_values().to_vec();
// Randomize first 'count' elements
for current_position in 0..self.count {
// Pick a random position to swap with
let swap_position =
self.inner
.next_int(current_position, values.len() as i32 - 1) as usize;
// Swap the elements
values.swap(current_position as usize, swap_position);
}
// Keep only the first 'count' values, and join them with spaces
values.truncate(self.count as usize);
StringSequenceInstance { values }
}
/// Advance the inner random number generator by the given number of rows.
pub fn advance_rows(&mut self, row_count: i64) {
self.inner.advance_rows(row_count);
}
pub fn row_finished(&mut self) {
self.inner.row_finished();
}
}
/// Displayable string sequence instance
///
/// Prints the sequence of strings as a single string with spaces between them.
///
/// Example display:
/// ```text
/// "value1 value2 value3"
/// ```
#[derive(Default, Debug, Clone, PartialEq)]
pub struct StringSequenceInstance<'a> {
values: Vec<&'a str>,
}
impl Display for StringSequenceInstance<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut iter = self.values.iter();
if let Some(first) = iter.next() {
write!(f, "{}", first)?;
}
for value in iter {
write!(f, " {}", value)?;
}
Ok(())
}
}
/// Generates random text according to TPC-H spec
#[derive(Debug, Clone)]
pub struct RandomText<'a> {
inner: RowRandomInt,
text_pool: &'a TextPool,
min_length: i32,
max_length: i32,
}
impl<'a> RandomText<'a> {
const LOW_LENGTH_MULTIPLIER: f64 = 0.4;
const HIGH_LENGTH_MULTIPLIER: f64 = 1.6;
pub fn new(seed: i64, text_pool: &'a TextPool, average_text_length: f64) -> Self {
Self::new_with_expected_row_count(seed, text_pool, average_text_length, 1)
}
pub fn new_with_expected_row_count(
seed: i64,
text_pool: &'a TextPool,
average_text_length: f64,
expected_row_count: i32,
) -> Self {
Self {
inner: RowRandomInt::new(seed, expected_row_count * 2),
text_pool,
min_length: (average_text_length * Self::LOW_LENGTH_MULTIPLIER) as i32,
max_length: (average_text_length * Self::HIGH_LENGTH_MULTIPLIER) as i32,
}
}
pub fn next_value(&mut self) -> &'a str {
let offset = self
.inner
.next_int(0, self.text_pool.size() - self.max_length);
let length = self.inner.next_int(self.min_length, self.max_length);
self.text_pool.text(offset, offset + length)
}
pub fn advance_rows(&mut self, row_count: i64) {
self.inner.advance_rows(row_count);
}
pub fn row_finished(&mut self) {
self.inner.row_finished();
}
}
#[cfg(test)]
mod test {
use super::*;
use std::collections::HashSet;
#[test]
fn test_small_random_alpha_numeric() {
RandomAlphaNumericTest {
average_length: 10,
num_rows: 100,
expected_average_length: 10,
}
.assert()
}
#[test]
fn test_large_random_alpha_numeric() {
RandomAlphaNumericTest {
average_length: 100,
num_rows: 100,
expected_average_length: 102,
}
.assert()
}
struct RandomAlphaNumericTest {
average_length: i32,
num_rows: usize,
expected_average_length: usize,
}
impl RandomAlphaNumericTest {
fn assert(self) {
let Self {
average_length,
num_rows,
expected_average_length,
} = self;
let mut generator = RandomAlphaNumeric::new(1, average_length);
let mut values = HashSet::new();
// check that the values are within the expected length and not repeated
let mut total_len = 0;
for _ in 0..num_rows {
let value = generator.next_value().to_string();
total_len += value.len();
assert!(values.insert(value)); // no dupes
}
assert_eq!(total_len / num_rows, expected_average_length);
}
}
}