blob: d8ccaebc07555157876125073e895635e07f89a7 [file] [log] [blame]
//! [`TPCHDate`] and date handling
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::{
fmt::{Display, Formatter},
sync::LazyLock,
};
/// The value of 1970-01-01 in the date generator system
pub const GENERATED_DATE_EPOCH_OFFSET: i32 = 83966;
/// The minimum date that can be generated by the data generator,
/// corresponding to `1992-01-01`.
///
/// The maximum date that can be generated by the data generator is this value
/// plus [`TOTAL_DATE_RANGE`].
pub const MIN_GENERATE_DATE: i32 = 92001;
const CURRENT_DATE: i32 = 95168;
/// The total number of days that can be generated by the data generator
pub const TOTAL_DATE_RANGE: i32 = 2557;
/// Month boundaries for a standard (non-leap) year
const MONTH_YEAR_DAY_START: [i32; 13] =
[0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365];
/// Lookup table for converting dates from the data generator to strings
///
/// index: dates generated by the data generator
/// value: strings like 1992-01-01
static DATE_TO_STRING: LazyLock<Vec<String>> = LazyLock::new(make_date_string_index);
/// Lookup table for julian date format used to check if a given date is in
/// the past as part of the lineitem generation.
static JULIAN_DATE: LazyLock<Vec<i32>> = LazyLock::new(|| {
(0..TOTAL_DATE_RANGE)
.map(|date| julian(date + MIN_GENERATE_DATE))
.collect()
});
pub struct GenerateUtils;
impl GenerateUtils {
/// Calculates row count with linear scaling (original behavior)
pub fn calculate_row_count(
scale_base: i32,
scale_factor: f64,
part: i32,
part_count: i32,
) -> i64 {
Self::calculate_scaled_row_count(scale_base, scale_factor, part, part_count, false)
}
/// Calculates row count with logarithmic scaling (for buildings)
pub fn calculate_logarithmic_row_count(
scale_base: i32,
scale_factor: f64,
part: i32,
part_count: i32,
) -> i64 {
Self::calculate_scaled_row_count(scale_base, scale_factor, part, part_count, true)
}
/// Internal implementation for row count calculation with scaling option
fn calculate_scaled_row_count(
scale_base: i32,
scale_factor: f64,
part: i32,
part_count: i32,
log_scale: bool,
) -> i64 {
let total_row_count = if log_scale {
(scale_base as f64 * (1.0 + scale_factor.log2())) as i64
} else {
(scale_base as f64 * scale_factor) as i64
};
let rows_per_part = total_row_count / part_count as i64;
if part == part_count {
// for the last part, add the remainder rows
rows_per_part + (total_row_count % part_count as i64)
} else {
rows_per_part
}
}
/// Calculates start index for a specific part of the data
pub fn calculate_start_index(
scale_base: i32,
scale_factor: f64,
part: i32,
part_count: i32,
) -> i64 {
let total_row_count = (scale_base as f64 * scale_factor) as i64;
let rows_per_part = total_row_count / part_count as i64;
rows_per_part * (part as i64 - 1)
}
}
/// Random time generator that produces hours and minutes
#[derive(Debug, Clone)]
pub struct RandomTimeOfDay {
rng: StdRng,
}
impl RandomTimeOfDay {
/// Creates a new RandomTimeOfDay generator with the given seed
pub fn new(seed: u64) -> Self {
RandomTimeOfDay {
rng: StdRng::seed_from_u64(seed),
}
}
/// Generates a random time of day as (hour, minute, second)
pub fn next_value(&mut self) -> (u8, u8, u8) {
let hour = self.rng.gen_range(0..24);
let minute = self.rng.gen_range(0..60);
let second = self.rng.gen_range(0..60);
(hour, minute, second)
}
/// Advances the generator by a given number of rows
pub fn advance_rows(&mut self, rows: i64) {
for _ in 0..rows {
self.next_value();
}
}
/// Mark this row as finished
///
/// This is a no-op for this generator since it doesn't need row-specific state tracking
/// but is required to match the interface pattern used by other random generators
pub fn row_finished(&mut self) {
// No operation needed - StdRng doesn't require row-based state management
}
}
/// Represents a date (day/year)
///
/// Example display: 1992-01-01
///
/// The date is stored as an index from the [`MIN_GENERATE_DATE`]
///
/// # Example
/// ```
/// # use spatialbench::dates::{TPCHDate, MIN_GENERATE_DATE};
/// let date = TPCHDate::new(MIN_GENERATE_DATE + 41, 0, 0, 0);
/// // Convert the date to y/m/d fields
/// assert_eq!((92,2,11), date.to_ymd());
/// // format as a string using the Display impl
/// assert_eq!("1992-02-11 00:00:00", date.to_string());
/// ```
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
pub struct TPCHDate {
date_index: i32,
hour: u8, // 0-23
minute: u8, // 0-59
second: u8,
}
impl Display for TPCHDate {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{} {:02}:{:02}:{:02}",
&DATE_TO_STRING[self.date_index as usize], self.hour, self.minute, self.second
)
}
}
impl TPCHDate {
/// Number of days that must be added to a TPCH date to get a Unix epoch
/// relative date.
///
/// * Unix epoch relative dates are days since the epoch (1970-01-01)
/// * [`TPCHDate`]s are days since [`MIN_GENERATE_DATE`] (1992-01-01)
///
/// This value is `8035` because `1992-01-01` is `8035` days after `1970-01-01`
pub const UNIX_EPOCH_OFFSET: i32 = 8035;
/// Create a new TPCHDate from a generated date
pub fn new(generated_date: i32, hour: u8, minute: u8, second: u8) -> Self {
Self {
date_index: generated_date - MIN_GENERATE_DATE,
hour,
minute,
second,
}
}
/// Create a new date with a given day value and time components including seconds
pub fn new_with_time(day_value: i32, (hour, minute, second): (u8, u8, u8)) -> Self {
TPCHDate::new(day_value, hour, minute, second)
}
/// Returns the (year, month, day) of this date
///
/// For a date that represents `1992-02-03`, this function will return
///
/// * year: `92`
/// * month: `2`
/// * day: `3`
pub fn to_ymd(&self) -> (i32, i32, i32) {
to_ymd(self.date_index + 1)
}
/// Return the inner date index
pub fn into_inner(self) -> i32 {
self.date_index
}
/// Checks if a date is in the past
pub fn is_in_past(date: i32) -> bool {
Self::to_julian(date) <= CURRENT_DATE
}
/// Lookup if a date is in the past.
fn to_julian(date: i32) -> i32 {
JULIAN_DATE[(date - MIN_GENERATE_DATE) as usize]
}
/// Returns the number of days since the Unix epoch this date
/// represents.
#[inline(always)]
pub fn to_unix_epoch(&self) -> i32 {
self.date_index + Self::UNIX_EPOCH_OFFSET
}
/// Returns the number of seconds since the Unix epoch this date represents,
/// including the time components (hour, minute, second)
#[inline(always)]
pub fn to_unix_epoch_seconds(&self) -> i64 {
// Start with days since Unix epoch converted to seconds
let base_seconds = (self.date_index + Self::UNIX_EPOCH_OFFSET) as i64 * 24 * 60 * 60;
// Add the time components in seconds
let time_seconds =
(self.hour as i64) * 3600 + (self.minute as i64) * 60 + (self.second as i64);
base_seconds + time_seconds
}
}
/// Creates a index table of formatted strings
///
/// index: dates generated by the data generator
/// value: strings like 1992-01-01
fn make_date_string_index() -> Vec<String> {
(0..TOTAL_DATE_RANGE)
.map(|i| {
let (y, m, dy) = to_ymd(i + 1);
format_ymd(y, m, dy)
})
.collect()
}
/// Returns the (year, month, day) for the specified date index
///
/// See [`TPCHDate::to_ymd`] for more information
fn to_ymd(index: i32) -> (i32, i32, i32) {
let y = julian(index + MIN_GENERATE_DATE - 1) / 1000;
let d = julian(index + MIN_GENERATE_DATE - 1) % 1000;
let mut m = 0;
while d > MONTH_YEAR_DAY_START[m as usize] + leap_year_adjustment(y, m) {
m += 1;
}
let dy =
d - MONTH_YEAR_DAY_START[(m - 1) as usize] - if is_leap_year(y) && m > 2 { 1 } else { 0 };
(y, m, dy)
}
/// Formants the specified y, m, d as `yyyy-mm-dd`
fn format_ymd(y: i32, m: i32, dy: i32) -> String {
format!("{:04}-{:02}-{:02}", 1900 + y, m, dy)
}
/// Helpers duplicated to avoid circular references
const fn julian(date: i32) -> i32 {
let mut offset = date - MIN_GENERATE_DATE;
let mut result = MIN_GENERATE_DATE;
loop {
let year = result / 1000;
let year_end = year * 1000 + 365 + if is_leap_year(year) { 1 } else { 0 };
if result + offset <= year_end {
break;
}
offset -= year_end - result + 1;
result += 1000;
}
result + offset
}
const fn is_leap_year(year: i32) -> bool {
year % 4 == 0 && year % 100 != 0
}
const fn leap_year_adjustment(year: i32, month: i32) -> i32 {
if is_leap_year(year) && month >= 2 {
1
} else {
0
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_date_strings() {
let date = TPCHDate::new(MIN_GENERATE_DATE + 1, 0, 0, 0);
assert_eq!(date.to_string(), "1992-01-02 00:00:00");
let date = TPCHDate::new(MIN_GENERATE_DATE + 1234, 0, 0, 0);
assert_eq!(date.to_string(), "1995-05-19 00:00:00");
let date = TPCHDate::new(MIN_GENERATE_DATE + TOTAL_DATE_RANGE - 1, 0, 0, 0);
assert_eq!(date.to_string(), "1998-12-31 00:00:00");
}
#[test]
fn test_display_dates() {
for index in [1, 23, 321, 623, 1234, 2345, 2556] {
let date = TPCHDate::new(MIN_GENERATE_DATE + index, 10, 30, 45);
let (y, m, dy) = date.to_ymd();
assert_eq!(
format!("{} 10:30:45", format_ymd(y, m, dy)),
date.to_string()
);
}
}
#[test]
fn test_date_epoch_consistency() {
// Check that dates are actually machine some epochs.
let date = TPCHDate::new(MIN_GENERATE_DATE + 1, 0, 0, 0);
assert_eq!(date.to_unix_epoch(), 8036);
let date = TPCHDate::new(MIN_GENERATE_DATE + 1234, 0, 0, 0);
// 1995-05-19 00:00:00 (12:00:00 AM)
assert_eq!(date.to_string(), "1995-05-19 00:00:00");
assert_eq!(date.to_unix_epoch(), 9269);
}
}