spatialbench/src/text.rs - sedona-spatialbench - Git at Google

 //! Implementation of text pool and text generation.
 //!
 //! Most of this code has been ported from the Apache Trino TPC-H generator
 //! implementation. The original code can be found in the following link:
 //!
 //! <https://github.com/trinodb/tpch/blob/master/src/main/java/io/trino/tpch/TextPool.java>

 use crate::{distribution::Distributions, random::RowRandomInt};
 use std::sync::OnceLock;

 /// Pool of random text that follows TPC-H grammar.
 #[derive(Debug, Clone)]
 pub struct TextPool {
     /// Bytes making up the text pool, exact size.
     text: Vec<u8>,
 }

 /// The default global text pool is lazily initialized once and shared across
 /// all the table generators.
 static DEFAULT_TEXT_POOL: OnceLock<TextPool> = OnceLock::new();

 impl TextPool {
     /// Default text pool size.
     const DEFAULT_TEXT_POOL_SIZE: i32 = 300 * 1024 * 1024;
     /// Maximum length of a sentence in the text.
     const MAX_SENTENCE_LENGTH: i32 = 256;

     /// Returns the default text pool or initializes for the first time if
     /// that's not already the case.
     pub fn get_or_init_default() -> &'static Self {
         DEFAULT_TEXT_POOL.get_or_init(|| {
             Self::new(
                 Self::DEFAULT_TEXT_POOL_SIZE,
                 Distributions::static_default(),
             )
         })
     }

     /// Returns a new text pool with a predefined size and set of distributions.
     pub fn new(size: i32, distributions: &Distributions) -> Self {
         let mut rng = RowRandomInt::new(933588178, i32::MAX);
         let mut text_bytes = Vec::with_capacity(size as usize + Self::MAX_SENTENCE_LENGTH as usize);

         while text_bytes.len() < size as usize {
             Self::generate_sentence(distributions, &mut text_bytes, &mut rng);
         }
         text_bytes.truncate(size as usize);

         Self { text: text_bytes }
     }

     /// Returns the text pool size.
     pub fn size(&self) -> i32 {
         // Cast is fine since we truncated the bytes to `size` in `new`, which
         // is an i32.
         self.text.len() as i32
     }

     /// Returns a chunk of text from the pool
     ///
     /// Returns the text from the pool between the given begin and end indices.
     pub fn text(&self, begin: i32, end: i32) -> &str {
         // get slice of bytes (note this also does bounds checks)
         let result: &[u8] = &self.text[begin as usize..end as usize];
         // Safety: text pool contains only ASCII
         unsafe { std::str::from_utf8_unchecked(result) }
     }

     fn generate_sentence(
         distributions: &Distributions,
         output: &mut Vec<u8>,
         random: &mut RowRandomInt,
     ) {
         let syntax = distributions.grammar().random_value(random);
         let max_length = syntax.len();

         for c in syntax.chars().take(max_length).step_by(2) {
             match c {
                 'V' => Self::generate_verb_phrase(distributions, output, random),
                 'N' => Self::generate_noun_phrase(distributions, output, random),
                 'P' => {
                     let preposition = distributions.prepositions().random_value(random);
                     output.extend_from_slice(preposition.as_bytes());
                     output.extend_from_slice(b" the ");
                     Self::generate_noun_phrase(distributions, output, random);
                 }
                 'T' => {
                     output.pop().expect("at least one byte");
                     let terminator = distributions.terminators().random_value(random);
                     output.extend_from_slice(terminator.as_bytes());
                 }
                 c => panic!("Unknown token '{}'", c),
             };

             let last = output.last().copied().expect("at least one byte");
             if last != b' ' {
                 output.push(b' ');
             }
         }
     }

     fn generate_verb_phrase(
         distributions: &Distributions,
         output: &mut Vec<u8>,
         random: &mut RowRandomInt,
     ) {
         let syntax = distributions.verb_phrase().random_value(random);
         let max_length = syntax.len();

         for c in syntax.chars().take(max_length).step_by(2) {
             let source = match c {
                 'D' => distributions.adverbs(),
                 'V' => distributions.verbs(),
                 'X' => distributions.auxiliaries(),
                 c => panic!("Unknown token '{}'", c),
             };

             // pick a random word
             let word = source.random_value(random);
             output.extend_from_slice(word.as_bytes());

             // add a space
             output.push(b' ');
         }
     }

     fn generate_noun_phrase(
         distributions: &Distributions,
         output: &mut Vec<u8>,
         random: &mut RowRandomInt,
     ) {
         let syntax = distributions.noun_phrase().random_value(random);
         let max_length = syntax.len();

         for c in syntax.chars().take(max_length) {
             let source = match c {
                 'A' => distributions.articles(),
                 'J' => distributions.adjectives(),
                 'D' => distributions.adverbs(),
                 'N' => distributions.nouns(),
                 ',' => {
                     output.pop().expect("at least one byte");
                     output.extend_from_slice(b", ");
                     continue;
                 }
                 ' ' => continue,
                 c => panic!("Unknown token '{}'", c),
             };

             // pick a random word
             let word = source.random_value(random);
             output.extend_from_slice(word.as_bytes());
             output.push(b' ');
         }
     }
 }
	//! Implementation of text pool and text generation.
	//!
	//! Most of this code has been ported from the Apache Trino TPC-H generator
	//! implementation. The original code can be found in the following link:
	//!
	//! <https://github.com/trinodb/tpch/blob/master/src/main/java/io/trino/tpch/TextPool.java>

	use crate::{distribution::Distributions, random::RowRandomInt};
	use std::sync::OnceLock;

	/// Pool of random text that follows TPC-H grammar.
	#[derive(Debug, Clone)]
	pub struct TextPool {
	/// Bytes making up the text pool, exact size.
	text: Vec<u8>,
	}

	/// The default global text pool is lazily initialized once and shared across
	/// all the table generators.
	static DEFAULT_TEXT_POOL: OnceLock<TextPool> = OnceLock::new();

	impl TextPool {
	/// Default text pool size.
	const DEFAULT_TEXT_POOL_SIZE: i32 = 300 * 1024 * 1024;
	/// Maximum length of a sentence in the text.
	const MAX_SENTENCE_LENGTH: i32 = 256;

	/// Returns the default text pool or initializes for the first time if
	/// that's not already the case.
	pub fn get_or_init_default() -> &'static Self {
	DEFAULT_TEXT_POOL.get_or_init(\|\| {
	Self::new(
	Self::DEFAULT_TEXT_POOL_SIZE,
	Distributions::static_default(),
	)
	})
	}

	/// Returns a new text pool with a predefined size and set of distributions.
	pub fn new(size: i32, distributions: &Distributions) -> Self {
	let mut rng = RowRandomInt::new(933588178, i32::MAX);
	let mut text_bytes = Vec::with_capacity(size as usize + Self::MAX_SENTENCE_LENGTH as usize);

	while text_bytes.len() < size as usize {
	Self::generate_sentence(distributions, &mut text_bytes, &mut rng);
	}
	text_bytes.truncate(size as usize);

	Self { text: text_bytes }
	}

	/// Returns the text pool size.
	pub fn size(&self) -> i32 {
	// Cast is fine since we truncated the bytes to `size` in `new`, which
	// is an i32.
	self.text.len() as i32
	}

	/// Returns a chunk of text from the pool
	///
	/// Returns the text from the pool between the given begin and end indices.
	pub fn text(&self, begin: i32, end: i32) -> &str {
	// get slice of bytes (note this also does bounds checks)
	let result: &[u8] = &self.text[begin as usize..end as usize];
	// Safety: text pool contains only ASCII
	unsafe { std::str::from_utf8_unchecked(result) }
	}

	fn generate_sentence(
	distributions: &Distributions,
	output: &mut Vec<u8>,
	random: &mut RowRandomInt,
	) {
	let syntax = distributions.grammar().random_value(random);
	let max_length = syntax.len();

	for c in syntax.chars().take(max_length).step_by(2) {
	match c {
	'V' => Self::generate_verb_phrase(distributions, output, random),
	'N' => Self::generate_noun_phrase(distributions, output, random),
	'P' => {
	let preposition = distributions.prepositions().random_value(random);
	output.extend_from_slice(preposition.as_bytes());
	output.extend_from_slice(b" the ");
	Self::generate_noun_phrase(distributions, output, random);
	}
	'T' => {
	output.pop().expect("at least one byte");
	let terminator = distributions.terminators().random_value(random);
	output.extend_from_slice(terminator.as_bytes());
	}
	c => panic!("Unknown token '{}'", c),
	};

	let last = output.last().copied().expect("at least one byte");
	if last != b' ' {
	output.push(b' ');
	}
	}
	}

	fn generate_verb_phrase(
	distributions: &Distributions,
	output: &mut Vec<u8>,
	random: &mut RowRandomInt,
	) {
	let syntax = distributions.verb_phrase().random_value(random);
	let max_length = syntax.len();

	for c in syntax.chars().take(max_length).step_by(2) {
	let source = match c {
	'D' => distributions.adverbs(),
	'V' => distributions.verbs(),
	'X' => distributions.auxiliaries(),
	c => panic!("Unknown token '{}'", c),
	};

	// pick a random word
	let word = source.random_value(random);
	output.extend_from_slice(word.as_bytes());

	// add a space
	output.push(b' ');
	}
	}

	fn generate_noun_phrase(
	distributions: &Distributions,
	output: &mut Vec<u8>,
	random: &mut RowRandomInt,
	) {
	let syntax = distributions.noun_phrase().random_value(random);
	let max_length = syntax.len();

	for c in syntax.chars().take(max_length) {
	let source = match c {
	'A' => distributions.articles(),
	'J' => distributions.adjectives(),
	'D' => distributions.adverbs(),
	'N' => distributions.nouns(),
	',' => {
	output.pop().expect("at least one byte");
	output.extend_from_slice(b", ");
	continue;
	}
	' ' => continue,
	c => panic!("Unknown token '{}'", c),
	};

	// pick a random word
	let word = source.random_value(random);
	output.extend_from_slice(word.as_bytes());
	output.push(b' ');
	}
	}
	}