blob: 255d2458de5828a8316f24e39c99c83101b23a61 [file] [log] [blame]
use std::io;
use byteorder::{ReadBytesExt, WriteBytesExt, BigEndian};
use csv;
/// A simple index for random access to CSV records.
///
/// This index permits seeking to the start of any CSV record with a constant
/// number of operations.
///
/// The format of the index is simplistic and amenable to serializing to disk.
/// It consists of exactly `N+1` 64 bit big-endian integers, where `N` is the
/// number of records in the CSV data that is indexed. Each `i`th integer
/// corresponds to the approximate byte offset where the `i`th record in the
/// CSV data begins. One additional integer is written to the end of the index
/// which indicates the total number of records in the CSV data.
///
/// This indexing format does not store the line numbers of CSV records, so
/// using the positions returned by this index to seek a CSV reader will likely
/// cause any future line numbers reported by that reader to be incorrect.
///
/// This format will never change.
///
/// N.B. The format of this indexing scheme matches the format of the old the
/// `csv::Indexed` type in pre-1.0 versions of the `csv` crate.
pub struct RandomAccessSimple<R> {
rdr: R,
len: u64,
}
impl<W: io::Write> RandomAccessSimple<W> {
/// Write a simple index to the given writer for the given CSV reader.
///
/// If there was a problem reading CSV records or writing to the given
/// writer, then an error is returned.
///
/// That the given CSV reader is read as given until EOF. The index
/// produced includes all records, including the first record even if the
/// CSV reader is configured to interpret the first record as a header
/// record.
///
/// # Example: in memory index
///
/// This example shows how to create a simple random access index, open it
/// and query the number of records in the index.
///
/// ```
/// extern crate csv;
/// extern crate csv_index;
///
/// use std::io;
/// use csv_index::RandomAccessSimple;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> csv::Result<()> {
/// let data = "\
/// city,country,pop
/// Boston,United States,4628910
/// Concord,United States,42695
/// ";
/// let mut rdr = csv::Reader::from_reader(data.as_bytes());
/// let mut wtr = io::Cursor::new(vec![]);
/// RandomAccessSimple::create(&mut rdr, &mut wtr)?;
///
/// let idx = RandomAccessSimple::open(wtr)?;
/// assert_eq!(idx.len(), 3);
/// Ok(())
/// }
/// ```
///
/// # Example: file backed index
///
/// This is like the previous example, but instead of creating the index
/// in memory with `std::io::Cursor`, we write the index to a file.
///
/// ```no_run
/// extern crate csv;
/// extern crate csv_index;
///
/// use std::fs::File;
/// use std::io;
/// use csv_index::RandomAccessSimple;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> csv::Result<()> {
/// let data = "\
/// city,country,pop
/// Boston,United States,4628910
/// Concord,United States,42695
/// ";
/// let mut rdr = csv::Reader::from_reader(data.as_bytes());
/// let mut wtr = File::create("data.csv.idx")?;
/// RandomAccessSimple::create(&mut rdr, &mut wtr)?;
///
/// let fileidx = File::open("data.csv.idx")?;
/// let idx = RandomAccessSimple::open(fileidx)?;
/// assert_eq!(idx.len(), 3);
/// Ok(())
/// }
/// ```
pub fn create<R: io::Read>(
rdr: &mut csv::Reader<R>,
mut wtr: W,
) -> csv::Result<()>
{
// If the reader is configured to read a header, then read that
// first. (The CSV reader otherwise won't yield the header record
// when calling `read_byte_record`.)
let mut len = 0;
if rdr.has_headers() {
let header = rdr.byte_headers()?;
if !header.is_empty() {
let pos = header.position().expect("position on header row");
wtr.write_u64::<BigEndian>(pos.byte())?;
len += 1;
}
}
let mut record = csv::ByteRecord::new();
while rdr.read_byte_record(&mut record)? {
let pos = record.position().expect("position on row");
wtr.write_u64::<BigEndian>(pos.byte())?;
len += 1;
}
wtr.write_u64::<BigEndian>(len)?;
Ok(())
}
}
impl<R: io::Read + io::Seek> RandomAccessSimple<R> {
/// Open an existing simple CSV index.
///
/// The reader given must be seekable and should contain an index written
/// by `RandomAccessSimple::create`.
///
/// # Example
///
/// This example shows how to create a simple random access index, open it
/// and query the number of records in the index.
///
/// ```
/// extern crate csv;
/// extern crate csv_index;
///
/// use std::io;
/// use csv_index::RandomAccessSimple;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> csv::Result<()> {
/// let data = "\
/// city,country,pop
/// Boston,United States,4628910
/// Concord,United States,42695
/// ";
/// let mut rdr = csv::Reader::from_reader(data.as_bytes());
/// let mut wtr = io::Cursor::new(vec![]);
/// RandomAccessSimple::create(&mut rdr, &mut wtr)?;
///
/// let idx = RandomAccessSimple::open(wtr)?;
/// assert_eq!(idx.len(), 3);
/// Ok(())
/// }
/// ```
pub fn open(mut rdr: R) -> csv::Result<RandomAccessSimple<R>> {
rdr.seek(io::SeekFrom::End(-8))?;
let len = rdr.read_u64::<BigEndian>()?;
Ok(RandomAccessSimple {
rdr: rdr,
len: len,
})
}
/// Get the position of the record at index `i`.
///
/// The first record has index `0`.
///
/// If the position returned is used to seek the CSV reader that was used
/// to create this index, then the next record read by the CSV reader will
/// be the `i`th record.
///
/// Note that since this index does not store the line number of each
/// record, the position returned will always have a line number equivalent
/// to `1`. This in turn will cause the CSV reader to report all subsequent
/// line numbers incorrectly.
///
/// # Example
///
/// This example shows how to create a simple random access index, open it
/// and use it to seek a CSV reader to read an arbitrary record.
///
/// ```
/// extern crate csv;
/// extern crate csv_index;
///
/// use std::error::Error;
/// use std::io;
/// use csv_index::RandomAccessSimple;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<(), Box<Error>> {
/// let data = "\
/// city,country,pop
/// Boston,United States,4628910
/// Concord,United States,42695
/// ";
/// // Note that we wrap our CSV data in an io::Cursor, which makes it
/// // seekable. If you're opening CSV data from a file, then this is
/// // not needed since a `File` is already seekable.
/// let mut rdr = csv::Reader::from_reader(io::Cursor::new(data));
/// let mut wtr = io::Cursor::new(vec![]);
/// RandomAccessSimple::create(&mut rdr, &mut wtr)?;
///
/// // Open the index we just created, get the position of the last
/// // record and seek the CSV reader.
/// let mut idx = RandomAccessSimple::open(wtr)?;
/// let pos = idx.get(2)?;
/// rdr.seek(pos)?;
///
/// // Read the next record.
/// if let Some(result) = rdr.records().next() {
/// let record = result?;
/// assert_eq!(record, vec!["Concord", "United States", "42695"]);
/// Ok(())
/// } else {
/// Err(From::from("expected at least one record but got none"))
/// }
/// }
/// ```
pub fn get(&mut self, i: u64) -> csv::Result<csv::Position> {
if i >= self.len {
let msg = format!(
"invalid record index {} (there are {} records)", i, self.len);
let err = io::Error::new(io::ErrorKind::Other, msg);
return Err(csv::Error::from(err));
}
self.rdr.seek(io::SeekFrom::Start(i * 8))?;
let offset = self.rdr.read_u64::<BigEndian>()?;
let mut pos = csv::Position::new();
pos.set_byte(offset).set_record(i);
Ok(pos)
}
/// Return the number of records (including the header record) in this
/// index.
pub fn len(&self) -> u64 {
self.len
}
/// Return true if and only if this index has zero records.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
#[cfg(test)]
mod tests {
use std::io;
use csv;
use super::RandomAccessSimple;
struct Indexed<'a> {
csv: csv::Reader<io::Cursor<&'a str>>,
idx: RandomAccessSimple<io::Cursor<Vec<u8>>>,
}
impl<'a> Indexed<'a> {
fn new(headers: bool, csv_data: &'a str) -> Indexed<'a> {
let mut rdr = csv::ReaderBuilder::new()
.has_headers(headers)
.from_reader(io::Cursor::new(csv_data));
let mut idxbuf = io::Cursor::new(vec![]);
RandomAccessSimple::create(&mut rdr, &mut idxbuf).unwrap();
Indexed {
csv: rdr,
idx: RandomAccessSimple::open(idxbuf).unwrap(),
}
}
fn read_at(&mut self, record: u64) -> csv::StringRecord {
let pos = self.idx.get(record).unwrap();
self.csv.seek(pos).unwrap();
self.csv.records().next().unwrap().unwrap()
}
}
#[test]
fn headers_empty() {
let idx = Indexed::new(true, "");
assert_eq!(idx.idx.len(), 0);
}
#[test]
fn headers_one_field() {
let mut idx = Indexed::new(true, "h1\na\nb\nc\n");
assert_eq!(idx.idx.len(), 4);
assert_eq!(idx.read_at(0), vec!["h1"]);
assert_eq!(idx.read_at(1), vec!["a"]);
assert_eq!(idx.read_at(2), vec!["b"]);
assert_eq!(idx.read_at(3), vec!["c"]);
}
#[test]
fn headers_many_fields() {
let mut idx = Indexed::new(true, "\
h1,h2,h3
a,b,c
d,e,f
g,h,i
");
assert_eq!(idx.idx.len(), 4);
assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]);
assert_eq!(idx.read_at(1), vec!["a", "b", "c"]);
assert_eq!(idx.read_at(2), vec!["d", "e", "f"]);
assert_eq!(idx.read_at(3), vec!["g", "h", "i"]);
}
#[test]
fn no_headers_one_field() {
let mut idx = Indexed::new(false, "h1\na\nb\nc\n");
assert_eq!(idx.idx.len(), 4);
assert_eq!(idx.read_at(0), vec!["h1"]);
assert_eq!(idx.read_at(1), vec!["a"]);
assert_eq!(idx.read_at(2), vec!["b"]);
assert_eq!(idx.read_at(3), vec!["c"]);
}
#[test]
fn no_headers_many_fields() {
let mut idx = Indexed::new(false, "\
h1,h2,h3
a,b,c
d,e,f
g,h,i
");
assert_eq!(idx.idx.len(), 4);
assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]);
assert_eq!(idx.read_at(1), vec!["a", "b", "c"]);
assert_eq!(idx.read_at(2), vec!["d", "e", "f"]);
assert_eq!(idx.read_at(3), vec!["g", "h", "i"]);
}
#[test]
fn headers_one_field_newlines() {
let mut idx = Indexed::new(true, "
h1
a
b
c
");
assert_eq!(idx.idx.len(), 4);
assert_eq!(idx.read_at(0), vec!["h1"]);
assert_eq!(idx.read_at(1), vec!["a"]);
assert_eq!(idx.read_at(2), vec!["b"]);
assert_eq!(idx.read_at(3), vec!["c"]);
}
}