| use std::io; |
| |
| use byteorder::{ReadBytesExt, WriteBytesExt, BigEndian}; |
| use csv; |
| |
| /// A simple index for random access to CSV records. |
| /// |
| /// This index permits seeking to the start of any CSV record with a constant |
| /// number of operations. |
| /// |
| /// The format of the index is simplistic and amenable to serializing to disk. |
| /// It consists of exactly `N+1` 64 bit big-endian integers, where `N` is the |
| /// number of records in the CSV data that is indexed. Each `i`th integer |
| /// corresponds to the approximate byte offset where the `i`th record in the |
| /// CSV data begins. One additional integer is written to the end of the index |
| /// which indicates the total number of records in the CSV data. |
| /// |
| /// This indexing format does not store the line numbers of CSV records, so |
| /// using the positions returned by this index to seek a CSV reader will likely |
| /// cause any future line numbers reported by that reader to be incorrect. |
| /// |
| /// This format will never change. |
| /// |
| /// N.B. The format of this indexing scheme matches the format of the old the |
| /// `csv::Indexed` type in pre-1.0 versions of the `csv` crate. |
| pub struct RandomAccessSimple<R> { |
| rdr: R, |
| len: u64, |
| } |
| |
| impl<W: io::Write> RandomAccessSimple<W> { |
| /// Write a simple index to the given writer for the given CSV reader. |
| /// |
| /// If there was a problem reading CSV records or writing to the given |
| /// writer, then an error is returned. |
| /// |
| /// That the given CSV reader is read as given until EOF. The index |
| /// produced includes all records, including the first record even if the |
| /// CSV reader is configured to interpret the first record as a header |
| /// record. |
| /// |
| /// # Example: in memory index |
| /// |
| /// This example shows how to create a simple random access index, open it |
| /// and query the number of records in the index. |
| /// |
| /// ``` |
| /// extern crate csv; |
| /// extern crate csv_index; |
| /// |
| /// use std::io; |
| /// use csv_index::RandomAccessSimple; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> csv::Result<()> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); |
| /// let mut wtr = io::Cursor::new(vec![]); |
| /// RandomAccessSimple::create(&mut rdr, &mut wtr)?; |
| /// |
| /// let idx = RandomAccessSimple::open(wtr)?; |
| /// assert_eq!(idx.len(), 3); |
| /// Ok(()) |
| /// } |
| /// ``` |
| /// |
| /// # Example: file backed index |
| /// |
| /// This is like the previous example, but instead of creating the index |
| /// in memory with `std::io::Cursor`, we write the index to a file. |
| /// |
| /// ```no_run |
| /// extern crate csv; |
| /// extern crate csv_index; |
| /// |
| /// use std::fs::File; |
| /// use std::io; |
| /// use csv_index::RandomAccessSimple; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> csv::Result<()> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); |
| /// let mut wtr = File::create("data.csv.idx")?; |
| /// RandomAccessSimple::create(&mut rdr, &mut wtr)?; |
| /// |
| /// let fileidx = File::open("data.csv.idx")?; |
| /// let idx = RandomAccessSimple::open(fileidx)?; |
| /// assert_eq!(idx.len(), 3); |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn create<R: io::Read>( |
| rdr: &mut csv::Reader<R>, |
| mut wtr: W, |
| ) -> csv::Result<()> |
| { |
| // If the reader is configured to read a header, then read that |
| // first. (The CSV reader otherwise won't yield the header record |
| // when calling `read_byte_record`.) |
| let mut len = 0; |
| if rdr.has_headers() { |
| let header = rdr.byte_headers()?; |
| if !header.is_empty() { |
| let pos = header.position().expect("position on header row"); |
| wtr.write_u64::<BigEndian>(pos.byte())?; |
| len += 1; |
| } |
| } |
| let mut record = csv::ByteRecord::new(); |
| while rdr.read_byte_record(&mut record)? { |
| let pos = record.position().expect("position on row"); |
| wtr.write_u64::<BigEndian>(pos.byte())?; |
| len += 1; |
| } |
| wtr.write_u64::<BigEndian>(len)?; |
| Ok(()) |
| } |
| } |
| |
| impl<R: io::Read + io::Seek> RandomAccessSimple<R> { |
| /// Open an existing simple CSV index. |
| /// |
| /// The reader given must be seekable and should contain an index written |
| /// by `RandomAccessSimple::create`. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to create a simple random access index, open it |
| /// and query the number of records in the index. |
| /// |
| /// ``` |
| /// extern crate csv; |
| /// extern crate csv_index; |
| /// |
| /// use std::io; |
| /// use csv_index::RandomAccessSimple; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> csv::Result<()> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); |
| /// let mut wtr = io::Cursor::new(vec![]); |
| /// RandomAccessSimple::create(&mut rdr, &mut wtr)?; |
| /// |
| /// let idx = RandomAccessSimple::open(wtr)?; |
| /// assert_eq!(idx.len(), 3); |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn open(mut rdr: R) -> csv::Result<RandomAccessSimple<R>> { |
| rdr.seek(io::SeekFrom::End(-8))?; |
| let len = rdr.read_u64::<BigEndian>()?; |
| Ok(RandomAccessSimple { |
| rdr: rdr, |
| len: len, |
| }) |
| } |
| |
| /// Get the position of the record at index `i`. |
| /// |
| /// The first record has index `0`. |
| /// |
| /// If the position returned is used to seek the CSV reader that was used |
| /// to create this index, then the next record read by the CSV reader will |
| /// be the `i`th record. |
| /// |
| /// Note that since this index does not store the line number of each |
| /// record, the position returned will always have a line number equivalent |
| /// to `1`. This in turn will cause the CSV reader to report all subsequent |
| /// line numbers incorrectly. |
| /// |
| /// # Example |
| /// |
| /// This example shows how to create a simple random access index, open it |
| /// and use it to seek a CSV reader to read an arbitrary record. |
| /// |
| /// ``` |
| /// extern crate csv; |
| /// extern crate csv_index; |
| /// |
| /// use std::error::Error; |
| /// use std::io; |
| /// use csv_index::RandomAccessSimple; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// // Note that we wrap our CSV data in an io::Cursor, which makes it |
| /// // seekable. If you're opening CSV data from a file, then this is |
| /// // not needed since a `File` is already seekable. |
| /// let mut rdr = csv::Reader::from_reader(io::Cursor::new(data)); |
| /// let mut wtr = io::Cursor::new(vec![]); |
| /// RandomAccessSimple::create(&mut rdr, &mut wtr)?; |
| /// |
| /// // Open the index we just created, get the position of the last |
| /// // record and seek the CSV reader. |
| /// let mut idx = RandomAccessSimple::open(wtr)?; |
| /// let pos = idx.get(2)?; |
| /// rdr.seek(pos)?; |
| /// |
| /// // Read the next record. |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Concord", "United States", "42695"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn get(&mut self, i: u64) -> csv::Result<csv::Position> { |
| if i >= self.len { |
| let msg = format!( |
| "invalid record index {} (there are {} records)", i, self.len); |
| let err = io::Error::new(io::ErrorKind::Other, msg); |
| return Err(csv::Error::from(err)); |
| } |
| self.rdr.seek(io::SeekFrom::Start(i * 8))?; |
| let offset = self.rdr.read_u64::<BigEndian>()?; |
| let mut pos = csv::Position::new(); |
| pos.set_byte(offset).set_record(i); |
| Ok(pos) |
| } |
| |
| /// Return the number of records (including the header record) in this |
| /// index. |
| pub fn len(&self) -> u64 { |
| self.len |
| } |
| |
| /// Return true if and only if this index has zero records. |
| pub fn is_empty(&self) -> bool { |
| self.len() == 0 |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use std::io; |
| |
| use csv; |
| |
| use super::RandomAccessSimple; |
| |
| struct Indexed<'a> { |
| csv: csv::Reader<io::Cursor<&'a str>>, |
| idx: RandomAccessSimple<io::Cursor<Vec<u8>>>, |
| } |
| |
| impl<'a> Indexed<'a> { |
| fn new(headers: bool, csv_data: &'a str) -> Indexed<'a> { |
| let mut rdr = csv::ReaderBuilder::new() |
| .has_headers(headers) |
| .from_reader(io::Cursor::new(csv_data)); |
| let mut idxbuf = io::Cursor::new(vec![]); |
| RandomAccessSimple::create(&mut rdr, &mut idxbuf).unwrap(); |
| Indexed { |
| csv: rdr, |
| idx: RandomAccessSimple::open(idxbuf).unwrap(), |
| } |
| } |
| |
| fn read_at(&mut self, record: u64) -> csv::StringRecord { |
| let pos = self.idx.get(record).unwrap(); |
| self.csv.seek(pos).unwrap(); |
| self.csv.records().next().unwrap().unwrap() |
| } |
| } |
| |
| #[test] |
| fn headers_empty() { |
| let idx = Indexed::new(true, ""); |
| assert_eq!(idx.idx.len(), 0); |
| } |
| |
| #[test] |
| fn headers_one_field() { |
| let mut idx = Indexed::new(true, "h1\na\nb\nc\n"); |
| assert_eq!(idx.idx.len(), 4); |
| assert_eq!(idx.read_at(0), vec!["h1"]); |
| assert_eq!(idx.read_at(1), vec!["a"]); |
| assert_eq!(idx.read_at(2), vec!["b"]); |
| assert_eq!(idx.read_at(3), vec!["c"]); |
| } |
| |
| #[test] |
| fn headers_many_fields() { |
| let mut idx = Indexed::new(true, "\ |
| h1,h2,h3 |
| a,b,c |
| d,e,f |
| g,h,i |
| "); |
| assert_eq!(idx.idx.len(), 4); |
| assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]); |
| assert_eq!(idx.read_at(1), vec!["a", "b", "c"]); |
| assert_eq!(idx.read_at(2), vec!["d", "e", "f"]); |
| assert_eq!(idx.read_at(3), vec!["g", "h", "i"]); |
| } |
| |
| #[test] |
| fn no_headers_one_field() { |
| let mut idx = Indexed::new(false, "h1\na\nb\nc\n"); |
| assert_eq!(idx.idx.len(), 4); |
| assert_eq!(idx.read_at(0), vec!["h1"]); |
| assert_eq!(idx.read_at(1), vec!["a"]); |
| assert_eq!(idx.read_at(2), vec!["b"]); |
| assert_eq!(idx.read_at(3), vec!["c"]); |
| } |
| |
| #[test] |
| fn no_headers_many_fields() { |
| let mut idx = Indexed::new(false, "\ |
| h1,h2,h3 |
| a,b,c |
| d,e,f |
| g,h,i |
| "); |
| assert_eq!(idx.idx.len(), 4); |
| assert_eq!(idx.read_at(0), vec!["h1", "h2", "h3"]); |
| assert_eq!(idx.read_at(1), vec!["a", "b", "c"]); |
| assert_eq!(idx.read_at(2), vec!["d", "e", "f"]); |
| assert_eq!(idx.read_at(3), vec!["g", "h", "i"]); |
| } |
| |
| #[test] |
| fn headers_one_field_newlines() { |
| let mut idx = Indexed::new(true, " |
| |
| |
| |
| |
| h1 |
| |
| a |
| |
| |
| b |
| |
| |
| |
| |
| |
| |
| c |
| |
| |
| |
| |
| |
| |
| "); |
| assert_eq!(idx.idx.len(), 4); |
| assert_eq!(idx.read_at(0), vec!["h1"]); |
| assert_eq!(idx.read_at(1), vec!["a"]); |
| assert_eq!(idx.read_at(2), vec!["b"]); |
| assert_eq!(idx.read_at(3), vec!["c"]); |
| } |
| } |