| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| use strict; |
| use warnings; |
| |
| package LucyX::Index::ZlibDocReader; |
| use base qw( Lucy::Index::DocReader ); |
| our $VERSION = '0.004001'; |
| $VERSION = eval $VERSION; |
| use Clownfish::Util::StringHelper qw( utf8_valid utf8_flag_on ); |
| use Compress::Zlib qw( uncompress ); |
| use Carp; |
| |
| # Inside-out member vars. |
| our %ix_in; |
| our %dat_in; |
| our %binary_fields; |
| |
| sub new { |
| my ( $either, %args ) = @_; |
| my $self = $either->SUPER::new(%args); |
| |
| # Validate metadata. Only open streams if the segment has data we |
| # recognize. |
| my $segment = $self->get_segment; |
| my $metadata = $segment->fetch_metadata("zdocs"); |
| if ($metadata) { |
| if ( $metadata->{format} != 1 ) { |
| confess("Unrecognized format: '$metadata->{format}'"); |
| } |
| |
| # Open InStreams. |
| my $dat_filename = $segment->get_name . "/zdocs.dat"; |
| my $ix_filename = $segment->get_name . "/zdocs.ix"; |
| my $folder = $self->get_folder; |
| $ix_in{$$self} = $folder->open_in($ix_filename) |
| or confess Clownfish->error; |
| $dat_in{$$self} = $folder->open_in($dat_filename) |
| or confess Clownfish->error; |
| |
| # Remember which fields are binary. |
| my $schema = $self->get_schema; |
| my $bin_fields = $binary_fields{$$self} = {}; |
| $bin_fields->{$_} = 1 |
| for grep { $schema->fetch_type($_)->binary } |
| @{ $schema->all_fields }; |
| } |
| |
| return $self; |
| } |
| |
| sub fetch_doc { |
| my ( $self, $doc_id ) = @_; |
| my $dat_in = $dat_in{$$self}; |
| my $ix_in = $ix_in{$$self}; |
| my $bin_fields = $binary_fields{$$self}; |
| |
| # Bail if no data in segment. |
| return unless $ix_in; |
| |
| # Read index information. |
| $ix_in->seek( $doc_id * 8 ); |
| my $start = $ix_in->read_i64; |
| my $len = $ix_in->read_i64 - $start; |
| my $compressed; |
| |
| # Read main data. |
| $dat_in->seek($start); |
| $dat_in->read( $compressed, $len ); |
| my $inflated = uncompress($compressed); |
| my $num_fields = unpack( "w", $inflated ); |
| my $pack_template = "w "; |
| $pack_template .= "w/a*" x ( $num_fields * 2 ); |
| my ( undef, %fields ) = unpack( $pack_template, $inflated ); |
| |
| # Turn on UTF-8 flag for string fields. |
| for my $field ( keys %fields ) { |
| next if $bin_fields->{$field}; |
| utf8_flag_on( $fields{$field} ); |
| confess("Invalid UTF-8 read for doc $doc_id field '$field'") |
| unless utf8_valid( $fields{$field} ); |
| } |
| |
| return Lucy::Document::HitDoc->new( |
| fields => \%fields, |
| doc_id => $doc_id, |
| ); |
| } |
| |
| sub read_record { |
| my ( $self, $doc_id, $buf ) = @_; |
| my $dat_in = $dat_in{$$self}; |
| my $ix_in = $ix_in{$$self}; |
| my $bin_fields = $binary_fields{$$self}; |
| |
| if ($ix_in) { |
| $ix_in->seek( $doc_id * 8 ); |
| my $start = $ix_in->read_i64; |
| my $len = $ix_in->read_i64 - $start; |
| $dat_in->seek($start); |
| $dat_in->read( $$buf, $len ); |
| } |
| } |
| |
| sub close { |
| my $self = shift; |
| delete $ix_in{$$self}; |
| delete $dat_in{$$self}; |
| delete $binary_fields{$$self}; |
| } |
| |
| sub DESTROY { |
| my $self = shift; |
| delete $ix_in{$$self}; |
| delete $dat_in{$$self}; |
| delete $binary_fields{$$self}; |
| $self->SUPER::DESTROY; |
| } |
| |
| 1; |
| |
| __END__ |
| |
| __POD__ |
| |
| =head1 NAME |
| |
| LucyX::Index::ZlibDocReader - Compressed doc storage. |
| |
| =head1 DESCRIPTION |
| |
| This is a proof-of-concept class to demonstrate alternate implementations for |
| fetching documents. It is unsupported. |
| |
| =cut |