blob: 7d39ae6b36a70fac065afa015b359a69cfa6b95c [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_DOCVECTOR
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Index/DocVector.h"
#include "Lucy/Index/TermVector.h"
#include "Lucy/Store/InStream.h"
#include "Lucy/Store/OutStream.h"
// Extract a document's compressed TermVector data into (term_text =>
// compressed positional data) pairs.
static Hash*
S_extract_tv_cache(ByteBuf *field_buf);
// Pull a TermVector object out from compressed positional data.
static TermVector*
S_extract_tv_from_tv_buf(const CharBuf *field, const CharBuf *term_text,
ByteBuf *tv_buf);
DocVector*
DocVec_new() {
DocVector *self = (DocVector*)VTable_Make_Obj(DOCVECTOR);
return DocVec_init(self);
}
DocVector*
DocVec_init(DocVector *self) {
self->field_bufs = Hash_new(0);
self->field_vectors = Hash_new(0);
return self;
}
void
DocVec_serialize(DocVector *self, OutStream *outstream) {
Hash_Serialize(self->field_bufs, outstream);
Hash_Serialize(self->field_vectors, outstream);
}
DocVector*
DocVec_deserialize(DocVector *self, InStream *instream) {
self = self ? self : (DocVector*)VTable_Make_Obj(DOCVECTOR);
self->field_bufs = Hash_deserialize(NULL, instream);
self->field_vectors = Hash_deserialize(NULL, instream);
return self;
}
void
DocVec_destroy(DocVector *self) {
DECREF(self->field_bufs);
DECREF(self->field_vectors);
SUPER_DESTROY(self, DOCVECTOR);
}
void
DocVec_add_field_buf(DocVector *self, const CharBuf *field,
ByteBuf *field_buf) {
Hash_Store(self->field_bufs, (Obj*)field, INCREF(field_buf));
}
ByteBuf*
DocVec_field_buf(DocVector *self, const CharBuf *field) {
return (ByteBuf*)Hash_Fetch(self->field_bufs, (Obj*)field);
}
VArray*
DocVec_field_names(DocVector *self) {
return Hash_Keys(self->field_bufs);
}
TermVector*
DocVec_term_vector(DocVector *self, const CharBuf *field,
const CharBuf *term_text) {
ByteBuf *tv_buf;
Hash *field_vector = (Hash*)Hash_Fetch(self->field_vectors, (Obj*)field);
// If no cache hit, try to fill cache.
if (field_vector == NULL) {
ByteBuf *field_buf
= (ByteBuf*)Hash_Fetch(self->field_bufs, (Obj*)field);
// Bail if there's no content or the field isn't highlightable.
if (field_buf == NULL) { return NULL; }
field_vector = S_extract_tv_cache(field_buf);
Hash_Store(self->field_vectors, (Obj*)field, (Obj*)field_vector);
}
// Get a buf for the term text or bail.
tv_buf = (ByteBuf*)Hash_Fetch(field_vector, (Obj*)term_text);
if (tv_buf == NULL) {
return NULL;
}
return S_extract_tv_from_tv_buf(field, term_text, tv_buf);
}
static Hash*
S_extract_tv_cache(ByteBuf *field_buf) {
Hash *tv_cache = Hash_new(0);
char *tv_string = BB_Get_Buf(field_buf);
int32_t num_terms = NumUtil_decode_c32(&tv_string);
CharBuf *text = CB_new(0);
int32_t i;
// Read the number of highlightable terms in the field.
for (i = 0; i < num_terms; i++) {
char *bookmark_ptr;
size_t overlap = NumUtil_decode_c32(&tv_string);
size_t len = NumUtil_decode_c32(&tv_string);
int32_t num_positions;
// Decompress the term text.
CB_Set_Size(text, overlap);
CB_Cat_Trusted_Str(text, tv_string, len);
tv_string += len;
// Get positions & offsets string.
bookmark_ptr = tv_string;
num_positions = NumUtil_decode_c32(&tv_string);
while (num_positions--) {
// Leave nums compressed to save a little mem.
NumUtil_skip_cint(&tv_string);
NumUtil_skip_cint(&tv_string);
NumUtil_skip_cint(&tv_string);
}
len = tv_string - bookmark_ptr;
// Store the $text => $posdata pair in the output hash.
Hash_Store(tv_cache, (Obj*)text,
(Obj*)BB_new_bytes(bookmark_ptr, len));
}
DECREF(text);
return tv_cache;
}
static TermVector*
S_extract_tv_from_tv_buf(const CharBuf *field, const CharBuf *term_text,
ByteBuf *tv_buf) {
TermVector *retval = NULL;
char *posdata = BB_Get_Buf(tv_buf);
char *posdata_end = posdata + BB_Get_Size(tv_buf);
int32_t *positions = NULL;
int32_t *starts = NULL;
int32_t *ends = NULL;
uint32_t num_pos = 0;
uint32_t i;
if (posdata != posdata_end) {
num_pos = NumUtil_decode_c32(&posdata);
positions = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
starts = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
ends = (int32_t*)MALLOCATE(num_pos * sizeof(int32_t));
}
// Expand C32s.
for (i = 0; i < num_pos; i++) {
positions[i] = NumUtil_decode_c32(&posdata);
starts[i] = NumUtil_decode_c32(&posdata);
ends[i] = NumUtil_decode_c32(&posdata);
}
if (posdata != posdata_end) {
THROW(ERR, "Bad encoding of posdata");
}
else {
I32Array *posits_map = I32Arr_new_steal(positions, num_pos);
I32Array *starts_map = I32Arr_new_steal(starts, num_pos);
I32Array *ends_map = I32Arr_new_steal(ends, num_pos);
retval = TV_new(field, term_text, posits_map, starts_map, ends_map);
DECREF(posits_map);
DECREF(starts_map);
DECREF(ends_map);
}
return retval;
}