| /* Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #define C_LUCY_HIGHLIGHTWRITER |
| #define C_LUCY_DEFAULTHIGHLIGHTWRITER |
| #define C_LUCY_TOKEN |
| #include "Lucy/Util/ToolSet.h" |
| |
| #include <stdio.h> |
| |
| #include "Lucy/Index/HighlightWriter.h" |
| #include "Lucy/Analysis/Token.h" |
| #include "Lucy/Analysis/Inversion.h" |
| #include "Lucy/Plan/FieldType.h" |
| #include "Lucy/Plan/FullTextType.h" |
| #include "Lucy/Index/HighlightReader.h" |
| #include "Lucy/Index/Inverter.h" |
| #include "Lucy/Index/PolyReader.h" |
| #include "Lucy/Index/SegReader.h" |
| #include "Lucy/Index/Segment.h" |
| #include "Lucy/Index/Snapshot.h" |
| #include "Lucy/Plan/Schema.h" |
| #include "Lucy/Store/Folder.h" |
| #include "Lucy/Store/OutStream.h" |
| #include "Lucy/Store/InStream.h" |
| |
| static OutStream* |
| S_lazy_init(HighlightWriter *self); |
| |
| int32_t HLWriter_current_file_format = 1; |
| |
| HighlightWriter* |
| HLWriter_new(Schema *schema, Snapshot *snapshot, Segment *segment, |
| PolyReader *polyreader) { |
| HighlightWriter *self |
| = (HighlightWriter*)VTable_Make_Obj(HIGHLIGHTWRITER); |
| return HLWriter_init(self, schema, snapshot, segment, polyreader); |
| } |
| |
| HighlightWriter* |
| HLWriter_init(HighlightWriter *self, Schema *schema, Snapshot *snapshot, |
| Segment *segment, PolyReader *polyreader) { |
| DataWriter_init((DataWriter*)self, schema, snapshot, segment, polyreader); |
| return self; |
| } |
| |
| void |
| HLWriter_destroy(HighlightWriter *self) { |
| DECREF(self->dat_out); |
| DECREF(self->ix_out); |
| SUPER_DESTROY(self, HIGHLIGHTWRITER); |
| } |
| |
| static OutStream* |
| S_lazy_init(HighlightWriter *self) { |
| if (!self->dat_out) { |
| Segment *segment = self->segment; |
| Folder *folder = self->folder; |
| CharBuf *seg_name = Seg_Get_Name(segment); |
| |
| // Open outstreams. |
| { |
| CharBuf *ix_file = CB_newf("%o/highlight.ix", seg_name); |
| self->ix_out = Folder_Open_Out(folder, ix_file); |
| DECREF(ix_file); |
| if (!self->ix_out) { RETHROW(INCREF(Err_get_error())); } |
| } |
| { |
| CharBuf *dat_file = CB_newf("%o/highlight.dat", seg_name); |
| self->dat_out = Folder_Open_Out(folder, dat_file); |
| DECREF(dat_file); |
| if (!self->dat_out) { RETHROW(INCREF(Err_get_error())); } |
| } |
| |
| // Go past invalid doc 0. |
| OutStream_Write_I64(self->ix_out, 0); |
| } |
| |
| return self->dat_out; |
| } |
| |
| void |
| HLWriter_add_inverted_doc(HighlightWriter *self, Inverter *inverter, |
| int32_t doc_id) { |
| OutStream *dat_out = S_lazy_init(self); |
| OutStream *ix_out = self->ix_out; |
| int64_t filepos = OutStream_Tell(dat_out); |
| uint32_t num_highlightable = 0; |
| int32_t expected = (int32_t)(OutStream_Tell(ix_out) / 8); |
| |
| // Verify doc id. |
| if (doc_id != expected) { |
| THROW(ERR, "Expected doc id %i32 but got %i32", expected, doc_id); |
| } |
| |
| // Write index data. |
| OutStream_Write_I64(ix_out, filepos); |
| |
| // Count, then write number of highlightable fields. |
| Inverter_Iterate(inverter); |
| while (Inverter_Next(inverter)) { |
| FieldType *type = Inverter_Get_Type(inverter); |
| if (FType_Is_A(type, FULLTEXTTYPE) |
| && FullTextType_Highlightable((FullTextType*)type) |
| ) { |
| num_highlightable++; |
| } |
| } |
| OutStream_Write_C32(dat_out, num_highlightable); |
| |
| Inverter_Iterate(inverter); |
| while (Inverter_Next(inverter)) { |
| FieldType *type = Inverter_Get_Type(inverter); |
| if (FType_Is_A(type, FULLTEXTTYPE) |
| && FullTextType_Highlightable((FullTextType*)type) |
| ) { |
| CharBuf *field = Inverter_Get_Field_Name(inverter); |
| Inversion *inversion = Inverter_Get_Inversion(inverter); |
| ByteBuf *tv_buf = HLWriter_TV_Buf(self, inversion); |
| CB_Serialize(field, dat_out); |
| BB_Serialize(tv_buf, dat_out); |
| DECREF(tv_buf); |
| } |
| } |
| } |
| |
| ByteBuf* |
| HLWriter_tv_buf(HighlightWriter *self, Inversion *inversion) { |
| char *last_text = ""; |
| size_t last_len = 0; |
| ByteBuf *tv_buf = BB_new(20 + Inversion_Get_Size(inversion) * 8); |
| uint32_t num_postings = 0; |
| char *dest; |
| Token **tokens; |
| uint32_t freq; |
| UNUSED_VAR(self); |
| |
| // Leave space for a c32 indicating the number of postings. |
| BB_Set_Size(tv_buf, C32_MAX_BYTES); |
| |
| Inversion_Reset(inversion); |
| while ((tokens = Inversion_Next_Cluster(inversion, &freq)) != NULL) { |
| Token *token = *tokens; |
| int32_t overlap = StrHelp_overlap(last_text, token->text, |
| last_len, token->len); |
| char *ptr; |
| char *orig; |
| size_t old_size = BB_Get_Size(tv_buf); |
| size_t new_size = old_size |
| + C32_MAX_BYTES // overlap |
| + C32_MAX_BYTES // length of string diff |
| + (token->len - overlap) // diff char data |
| + C32_MAX_BYTES // num prox |
| + (C32_MAX_BYTES * freq * 3); // pos data |
| |
| // Allocate for worst-case scenario. |
| ptr = BB_Grow(tv_buf, new_size); |
| orig = ptr; |
| ptr += old_size; |
| |
| // Track number of postings. |
| num_postings += 1; |
| |
| // Append the string diff to the tv_buf. |
| NumUtil_encode_c32(overlap, &ptr); |
| NumUtil_encode_c32((token->len - overlap), &ptr); |
| memcpy(ptr, (token->text + overlap), (token->len - overlap)); |
| ptr += token->len - overlap; |
| |
| // Save text and text_len for comparison next loop. |
| last_text = token->text; |
| last_len = token->len; |
| |
| // Append the number of positions for this term. |
| NumUtil_encode_c32(freq, &ptr); |
| |
| do { |
| // Add position, start_offset, and end_offset to tv_buf. |
| NumUtil_encode_c32(token->pos, &ptr); |
| NumUtil_encode_c32(token->start_offset, &ptr); |
| NumUtil_encode_c32(token->end_offset, &ptr); |
| |
| } while (--freq && (token = *++tokens)); |
| |
| // Set new byte length. |
| BB_Set_Size(tv_buf, ptr - orig); |
| } |
| |
| // Go back and start the term vector string with the posting count. |
| dest = BB_Get_Buf(tv_buf); |
| NumUtil_encode_padded_c32(num_postings, &dest); |
| |
| return tv_buf; |
| } |
| |
| void |
| HLWriter_add_segment(HighlightWriter *self, SegReader *reader, |
| I32Array *doc_map) { |
| int32_t doc_max = SegReader_Doc_Max(reader); |
| |
| if (doc_max == 0) { |
| // Bail if the supplied segment is empty. |
| return; |
| } |
| else { |
| DefaultHighlightReader *hl_reader |
| = (DefaultHighlightReader*)CERTIFY( |
| SegReader_Obtain(reader, VTable_Get_Name(HIGHLIGHTREADER)), |
| DEFAULTHIGHLIGHTREADER); |
| OutStream *dat_out = S_lazy_init(self); |
| OutStream *ix_out = self->ix_out; |
| int32_t orig; |
| ByteBuf *bb = BB_new(0); |
| |
| for (orig = 1; orig <= doc_max; orig++) { |
| // Skip deleted docs. |
| if (doc_map && !I32Arr_Get(doc_map, orig)) { |
| continue; |
| } |
| |
| // Write file pointer. |
| OutStream_Write_I64(ix_out, OutStream_Tell(dat_out)); |
| |
| // Copy the raw record. |
| DefHLReader_Read_Record(hl_reader, orig, bb); |
| OutStream_Write_Bytes(dat_out, BB_Get_Buf(bb), BB_Get_Size(bb)); |
| |
| BB_Set_Size(bb, 0); |
| } |
| DECREF(bb); |
| } |
| } |
| |
| void |
| HLWriter_finish(HighlightWriter *self) { |
| if (self->dat_out) { |
| // Write one final file pointer, so that we can derive the length of |
| // the last record. |
| int64_t end = OutStream_Tell(self->dat_out); |
| OutStream_Write_I64(self->ix_out, end); |
| |
| // Close down the output streams. |
| OutStream_Close(self->dat_out); |
| OutStream_Close(self->ix_out); |
| Seg_Store_Metadata_Str(self->segment, "highlight", 9, |
| (Obj*)HLWriter_Metadata(self)); |
| } |
| } |
| |
| int32_t |
| HLWriter_format(HighlightWriter *self) { |
| UNUSED_VAR(self); |
| return HLWriter_current_file_format; |
| } |
| |
| |