core/Lucy/Index/Indexer.cfh - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 parcel Lucy;

 /** Build inverted indexes.
  *
  * The Indexer class is Apache Lucy's primary tool for managing the content of
  * inverted indexes, which may later be searched using
  * [](cfish:IndexSearcher).
  *
  * In general, only one Indexer at a time may write to an index safely.  If a
  * write lock cannot be secured, new() will throw an exception.
  *
  * If an index is located on a shared volume, each writer application must
  * identify itself by supplying an
  * [](cfish:IndexManager) with a unique
  * `host` id to Indexer's constructor or index corruption will
  * occur.  See [](FileLocking) for a detailed
  * discussion.
  *
  * Note: at present, [](cfish:.Delete_By_Term) and [](cfish:.Delete_By_Query) only affect
  * documents which had been previously committed to the index -- and not any
  * documents added this indexing session but not yet committed.  This may
  * change in a future update.
  */
 public class Lucy::Index::Indexer inherits Clownfish::Obj {

     Schema            *schema;
     Folder            *folder;
     Segment           *segment;
     IndexManager      *manager;
     String            *latest_snapfile;
     PolyReader        *polyreader;
     Snapshot          *snapshot;
     SegWriter         *seg_writer;
     DeletionsWriter   *del_writer;
     FilePurger        *file_purger;
     Lock              *write_lock;
     Lock              *merge_lock;
     Doc               *stock_doc;
     String            *snapfile;
     bool               truncate;
     bool               optimize;
     bool               needs_commit;
     bool               prepared;

     public inert int32_t TRUNCATE;
     public inert int32_t CREATE;

     /** Open a new Indexer.  If the index already exists, update it.
      *
      * @param schema A Schema.
      * @param index Either a string filepath or a Folder.
      * @param manager An IndexManager.
      * @param flags Flags governing behavior.
      */
     public inert incremented Indexer*
     new(Schema *schema = NULL, Obj *index, IndexManager *manager = NULL,
         int32_t flags = 0);

     /** Initialize an Indexer.
      *
      * @param schema A Schema.
      * @param index Either a string filepath or a Folder.
      * @param manager An IndexManager.
      * @param flags Flags governing behavior.
      */
     public inert Indexer*
     init(Indexer *self, Schema *schema = NULL, Obj *index,
          IndexManager *manager = NULL, int32_t flags = 0);

     /** Add a document to the index.
      *
      * @param doc A Lucy::Document::Doc object.
      * @param boost A floating point weight which affects how this document
      * scores.
      */
     public void
     Add_Doc(Indexer *self, Doc *doc, float boost = 1.0);

     /** Absorb an existing index into this one.  The two indexes must
      * have matching Schemas.
      *
      * @param index Either an index path name or a Folder.
      */
     public void
     Add_Index(Indexer *self, Obj *index);

     /** Mark documents which contain the supplied term as deleted, so that
      * they will be excluded from search results and eventually removed
      * altogether.  The change is not apparent to search apps until after
      * [](cfish:.Commit) succeeds.
      *
      * @param field The name of an indexed field. (If it is not spec'd as
      * `indexed`, an error will occur.)
      * @param term The term which identifies docs to be marked as deleted.  If
      * `field` is associated with an Analyzer, `term`
      * will be processed automatically (so don't pre-process it yourself).
      */
     public void
     Delete_By_Term(Indexer *self, String *field, Obj *term);

     /** Mark documents which match the supplied Query as deleted.
      *
      * @param query A [](cfish:Query).
      */
     public void
     Delete_By_Query(Indexer *self, Query *query);

     /** Mark the document identified by the supplied document ID as deleted.
      *
      * @param doc_id A [document id](DocIDs).
      */
     public void
     Delete_By_Doc_ID(Indexer *self, int32_t doc_id);

     /** Optimize the index for search-time performance.  This may take a
      * while, as it can involve rewriting large amounts of data.
      *
      * Every Indexer session which changes index content and ends in a
      * [](cfish:.Commit) creates a new segment.  Once written, segments are never
      * modified.  However, they are periodically recycled by feeding their
      * content into the segment currently being written.
      *
      * The [](cfish:.Optimize) method causes all existing index content to be fed back
      * into the Indexer.  When [](cfish:.Commit) completes after an [](cfish:.Optimize), the
      * index will consist of one segment.  So [](cfish:.Optimize) must be called
      * before [](cfish:.Commit).  Also, optimizing a fresh index created from scratch
      * has no effect.
      *
      * Historically, there was a significant search-time performance benefit
      * to collapsing down to a single segment versus even two segments.  Now
      * the effect of collapsing is much less significant, and calling
      * [](cfish:.Optimize) is rarely justified.
      */
     public void
     Optimize(Indexer *self);

     /** Commit any changes made to the index.  Until this is called, none of
      * the changes made during an indexing session are permanent.
      *
      * Calling [](cfish:.Commit) invalidates the Indexer, so if you want to make more
      * changes you'll need a new one.
      */
     public void
     Commit(Indexer *self);

     /** Perform the expensive setup for [](cfish:.Commit) in advance, so that [](cfish:.Commit)
      * completes quickly.  (If [](cfish:.Prepare_Commit) is not called explicitly by
      * the user, [](cfish:.Commit) will call it internally.)
      */
     public void
     Prepare_Commit(Indexer *self);

     /** Accessor for schema.
      */
     public Schema*
     Get_Schema(Indexer *self);

     /** Accessor for seg_writer member var.
      */
     SegWriter*
     Get_Seg_Writer(Indexer *self);

     Doc*
     Get_Stock_Doc(Indexer *self);

     public void
     Destroy(Indexer *self);
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	parcel Lucy;

	/** Build inverted indexes.
	*
	* The Indexer class is Apache Lucy's primary tool for managing the content of
	* inverted indexes, which may later be searched using
	* [](cfish:IndexSearcher).
	*
	* In general, only one Indexer at a time may write to an index safely. If a
	* write lock cannot be secured, new() will throw an exception.
	*
	* If an index is located on a shared volume, each writer application must
	* identify itself by supplying an
	* [](cfish:IndexManager) with a unique
	* `host` id to Indexer's constructor or index corruption will
	* occur. See [](FileLocking) for a detailed
	* discussion.
	*
	* Note: at present, [](cfish:.Delete_By_Term) and [](cfish:.Delete_By_Query) only affect
	* documents which had been previously committed to the index -- and not any
	* documents added this indexing session but not yet committed. This may
	* change in a future update.
	*/
	public class Lucy::Index::Indexer inherits Clownfish::Obj {

	Schema *schema;
	Folder *folder;
	Segment *segment;
	IndexManager *manager;
	String *latest_snapfile;
	PolyReader *polyreader;
	Snapshot *snapshot;
	SegWriter *seg_writer;
	DeletionsWriter *del_writer;
	FilePurger *file_purger;
	Lock *write_lock;
	Lock *merge_lock;
	Doc *stock_doc;
	String *snapfile;
	bool truncate;
	bool optimize;
	bool needs_commit;
	bool prepared;

	public inert int32_t TRUNCATE;
	public inert int32_t CREATE;

	/** Open a new Indexer. If the index already exists, update it.
	*
	* @param schema A Schema.
	* @param index Either a string filepath or a Folder.
	* @param manager An IndexManager.
	* @param flags Flags governing behavior.
	*/
	public inert incremented Indexer*
	new(Schema schema = NULL, Obj index, IndexManager *manager = NULL,
	int32_t flags = 0);

	/** Initialize an Indexer.
	*
	* @param schema A Schema.
	* @param index Either a string filepath or a Folder.
	* @param manager An IndexManager.
	* @param flags Flags governing behavior.
	*/
	public inert Indexer*
	init(Indexer self, Schema schema = NULL, Obj *index,
	IndexManager *manager = NULL, int32_t flags = 0);

	/** Add a document to the index.
	*
	* @param doc A Lucy::Document::Doc object.
	* @param boost A floating point weight which affects how this document
	* scores.
	*/
	public void
	Add_Doc(Indexer self, Doc doc, float boost = 1.0);

	/** Absorb an existing index into this one. The two indexes must
	* have matching Schemas.
	*
	* @param index Either an index path name or a Folder.
	*/
	public void
	Add_Index(Indexer self, Obj index);

	/** Mark documents which contain the supplied term as deleted, so that
	* they will be excluded from search results and eventually removed
	* altogether. The change is not apparent to search apps until after
	* [](cfish:.Commit) succeeds.
	*
	* @param field The name of an indexed field. (If it is not spec'd as
	* `indexed`, an error will occur.)
	* @param term The term which identifies docs to be marked as deleted. If
	* `field` is associated with an Analyzer, `term`
	* will be processed automatically (so don't pre-process it yourself).
	*/
	public void
	Delete_By_Term(Indexer self, String field, Obj *term);

	/** Mark documents which match the supplied Query as deleted.
	*
	* @param query A [](cfish:Query).
	*/
	public void
	Delete_By_Query(Indexer self, Query query);

	/** Mark the document identified by the supplied document ID as deleted.
	*
	* @param doc_id A [document id](DocIDs).
	*/
	public void
	Delete_By_Doc_ID(Indexer *self, int32_t doc_id);

	/** Optimize the index for search-time performance. This may take a
	* while, as it can involve rewriting large amounts of data.
	*
	* Every Indexer session which changes index content and ends in a
	* [](cfish:.Commit) creates a new segment. Once written, segments are never
	* modified. However, they are periodically recycled by feeding their
	* content into the segment currently being written.
	*
	* The [](cfish:.Optimize) method causes all existing index content to be fed back
	* into the Indexer. When [](cfish:.Commit) completes after an [](cfish:.Optimize), the
	* index will consist of one segment. So [](cfish:.Optimize) must be called
	* before [](cfish:.Commit). Also, optimizing a fresh index created from scratch
	* has no effect.
	*
	* Historically, there was a significant search-time performance benefit
	* to collapsing down to a single segment versus even two segments. Now
	* the effect of collapsing is much less significant, and calling
	* [](cfish:.Optimize) is rarely justified.
	*/
	public void
	Optimize(Indexer *self);

	/** Commit any changes made to the index. Until this is called, none of
	* the changes made during an indexing session are permanent.
	*
	* Calling [](cfish:.Commit) invalidates the Indexer, so if you want to make more
	* changes you'll need a new one.
	*/
	public void
	Commit(Indexer *self);

	/** Perform the expensive setup for [](cfish:.Commit) in advance, so that [](cfish:.Commit)
	* completes quickly. (If [](cfish:.Prepare_Commit) is not called explicitly by
	* the user, [](cfish:.Commit) will call it internally.)
	*/
	public void
	Prepare_Commit(Indexer *self);

	/** Accessor for schema.
	*/
	public Schema*
	Get_Schema(Indexer *self);

	/** Accessor for seg_writer member var.
	*/
	SegWriter*
	Get_Seg_Writer(Indexer *self);

	Doc*
	Get_Stock_Doc(Indexer *self);

	public void
	Destroy(Indexer *self);
	}