| // # Stop Words Example |
| // |
| // This example covers the basic usage of stop words |
| // with tantivy |
| // |
| // We will : |
| // - define our schema |
| // - create an index in a directory |
| // - add a few stop words |
| // - index few documents in our index |
| |
| // --- |
| // Importing tantivy... |
| use tantivy::collector::TopDocs; |
| use tantivy::query::QueryParser; |
| use tantivy::schema::*; |
| use tantivy::tokenizer::*; |
| use tantivy::{doc, Index}; |
| |
| fn main() -> tantivy::Result<()> { |
| // this example assumes you understand the content in `basic_search` |
| let mut schema_builder = Schema::builder(); |
| |
| // This configures your custom options for how tantivy will |
| // store and process your content in the index; The key |
| // to note is that we are setting the tokenizer to `stoppy` |
| // which will be defined and registered below. |
| let text_field_indexing = TextFieldIndexing::default() |
| .set_tokenizer("stoppy") |
| .set_index_option(IndexRecordOption::WithFreqsAndPositions); |
| let text_options = TextOptions::default() |
| .set_indexing_options(text_field_indexing) |
| .set_stored(); |
| |
| // Our first field is title. |
| schema_builder.add_text_field("title", text_options); |
| |
| // Our second field is body. |
| let text_field_indexing = TextFieldIndexing::default() |
| .set_tokenizer("stoppy") |
| .set_index_option(IndexRecordOption::WithFreqsAndPositions); |
| let text_options = TextOptions::default() |
| .set_indexing_options(text_field_indexing) |
| .set_stored(); |
| schema_builder.add_text_field("body", text_options); |
| |
| let schema = schema_builder.build(); |
| |
| let index = Index::create_in_ram(schema.clone()); |
| |
| // This tokenizer lowers all of the text (to help with stop word matching) |
| // then removes all instances of `the` and `and` from the corpus |
| let tokenizer = TextAnalyzer::from(SimpleTokenizer) |
| .filter(LowerCaser) |
| .filter(StopWordFilter::remove(vec![ |
| "the".to_string(), |
| "and".to_string(), |
| ])); |
| |
| index.tokenizers().register("stoppy", tokenizer); |
| |
| let mut index_writer = index.writer(50_000_000)?; |
| |
| let title = schema.get_field("title").unwrap(); |
| let body = schema.get_field("body").unwrap(); |
| |
| index_writer.add_document(doc!( |
| title => "The Old Man and the Sea", |
| body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ |
| he had gone eighty-four days now without taking a fish." |
| ))?; |
| |
| index_writer.add_document(doc!( |
| title => "Of Mice and Men", |
| body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ |
| bank and runs deep and green. The water is warm too, for it has slipped twinkling \ |
| over the yellow sands in the sunlight before reaching the narrow pool. On one \ |
| side of the river the golden foothill slopes curve up to the strong and rocky \ |
| Gabilan Mountains, but on the valley side the water is lined with trees—willows \ |
| fresh and green with every spring, carrying in their lower leaf junctures the \ |
| debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ |
| limbs and branches that arch over the pool" |
| ))?; |
| |
| index_writer.add_document(doc!( |
| title => "Frankenstein", |
| body => "You will rejoice to hear that no disaster has accompanied the commencement of an \ |
| enterprise which you have regarded with such evil forebodings. I arrived here \ |
| yesterday, and my first task is to assure my dear sister of my welfare and \ |
| increasing confidence in the success of my undertaking." |
| ))?; |
| |
| index_writer.commit()?; |
| |
| let reader = index.reader()?; |
| |
| let searcher = reader.searcher(); |
| |
| let query_parser = QueryParser::for_index(&index, vec![title, body]); |
| |
| // stop words are applied on the query as well. |
| // The following will be equivalent to `title:frankenstein` |
| let query = query_parser.parse_query("title:\"the Frankenstein\"")?; |
| let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; |
| |
| for (score, doc_address) in top_docs { |
| let retrieved_doc = searcher.doc(doc_address)?; |
| println!("\n==\nDocument score {}:", score); |
| println!("{}", schema.to_json(&retrieved_doc)); |
| } |
| |
| Ok(()) |
| } |