| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Add NuGet References: |
| |
| // Lucene.Net.Analysis.Common |
| |
| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.Standard; |
| using Lucene.Net.Documents; |
| using Lucene.Net.Index; |
| using Lucene.Net.Store; |
| using Lucene.Net.Util; |
| using System; |
| using System.IO; |
| using System.Text; |
| |
| namespace Lucene.Net.Demo |
| { |
| /// <summary> |
| /// Index all text files under a directory. |
| /// <para/> |
| /// This is a command-line application demonstrating simple Lucene indexing. |
| /// Run it with no command-line arguments for usage information. |
| /// </summary> |
| public class IndexFiles |
| { |
| private IndexFiles() { } |
| |
| /// <summary>Index all text files under a directory.</summary> |
| public static void Main(string[] args) |
| { |
| // The <CONSOLE_APP_NAME> should be the assembly name of the application |
| // this code is compiled into. In .NET Framework, it is the name of the EXE file. |
| // In .NET Core, you have the option of compiling this into either an EXE or a DLL |
| // (see https://docs.microsoft.com/en-us/dotnet/core/deploying/index). |
| // In the latter case, the <CONSOLE_APP_NAME> will be "dotnet <DLL_NAME>.dll". |
| string usage = "Usage: <CONSOLE_APP_NAME> <INDEX_DIRECTORY> <SOURCE_DIRECTORY> " |
| + "[-u|--update]\n\n" |
| + "This indexes the documents in <SOURCE_DIRECTORY>, creating a Lucene index" |
| + "in <INDEX_DIRECTORY> that can be searched with the search-files demo."; |
| |
| // Validate required arguments are present. |
| // If not, show usage information. |
| if (args.Length < 2) |
| { |
| Console.WriteLine(usage); |
| Environment.Exit(1); |
| } |
| string indexPath = args[0]; |
| string sourcePath = args[1]; |
| bool create = true; |
| |
| for (int i = 0; i < args.Length; i++) |
| { |
| if ("-u".Equals(args[i], StringComparison.Ordinal) || "--update".Equals(args[i], StringComparison.Ordinal)) |
| { |
| create = false; |
| } |
| } |
| |
| DirectoryInfo sourceDirectory = new DirectoryInfo(sourcePath); |
| if (!sourceDirectory.Exists) |
| { |
| Console.WriteLine("Source directory '" + sourcePath + "' does not exist, please check the path"); |
| Environment.Exit(1); |
| } |
| |
| DateTime start = DateTime.UtcNow; |
| try |
| { |
| Console.WriteLine("Indexing to directory '" + indexPath + "'..."); |
| |
| Store.Directory dir = FSDirectory.Open(indexPath); |
| // :Post-Release-Update-Version.LUCENE_XY: |
| Analyzer analyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48); |
| IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer); |
| |
| if (create) |
| { |
| // Create a new index in the directory, removing any |
| // previously indexed documents: |
| iwc.OpenMode = OpenMode.CREATE; |
| } |
| else |
| { |
| // Add new documents to an existing index: |
| iwc.OpenMode = OpenMode.CREATE_OR_APPEND; |
| } |
| |
| // Optional: for better indexing performance, if you |
| // are indexing many documents, increase the RAM |
| // buffer. |
| // |
| // iwc.RAMBufferSizeMB = 256.0; |
| |
| using (IndexWriter writer = new IndexWriter(dir, iwc)) |
| { |
| IndexDocs(writer, sourceDirectory); |
| |
| // NOTE: if you want to maximize search performance, |
| // you can optionally call forceMerge here. This can be |
| // a terribly costly operation, so generally it's only |
| // worth it when your index is relatively static (ie |
| // you're done adding documents to it): |
| // |
| // writer.ForceMerge(1); |
| } |
| |
| DateTime end = DateTime.UtcNow; |
| Console.WriteLine((end - start).TotalMilliseconds + " total milliseconds"); |
| } |
| catch (IOException e) |
| { |
| Console.WriteLine(" caught a " + e.GetType() + |
| "\n with message: " + e.Message); |
| } |
| } |
| |
| /// <summary> |
| /// Recurses over files and directories found under the |
| /// given directory and indexes each file.<para/> |
| /// |
| /// NOTE: This method indexes one document per input file. |
| /// This is slow. For good throughput, put multiple documents |
| /// into your input file(s). |
| /// </summary> |
| /// <param name="writer"> |
| /// <see cref="IndexWriter"/> to the index where the given |
| /// file/dir info will be stored |
| /// </param> |
| /// <param name="directoryInfo"> |
| /// The directory to recurse into to find files to index. |
| /// </param> |
| /// <exception cref="IOException"> |
| /// If there is a low-level I/O error. |
| /// </exception> |
| internal static void IndexDocs(IndexWriter writer, DirectoryInfo directoryInfo) |
| { |
| foreach (var dirInfo in directoryInfo.GetDirectories()) |
| { |
| IndexDocs(writer, dirInfo); |
| } |
| foreach (var fileInfo in directoryInfo.GetFiles()) |
| { |
| IndexDocs(writer, fileInfo); |
| } |
| } |
| |
| /// <summary> |
| /// Indexes the given file using the given writer.<para/> |
| /// </summary> |
| /// <param name="writer"> |
| /// <see cref="IndexWriter"/> to the index where the given |
| /// file info will be stored. |
| /// </param> |
| /// <param name="file"> |
| /// The file to index. |
| /// </param> |
| /// <exception cref="IOException"> |
| /// If there is a low-level I/O error. |
| /// </exception> |
| internal static void IndexDocs(IndexWriter writer, FileInfo file) |
| { |
| using (FileStream fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read)) |
| { |
| // make a new, empty document |
| Document doc = new Document(); |
| |
| // Add the path of the file as a field named "path". Use a |
| // field that is indexed (i.e. searchable), but don't tokenize |
| // the field into separate words and don't index term frequency |
| // or positional information: |
| Field pathField = new StringField("path", file.FullName, Field.Store.YES); |
| doc.Add(pathField); |
| |
| // Add the last modified date of the file a field named "modified". |
| // Use a LongField that is indexed (i.e. efficiently filterable with |
| // NumericRangeFilter). This indexes to milli-second resolution, which |
| // is often too fine. You could instead create a number based on |
| // year/month/day/hour/minutes/seconds, down the resolution you require. |
| // For example the long value 2011021714 would mean |
| // February 17, 2011, 2-3 PM. |
| doc.Add(new Int64Field("modified", file.LastWriteTimeUtc.Ticks, Field.Store.NO)); |
| |
| // Add the contents of the file to a field named "contents". Specify a Reader, |
| // so that the text of the file is tokenized and indexed, but not stored. |
| // Note that FileReader expects the file to be in UTF-8 encoding. |
| // If that's not the case searching for special characters will fail. |
| doc.Add(new TextField("contents", new StreamReader(fs, Encoding.UTF8))); |
| |
| if (writer.Config.OpenMode == OpenMode.CREATE) |
| { |
| // New index, so we just add the document (no old document can be there): |
| Console.WriteLine("adding " + file); |
| writer.AddDocument(doc); |
| } |
| else |
| { |
| // Existing index (an old copy of this document may have been indexed) so |
| // we use updateDocument instead to replace the old one matching the exact |
| // path, if present: |
| Console.WriteLine("updating " + file); |
| writer.UpdateDocument(new Term("path", file.FullName), doc); |
| } |
| } |
| } |
| } |
| } |