blob: 1abf35c67008124ec64e552361cb45ec69686b5f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Add NuGet References:
// Lucene.Net.Analysis.Common
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Store;
using Lucene.Net.Util;
using System;
using System.IO;
using System.Text;
namespace Lucene.Net.Demo
{
/// <summary>
/// Index all text files under a directory.
/// <para/>
/// This is a command-line application demonstrating simple Lucene indexing.
/// Run it with no command-line arguments for usage information.
/// </summary>
public class IndexFiles
{
private IndexFiles() { }
/// <summary>Index all text files under a directory.</summary>
public static void Main(string[] args)
{
// The <CONSOLE_APP_NAME> should be the assembly name of the application
// this code is compiled into. In .NET Framework, it is the name of the EXE file.
// In .NET Core, you have the option of compiling this into either an EXE or a DLL
// (see https://docs.microsoft.com/en-us/dotnet/core/deploying/index).
// In the latter case, the <CONSOLE_APP_NAME> will be "dotnet <DLL_NAME>.dll".
string usage = "Usage: <CONSOLE_APP_NAME> <INDEX_DIRECTORY> <SOURCE_DIRECTORY> "
+ "[-u|--update]\n\n"
+ "This indexes the documents in <SOURCE_DIRECTORY>, creating a Lucene index"
+ "in <INDEX_DIRECTORY> that can be searched with the search-files demo.";
// Validate required arguments are present.
// If not, show usage information.
if (args.Length < 2)
{
Console.WriteLine(usage);
Environment.Exit(1);
}
string indexPath = args[0];
string sourcePath = args[1];
bool create = true;
for (int i = 0; i < args.Length; i++)
{
if ("-u".Equals(args[i], StringComparison.Ordinal) || "--update".Equals(args[i], StringComparison.Ordinal))
{
create = false;
}
}
DirectoryInfo sourceDirectory = new DirectoryInfo(sourcePath);
if (!sourceDirectory.Exists)
{
Console.WriteLine("Source directory '" + sourcePath + "' does not exist, please check the path");
Environment.Exit(1);
}
DateTime start = DateTime.UtcNow;
try
{
Console.WriteLine("Indexing to directory '" + indexPath + "'...");
Store.Directory dir = FSDirectory.Open(indexPath);
// :Post-Release-Update-Version.LUCENE_XY:
Analyzer analyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48);
IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer);
if (create)
{
// Create a new index in the directory, removing any
// previously indexed documents:
iwc.OpenMode = OpenMode.CREATE;
}
else
{
// Add new documents to an existing index:
iwc.OpenMode = OpenMode.CREATE_OR_APPEND;
}
// Optional: for better indexing performance, if you
// are indexing many documents, increase the RAM
// buffer.
//
// iwc.RAMBufferSizeMB = 256.0;
using (IndexWriter writer = new IndexWriter(dir, iwc))
{
IndexDocs(writer, sourceDirectory);
// NOTE: if you want to maximize search performance,
// you can optionally call forceMerge here. This can be
// a terribly costly operation, so generally it's only
// worth it when your index is relatively static (ie
// you're done adding documents to it):
//
// writer.ForceMerge(1);
}
DateTime end = DateTime.UtcNow;
Console.WriteLine((end - start).TotalMilliseconds + " total milliseconds");
}
catch (IOException e)
{
Console.WriteLine(" caught a " + e.GetType() +
"\n with message: " + e.Message);
}
}
/// <summary>
/// Recurses over files and directories found under the
/// given directory and indexes each file.<para/>
///
/// NOTE: This method indexes one document per input file.
/// This is slow. For good throughput, put multiple documents
/// into your input file(s).
/// </summary>
/// <param name="writer">
/// <see cref="IndexWriter"/> to the index where the given
/// file/dir info will be stored
/// </param>
/// <param name="directoryInfo">
/// The directory to recurse into to find files to index.
/// </param>
/// <exception cref="IOException">
/// If there is a low-level I/O error.
/// </exception>
internal static void IndexDocs(IndexWriter writer, DirectoryInfo directoryInfo)
{
foreach (var dirInfo in directoryInfo.GetDirectories())
{
IndexDocs(writer, dirInfo);
}
foreach (var fileInfo in directoryInfo.GetFiles())
{
IndexDocs(writer, fileInfo);
}
}
/// <summary>
/// Indexes the given file using the given writer.<para/>
/// </summary>
/// <param name="writer">
/// <see cref="IndexWriter"/> to the index where the given
/// file info will be stored.
/// </param>
/// <param name="file">
/// The file to index.
/// </param>
/// <exception cref="IOException">
/// If there is a low-level I/O error.
/// </exception>
internal static void IndexDocs(IndexWriter writer, FileInfo file)
{
using (FileStream fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read))
{
// make a new, empty document
Document doc = new Document();
// Add the path of the file as a field named "path". Use a
// field that is indexed (i.e. searchable), but don't tokenize
// the field into separate words and don't index term frequency
// or positional information:
Field pathField = new StringField("path", file.FullName, Field.Store.YES);
doc.Add(pathField);
// Add the last modified date of the file a field named "modified".
// Use a LongField that is indexed (i.e. efficiently filterable with
// NumericRangeFilter). This indexes to milli-second resolution, which
// is often too fine. You could instead create a number based on
// year/month/day/hour/minutes/seconds, down the resolution you require.
// For example the long value 2011021714 would mean
// February 17, 2011, 2-3 PM.
doc.Add(new Int64Field("modified", file.LastWriteTimeUtc.Ticks, Field.Store.NO));
// Add the contents of the file to a field named "contents". Specify a Reader,
// so that the text of the file is tokenized and indexed, but not stored.
// Note that FileReader expects the file to be in UTF-8 encoding.
// If that's not the case searching for special characters will fail.
doc.Add(new TextField("contents", new StreamReader(fs, Encoding.UTF8)));
if (writer.Config.OpenMode == OpenMode.CREATE)
{
// New index, so we just add the document (no old document can be there):
Console.WriteLine("adding " + file);
writer.AddDocument(doc);
}
else
{
// Existing index (an old copy of this document may have been indexed) so
// we use updateDocument instead to replace the old one matching the exact
// path, if present:
Console.WriteLine("updating " + file);
writer.UpdateDocument(new Term("path", file.FullName), doc);
}
}
}
}
}