src/Demo/DemoLib/HTMLDocument.cs - lucenenet - Git at Google

 /*
  * Copyright 2004 The Apache Software Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 using System;
 using HTMLParser = Lucene.Net.Demo.Html.HTMLParser;
 using Lucene.Net.Documents;

 namespace Lucene.Net.Demo
 {

 	/// <summary>A utility for making Lucene Documents for HTML documents. </summary>

 	public class HTMLDocument
 	{
 		internal static char dirSep = System.IO.Path.DirectorySeparatorChar.ToString()[0];

 		public static System.String Uid(System.IO.FileInfo f)
 		{
 			// Append path and date into a string in such a way that lexicographic
 			// sorting gives the same results as a walk of the file hierarchy.  Thus
 			// null (\u0000) is used both to separate directory components and to
 			// separate the path from the date.
 			return f.FullName.Replace(dirSep, '\u0000') + "\u0000" + DateTools.TimeToString(((f.LastWriteTime.Ticks - 621355968000000000) / 10000), DateTools.Resolution.SECOND);
 		}

 		public static System.String Uid2url(System.String uid)
 		{
 			System.String url = uid.Replace('\u0000', '/'); // replace nulls with slashes
 			return url.Substring(0, (url.LastIndexOf('/')) - (0)); // remove date from end
 		}

 		public static Document Document(System.IO.FileInfo f)
 		{
 			// make a new, empty document
 			Document doc = new Document();

 			// Add the url as a field named "path".  Use a field that is
 			// indexed (i.e. searchable), but don't tokenize the field into words.
 			doc.Add(new Field("path", f.FullName.Replace(dirSep, '/'), Field.Store.YES, Field.Index.UN_TOKENIZED));

 			// Add the last modified date of the file a field named "modified".
 			// Use a field that is indexed (i.e. searchable), but don't tokenize
 			// the field into words.
 			doc.Add(new Field("modified", DateTools.TimeToString(((f.LastWriteTime.Ticks - 621355968000000000) / 10000), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.UN_TOKENIZED));

 			// Add the uid as a field, so that index can be incrementally maintained.
 			// This field is not stored with document, it is indexed, but it is not
 			// tokenized prior to indexing.
 			doc.Add(new Field("uid", Uid(f), Field.Store.NO, Field.Index.UN_TOKENIZED));

 			System.IO.FileStream fis = new System.IO.FileStream(f.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
 			HTMLParser parser = new HTMLParser(fis);

 			// Add the tag-stripped contents as a Reader-valued Text field so it will
 			// get tokenized and indexed.
 			doc.Add(new Field("contents", parser.GetReader()));

 			// Add the summary as a field that is stored and returned with
 			// hit documents for display.
 			doc.Add(new Field("summary", parser.GetSummary(), Field.Store.YES, Field.Index.NO));

 			// Add the title as a field that it can be searched and that is stored.
 			doc.Add(new Field("title", parser.GetTitle(), Field.Store.YES, Field.Index.TOKENIZED));

 			// return the document
 			return doc;
 		}

 		private HTMLDocument()
 		{
 		}
 	}
 }
	/*
	* Copyright 2004 The Apache Software Foundation
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	using System;
	using HTMLParser = Lucene.Net.Demo.Html.HTMLParser;
	using Lucene.Net.Documents;

	namespace Lucene.Net.Demo
	{

	/// <summary>A utility for making Lucene Documents for HTML documents. </summary>

	public class HTMLDocument
	{
	internal static char dirSep = System.IO.Path.DirectorySeparatorChar.ToString()[0];

	public static System.String Uid(System.IO.FileInfo f)
	{
	// Append path and date into a string in such a way that lexicographic
	// sorting gives the same results as a walk of the file hierarchy. Thus
	// null (\u0000) is used both to separate directory components and to
	// separate the path from the date.
	return f.FullName.Replace(dirSep, '\u0000') + "\u0000" + DateTools.TimeToString(((f.LastWriteTime.Ticks - 621355968000000000) / 10000), DateTools.Resolution.SECOND);
	}

	public static System.String Uid2url(System.String uid)
	{
	System.String url = uid.Replace('\u0000', '/'); // replace nulls with slashes
	return url.Substring(0, (url.LastIndexOf('/')) - (0)); // remove date from end
	}

	public static Document Document(System.IO.FileInfo f)
	{
	// make a new, empty document
	Document doc = new Document();

	// Add the url as a field named "path". Use a field that is
	// indexed (i.e. searchable), but don't tokenize the field into words.
	doc.Add(new Field("path", f.FullName.Replace(dirSep, '/'), Field.Store.YES, Field.Index.UN_TOKENIZED));

	// Add the last modified date of the file a field named "modified".
	// Use a field that is indexed (i.e. searchable), but don't tokenize
	// the field into words.
	doc.Add(new Field("modified", DateTools.TimeToString(((f.LastWriteTime.Ticks - 621355968000000000) / 10000), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.UN_TOKENIZED));

	// Add the uid as a field, so that index can be incrementally maintained.
	// This field is not stored with document, it is indexed, but it is not
	// tokenized prior to indexing.
	doc.Add(new Field("uid", Uid(f), Field.Store.NO, Field.Index.UN_TOKENIZED));

	System.IO.FileStream fis = new System.IO.FileStream(f.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
	HTMLParser parser = new HTMLParser(fis);

	// Add the tag-stripped contents as a Reader-valued Text field so it will
	// get tokenized and indexed.
	doc.Add(new Field("contents", parser.GetReader()));

	// Add the summary as a field that is stored and returned with
	// hit documents for display.
	doc.Add(new Field("summary", parser.GetSummary(), Field.Store.YES, Field.Index.NO));

	// Add the title as a field that it can be searched and that is stored.
	doc.Add(new Field("title", parser.GetTitle(), Field.Store.YES, Field.Index.TOKENIZED));

	// return the document
	return doc;
	}

	private HTMLDocument()
	{
	}
	}
	}