blob: 15e6a7c548113900dfc5391514919b60686936c5 [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Queries.Mlt;
using Lucene.Net.Search;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.IO;
using System.Xml;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.QueryParsers.Xml.Builders
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Builder for <see cref="MoreLikeThisQuery"/>
/// </summary>
public class LikeThisQueryBuilder : IQueryBuilder
{
private static readonly int DEFAULT_MAX_QUERY_TERMS = 20;
private static readonly int DEFAULT_MIN_TERM_FREQUENCY = 1;
private static readonly float DEFAULT_PERCENT_TERMS_TO_MATCH = 30; //default is a 3rd of selected terms must match
private readonly Analyzer analyzer;
private readonly string[] defaultFieldNames;
public LikeThisQueryBuilder(Analyzer analyzer, string[] defaultFieldNames)
{
this.analyzer = analyzer;
this.defaultFieldNames = defaultFieldNames;
}
/// <summary>
/// (non-Javadoc)
/// @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
/// </summary>
public virtual Query GetQuery(XmlElement e)
{
string fieldsList = e.GetAttribute("fieldNames"); //a comma-delimited list of fields
string[] fields = defaultFieldNames;
if ((fieldsList != null) && (fieldsList.Trim().Length > 0))
{
fields = fieldsList.Trim().Split(',').TrimEnd();
//trim the fieldnames
for (int i = 0; i < fields.Length; i++)
{
fields[i] = fields[i].Trim();
}
}
//Parse any "stopWords" attribute
//TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then
//I use all analyzers/fields to generate multi-field compatible stop list
string stopWords = e.GetAttribute("stopWords");
ISet<string> stopWordsSet = null;
if ((stopWords != null) && (fields != null))
{
stopWordsSet = new JCG.HashSet<string>();
foreach (string field in fields)
{
TokenStream ts = null;
try
{
ts = analyzer.GetTokenStream(field, stopWords);
ICharTermAttribute termAtt = ts.AddAttribute<ICharTermAttribute>();
ts.Reset();
while (ts.IncrementToken())
{
stopWordsSet.Add(termAtt.ToString());
}
ts.End();
}
catch (IOException ioe)
{
throw new ParserException("IoException parsing stop words list in "
+ GetType().Name + ":" + ioe.Message);
}
finally
{
IOUtils.DisposeWhileHandlingException(ts);
}
}
}
MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.GetText(e), fields, analyzer, fields[0]);
mlt.MaxQueryTerms = DOMUtils.GetAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS);
mlt.MinTermFrequency = DOMUtils.GetAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY);
mlt.PercentTermsToMatch = DOMUtils.GetAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100;
mlt.StopWords = stopWordsSet;
int minDocFreq = DOMUtils.GetAttribute(e, "minDocFreq", -1);
if (minDocFreq >= 0)
{
mlt.MinDocFreq = minDocFreq;
}
mlt.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f);
return mlt;
}
}
}