blob: 04e52d8fcab3009f64e5ffb70e05c876aa6c35bd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
public String getCoreName() {
return "basic";
}
@BeforeClass
public static void beforeTests() throws Exception {
initCore("solrconfig-basic.xml", "schema-folding.xml");
String docs[] = {
"abcdefg1 finger",
"gangs hijklmn1",
"opqrstu1 zilly",
};
// prepare the index
for (int i = 0; i < docs.length; i++) {
String num = Integer.toString(i);
String boolVal = ((i % 2) == 0) ? "true" : "false";
assertU(adoc("id", num,
"int_f", num,
"float_f", num,
"long_f", num,
"double_f", num,
"bool_f", boolVal,
"date_f", "200" + Integer.toString(i % 10) + "-01-01T00:00:00Z",
"content", docs[i],
"content_ws", docs[i],
"content_rev", docs[i],
"content_multi", docs[i],
"content_lower_token", docs[i],
"content_oldstyle", docs[i],
"content_charfilter", docs[i],
"content_multi_bad", docs[i],
"content_straight", docs[i],
"content_lower", docs[i],
"content_folding", docs[i],
"content_stemming", docs[i],
"content_keyword", docs[i]
));
}
// Mixing and matching amongst various languages is probalby a bad thing, so add some tests for various
// special filters
int idx = docs.length;
// Greek
assertU(adoc("id", Integer.toString(idx++), "content_greek", "Μάϊος"));
assertU(adoc("id", Integer.toString(idx++), "content_greek", "ΜΆΪΟΣ"));
// Turkish
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "\u0130STANBUL"));
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "ISPARTA"));
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "izmir"));
// Russian normalization
assertU(adoc("id", Integer.toString(idx++), "content_russian", "электромагнитной"));
assertU(adoc("id", Integer.toString(idx++), "content_russian", "Вместе"));
assertU(adoc("id", Integer.toString(idx++), "content_russian", "силе"));
// persian normalization
assertU(adoc("id", Integer.toString(idx++), "content_persian", "هاي"));
// arabic normalization
assertU(adoc("id", Integer.toString(idx++), "content_arabic", "روبرت"));
// hindi normalization
assertU(adoc("id", Integer.toString(idx++), "content_hindi", "हिंदी"));
assertU(adoc("id", Integer.toString(idx++), "content_hindi", "अाअा"));
// german normalization
assertU(adoc("id", Integer.toString(idx++), "content_german", "weissbier"));
// cjk width normalization
assertU(adoc("id", Integer.toString(idx++), "content_width", "ヴィッツ"));
assertU(commit());
}
@Test
public void testPrefixCaseAccentFolding() throws Exception {
String matchOneDocPrefixUpper[][] = {
{"A*", "ÁB*", "ABÇ*"}, // these should find only doc 0
{"H*", "HÏ*", "HìJ*"}, // these should find only doc 1
{"O*", "ÖP*", "OPQ*"}, // these should find only doc 2
};
String matchRevPrefixUpper[][] = {
{"*Ğ1", "*DEfG1", "*EfG1"},
{"*N1", "*LmŊ1", "*MÑ1"},
{"*Ǖ1", "*sTu1", "*RŠTU1"}
};
// test the prefix queries find only one doc where the query is uppercased. Must go through query parser here!
for (int idx = 0; idx < matchOneDocPrefixUpper.length; idx++) {
for (int jdx = 0; jdx < matchOneDocPrefixUpper[idx].length; jdx++) {
String me = matchOneDocPrefixUpper[idx][jdx];
assertQ(req("q", "content:" + me),
"//*[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_ws:" + me),
"//*[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_multi:" + me),
"//*[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_lower_token:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_oldstyle:" + me),
"//result[@numFound='0']");
}
}
for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
for (int jdx = 0; jdx < matchRevPrefixUpper[idx].length; jdx++) {
String me = matchRevPrefixUpper[idx][jdx];
assertQ(req("q", "content_rev:" + me),
"//*[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
}
}
}
// test the wildcard queries find only one doc where the query is uppercased and/or accented.
@Test
public void testWildcardCaseAccentFolding() throws Exception {
String matchOneDocWildUpper[][] = {
{"Á*C*", "ÁB*1", "ABÇ*g1", "Á*FG1"}, // these should find only doc 0
{"H*k*", "HÏ*l?*", "HìJ*n*", "HìJ*m*"}, // these should find only doc 1
{"O*ř*", "ÖP*ş???", "OPQ*S?Ů*", "ÖP*1"}, // these should find only doc 2
};
for (int idx = 0; idx < matchOneDocWildUpper.length; idx++) {
for (int jdx = 0; jdx < matchOneDocWildUpper[idx].length; jdx++) {
String me = matchOneDocWildUpper[idx][jdx];
assertQ("Error with " + me, req("q", "content:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_ws:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_multi:" + me),
"//result[@numFound='1']",
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_oldstyle:" + me),
"//result[@numFound='0']");
}
}
}
@Test
public void testFuzzy() throws Exception {
assertQ(req("q", "content:ZiLLx~1"),
"//result[@numFound='1']");
assertQ(req("q", "content_straight:ZiLLx~1"), // case preserving field shouldn't match
"//result[@numFound='0']");
assertQ(req("q", "content_folding:ZiLLx~1"), // case preserving field shouldn't match
"//result[@numFound='0']");
}
@Test
public void testRegex() throws Exception {
assertQ(req("q", "content:/Zill[a-z]/"),
"//result[@numFound='1']");
assertQ(req("q", "content:/Zill[A-Z]/"), // everything in the regex gets lowercased?
"//result[@numFound='1']");
assertQ(req("q", "content_keyword:/.*Zill[A-Z]/"),
"//result[@numFound='1']");
assertQ(req("q", "content_straight:/Zill[a-z]/"), // case preserving field shouldn't match
"//result[@numFound='0']");
assertQ(req("q", "content_folding:/Zill[a-z]/"), // case preserving field shouldn't match
"//result[@numFound='0']");
assertQ(req("q", "content_keyword:/Abcdefg1 Finger/"), // test spaces
"//result[@numFound='1']");
}
@Test
public void testGeneral() throws Exception {
assertQ(req("q", "content_stemming:fings*"), "//result[@numFound='0']"); // should not match (but would if fings* was stemmed to fing*
assertQ(req("q", "content_stemming:fing*"), "//result[@numFound='1']");
}
// Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go
// and update the documentation
@Test
public void testPhrase() {
assertQ(req("q", "content:\"silly ABCD*\""),
"//result[@numFound='0']");
}
@Test
public void testWildcardRange() {
assertQ(req("q", "content:[* TO *]"),
"//result[@numFound='3']");
assertQ(req("q", "content:[AB* TO Z*]"),
"//result[@numFound='3']");
assertQ(req("q", "content:[AB*E?G* TO TU*W]"),
"//result[@numFound='3']");
}
// Does the char filter get correctly handled?
@Test
public void testCharFilter() {
assertQ(req("q", "content_charfilter:" + "Á*C*"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
assertQ(req("q", "content_charfilter:" + "ABÇ*g1"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
assertQ(req("q", "content_charfilter:" + "HÏ*l?*"),
"//result[@numFound='1']",
"//*[@name='id'][.='1']");
}
@Test
public void testRangeQuery() {
assertQ(req("q", "content:" + "{Ȫp*1 TO QŮ*}"),
"//result[@numFound='1']",
"//*[@name='id'][.='2']");
assertQ(req("q", "content:" + "[Áb* TO f?Ñg?r]"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
}
@Test
public void testNonTextTypes() {
String[] intTypes = {"int_f", "float_f", "long_f", "double_f"};
for (String str : intTypes) {
assertQ(req("q", str + ":" + "0"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
assertQ(req("q", str + ":" + "[0 TO 2]"),
"//result[@numFound='3']",
"//*[@name='id'][.='0']",
"//*[@name='id'][.='1']",
"//*[@name='id'][.='2']");
}
assertQ(req("q", "bool_f:true"),
"//result[@numFound='2']",
"//*[@name='id'][.='0']",
"//*[@name='id'][.='2']");
assertQ(req("q", "bool_f:[false TO true]"),
"//result[@numFound='3']",
"//*[@name='id'][.='0']",
"//*[@name='id'][.='1']",
"//*[@name='id'][.='2']");
assertQ(req("q", "date_f:2000-01-01T00\\:00\\:00Z"),
"//result[@numFound='1']",
"//*[@name='id'][.='0']");
assertQ(req("q", "date_f:[2000-12-31T23:59:59.999Z TO 2002-01-02T00:00:01Z]"),
"//result[@numFound='2']",
"//*[@name='id'][.='1']",
"//*[@name='id'][.='2']");
}
@Test
public void testMultiBad() {
try {
ignoreException("analyzer returned too many terms");
Exception expected = expectThrows(Exception.class, "Should throw exception when token evaluates to more than one term",
() -> assertQ(req("q", "content_multi_bad:" + "abCD*"))
);
assertTrue(expected.getCause() instanceof org.apache.solr.common.SolrException);
} finally {
resetExceptionIgnores();
}
}
@Test
public void testGreek() {
assertQ(req("q", "content_greek:μαιο*"), "//result[@numFound='2']");
assertQ(req("q", "content_greek:ΜΆΪΟ*"), "//result[@numFound='2']");
assertQ(req("q", "content_greek:Μάϊο*"), "//result[@numFound='2']");
}
@Test
public void testRussian() {
assertQ(req("q", "content_russian:элЕктРомагн*тной"), "//result[@numFound='1']");
assertQ(req("q", "content_russian:Вме*те"), "//result[@numFound='1']");
assertQ(req("q", "content_russian:Си*е"), "//result[@numFound='1']");
assertQ(req("q", "content_russian:эЛектромагнИт*"), "//result[@numFound='1']");
}
public void testPersian() {
assertQ(req("q", "content_persian:های*"), "//result[@numFound='1']");
}
public void testArabic() {
assertQ(req("q", "content_arabic:روبرـــــــــــــــــــــــــــــــــت*"), "//result[@numFound='1']");
}
public void testHindi() {
assertQ(req("q", "content_hindi:हिन्दी*"), "//result[@numFound='1']");
assertQ(req("q", "content_hindi:आआ*"), "//result[@numFound='1']");
}
public void testGerman() {
assertQ(req("q", "content_german:weiß*"), "//result[@numFound='1']");
}
public void testCJKWidth() {
assertQ(req("q", "content_width:ヴィ*"), "//result[@numFound='1']");
}
}