blob: 3edf42b63a9d53deaf1fdc7511e8fb0c134e22e3 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.SolrTestCaseUtil;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestFoldingMultitermQuery extends SolrTestCaseJ4 {
public String getCoreName() {
return "basic";
public static void beforeTests() throws Exception {
initCore("solrconfig-basic.xml", "schema-folding.xml");
String docs[] = {
"abcdefg1 finger",
"gangs hijklmn1",
"opqrstu1 zilly",
// prepare the index
for (int i = 0; i < docs.length; i++) {
String num = Integer.toString(i);
String boolVal = ((i % 2) == 0) ? "true" : "false";
assertU(adoc("id", num,
"int_f", num,
"float_f", num,
"long_f", num,
"double_f", num,
"bool_f", boolVal,
"date_f", "200" + Integer.toString(i % 10) + "-01-01T00:00:00Z",
"content", docs[i],
"content_ws", docs[i],
"content_rev", docs[i],
"content_multi", docs[i],
"content_lower_token", docs[i],
"content_oldstyle", docs[i],
"content_charfilter", docs[i],
"content_multi_bad", docs[i],
"content_straight", docs[i],
"content_lower", docs[i],
"content_folding", docs[i],
"content_stemming", docs[i],
"content_keyword", docs[i]
// Mixing and matching amongst various languages is probalby a bad thing, so add some tests for various
// special filters
int idx = docs.length;
// Greek
assertU(adoc("id", Integer.toString(idx++), "content_greek", "Μάϊος"));
assertU(adoc("id", Integer.toString(idx++), "content_greek", "ΜΆΪΟΣ"));
// Turkish
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "\u0130STANBUL"));
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "ISPARTA"));
assertU(adoc("id", Integer.toString(idx++), "content_turkish", "izmir"));
// Russian normalization
assertU(adoc("id", Integer.toString(idx++), "content_russian", "электромагнитной"));
assertU(adoc("id", Integer.toString(idx++), "content_russian", "Вместе"));
assertU(adoc("id", Integer.toString(idx++), "content_russian", "силе"));
// persian normalization
assertU(adoc("id", Integer.toString(idx++), "content_persian", "هاي"));
// arabic normalization
assertU(adoc("id", Integer.toString(idx++), "content_arabic", "روبرت"));
// hindi normalization
assertU(adoc("id", Integer.toString(idx++), "content_hindi", "हिंदी"));
assertU(adoc("id", Integer.toString(idx++), "content_hindi", "अाअा"));
// german normalization
assertU(adoc("id", Integer.toString(idx++), "content_german", "weissbier"));
// cjk width normalization
assertU(adoc("id", Integer.toString(idx++), "content_width", "ヴィッツ"));
public void testPrefixCaseAccentFolding() throws Exception {
String matchOneDocPrefixUpper[][] = {
{"A*", "ÁB*", "ABÇ*"}, // these should find only doc 0
{"H*", "HÏ*", "HìJ*"}, // these should find only doc 1
{"O*", "ÖP*", "OPQ*"}, // these should find only doc 2
String matchRevPrefixUpper[][] = {
{"*Ğ1", "*DEfG1", "*EfG1"},
{"*N1", "*LmŊ1", "*MÑ1"},
{"*Ǖ1", "*sTu1", "*RŠTU1"}
// test the prefix queries find only one doc where the query is uppercased. Must go through query parser here!
for (int idx = 0; idx < matchOneDocPrefixUpper.length; idx++) {
for (int jdx = 0; jdx < matchOneDocPrefixUpper[idx].length; jdx++) {
String me = matchOneDocPrefixUpper[idx][jdx];
assertQ(req("q", "content:" + me),
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_ws:" + me),
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_multi:" + me),
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_lower_token:" + me),
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_oldstyle:" + me),
for (int idx = 0; idx < matchRevPrefixUpper.length; idx++) {
for (int jdx = 0; jdx < matchRevPrefixUpper[idx].length; jdx++) {
String me = matchRevPrefixUpper[idx][jdx];
assertQ(req("q", "content_rev:" + me),
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
// test the wildcard queries find only one doc where the query is uppercased and/or accented.
public void testWildcardCaseAccentFolding() throws Exception {
String matchOneDocWildUpper[][] = {
{"Á*C*", "ÁB*1", "ABÇ*g1", "Á*FG1"}, // these should find only doc 0
{"H*k*", "HÏ*l?*", "HìJ*n*", "HìJ*m*"}, // these should find only doc 1
{"O*ř*", "ÖP*ş???", "OPQ*S?Ů*", "ÖP*1"}, // these should find only doc 2
for (int idx = 0; idx < matchOneDocWildUpper.length; idx++) {
for (int jdx = 0; jdx < matchOneDocWildUpper[idx].length; jdx++) {
String me = matchOneDocWildUpper[idx][jdx];
assertQ("Error with " + me, req("q", "content:" + me),
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_ws:" + me),
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_multi:" + me),
"//*[@name='id'][.='" + Integer.toString(idx) + "']");
assertQ(req("q", "content_oldstyle:" + me),
public void testFuzzy() throws Exception {
assertQ(req("q", "content:ZiLLx~1"),
assertQ(req("q", "content_straight:ZiLLx~1"), // case preserving field shouldn't match
assertQ(req("q", "content_folding:ZiLLx~1"), // case preserving field shouldn't match
public void testRegex() throws Exception {
assertQ(req("q", "content:/Zill[a-z]/"),
assertQ(req("q", "content:/Zill[A-Z]/"), // everything in the regex gets lowercased?
assertQ(req("q", "content_keyword:/.*Zill[A-Z]/"),
assertQ(req("q", "content_straight:/Zill[a-z]/"), // case preserving field shouldn't match
assertQ(req("q", "content_folding:/Zill[a-z]/"), // case preserving field shouldn't match
assertQ(req("q", "content_keyword:/Abcdefg1 Finger/"), // test spaces
public void testGeneral() throws Exception {
assertQ(req("q", "content_stemming:fings*"), "//result[@numFound='0']"); // should not match (but would if fings* was stemmed to fing*
assertQ(req("q", "content_stemming:fing*"), "//result[@numFound='1']");
// Phrases should fail. This test is mainly a marker so if phrases ever do start working with wildcards we go
// and update the documentation
public void testPhrase() {
assertQ(req("q", "content:\"silly ABCD*\""),
public void testWildcardRange() {
assertQ(req("q", "content:[* TO *]"),
assertQ(req("q", "content:[AB* TO Z*]"),
assertQ(req("q", "content:[AB*E?G* TO TU*W]"),
// Does the char filter get correctly handled?
public void testCharFilter() {
assertQ(req("q", "content_charfilter:" + "Á*C*"),
assertQ(req("q", "content_charfilter:" + "ABÇ*g1"),
assertQ(req("q", "content_charfilter:" + "HÏ*l?*"),
public void testRangeQuery() {
assertQ(req("q", "content:" + "{Ȫp*1 TO QŮ*}"),
assertQ(req("q", "content:" + "[Áb* TO f?Ñg?r]"),
public void testNonTextTypes() {
String[] intTypes = {"int_f", "float_f", "long_f", "double_f"};
for (String str : intTypes) {
assertQ(req("q", str + ":" + "0"),
assertQ(req("q", str + ":" + "[0 TO 2]"),
assertQ(req("q", "bool_f:true"),
assertQ(req("q", "bool_f:[false TO true]"),
assertQ(req("q", "date_f:2000-01-01T00\\:00\\:00Z"),
assertQ(req("q", "date_f:[2000-12-31T23:59:59.999Z TO 2002-01-02T00:00:01Z]"),
public void testMultiBad() {
try {
ignoreException("analyzer returned too many terms");
Exception expected = SolrTestCaseUtil.expectThrows(Exception.class, "Should throw exception when token evaluates to more than one term", () -> assertQ(req("q", "content_multi_bad:" + "abCD*")));
assertTrue(expected.getCause() instanceof org.apache.solr.common.SolrException);
} finally {
public void testGreek() {
assertQ(req("q", "content_greek:μαιο*"), "//result[@numFound='2']");
assertQ(req("q", "content_greek:ΜΆΪΟ*"), "//result[@numFound='2']");
assertQ(req("q", "content_greek:Μάϊο*"), "//result[@numFound='2']");
public void testRussian() {
assertQ(req("q", "content_russian:элЕктРомагн*тной"), "//result[@numFound='1']");
assertQ(req("q", "content_russian:Вме*те"), "//result[@numFound='1']");
assertQ(req("q", "content_russian:Си*е"), "//result[@numFound='1']");
assertQ(req("q", "content_russian:эЛектромагнИт*"), "//result[@numFound='1']");
public void testPersian() {
assertQ(req("q", "content_persian:های*"), "//result[@numFound='1']");
public void testArabic() {
assertQ(req("q", "content_arabic:روبرـــــــــــــــــــــــــــــــــت*"), "//result[@numFound='1']");
public void testHindi() {
assertQ(req("q", "content_hindi:हिन्दी*"), "//result[@numFound='1']");
assertQ(req("q", "content_hindi:आआ*"), "//result[@numFound='1']");
public void testGerman() {
assertQ(req("q", "content_german:weiß*"), "//result[@numFound='1']");
public void testCJKWidth() {
assertQ(req("q", "content_width:ヴィ*"), "//result[@numFound='1']");