blob: c1a5e097a0747104dbe37362496697cfbc54f8e4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
import org.apache.lucene.util.RamUsageTester;
import org.apache.lucene.util.TestUtil;
import org.junit.Ignore;
/**
* Can be retrieved via:
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
* Note some of the files differ only in case. This may be a problem on your operating system!
*/
@Ignore("enable manually")
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
public class TestAllDictionaries extends LuceneTestCase {
// set this to the location of where you downloaded all the files
static final Path DICTIONARY_HOME =
Paths.get("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
final String tests[] = {
/* zip file */ /* dictionary */ /* affix */
"af_ZA.zip", "af_ZA.dic", "af_ZA.aff",
"ak_GH.zip", "ak_GH.dic", "ak_GH.aff",
"bg_BG.zip", "bg_BG.dic", "bg_BG.aff",
"ca_ANY.zip", "catalan.dic", "catalan.aff",
"ca_ES.zip", "ca_ES.dic", "ca_ES.aff",
// BUG: broken flag "cop_EG.zip", "cop_EG.dic", "cop_EG.aff",
"cs_CZ.zip", "cs_CZ.dic", "cs_CZ.aff",
"cy_GB.zip", "cy_GB.dic", "cy_GB.aff",
"da_DK.zip", "da_DK.dic", "da_DK.aff",
"de_AT.zip", "de_AT.dic", "de_AT.aff",
"de_CH.zip", "de_CH.dic", "de_CH.aff",
"de_DE.zip", "de_DE.dic", "de_DE.aff",
"de_DE_comb.zip", "de_DE_comb.dic", "de_DE_comb.aff",
"de_DE_frami.zip", "de_DE_frami.dic", "de_DE_frami.aff",
"de_DE_neu.zip", "de_DE_neu.dic", "de_DE_neu.aff",
"el_GR.zip", "el_GR.dic", "el_GR.aff",
"en_AU.zip", "en_AU.dic", "en_AU.aff",
"en_CA.zip", "en_CA.dic", "en_CA.aff",
"en_GB-oed.zip", "en_GB-oed.dic", "en_GB-oed.aff",
"en_GB.zip", "en_GB.dic", "en_GB.aff",
"en_NZ.zip", "en_NZ.dic", "en_NZ.aff",
"eo.zip", "eo_l3.dic", "eo_l3.aff",
"eo_EO.zip", "eo_EO.dic", "eo_EO.aff",
"es_AR.zip", "es_AR.dic", "es_AR.aff",
"es_BO.zip", "es_BO.dic", "es_BO.aff",
"es_CL.zip", "es_CL.dic", "es_CL.aff",
"es_CO.zip", "es_CO.dic", "es_CO.aff",
"es_CR.zip", "es_CR.dic", "es_CR.aff",
"es_CU.zip", "es_CU.dic", "es_CU.aff",
"es_DO.zip", "es_DO.dic", "es_DO.aff",
"es_EC.zip", "es_EC.dic", "es_EC.aff",
"es_ES.zip", "es_ES.dic", "es_ES.aff",
"es_GT.zip", "es_GT.dic", "es_GT.aff",
"es_HN.zip", "es_HN.dic", "es_HN.aff",
"es_MX.zip", "es_MX.dic", "es_MX.aff",
"es_NEW.zip", "es_NEW.dic", "es_NEW.aff",
"es_NI.zip", "es_NI.dic", "es_NI.aff",
"es_PA.zip", "es_PA.dic", "es_PA.aff",
"es_PE.zip", "es_PE.dic", "es_PE.aff",
"es_PR.zip", "es_PR.dic", "es_PR.aff",
"es_PY.zip", "es_PY.dic", "es_PY.aff",
"es_SV.zip", "es_SV.dic", "es_SV.aff",
"es_UY.zip", "es_UY.dic", "es_UY.aff",
"es_VE.zip", "es_VE.dic", "es_VE.aff",
"et_EE.zip", "et_EE.dic", "et_EE.aff",
"fo_FO.zip", "fo_FO.dic", "fo_FO.aff",
"fr_FR-1990_1-3-2.zip", "fr_FR-1990.dic", "fr_FR-1990.aff",
"fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff",
"fr_FR_1-3-2.zip", "fr_FR.dic", "fr_FR.aff",
"fy_NL.zip", "fy_NL.dic", "fy_NL.aff",
"ga_IE.zip", "ga_IE.dic", "ga_IE.aff",
"gd_GB.zip", "gd_GB.dic", "gd_GB.aff",
"gl_ES.zip", "gl_ES.dic", "gl_ES.aff",
"gsc_FR.zip", "gsc_FR.dic", "gsc_FR.aff",
"gu_IN.zip", "gu_IN.dic", "gu_IN.aff",
"he_IL.zip", "he_IL.dic", "he_IL.aff",
"hi_IN.zip", "hi_IN.dic", "hi_IN.aff",
"hil_PH.zip", "hil_PH.dic", "hil_PH.aff",
"hr_HR.zip", "hr_HR.dic", "hr_HR.aff",
"hu_HU.zip", "hu_HU.dic", "hu_HU.aff",
"hu_HU_comb.zip", "hu_HU.dic", "hu_HU.aff",
"ia.zip", "ia.dic", "ia.aff",
"id_ID.zip", "id_ID.dic", "id_ID.aff",
"it_IT.zip", "it_IT.dic", "it_IT.aff",
"ku_TR.zip", "ku_TR.dic", "ku_TR.aff",
"la.zip", "la.dic", "la.aff",
"lt_LT.zip", "lt_LT.dic", "lt_LT.aff",
"lv_LV.zip", "lv_LV.dic", "lv_LV.aff",
"mg_MG.zip", "mg_MG.dic", "mg_MG.aff",
"mi_NZ.zip", "mi_NZ.dic", "mi_NZ.aff",
"mk_MK.zip", "mk_MK.dic", "mk_MK.aff",
"mos_BF.zip", "mos_BF.dic", "mos_BF.aff",
"mr_IN.zip", "mr_IN.dic", "mr_IN.aff",
"ms_MY.zip", "ms_MY.dic", "ms_MY.aff",
"nb_NO.zip", "nb_NO.dic", "nb_NO.aff",
"ne_NP.zip", "ne_NP.dic", "ne_NP.aff",
"nl_NL.zip", "nl_NL.dic", "nl_NL.aff",
"nl_med.zip", "nl_med.dic", "nl_med.aff",
"nn_NO.zip", "nn_NO.dic", "nn_NO.aff",
"nr_ZA.zip", "nr_ZA.dic", "nr_ZA.aff",
"ns_ZA.zip", "ns_ZA.dic", "ns_ZA.aff",
"ny_MW.zip", "ny_MW.dic", "ny_MW.aff",
"oc_FR.zip", "oc_FR.dic", "oc_FR.aff",
"pl_PL.zip", "pl_PL.dic", "pl_PL.aff",
"pt_BR.zip", "pt_BR.dic", "pt_BR.aff",
"pt_PT.zip", "pt_PT.dic", "pt_PT.aff",
"ro_RO.zip", "ro_RO.dic", "ro_RO.aff",
"ru_RU.zip", "ru_RU.dic", "ru_RU.aff",
"ru_RU_ye.zip", "ru_RU_ie.dic", "ru_RU_ie.aff",
"ru_RU_yo.zip", "ru_RU_yo.dic", "ru_RU_yo.aff",
"rw_RW.zip", "rw_RW.dic", "rw_RW.aff",
"sk_SK.zip", "sk_SK.dic", "sk_SK.aff",
"sl_SI.zip", "sl_SI.dic", "sl_SI.aff",
"sq_AL.zip", "sq_AL.dic", "sq_AL.aff",
"ss_ZA.zip", "ss_ZA.dic", "ss_ZA.aff",
"st_ZA.zip", "st_ZA.dic", "st_ZA.aff",
"sv_SE.zip", "sv_SE.dic", "sv_SE.aff",
"sw_KE.zip", "sw_KE.dic", "sw_KE.aff",
"tet_ID.zip", "tet_ID.dic", "tet_ID.aff",
"th_TH.zip", "th_TH.dic", "th_TH.aff",
"tl_PH.zip", "tl_PH.dic", "tl_PH.aff",
"tn_ZA.zip", "tn_ZA.dic", "tn_ZA.aff",
"ts_ZA.zip", "ts_ZA.dic", "ts_ZA.aff",
"uk_UA.zip", "uk_UA.dic", "uk_UA.aff",
"ve_ZA.zip", "ve_ZA.dic", "ve_ZA.aff",
"vi_VN.zip", "vi_VN.dic", "vi_VN.aff",
"xh_ZA.zip", "xh_ZA.dic", "xh_ZA.aff",
"zu_ZA.zip", "zu_ZA.dic", "zu_ZA.aff",
};
public void test() throws Exception {
Path tmp = LuceneTestCase.createTempDir();
for (int i = 0; i < tests.length; i += 3) {
Path f = DICTIONARY_HOME.resolve(tests[i]);
assert Files.exists(f);
IOUtils.rm(tmp);
Files.createDirectory(tmp);
try (InputStream in = Files.newInputStream(f); Directory tempDir = getDirectory()) {
TestUtil.unzip(in, tmp);
Path dicEntry = tmp.resolve(tests[i+1]);
Path affEntry = tmp.resolve(tests[i+2]);
try (InputStream dictionary = Files.newInputStream(dicEntry);
InputStream affix = Files.newInputStream(affEntry)) {
Dictionary dic = new Dictionary(tempDir, "dictionary", affix, dictionary);
System.out.println(tests[i] + "\t" + RamUsageTester.humanSizeOf(dic) + "\t(" +
"words=" + RamUsageTester.humanSizeOf(dic.words) + ", " +
"flags=" + RamUsageTester.humanSizeOf(dic.flagLookup) + ", " +
"strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", " +
"conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", " +
"affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", " +
"prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", " +
"suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")");
}
}
}
}
public void testOneDictionary() throws Exception {
Path tmp = LuceneTestCase.createTempDir();
String toTest = "zu_ZA.zip";
for (int i = 0; i < tests.length; i++) {
if (tests[i].equals(toTest)) {
Path f = DICTIONARY_HOME.resolve(tests[i]);
assert Files.exists(f);
IOUtils.rm(tmp);
Files.createDirectory(tmp);
try (InputStream in = Files.newInputStream(f)) {
TestUtil.unzip(in, tmp);
Path dicEntry = tmp.resolve(tests[i+1]);
Path affEntry = tmp.resolve(tests[i+2]);
try (InputStream dictionary = Files.newInputStream(dicEntry);
InputStream affix = Files.newInputStream(affEntry);
Directory tempDir = getDirectory()) {
new Dictionary(tempDir, "dictionary", affix, dictionary);
}
}
}
}
}
private Directory getDirectory() {
return newDirectory();
}
}