| package org.apache.lucene.analysis.hunspell; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.InputStream; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.MockDirectoryWrapper; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks; |
| import org.apache.lucene.util.RamUsageTester; |
| import org.apache.lucene.util.TestUtil; |
| import org.junit.Ignore; |
| |
| /** |
| * Can be retrieved via: |
| * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ |
| * Note some of the files differ only in case. This may be a problem on your operating system! |
| */ |
| @Ignore("enable manually") |
| @SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary") |
| public class TestAllDictionaries extends LuceneTestCase { |
| |
| // set this to the location of where you downloaded all the files |
| static final Path DICTIONARY_HOME = |
| Paths.get("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); |
| |
| final String tests[] = { |
| /* zip file */ /* dictionary */ /* affix */ |
| "af_ZA.zip", "af_ZA.dic", "af_ZA.aff", |
| "ak_GH.zip", "ak_GH.dic", "ak_GH.aff", |
| "bg_BG.zip", "bg_BG.dic", "bg_BG.aff", |
| "ca_ANY.zip", "catalan.dic", "catalan.aff", |
| "ca_ES.zip", "ca_ES.dic", "ca_ES.aff", |
| // BUG: broken flag "cop_EG.zip", "cop_EG.dic", "cop_EG.aff", |
| "cs_CZ.zip", "cs_CZ.dic", "cs_CZ.aff", |
| "cy_GB.zip", "cy_GB.dic", "cy_GB.aff", |
| "da_DK.zip", "da_DK.dic", "da_DK.aff", |
| "de_AT.zip", "de_AT.dic", "de_AT.aff", |
| "de_CH.zip", "de_CH.dic", "de_CH.aff", |
| "de_DE.zip", "de_DE.dic", "de_DE.aff", |
| "de_DE_comb.zip", "de_DE_comb.dic", "de_DE_comb.aff", |
| "de_DE_frami.zip", "de_DE_frami.dic", "de_DE_frami.aff", |
| "de_DE_neu.zip", "de_DE_neu.dic", "de_DE_neu.aff", |
| "el_GR.zip", "el_GR.dic", "el_GR.aff", |
| "en_AU.zip", "en_AU.dic", "en_AU.aff", |
| "en_CA.zip", "en_CA.dic", "en_CA.aff", |
| "en_GB-oed.zip", "en_GB-oed.dic", "en_GB-oed.aff", |
| "en_GB.zip", "en_GB.dic", "en_GB.aff", |
| "en_NZ.zip", "en_NZ.dic", "en_NZ.aff", |
| "eo.zip", "eo_l3.dic", "eo_l3.aff", |
| "eo_EO.zip", "eo_EO.dic", "eo_EO.aff", |
| "es_AR.zip", "es_AR.dic", "es_AR.aff", |
| "es_BO.zip", "es_BO.dic", "es_BO.aff", |
| "es_CL.zip", "es_CL.dic", "es_CL.aff", |
| "es_CO.zip", "es_CO.dic", "es_CO.aff", |
| "es_CR.zip", "es_CR.dic", "es_CR.aff", |
| "es_CU.zip", "es_CU.dic", "es_CU.aff", |
| "es_DO.zip", "es_DO.dic", "es_DO.aff", |
| "es_EC.zip", "es_EC.dic", "es_EC.aff", |
| "es_ES.zip", "es_ES.dic", "es_ES.aff", |
| "es_GT.zip", "es_GT.dic", "es_GT.aff", |
| "es_HN.zip", "es_HN.dic", "es_HN.aff", |
| "es_MX.zip", "es_MX.dic", "es_MX.aff", |
| "es_NEW.zip", "es_NEW.dic", "es_NEW.aff", |
| "es_NI.zip", "es_NI.dic", "es_NI.aff", |
| "es_PA.zip", "es_PA.dic", "es_PA.aff", |
| "es_PE.zip", "es_PE.dic", "es_PE.aff", |
| "es_PR.zip", "es_PR.dic", "es_PR.aff", |
| "es_PY.zip", "es_PY.dic", "es_PY.aff", |
| "es_SV.zip", "es_SV.dic", "es_SV.aff", |
| "es_UY.zip", "es_UY.dic", "es_UY.aff", |
| "es_VE.zip", "es_VE.dic", "es_VE.aff", |
| "et_EE.zip", "et_EE.dic", "et_EE.aff", |
| "fo_FO.zip", "fo_FO.dic", "fo_FO.aff", |
| "fr_FR-1990_1-3-2.zip", "fr_FR-1990.dic", "fr_FR-1990.aff", |
| "fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff", |
| "fr_FR_1-3-2.zip", "fr_FR.dic", "fr_FR.aff", |
| "fy_NL.zip", "fy_NL.dic", "fy_NL.aff", |
| "ga_IE.zip", "ga_IE.dic", "ga_IE.aff", |
| "gd_GB.zip", "gd_GB.dic", "gd_GB.aff", |
| "gl_ES.zip", "gl_ES.dic", "gl_ES.aff", |
| "gsc_FR.zip", "gsc_FR.dic", "gsc_FR.aff", |
| "gu_IN.zip", "gu_IN.dic", "gu_IN.aff", |
| "he_IL.zip", "he_IL.dic", "he_IL.aff", |
| "hi_IN.zip", "hi_IN.dic", "hi_IN.aff", |
| "hil_PH.zip", "hil_PH.dic", "hil_PH.aff", |
| "hr_HR.zip", "hr_HR.dic", "hr_HR.aff", |
| "hu_HU.zip", "hu_HU.dic", "hu_HU.aff", |
| "hu_HU_comb.zip", "hu_HU.dic", "hu_HU.aff", |
| "ia.zip", "ia.dic", "ia.aff", |
| "id_ID.zip", "id_ID.dic", "id_ID.aff", |
| "it_IT.zip", "it_IT.dic", "it_IT.aff", |
| "ku_TR.zip", "ku_TR.dic", "ku_TR.aff", |
| "la.zip", "la.dic", "la.aff", |
| "lt_LT.zip", "lt_LT.dic", "lt_LT.aff", |
| "lv_LV.zip", "lv_LV.dic", "lv_LV.aff", |
| "mg_MG.zip", "mg_MG.dic", "mg_MG.aff", |
| "mi_NZ.zip", "mi_NZ.dic", "mi_NZ.aff", |
| "mk_MK.zip", "mk_MK.dic", "mk_MK.aff", |
| "mos_BF.zip", "mos_BF.dic", "mos_BF.aff", |
| "mr_IN.zip", "mr_IN.dic", "mr_IN.aff", |
| "ms_MY.zip", "ms_MY.dic", "ms_MY.aff", |
| "nb_NO.zip", "nb_NO.dic", "nb_NO.aff", |
| "ne_NP.zip", "ne_NP.dic", "ne_NP.aff", |
| "nl_NL.zip", "nl_NL.dic", "nl_NL.aff", |
| "nl_med.zip", "nl_med.dic", "nl_med.aff", |
| "nn_NO.zip", "nn_NO.dic", "nn_NO.aff", |
| "nr_ZA.zip", "nr_ZA.dic", "nr_ZA.aff", |
| "ns_ZA.zip", "ns_ZA.dic", "ns_ZA.aff", |
| "ny_MW.zip", "ny_MW.dic", "ny_MW.aff", |
| "oc_FR.zip", "oc_FR.dic", "oc_FR.aff", |
| "pl_PL.zip", "pl_PL.dic", "pl_PL.aff", |
| "pt_BR.zip", "pt_BR.dic", "pt_BR.aff", |
| "pt_PT.zip", "pt_PT.dic", "pt_PT.aff", |
| "ro_RO.zip", "ro_RO.dic", "ro_RO.aff", |
| "ru_RU.zip", "ru_RU.dic", "ru_RU.aff", |
| "ru_RU_ye.zip", "ru_RU_ie.dic", "ru_RU_ie.aff", |
| "ru_RU_yo.zip", "ru_RU_yo.dic", "ru_RU_yo.aff", |
| "rw_RW.zip", "rw_RW.dic", "rw_RW.aff", |
| "sk_SK.zip", "sk_SK.dic", "sk_SK.aff", |
| "sl_SI.zip", "sl_SI.dic", "sl_SI.aff", |
| "sq_AL.zip", "sq_AL.dic", "sq_AL.aff", |
| "ss_ZA.zip", "ss_ZA.dic", "ss_ZA.aff", |
| "st_ZA.zip", "st_ZA.dic", "st_ZA.aff", |
| "sv_SE.zip", "sv_SE.dic", "sv_SE.aff", |
| "sw_KE.zip", "sw_KE.dic", "sw_KE.aff", |
| "tet_ID.zip", "tet_ID.dic", "tet_ID.aff", |
| "th_TH.zip", "th_TH.dic", "th_TH.aff", |
| "tl_PH.zip", "tl_PH.dic", "tl_PH.aff", |
| "tn_ZA.zip", "tn_ZA.dic", "tn_ZA.aff", |
| "ts_ZA.zip", "ts_ZA.dic", "ts_ZA.aff", |
| "uk_UA.zip", "uk_UA.dic", "uk_UA.aff", |
| "ve_ZA.zip", "ve_ZA.dic", "ve_ZA.aff", |
| "vi_VN.zip", "vi_VN.dic", "vi_VN.aff", |
| "xh_ZA.zip", "xh_ZA.dic", "xh_ZA.aff", |
| "zu_ZA.zip", "zu_ZA.dic", "zu_ZA.aff", |
| }; |
| |
| public void test() throws Exception { |
| Path tmp = LuceneTestCase.createTempDir(); |
| |
| for (int i = 0; i < tests.length; i += 3) { |
| Path f = DICTIONARY_HOME.resolve(tests[i]); |
| assert Files.exists(f); |
| |
| IOUtils.rm(tmp); |
| Files.createDirectory(tmp); |
| |
| try (InputStream in = Files.newInputStream(f); Directory tempDir = getDirectory()) { |
| TestUtil.unzip(in, tmp); |
| Path dicEntry = tmp.resolve(tests[i+1]); |
| Path affEntry = tmp.resolve(tests[i+2]); |
| |
| try (InputStream dictionary = Files.newInputStream(dicEntry); |
| InputStream affix = Files.newInputStream(affEntry)) { |
| Dictionary dic = new Dictionary(tempDir, "dictionary", affix, dictionary); |
| System.out.println(tests[i] + "\t" + RamUsageTester.humanSizeOf(dic) + "\t(" + |
| "words=" + RamUsageTester.humanSizeOf(dic.words) + ", " + |
| "flags=" + RamUsageTester.humanSizeOf(dic.flagLookup) + ", " + |
| "strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", " + |
| "conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", " + |
| "affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", " + |
| "prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", " + |
| "suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")"); |
| } |
| } |
| } |
| } |
| |
| public void testOneDictionary() throws Exception { |
| Path tmp = LuceneTestCase.createTempDir(); |
| |
| String toTest = "zu_ZA.zip"; |
| for (int i = 0; i < tests.length; i++) { |
| if (tests[i].equals(toTest)) { |
| Path f = DICTIONARY_HOME.resolve(tests[i]); |
| assert Files.exists(f); |
| |
| IOUtils.rm(tmp); |
| Files.createDirectory(tmp); |
| |
| try (InputStream in = Files.newInputStream(f)) { |
| TestUtil.unzip(in, tmp); |
| Path dicEntry = tmp.resolve(tests[i+1]); |
| Path affEntry = tmp.resolve(tests[i+2]); |
| |
| try (InputStream dictionary = Files.newInputStream(dicEntry); |
| InputStream affix = Files.newInputStream(affEntry); |
| Directory tempDir = getDirectory()) { |
| new Dictionary(tempDir, "dictionary", affix, dictionary); |
| } |
| } |
| } |
| } |
| } |
| |
| private Directory getDirectory() { |
| return newDirectory(); |
| } |
| } |