| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.hunspell; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.text.ParseException; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.concurrent.Callable; |
| import java.util.concurrent.ExecutorService; |
| import java.util.concurrent.Executors; |
| import java.util.concurrent.Future; |
| import java.util.concurrent.TimeUnit; |
| import java.util.concurrent.atomic.AtomicBoolean; |
| import java.util.concurrent.atomic.AtomicLong; |
| import java.util.function.Function; |
| import java.util.stream.Collectors; |
| import java.util.stream.Stream; |
| import org.apache.lucene.store.BaseDirectoryWrapper; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks; |
| import org.apache.lucene.util.NamedThreadFactory; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.RamUsageTester; |
| import org.junit.Assert; |
| import org.junit.Assume; |
| import org.junit.Ignore; |
| |
| /** |
| * Loads all dictionaries from the directory specified in {@code hunspell.dictionaries} system |
| * property and prints their memory usage. All *.aff files are traversed recursively inside the |
| * given directory. Each *.aff file must have a same-named sibling *.dic file. For examples of such |
| * directories, refer to the {@link org.apache.lucene.analysis.hunspell package documentation}. |
| */ |
| @SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary") |
| public class TestAllDictionaries extends LuceneTestCase { |
| static Stream<Path> findAllAffixFiles() throws IOException { |
| String dicDir = System.getProperty("hunspell.dictionaries"); |
| Assume.assumeFalse( |
| "Requires Hunspell dictionaries at -Dhunspell.dictionaries=...", dicDir == null); |
| Path dicPath = Paths.get(dicDir); |
| return Files.walk(dicPath).filter(f -> f.toString().endsWith(".aff")).sorted(); |
| } |
| |
| static Dictionary loadDictionary(Path aff) throws IOException, ParseException { |
| String affPath = aff.toString(); |
| Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic"); |
| assert Files.exists(dic) : dic; |
| try (InputStream dictionary = Files.newInputStream(dic); |
| InputStream affix = Files.newInputStream(aff); |
| BaseDirectoryWrapper tempDir = newDirectory()) { |
| return new Dictionary(tempDir, "dictionary", affix, dictionary); |
| } |
| } |
| |
| /** Hack bais to expose current position. */ |
| private static class ExposePosition extends ByteArrayInputStream { |
| public ExposePosition(byte[] buf) { |
| super(buf); |
| } |
| |
| public long position() { |
| return super.pos; |
| } |
| } |
| |
| @Ignore |
| public void testMaxPrologueNeeded() throws Exception { |
| AtomicBoolean failTest = new AtomicBoolean(); |
| |
| Map<String, List<Long>> global = new LinkedHashMap<>(); |
| for (Path aff : findAllAffixFiles().collect(Collectors.toList())) { |
| Map<String, List<Long>> local = new LinkedHashMap<>(); |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| try (ExposePosition is = new ExposePosition(Files.readAllBytes(aff))) { |
| int chr; |
| while ((chr = is.read()) >= 0) { |
| baos.write(chr); |
| |
| if (chr == '\n') { |
| String line = baos.toString(StandardCharsets.ISO_8859_1); |
| if (!line.isBlank()) { |
| String firstWord = line.split("\\s")[0]; |
| switch (firstWord) { |
| case "SET": |
| case "FLAG": |
| local.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position()); |
| global.computeIfAbsent(firstWord, (k) -> new ArrayList<>()).add(is.position()); |
| break; |
| } |
| } |
| |
| baos.reset(); |
| } |
| } |
| } |
| |
| local.forEach( |
| (flag, positions) -> { |
| if (positions.size() > 1) { |
| System.out.format( |
| Locale.ROOT, |
| "Flag %s at more than one position in %s: %s%n", |
| flag, |
| aff, |
| positions); |
| failTest.set(true); |
| } |
| }); |
| } |
| |
| global.forEach( |
| (flag, positions) -> { |
| long max = positions.stream().mapToLong(v -> v).max().orElse(0); |
| System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max); |
| Assert.assertTrue( |
| "Flags beyond max prologue scan window: " + max, |
| max < Dictionary.MAX_PROLOGUE_SCAN_WINDOW); |
| }); |
| |
| if (failTest.get()) { |
| throw new AssertionError("Duplicate flags were present in at least one .aff file."); |
| } |
| } |
| |
| public void testDictionariesLoadSuccessfully() throws Exception { |
| AtomicLong totalMemory = new AtomicLong(); |
| AtomicLong totalWords = new AtomicLong(); |
| int threads = Runtime.getRuntime().availableProcessors(); |
| ExecutorService executor = |
| Executors.newFixedThreadPool(threads, new NamedThreadFactory("dictCheck-")); |
| List<Path> failures = Collections.synchronizedList(new ArrayList<>()); |
| Function<Path, Void> process = |
| (Path aff) -> { |
| try { |
| Dictionary dic = loadDictionary(aff); |
| totalMemory.addAndGet(RamUsageTester.sizeOf(dic)); |
| totalWords.addAndGet( |
| RamUsageTester.sizeOf(dic.words) + RamUsageTester.sizeOf(dic.wordHashes)); |
| System.out.println(aff + "\t" + memoryUsageSummary(dic)); |
| } catch (Throwable e) { |
| failures.add(aff); |
| System.err.println("While checking " + aff + ":"); |
| e.printStackTrace(); |
| } |
| return null; |
| }; |
| |
| List<Callable<Void>> tasks = |
| findAllAffixFiles() |
| .map(aff -> (Callable<Void>) () -> process.apply(aff)) |
| .collect(Collectors.toList()); |
| try { |
| for (Future<?> future : executor.invokeAll(tasks)) { |
| future.get(); |
| } |
| |
| if (!failures.isEmpty()) { |
| throw new AssertionError( |
| "Certain dictionaries failed to parse:\n - " |
| + failures.stream() |
| .map(path -> path.toAbsolutePath().toString()) |
| .collect(Collectors.joining("\n - "))); |
| } |
| } finally { |
| executor.shutdown(); |
| assertTrue(executor.awaitTermination(1, TimeUnit.MINUTES)); |
| } |
| |
| System.out.println("Total dictionaries loaded: " + tasks.size()); |
| System.out.println("Total memory: " + RamUsageEstimator.humanReadableUnits(totalMemory.get())); |
| System.out.println( |
| "Total memory for word storage: " + RamUsageEstimator.humanReadableUnits(totalWords.get())); |
| } |
| |
| private static String memoryUsageSummary(Dictionary dic) { |
| return RamUsageTester.humanSizeOf(dic) |
| + "\t(" |
| + ("words=" + RamUsageTester.humanSizeOf(dic.words) + ", ") |
| + ("flags=" + RamUsageTester.humanSizeOf(dic.flagLookup) + ", ") |
| + ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ") |
| + ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ") |
| + ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ") |
| + ("morphData=" + RamUsageTester.humanSizeOf(dic.morphData) + ", ") |
| + ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ") |
| + ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")"); |
| } |
| } |