| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.ko.dict; |
| |
| import java.io.BufferedInputStream; |
| import java.io.EOFException; |
| import java.io.FileNotFoundException; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.nio.ByteBuffer; |
| import java.nio.channels.Channels; |
| import java.nio.channels.ReadableByteChannel; |
| import java.nio.file.Files; |
| import java.nio.file.Paths; |
| import org.apache.lucene.analysis.ko.POS; |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.store.DataInput; |
| import org.apache.lucene.store.InputStreamDataInput; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.IntsRef; |
| |
| /** Base class for a binary-encoded in-memory dictionary. */ |
| public abstract class BinaryDictionary implements Dictionary { |
| |
| /** Used to specify where (dictionary) resources get loaded from. */ |
| public enum ResourceScheme { |
| CLASSPATH, |
| FILE |
| } |
| |
| public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat"; |
| public static final String DICT_FILENAME_SUFFIX = "$buffer.dat"; |
| public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat"; |
| |
| public static final String DICT_HEADER = "ko_dict"; |
| public static final String TARGETMAP_HEADER = "ko_dict_map"; |
| public static final String POSDICT_HEADER = "ko_dict_pos"; |
| public static final int VERSION = 1; |
| |
| private final ResourceScheme resourceScheme; |
| private final String resourcePath; |
| private final ByteBuffer buffer; |
| private final int[] targetMapOffsets, targetMap; |
| private final POS.Tag[] posDict; |
| |
| protected BinaryDictionary() throws IOException { |
| this(ResourceScheme.CLASSPATH, null); |
| } |
| |
| /** |
| * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH). |
| * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH |
| * scheme only, use this class's name as the path. |
| */ |
| protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath) |
| throws IOException { |
| this.resourceScheme = resourceScheme; |
| if (resourcePath == null) { |
| if (resourceScheme != ResourceScheme.CLASSPATH) { |
| throw new IllegalArgumentException( |
| "resourcePath must be supplied with FILE resource scheme"); |
| } |
| this.resourcePath = getClass().getName().replace('.', '/'); |
| } else { |
| this.resourcePath = resourcePath; |
| } |
| InputStream mapIS = null, dictIS = null, posIS = null; |
| int[] targetMapOffsets, targetMap; |
| ByteBuffer buffer; |
| try { |
| mapIS = getResource(TARGETMAP_FILENAME_SUFFIX); |
| mapIS = new BufferedInputStream(mapIS); |
| DataInput in = new InputStreamDataInput(mapIS); |
| CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION); |
| targetMap = new int[in.readVInt()]; |
| targetMapOffsets = new int[in.readVInt()]; |
| int accum = 0, sourceId = 0; |
| for (int ofs = 0; ofs < targetMap.length; ofs++) { |
| final int val = in.readVInt(); |
| if ((val & 0x01) != 0) { |
| targetMapOffsets[sourceId] = ofs; |
| sourceId++; |
| } |
| accum += val >>> 1; |
| targetMap[ofs] = accum; |
| } |
| if (sourceId + 1 != targetMapOffsets.length) |
| throw new IOException( |
| "targetMap file format broken; targetMap.length=" |
| + targetMap.length |
| + ", targetMapOffsets.length=" |
| + targetMapOffsets.length |
| + ", sourceId=" |
| + sourceId); |
| targetMapOffsets[sourceId] = targetMap.length; |
| mapIS.close(); |
| mapIS = null; |
| |
| posIS = getResource(POSDICT_FILENAME_SUFFIX); |
| posIS = new BufferedInputStream(posIS); |
| in = new InputStreamDataInput(posIS); |
| CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION); |
| int posSize = in.readVInt(); |
| posDict = new POS.Tag[posSize]; |
| for (int j = 0; j < posSize; j++) { |
| posDict[j] = POS.resolveTag(in.readByte()); |
| } |
| posIS.close(); |
| posIS = null; |
| |
| dictIS = getResource(DICT_FILENAME_SUFFIX); |
| // no buffering here, as we load in one large buffer |
| in = new InputStreamDataInput(dictIS); |
| CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION); |
| final int size = in.readVInt(); |
| final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size); |
| final ReadableByteChannel channel = Channels.newChannel(dictIS); |
| final int read = channel.read(tmpBuffer); |
| if (read != size) { |
| throw new EOFException("Cannot read whole dictionary"); |
| } |
| dictIS.close(); |
| dictIS = null; |
| buffer = tmpBuffer.asReadOnlyBuffer(); |
| } finally { |
| IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS); |
| } |
| |
| this.targetMap = targetMap; |
| this.targetMapOffsets = targetMapOffsets; |
| this.buffer = buffer; |
| } |
| |
| protected final InputStream getResource(String suffix) throws IOException { |
| switch (resourceScheme) { |
| case CLASSPATH: |
| return getClassResource(resourcePath + suffix); |
| case FILE: |
| return Files.newInputStream(Paths.get(resourcePath + suffix)); |
| default: |
| throw new IllegalStateException("unknown resource scheme " + resourceScheme); |
| } |
| } |
| |
| public static InputStream getResource(ResourceScheme scheme, String path) throws IOException { |
| switch (scheme) { |
| case CLASSPATH: |
| return getClassResource(path); |
| case FILE: |
| return Files.newInputStream(Paths.get(path)); |
| default: |
| throw new IllegalStateException("unknown resource scheme " + scheme); |
| } |
| } |
| |
| // util, reused by ConnectionCosts and CharacterDefinition |
| public static InputStream getClassResource(Class<?> clazz, String suffix) throws IOException { |
| final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix); |
| if (is == null) { |
| throw new FileNotFoundException( |
| "Not in classpath: " + clazz.getName().replace('.', '/') + suffix); |
| } |
| return is; |
| } |
| |
| private static InputStream getClassResource(String path) throws IOException { |
| final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path); |
| if (is == null) { |
| throw new FileNotFoundException("Not in classpath: " + path); |
| } |
| return is; |
| } |
| |
| public void lookupWordIds(int sourceId, IntsRef ref) { |
| ref.ints = targetMap; |
| ref.offset = targetMapOffsets[sourceId]; |
| // targetMapOffsets always has one more entry pointing behind last: |
| ref.length = targetMapOffsets[sourceId + 1] - ref.offset; |
| } |
| |
| @Override |
| public int getLeftId(int wordId) { |
| return buffer.getShort(wordId) >>> 2; |
| } |
| |
| @Override |
| public int getRightId(int wordId) { |
| return buffer.getShort(wordId + 2) >>> 2; // Skip left id |
| } |
| |
| @Override |
| public int getWordCost(int wordId) { |
| return buffer.getShort(wordId + 4); // Skip left and right id |
| } |
| |
| @Override |
| public POS.Type getPOSType(int wordId) { |
| byte value = (byte) (buffer.getShort(wordId) & 3); |
| return POS.resolveType(value); |
| } |
| |
| @Override |
| public POS.Tag getLeftPOS(int wordId) { |
| return posDict[getLeftId(wordId)]; |
| } |
| |
| @Override |
| public POS.Tag getRightPOS(int wordId) { |
| POS.Type type = getPOSType(wordId); |
| if (type == POS.Type.MORPHEME || type == POS.Type.COMPOUND || hasSinglePOS(wordId)) { |
| return getLeftPOS(wordId); |
| } else { |
| byte value = buffer.get(wordId + 6); |
| return POS.resolveTag(value); |
| } |
| } |
| |
| @Override |
| public String getReading(int wordId) { |
| if (hasReadingData(wordId)) { |
| int offset = wordId + 6; |
| return readString(offset); |
| } |
| return null; |
| } |
| |
| @Override |
| public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) { |
| POS.Type posType = getPOSType(wordId); |
| if (posType == POS.Type.MORPHEME) { |
| return null; |
| } |
| int offset = wordId + 6; |
| boolean hasSinglePos = hasSinglePOS(wordId); |
| if (hasSinglePos == false) { |
| offset++; // skip rightPOS |
| } |
| int length = buffer.get(offset++); |
| if (length == 0) { |
| return null; |
| } |
| Morpheme[] morphemes = new Morpheme[length]; |
| int surfaceOffset = 0; |
| final POS.Tag leftPOS = getLeftPOS(wordId); |
| for (int i = 0; i < length; i++) { |
| final String form; |
| final POS.Tag tag = hasSinglePos ? leftPOS : POS.resolveTag(buffer.get(offset++)); |
| if (posType == POS.Type.INFLECT) { |
| form = readString(offset); |
| offset += form.length() * 2 + 1; |
| } else { |
| int formLen = buffer.get(offset++); |
| form = new String(surfaceForm, off + surfaceOffset, formLen); |
| surfaceOffset += formLen; |
| } |
| morphemes[i] = new Morpheme(tag, form); |
| } |
| return morphemes; |
| } |
| |
| private String readString(int offset) { |
| int strOffset = offset; |
| int len = buffer.get(strOffset++); |
| char[] text = new char[len]; |
| for (int i = 0; i < len; i++) { |
| text[i] = buffer.getChar(strOffset + (i << 1)); |
| } |
| return new String(text); |
| } |
| |
| private boolean hasSinglePOS(int wordId) { |
| return (buffer.getShort(wordId + 2) & HAS_SINGLE_POS) != 0; |
| } |
| |
| private boolean hasReadingData(int wordId) { |
| return (buffer.getShort(wordId + 2) & HAS_READING) != 0; |
| } |
| |
| /** flag that the entry has a single part of speech (leftPOS) */ |
| public static final int HAS_SINGLE_POS = 1; |
| |
| /** flag that the entry has reading data. otherwise reading is surface form */ |
| public static final int HAS_READING = 2; |
| } |