| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.ja.dict; |
| |
| |
| import java.io.BufferedInputStream; |
| import java.io.EOFException; |
| import java.io.IOException; |
| import java.io.FileNotFoundException; |
| import java.io.InputStream; |
| import java.nio.ByteBuffer; |
| import java.nio.file.Files; |
| import java.nio.file.Paths; |
| import java.nio.channels.Channels; |
| import java.nio.channels.ReadableByteChannel; |
| |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.store.DataInput; |
| import org.apache.lucene.store.InputStreamDataInput; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IOUtils; |
| |
| /** |
| * Base class for a binary-encoded in-memory dictionary. |
| */ |
| public abstract class BinaryDictionary implements Dictionary { |
| |
| /** |
| * Used to specify where (dictionary) resources get loaded from. |
| */ |
| public enum ResourceScheme { |
| CLASSPATH, FILE |
| } |
| |
| public static final String DICT_FILENAME_SUFFIX = "$buffer.dat"; |
| public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat"; |
| public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat"; |
| |
| public static final String DICT_HEADER = "kuromoji_dict"; |
| public static final String TARGETMAP_HEADER = "kuromoji_dict_map"; |
| public static final String POSDICT_HEADER = "kuromoji_dict_pos"; |
| public static final int VERSION = 1; |
| |
| private final ResourceScheme resourceScheme; |
| private final String resourcePath; |
| private final ByteBuffer buffer; |
| private final int[] targetMapOffsets, targetMap; |
| private final String[] posDict; |
| private final String[] inflTypeDict; |
| private final String[] inflFormDict; |
| |
| protected BinaryDictionary() throws IOException { |
| this(ResourceScheme.CLASSPATH, null); |
| } |
| |
| /** |
| * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH). |
| * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use |
| * this class's name as the path. |
| */ |
| protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException { |
| this.resourceScheme = resourceScheme; |
| if (resourcePath == null) { |
| if (resourceScheme != ResourceScheme.CLASSPATH) { |
| throw new IllegalArgumentException("resourcePath must be supplied with FILE resource scheme"); |
| } |
| this.resourcePath = getClass().getName().replace('.', '/'); |
| } else { |
| this.resourcePath = resourcePath; |
| } |
| InputStream mapIS = null, dictIS = null, posIS = null; |
| int[] targetMapOffsets = null, targetMap = null; |
| String[] posDict = null; |
| String[] inflFormDict = null; |
| String[] inflTypeDict = null; |
| ByteBuffer buffer = null; |
| boolean success = false; |
| try { |
| mapIS = getResource(TARGETMAP_FILENAME_SUFFIX); |
| mapIS = new BufferedInputStream(mapIS); |
| DataInput in = new InputStreamDataInput(mapIS); |
| CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION); |
| targetMap = new int[in.readVInt()]; |
| targetMapOffsets = new int[in.readVInt()]; |
| int accum = 0, sourceId = 0; |
| for (int ofs = 0; ofs < targetMap.length; ofs++) { |
| final int val = in.readVInt(); |
| if ((val & 0x01) != 0) { |
| targetMapOffsets[sourceId] = ofs; |
| sourceId++; |
| } |
| accum += val >>> 1; |
| targetMap[ofs] = accum; |
| } |
| if (sourceId + 1 != targetMapOffsets.length) |
| throw new IOException("targetMap file format broken; targetMap.length=" + targetMap.length |
| + ", targetMapOffsets.length=" + targetMapOffsets.length |
| + ", sourceId=" + sourceId); |
| targetMapOffsets[sourceId] = targetMap.length; |
| mapIS.close(); mapIS = null; |
| |
| posIS = getResource(POSDICT_FILENAME_SUFFIX); |
| posIS = new BufferedInputStream(posIS); |
| in = new InputStreamDataInput(posIS); |
| CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION); |
| int posSize = in.readVInt(); |
| posDict = new String[posSize]; |
| inflTypeDict = new String[posSize]; |
| inflFormDict = new String[posSize]; |
| for (int j = 0; j < posSize; j++) { |
| posDict[j] = in.readString(); |
| inflTypeDict[j] = in.readString(); |
| inflFormDict[j] = in.readString(); |
| // this is how we encode null inflections |
| if (inflTypeDict[j].length() == 0) { |
| inflTypeDict[j] = null; |
| } |
| if (inflFormDict[j].length() == 0) { |
| inflFormDict[j] = null; |
| } |
| } |
| posIS.close(); posIS = null; |
| |
| dictIS = getResource(DICT_FILENAME_SUFFIX); |
| // no buffering here, as we load in one large buffer |
| in = new InputStreamDataInput(dictIS); |
| CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION); |
| final int size = in.readVInt(); |
| final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size); |
| final ReadableByteChannel channel = Channels.newChannel(dictIS); |
| final int read = channel.read(tmpBuffer); |
| if (read != size) { |
| throw new EOFException("Cannot read whole dictionary"); |
| } |
| dictIS.close(); dictIS = null; |
| buffer = tmpBuffer.asReadOnlyBuffer(); |
| success = true; |
| } finally { |
| if (success) { |
| IOUtils.close(mapIS, posIS, dictIS); |
| } else { |
| IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS); |
| } |
| } |
| |
| this.targetMap = targetMap; |
| this.targetMapOffsets = targetMapOffsets; |
| this.posDict = posDict; |
| this.inflTypeDict = inflTypeDict; |
| this.inflFormDict = inflFormDict; |
| this.buffer = buffer; |
| } |
| |
| protected final InputStream getResource(String suffix) throws IOException { |
| switch(resourceScheme) { |
| case CLASSPATH: |
| return getClassResource(resourcePath + suffix); |
| case FILE: |
| return Files.newInputStream(Paths.get(resourcePath + suffix)); |
| default: |
| throw new IllegalStateException("unknown resource scheme " + resourceScheme); |
| } |
| } |
| |
| public static final InputStream getResource(ResourceScheme scheme, String path) throws IOException { |
| switch(scheme) { |
| case CLASSPATH: |
| return getClassResource(path); |
| case FILE: |
| return Files.newInputStream(Paths.get(path)); |
| default: |
| throw new IllegalStateException("unknown resource scheme " + scheme); |
| } |
| } |
| |
| // util, reused by ConnectionCosts and CharacterDefinition |
| public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException { |
| final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix); |
| if (is == null) { |
| throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix); |
| } |
| return is; |
| } |
| |
| private static InputStream getClassResource(String path) throws IOException { |
| final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path); |
| if (is == null) { |
| throw new FileNotFoundException("Not in classpath: " + path); |
| } |
| return is; |
| } |
| |
| public void lookupWordIds(int sourceId, IntsRef ref) { |
| ref.ints = targetMap; |
| ref.offset = targetMapOffsets[sourceId]; |
| // targetMapOffsets always has one more entry pointing behind last: |
| ref.length = targetMapOffsets[sourceId + 1] - ref.offset; |
| } |
| |
| @Override |
| public int getLeftId(int wordId) { |
| return (buffer.getShort(wordId) & 0xffff) >>> 3; |
| } |
| |
| @Override |
| public int getRightId(int wordId) { |
| return (buffer.getShort(wordId) & 0xffff) >>> 3; |
| } |
| |
| @Override |
| public int getWordCost(int wordId) { |
| return buffer.getShort(wordId + 2); // Skip id |
| } |
| |
| @Override |
| public String getBaseForm(int wordId, char surfaceForm[], int off, int len) { |
| if (hasBaseFormData(wordId)) { |
| int offset = baseFormOffset(wordId); |
| int data = buffer.get(offset++) & 0xff; |
| int prefix = data >>> 4; |
| int suffix = data & 0xF; |
| char text[] = new char[prefix+suffix]; |
| System.arraycopy(surfaceForm, off, text, 0, prefix); |
| for (int i = 0; i < suffix; i++) { |
| text[prefix+i] = buffer.getChar(offset + (i << 1)); |
| } |
| return new String(text); |
| } else { |
| return null; |
| } |
| } |
| |
| @Override |
| public String getReading(int wordId, char surface[], int off, int len) { |
| if (hasReadingData(wordId)) { |
| int offset = readingOffset(wordId); |
| int readingData = buffer.get(offset++) & 0xff; |
| return readString(offset, readingData >>> 1, (readingData & 1) == 1); |
| } else { |
| // the reading is the surface form, with hiragana shifted to katakana |
| char text[] = new char[len]; |
| for (int i = 0; i < len; i++) { |
| char ch = surface[off+i]; |
| if (ch > 0x3040 && ch < 0x3097) { |
| text[i] = (char)(ch + 0x60); |
| } else { |
| text[i] = ch; |
| } |
| } |
| return new String(text); |
| } |
| } |
| |
| @Override |
| public String getPartOfSpeech(int wordId) { |
| return posDict[getLeftId(wordId)]; |
| } |
| |
| @Override |
| public String getPronunciation(int wordId, char surface[], int off, int len) { |
| if (hasPronunciationData(wordId)) { |
| int offset = pronunciationOffset(wordId); |
| int pronunciationData = buffer.get(offset++) & 0xff; |
| return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1); |
| } else { |
| return getReading(wordId, surface, off, len); // same as the reading |
| } |
| } |
| |
| @Override |
| public String getInflectionType(int wordId) { |
| return inflTypeDict[getLeftId(wordId)]; |
| } |
| |
| @Override |
| public String getInflectionForm(int wordId) { |
| return inflFormDict[getLeftId(wordId)]; |
| } |
| |
| private static int baseFormOffset(int wordId) { |
| return wordId + 4; |
| } |
| |
| private int readingOffset(int wordId) { |
| int offset = baseFormOffset(wordId); |
| if (hasBaseFormData(wordId)) { |
| int baseFormLength = buffer.get(offset++) & 0xf; |
| return offset + (baseFormLength << 1); |
| } else { |
| return offset; |
| } |
| } |
| |
| private int pronunciationOffset(int wordId) { |
| if (hasReadingData(wordId)) { |
| int offset = readingOffset(wordId); |
| int readingData = buffer.get(offset++) & 0xff; |
| final int readingLength; |
| if ((readingData & 1) == 0) { |
| readingLength = readingData & 0xfe; // UTF-16: mask off kana bit |
| } else { |
| readingLength = readingData >>> 1; |
| } |
| return offset + readingLength; |
| } else { |
| return readingOffset(wordId); |
| } |
| } |
| |
| private boolean hasBaseFormData(int wordId) { |
| return (buffer.getShort(wordId) & HAS_BASEFORM) != 0; |
| } |
| |
| private boolean hasReadingData(int wordId) { |
| return (buffer.getShort(wordId) & HAS_READING) != 0; |
| } |
| |
| private boolean hasPronunciationData(int wordId) { |
| return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0; |
| } |
| |
| private String readString(int offset, int length, boolean kana) { |
| char text[] = new char[length]; |
| if (kana) { |
| for (int i = 0; i < length; i++) { |
| text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff)); |
| } |
| } else { |
| for (int i = 0; i < length; i++) { |
| text[i] = buffer.getChar(offset + (i << 1)); |
| } |
| } |
| return new String(text); |
| } |
| |
| /** flag that the entry has baseform data. otherwise it's not inflected (same as surface form) */ |
| public static final int HAS_BASEFORM = 1; |
| /** flag that the entry has reading data. otherwise reading is surface form converted to katakana */ |
| public static final int HAS_READING = 2; |
| /** flag that the entry has pronunciation data. otherwise pronunciation is the reading */ |
| public static final int HAS_PRONUNCIATION = 4; |
| } |