| using J2N; |
| using J2N.IO; |
| using J2N.Numerics; |
| using Lucene.Net.Codecs; |
| using Lucene.Net.Store; |
| using Lucene.Net.Util; |
| using System; |
| using System.IO; |
| using System.Security; |
| |
| namespace Lucene.Net.Analysis.Ja.Dict |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Base class for a binary-encoded in-memory dictionary. |
| /// <para/> |
| /// NOTE: To use an alternate dicationary than the built-in one, put the data files in a subdirectory of |
| /// your application named "kuromoji-data". This subdirectory |
| /// can be placed in any directory up to and including the root directory (if the OS permission allows). |
| /// To place the files in an alternate location, set an environment variable named "kuromoji.data.dir" |
| /// with the name of the directory the data files can be located within. |
| /// </summary> |
| public abstract class BinaryDictionary : IDictionary |
| { |
| public static readonly string DICT_FILENAME_SUFFIX = "$buffer.dat"; |
| public static readonly string TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat"; |
| public static readonly string POSDICT_FILENAME_SUFFIX = "$posDict.dat"; |
| |
| public static readonly string DICT_HEADER = "kuromoji_dict"; |
| public static readonly string TARGETMAP_HEADER = "kuromoji_dict_map"; |
| public static readonly string POSDICT_HEADER = "kuromoji_dict_pos"; |
| public static readonly int VERSION = 1; |
| |
| private readonly ByteBuffer buffer; |
| private readonly int[] targetMapOffsets, targetMap; |
| private readonly string[] posDict; |
| private readonly string[] inflTypeDict; |
| private readonly string[] inflFormDict; |
| |
| // LUCENENET specific - variable to hold the name of the data directory (or empty string to load embedded resources) |
| private static readonly string DATA_DIR = LoadDataDir(); |
| // LUCENENET specific - name of the subdirectory inside of the directory where the Kuromoji dictionary files reside. |
| private const string DATA_SUBDIR = "kuromoji-data"; |
| |
| private static string LoadDataDir() |
| { |
| // LUCENENET specific - reformatted with :, renamed from "analysis.data.dir" |
| string currentPath = SystemProperties.GetProperty("kuromoji:data:dir", |
| #if FEATURE_APPDOMAIN_BASEDIRECTORY |
| AppDomain.CurrentDomain.BaseDirectory |
| #else |
| System.AppContext.BaseDirectory |
| #endif |
| ); |
| |
| // If a matching directory path is found, set our DATA_DIR static |
| // variable. If it is null or empty after this process, we need to |
| // load the embedded files. |
| string candidatePath = System.IO.Path.Combine(currentPath, DATA_SUBDIR); |
| if (System.IO.Directory.Exists(candidatePath)) |
| { |
| return candidatePath; |
| } |
| |
| while (new DirectoryInfo(currentPath).Parent != null) |
| { |
| try |
| { |
| candidatePath = System.IO.Path.Combine(new DirectoryInfo(currentPath).Parent.FullName, DATA_SUBDIR); |
| if (System.IO.Directory.Exists(candidatePath)) |
| { |
| return candidatePath; |
| } |
| currentPath = new DirectoryInfo(currentPath).Parent.FullName; |
| } |
| catch (SecurityException) |
| { |
| // ignore security errors |
| } |
| } |
| |
| return null; // This is the signal to load from local resources |
| } |
| |
| protected BinaryDictionary() |
| { |
| int[] targetMapOffsets = null, targetMap = null; |
| string[] posDict = null; |
| string[] inflFormDict = null; |
| string[] inflTypeDict = null; |
| ByteBuffer buffer; // LUCENENET: IDE0059: Remove unnecessary value assignment |
| |
| using (Stream mapIS = GetResource(TARGETMAP_FILENAME_SUFFIX)) |
| { |
| DataInput @in = new InputStreamDataInput(mapIS); |
| CodecUtil.CheckHeader(@in, TARGETMAP_HEADER, VERSION, VERSION); |
| targetMap = new int[@in.ReadVInt32()]; |
| targetMapOffsets = new int[@in.ReadVInt32()]; |
| int accum = 0, sourceId = 0; |
| for (int ofs = 0; ofs < targetMap.Length; ofs++) |
| { |
| int val = @in.ReadVInt32(); |
| if ((val & 0x01) != 0) |
| { |
| targetMapOffsets[sourceId] = ofs; |
| sourceId++; |
| } |
| accum += val.TripleShift(1); |
| targetMap[ofs] = accum; |
| } |
| if (sourceId + 1 != targetMapOffsets.Length) |
| throw new IOException("targetMap file format broken"); |
| targetMapOffsets[sourceId] = targetMap.Length; |
| } |
| |
| using (Stream posIS = GetResource(POSDICT_FILENAME_SUFFIX)) |
| { |
| DataInput @in = new InputStreamDataInput(posIS); |
| CodecUtil.CheckHeader(@in, POSDICT_HEADER, VERSION, VERSION); |
| int posSize = @in.ReadVInt32(); |
| posDict = new string[posSize]; |
| inflTypeDict = new string[posSize]; |
| inflFormDict = new string[posSize]; |
| for (int j = 0; j < posSize; j++) |
| { |
| posDict[j] = @in.ReadString(); |
| inflTypeDict[j] = @in.ReadString(); |
| inflFormDict[j] = @in.ReadString(); |
| // this is how we encode null inflections |
| if (inflTypeDict[j].Length == 0) |
| { |
| inflTypeDict[j] = null; |
| } |
| if (inflFormDict[j].Length == 0) |
| { |
| inflFormDict[j] = null; |
| } |
| } |
| } |
| |
| ByteBuffer tmpBuffer; |
| |
| using (Stream dictIS = GetResource(DICT_FILENAME_SUFFIX)) |
| { |
| // no buffering here, as we load in one large buffer |
| DataInput @in = new InputStreamDataInput(dictIS); |
| CodecUtil.CheckHeader(@in, DICT_HEADER, VERSION, VERSION); |
| int size = @in.ReadVInt32(); |
| tmpBuffer = ByteBuffer.Allocate(size); // AllocateDirect..? |
| int read = dictIS.Read(tmpBuffer.Array, 0, size); |
| if (read != size) |
| { |
| throw new EndOfStreamException("Cannot read whole dictionary"); |
| } |
| } |
| buffer = tmpBuffer.AsReadOnlyBuffer(); |
| |
| this.targetMap = targetMap; |
| this.targetMapOffsets = targetMapOffsets; |
| this.posDict = posDict; |
| this.inflTypeDict = inflTypeDict; |
| this.inflFormDict = inflFormDict; |
| this.buffer = buffer; |
| } |
| |
| protected Stream GetResource(string suffix) |
| { |
| return GetTypeResource(GetType(), suffix); |
| } |
| |
| // util, reused by ConnectionCosts and CharacterDefinition |
| public static Stream GetTypeResource(Type clazz, string suffix) |
| { |
| string fileName = clazz.Name + suffix; |
| |
| // LUCENENET specific: Rather than forcing the end user to recompile if they want to use a custom dictionary, |
| // we load the data from the kuromoji-data directory (which can be set via the kuromoji.data.dir environment variable). |
| if (string.IsNullOrEmpty(DATA_DIR)) |
| { |
| Stream @is = clazz.FindAndGetManifestResourceStream(fileName); |
| if (@is == null) |
| throw new FileNotFoundException("Not in assembly: " + clazz.FullName + suffix); |
| return @is; |
| } |
| |
| // We have a data directory, so first check if the file exists |
| string path = System.IO.Path.Combine(DATA_DIR, fileName); |
| if (!File.Exists(path)) |
| { |
| throw new FileNotFoundException(string.Format("Expected file '{0}' not found. " + |
| "If the '{1}' directory exists, this file is required. " + |
| "Either remove the '{3}' directory or generate the required dictionary files using the lucene-cli tool.", |
| fileName, DATA_DIR, DATA_SUBDIR)); |
| } |
| |
| // The file exists - open a stream. |
| return new FileStream(path, FileMode.Open, FileAccess.Read); |
| } |
| |
| public virtual void LookupWordIds(int sourceId, Int32sRef @ref) |
| { |
| @ref.Int32s = targetMap; |
| @ref.Offset = targetMapOffsets[sourceId]; |
| // targetMapOffsets always has one more entry pointing behind last: |
| @ref.Length = targetMapOffsets[sourceId + 1] - @ref.Offset; |
| } |
| |
| public virtual int GetLeftId(int wordId) |
| { |
| return buffer.GetInt16(wordId).TripleShift(3); |
| } |
| |
| public virtual int GetRightId(int wordId) |
| { |
| return buffer.GetInt16(wordId).TripleShift(3); |
| } |
| |
| public virtual int GetWordCost(int wordId) |
| { |
| return buffer.GetInt16(wordId + 2); // Skip id |
| } |
| |
| public virtual string GetBaseForm(int wordId, char[] surfaceForm, int off, int len) |
| { |
| if (HasBaseFormData(wordId)) |
| { |
| int offset = BaseFormOffset(wordId); |
| int data = buffer.Get(offset++) & 0xff; |
| int prefix = data.TripleShift(4); |
| int suffix = data & 0xF; |
| char[] text = new char[prefix + suffix]; |
| System.Array.Copy(surfaceForm, off, text, 0, prefix); |
| for (int i = 0; i < suffix; i++) |
| { |
| text[prefix + i] = buffer.GetChar(offset + (i << 1)); |
| } |
| return new string(text); |
| } |
| else |
| { |
| return null; |
| } |
| } |
| |
| public virtual string GetReading(int wordId, char[] surface, int off, int len) |
| { |
| if (HasReadingData(wordId)) |
| { |
| int offset = ReadingOffset(wordId); |
| int readingData = buffer.Get(offset++) & 0xff; |
| return ReadString(offset, readingData.TripleShift(1), (readingData & 1) == 1); |
| } |
| else |
| { |
| // the reading is the surface form, with hiragana shifted to katakana |
| char[] text = new char[len]; |
| for (int i = 0; i < len; i++) |
| { |
| char ch = surface[off + i]; |
| if (ch > 0x3040 && ch < 0x3097) |
| { |
| text[i] = (char)(ch + 0x60); |
| } |
| else |
| { |
| text[i] = ch; |
| } |
| } |
| return new string(text); |
| } |
| } |
| |
| public virtual string GetPartOfSpeech(int wordId) |
| { |
| return posDict[GetLeftId(wordId)]; |
| } |
| |
| public virtual string GetPronunciation(int wordId, char[] surface, int off, int len) |
| { |
| if (HasPronunciationData(wordId)) |
| { |
| int offset = PronunciationOffset(wordId); |
| int pronunciationData = buffer.Get(offset++) & 0xff; |
| return ReadString(offset, pronunciationData.TripleShift(1), (pronunciationData & 1) == 1); |
| } |
| else |
| { |
| return GetReading(wordId, surface, off, len); // same as the reading |
| } |
| } |
| |
| public virtual string GetInflectionType(int wordId) |
| { |
| return inflTypeDict[GetLeftId(wordId)]; |
| } |
| |
| public virtual string GetInflectionForm(int wordId) |
| { |
| return inflFormDict[GetLeftId(wordId)]; |
| } |
| |
| private static int BaseFormOffset(int wordId) |
| { |
| return wordId + 4; |
| } |
| |
| private int ReadingOffset(int wordId) |
| { |
| int offset = BaseFormOffset(wordId); |
| if (HasBaseFormData(wordId)) |
| { |
| int baseFormLength = buffer.Get(offset++) & 0xf; |
| return offset + (baseFormLength << 1); |
| } |
| else |
| { |
| return offset; |
| } |
| } |
| |
| private int PronunciationOffset(int wordId) |
| { |
| if (HasReadingData(wordId)) |
| { |
| int offset = ReadingOffset(wordId); |
| int readingData = buffer.Get(offset++) & 0xff; |
| int readingLength; |
| if ((readingData & 1) == 0) |
| { |
| readingLength = readingData & 0xfe; // UTF-16: mask off kana bit |
| } |
| else |
| { |
| readingLength = readingData.TripleShift(1); |
| } |
| return offset + readingLength; |
| } |
| else |
| { |
| return ReadingOffset(wordId); |
| } |
| } |
| |
| private bool HasBaseFormData(int wordId) |
| { |
| return (buffer.GetInt16(wordId) & HAS_BASEFORM) != 0; |
| } |
| |
| private bool HasReadingData(int wordId) |
| { |
| return (buffer.GetInt16(wordId) & HAS_READING) != 0; |
| } |
| |
| private bool HasPronunciationData(int wordId) |
| { |
| return (buffer.GetInt16(wordId) & HAS_PRONUNCIATION) != 0; |
| } |
| |
| private string ReadString(int offset, int length, bool kana) |
| { |
| char[] text = new char[length]; |
| if (kana) |
| { |
| for (int i = 0; i < length; i++) |
| { |
| text[i] = (char)(0x30A0 + (buffer.Get(offset + i) & 0xff)); |
| } |
| } |
| else |
| { |
| for (int i = 0; i < length; i++) |
| { |
| text[i] = buffer.GetChar(offset + (i << 1)); |
| } |
| } |
| return new string(text); |
| } |
| |
| /// <summary>flag that the entry has baseform data. otherwise its not inflected (same as surface form)</summary> |
| public static readonly int HAS_BASEFORM = 1; |
| /// <summary>flag that the entry has reading data. otherwise reading is surface form converted to katakana</summary> |
| public static readonly int HAS_READING = 2; |
| /// <summary>flag that the entry has pronunciation data. otherwise pronunciation is the reading</summary> |
| public static readonly int HAS_PRONUNCIATION = 4; |
| } |
| } |