blob: 0b2eab100a5cba1a5957bdfbbf4ff6feef26cd18 [file] [log] [blame]
// lucene version compatibility level: 4.8.1
using J2N.IO;
using Lucene.Net.Support;
using System;
using System.IO;
using System.Reflection;
using System.Text;
namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// SmartChineseAnalyzer Bigram dictionary.
/// <para/>
/// @lucene.experimental
/// </summary>
internal class BigramDictionary : AbstractDictionary
{
private BigramDictionary()
{
}
public static readonly char WORD_SEGMENT_CHAR = '@';
private static BigramDictionary singleInstance;
public static readonly int PRIME_BIGRAM_LENGTH = 402137;
/// <summary>
/// The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.
/// </summary>
private long[] bigramHashTable;
private int[] frequencyTable;
private int max = 0;
private int repeat = 0;
// static Logger log = Logger.getLogger(BigramDictionary.class);
private static object syncLock = new object();
public static BigramDictionary GetInstance()
{
lock (syncLock)
{
if (singleInstance == null)
{
singleInstance = new BigramDictionary();
// LUCENENET specific
// LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817
// This issue still existed as of 4.8.0. Here is the fix - we only
// load from a directory if the actual directory exists (AnalyzerProfile
// ensures it is an empty string if it is not available).
string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
if (string.IsNullOrEmpty(dictRoot))
{
singleInstance.Load();
}
else
{
singleInstance.Load(dictRoot);
}
//try
//{
// singleInstance.Load();
//}
//catch (IOException e)
//{
// string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
// singleInstance.Load(dictRoot);
//}
//catch (TypeLoadException e)
//{
// throw new Exception(e.ToString(), e);
//}
}
return singleInstance;
}
}
private bool LoadFromObj(FileInfo serialObj)
{
try
{
using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read))
LoadFromInputStream(input);
return true;
}
catch (Exception e)
{
throw new Exception(e.ToString(), e);
}
}
// LUCENENET conversion note:
// The data in Lucene is stored in a proprietary binary format (similar to
// .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the
// data was extracted using Java's DataOutputStream using the following Java code.
// It can then be read in using the LoadFromInputStream method below
// (using a DataInputStream instead of a BinaryReader), and saved
// in the correct (BinaryWriter) format by calling the SaveToObj method.
// Alternatively, the data can be loaded from disk using the files
// here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file,
// which will automatically produce the .mem files.
//public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException
//{
// // save wordIndexTable
// int wiLen = wordIndexTable.length;
// stream.writeInt(wiLen);
// for (int i = 0; i<wiLen; i++)
// {
// stream.writeShort(wordIndexTable[i]);
// }
// // save charIndexTable
// int ciLen = charIndexTable.length;
// stream.writeInt(ciLen);
// for (int i = 0; i<ciLen; i++)
// {
// stream.writeChar(charIndexTable[i]);
// }
// int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.length;
// stream.writeInt(caDim1);
// for (int i = 0; i<caDim1; i++)
// {
// int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].length;
// stream.writeInt(caDim2);
// for (int j = 0; j<caDim2; j++)
// {
// int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].length;
// stream.writeInt(caDim3);
// for (int k = 0; k<caDim3; k++)
// {
// stream.writeChar(wordItem_charArrayTable[i][j][k]);
// }
// }
// }
// int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.length;
// stream.writeInt(fDim1);
// for (int i = 0; i<fDim1; i++)
// {
// int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].length;
// stream.writeInt(fDim2);
// for (int j = 0; j<fDim2; j++)
// {
// stream.writeInt(wordItem_frequencyTable[i][j]);
// }
// }
//}
private void LoadFromInputStream(Stream serialObjectInputStream)
{
//ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
//bigramHashTable = (long[])input.readObject();
//frequencyTable = (int[])input.readObject();
//// log.info("load bigram dict from serialization.");
//input.close();
using (var reader = new BinaryReader(serialObjectInputStream))
//using (var reader = new DataInputStream(serialObjectInputStream))
{
// Read bigramHashTable
int bhLen = reader.ReadInt32();
bigramHashTable = new long[bhLen];
for (int i = 0; i < bhLen; i++)
{
bigramHashTable[i] = reader.ReadInt64();
}
// Read frequencyTable
int fLen = reader.ReadInt32();
frequencyTable = new int[fLen];
for (int i = 0; i < fLen; i++)
{
frequencyTable[i] = reader.ReadInt32();
}
}
// log.info("load bigram dict from serialization.");
}
private void SaveToObj(FileInfo serialObj)
{
try
{
//ObjectOutputStream output = new ObjectOutputStream(new FileStream(
// serialObj.FullName, FileMode.Create, FileAccess.Write));
//output.writeObject(bigramHashTable);
//output.writeObject(frequencyTable);
//output.close();
using (Stream output = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write))
{
using (BinaryWriter writer = new BinaryWriter(output))
{
int bhLen = bigramHashTable.Length;
writer.Write(bhLen);
for (int i = 0; i < bhLen; i++)
{
writer.Write(bigramHashTable[i]);
}
int fLen = frequencyTable.Length;
writer.Write(fLen);
for (int i = 0; i < fLen; i++)
{
writer.Write(frequencyTable[i]);
}
}
}
// log.info("serialize bigram dict.");
}
#pragma warning disable 168
catch (Exception e)
#pragma warning restore 168
{
// log.warn(e.getMessage());
}
}
private void Load()
{
using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "bigramdict.mem"))
{
LoadFromInputStream(input);
}
}
private void Load(string dictRoot)
{
string bigramDictPath = System.IO.Path.Combine(dictRoot, "bigramdict.dct");
FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dictRoot, "bigramdict.mem"));
if (serialObj.Exists && LoadFromObj(serialObj))
{
}
else
{
try
{
bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
frequencyTable = new int[PRIME_BIGRAM_LENGTH];
for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++)
{
// it is possible for a value to hash to 0, but the probability is extremely low
bigramHashTable[i] = 0;
frequencyTable[i] = 0;
}
LoadFromFile(bigramDictPath);
}
catch (IOException e)
{
throw new Exception(e.ToString(), e);
}
SaveToObj(serialObj);
}
}
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
/// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
// The 3756th is used (as a header) to store information.
int[]
buffer = new int[3];
byte[] intBuffer = new byte[4];
string tmpword;
//using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
{
// GB2312 characters 0 - 6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
{
string currentStr = GetCCByGB2312Id(i);
// if (i == 5231)
// System.out.println(i);
dctFile.Read(intBuffer, 0, intBuffer.Length);
// the dictionary was developed for C, and byte order must be converted to work with Java
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
continue;
}
total += cnt;
int j = 0;
while (j < cnt)
{
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// frequency
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// length
dctFile.Read(intBuffer, 0, intBuffer.Length);
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
length = buffer[1];
if (length > 0)
{
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
//tmpword = new String(lchBuffer, "GB2312");
tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
if (i != 3755 + GB2312_FIRST_CHAR)
{
tmpword = currentStr + tmpword;
}
char[] carray = tmpword.ToCharArray();
long hashId = Hash1(carray);
int index = GetAvaliableIndex(hashId, carray);
if (index != -1)
{
if (bigramHashTable[index] == 0)
{
bigramHashTable[index] = hashId;
// bigramStringTable[index] = tmpword;
}
frequencyTable[index] += buffer[0];
}
}
j++;
}
}
}
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
}
private int GetAvaliableIndex(long hashId, char[] carray)
{
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH)
{
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH
&& (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId))
{
return index;
}
else
return -1;
}
/// <summary>
/// lookup the index into the frequency array.
/// </summary>
private int GetBigramItemIndex(char[] carray)
{
long hashId = Hash1(carray);
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
repeat++;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH)
{
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
repeat++;
if (i > max)
max = i;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId)
{
return index;
}
else
return -1;
}
public int GetFrequency(char[] carray)
{
int index = GetBigramItemIndex(carray);
if (index != -1)
return frequencyTable[index];
return 0;
}
}
}