blob: d1140620cdfa95b17a1d42d5ce2df5f1cfebcfd1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cn.smart.hhmm;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
import org.apache.lucene.util.SuppressForbidden;
/**
* SmartChineseAnalyzer Bigram dictionary.
* @lucene.experimental
*/
class BigramDictionary extends AbstractDictionary {
private BigramDictionary() {
}
public static final char WORD_SEGMENT_CHAR = '@';
private static BigramDictionary singleInstance;
public static final int PRIME_BIGRAM_LENGTH = 402137;
/*
* The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.
*/
private long[] bigramHashTable;
private int[] frequencyTable;
private int max = 0;
private int repeat = 0;
// static Logger log = Logger.getLogger(BigramDictionary.class);
public synchronized static BigramDictionary getInstance() {
if (singleInstance == null) {
singleInstance = new BigramDictionary();
try {
singleInstance.load();
} catch (IOException e) {
String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
try {
singleInstance.load(dictRoot);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
return singleInstance;
}
private boolean loadFromObj(Path serialObj) {
try {
loadFromInputStream(Files.newInputStream(serialObj));
return true;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@SuppressForbidden(reason = "TODO: fix code to serialize its own dictionary vs. a binary blob in the codebase")
private void loadFromInputStream(InputStream serialObjectInputStream)
throws IOException, ClassNotFoundException {
try (ObjectInputStream input = new ObjectInputStream(serialObjectInputStream)) {
bigramHashTable = (long[]) input.readObject();
frequencyTable = (int[]) input.readObject();
// log.info("load bigram dict from serialization.");
}
}
@SuppressForbidden(reason = "TODO: fix code to serialize its own dictionary vs. a binary blob in the codebase")
private void saveToObj(Path serialObj) throws IOException {
try (ObjectOutputStream output = new ObjectOutputStream(Files.newOutputStream(
serialObj))) {
output.writeObject(bigramHashTable);
output.writeObject(frequencyTable);
// log.info("serialize bigram dict.");
}
}
private void load() throws IOException, ClassNotFoundException {
InputStream input = this.getClass().getResourceAsStream("bigramdict.mem");
loadFromInputStream(input);
}
private void load(String dictRoot) throws IOException {
String bigramDictPath = dictRoot + "/bigramdict.dct";
Path serialObj = Paths.get(dictRoot + "/bigramdict.mem");
if (Files.exists(serialObj) && loadFromObj(serialObj)) {
} else {
try {
bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
frequencyTable = new int[PRIME_BIGRAM_LENGTH];
for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
// it is possible for a value to hash to 0, but the probability is extremely low
bigramHashTable[i] = 0;
frequencyTable[i] = 0;
}
loadFromFile(bigramDictPath);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
saveToObj(serialObj);
}
}
/**
* Load the datafile into this BigramDictionary
*
* @param dctFilePath path to the Bigramdictionary (bigramdict.dct)
* @throws IOException If there is a low-level I/O error
*/
public void loadFromFile(String dctFilePath) throws IOException {
int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
// The 3756th is used (as a header) to store information.
int[] buffer = new int[3];
byte[] intBuffer = new byte[4];
String tmpword;
DataInputStream dctFile = new DataInputStream(Files.newInputStream(Paths.get(dctFilePath)));
// GB2312 characters 0 - 6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
String currentStr = getCCByGB2312Id(i);
// if (i == 5231)
// System.out.println(i);
dctFile.read(intBuffer);
// the dictionary was developed for C, and byte order must be converted to work with Java
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
if (cnt <= 0) {
continue;
}
total += cnt;
int j = 0;
while (j < cnt) {
dctFile.read(intBuffer);
buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// frequency
dctFile.read(intBuffer);
buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
.getInt();// length
dctFile.read(intBuffer);
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
length = buffer[1];
if (length > 0) {
byte[] lchBuffer = new byte[length];
dctFile.read(lchBuffer);
tmpword = new String(lchBuffer, "GB2312");
if (i != 3755 + GB2312_FIRST_CHAR) {
tmpword = currentStr + tmpword;
}
char carray[] = tmpword.toCharArray();
long hashId = hash1(carray);
int index = getAvaliableIndex(hashId, carray);
if (index != -1) {
if (bigramHashTable[index] == 0) {
bigramHashTable[index] = hashId;
// bigramStringTable[index] = tmpword;
}
frequencyTable[index] += buffer[0];
}
}
j++;
}
}
dctFile.close();
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
}
private int getAvaliableIndex(long hashId, char carray[]) {
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH) {
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH
&& (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
return index;
} else
return -1;
}
/*
* lookup the index into the frequency array.
*/
private int getBigramItemIndex(char carray[]) {
long hashId = hash1(carray);
int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
if (hash1 < 0)
hash1 = PRIME_BIGRAM_LENGTH + hash1;
if (hash2 < 0)
hash2 = PRIME_BIGRAM_LENGTH + hash2;
int index = hash1;
int i = 1;
repeat++;
while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
&& i < PRIME_BIGRAM_LENGTH) {
index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
i++;
repeat++;
if (i > max)
max = i;
}
// System.out.println(i - 1);
if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
return index;
} else
return -1;
}
public int getFrequency(char[] carray) {
int index = getBigramItemIndex(carray);
if (index != -1)
return frequencyTable[index];
return 0;
}
}