blob: ef9a11c5a031a2d010b5445ee525603b5368e688 [file] [log] [blame]
using J2N.IO;
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Codecs;
using Lucene.Net.Diagnostics;
using Lucene.Net.Store;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
namespace Lucene.Net.Analysis.Ja.Util
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public abstract class BinaryDictionaryWriter
{
protected readonly Type m_implClazz;
protected ByteBuffer m_buffer;
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
private int[] targetMap = new int[8192];
private int[] targetMapOffsets = new int[8192];
private readonly List<string> posDict = new List<string>();
public BinaryDictionaryWriter(Type implClazz, int size)
{
this.m_implClazz = implClazz;
m_buffer = ByteBuffer.Allocate(size);
}
/// <summary>
/// Put the entry in map.
/// </summary>
/// <param name="entry"></param>
/// <returns>Current position of buffer, which will be wordId of next entry.</returns>
public virtual int Put(string[] entry)
{
short leftId = short.Parse(entry[1], CultureInfo.InvariantCulture);
short rightId = short.Parse(entry[2], CultureInfo.InvariantCulture);
short wordCost = short.Parse(entry[3], CultureInfo.InvariantCulture);
StringBuilder sb = new StringBuilder();
// build up the POS string
for (int i = 4; i < 8; i++)
{
string part = entry[i];
if (Debugging.AssertsEnabled) Debugging.Assert(part.Length > 0);
if (!"*".Equals(part, StringComparison.Ordinal))
{
if (sb.Length > 0)
{
sb.Append('-');
}
sb.Append(part);
}
}
string posData = sb.ToString();
sb.Length = 0;
sb.Append(CSVUtil.QuoteEscape(posData));
sb.Append(',');
if (!"*".Equals(entry[8], StringComparison.Ordinal))
{
sb.Append(CSVUtil.QuoteEscape(entry[8]));
}
sb.Append(',');
if (!"*".Equals(entry[9], StringComparison.Ordinal))
{
sb.Append(CSVUtil.QuoteEscape(entry[9]));
}
string fullPOSData = sb.ToString();
string baseForm = entry[10];
string reading = entry[11];
string pronunciation = entry[12];
// extend buffer if necessary
int left = m_buffer.Remaining;
// worst case: two short, 3 bytes, and features (all as utf-16)
int worstCase = 4 + 3 + 2 * (baseForm.Length + reading.Length + pronunciation.Length);
if (worstCase > left)
{
ByteBuffer newBuffer = ByteBuffer.Allocate(ArrayUtil.Oversize(m_buffer.Limit + worstCase - left, 1));
m_buffer.Flip();
newBuffer.Put(m_buffer);
m_buffer = newBuffer;
}
int flags = 0;
if (!("*".Equals(baseForm, StringComparison.Ordinal) || baseForm.Equals(entry[0], StringComparison.Ordinal)))
{
flags |= BinaryDictionary.HAS_BASEFORM;
}
if (!reading.Equals(ToKatakana(entry[0]), StringComparison.Ordinal))
{
flags |= BinaryDictionary.HAS_READING;
}
if (!pronunciation.Equals(reading, StringComparison.Ordinal))
{
flags |= BinaryDictionary.HAS_PRONUNCIATION;
}
if (Debugging.AssertsEnabled)
{
Debugging.Assert(leftId == rightId);
Debugging.Assert(leftId < 4096); // there are still unused bits
}
// add pos mapping
int toFill = 1 + leftId - posDict.Count;
for (int i = 0; i < toFill; i++)
{
posDict.Add(null);
}
string existing = posDict[leftId];
if (Debugging.AssertsEnabled) Debugging.Assert(existing == null || existing.Equals(fullPOSData, StringComparison.Ordinal));
posDict[leftId] = fullPOSData;
m_buffer.PutInt16((short)(leftId << 3 | flags));
m_buffer.PutInt16(wordCost);
if ((flags & BinaryDictionary.HAS_BASEFORM) != 0)
{
if (Debugging.AssertsEnabled) Debugging.Assert(baseForm.Length < 16);
int shared = SharedPrefix(entry[0], baseForm);
int suffix = baseForm.Length - shared;
m_buffer.Put((byte)(shared << 4 | suffix));
for (int i = shared; i < baseForm.Length; i++)
{
m_buffer.PutChar(baseForm[i]);
}
}
if ((flags & BinaryDictionary.HAS_READING) != 0)
{
if (IsKatakana(reading))
{
m_buffer.Put((byte)(reading.Length << 1 | 1));
WriteKatakana(reading);
}
else
{
m_buffer.Put((byte)(reading.Length << 1));
for (int i = 0; i < reading.Length; i++)
{
m_buffer.PutChar(reading[i]);
}
}
}
if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0)
{
// we can save 150KB here, but it makes the reader a little complicated.
// int shared = sharedPrefix(reading, pronunciation);
// buffer.put((byte) shared);
// pronunciation = pronunciation.substring(shared);
if (IsKatakana(pronunciation))
{
m_buffer.Put((byte)(pronunciation.Length << 1 | 1));
WriteKatakana(pronunciation);
}
else
{
m_buffer.Put((byte)(pronunciation.Length << 1));
for (int i = 0; i < pronunciation.Length; i++)
{
m_buffer.PutChar(pronunciation[i]);
}
}
}
return m_buffer.Position;
}
private bool IsKatakana(string s)
{
for (int i = 0; i < s.Length; i++)
{
char ch = s[i];
if (ch < 0x30A0 || ch > 0x30FF)
{
return false;
}
}
return true;
}
private void WriteKatakana(string s)
{
for (int i = 0; i < s.Length; i++)
{
m_buffer.Put((byte)(s[i] - 0x30A0));
}
}
private string ToKatakana(string s)
{
char[] text = new char[s.Length];
for (int i = 0; i < s.Length; i++)
{
char ch = s[i];
if (ch > 0x3040 && ch < 0x3097)
{
text[i] = (char)(ch + 0x60);
}
else
{
text[i] = ch;
}
}
return new string(text);
}
public static int SharedPrefix(string left, string right)
{
int len = left.Length < right.Length ? left.Length : right.Length;
for (int i = 0; i < len; i++)
if (left[i] != right[i])
return i;
return len;
}
public virtual void AddMapping(int sourceId, int wordId)
{
if (Debugging.AssertsEnabled) Debugging.Assert(wordId > lastWordId, () => "words out of order: " + wordId + " vs lastID: " + lastWordId);
if (sourceId > lastSourceId)
{
if (Debugging.AssertsEnabled) Debugging.Assert(sourceId > lastSourceId, () => "source ids out of order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId);
targetMapOffsets = ArrayUtil.Grow(targetMapOffsets, sourceId + 1);
for (int i = lastSourceId + 1; i <= sourceId; i++)
{
targetMapOffsets[i] = targetMapEndOffset;
}
}
else
{
if (Debugging.AssertsEnabled) Debugging.Assert(sourceId == lastSourceId);
}
targetMap = ArrayUtil.Grow(targetMap, targetMapEndOffset + 1);
targetMap[targetMapEndOffset] = wordId;
targetMapEndOffset++;
lastSourceId = sourceId;
lastWordId = wordId;
}
protected string GetBaseFileName(string baseDir)
{
// LUCENENET specific: we don't need to do a "classpath" output directory, since we
// are changing the implementation to read files dynamically instead of making the
// user recompile with the new files.
return System.IO.Path.Combine(baseDir, m_implClazz.Name);
//return baseDir + System.IO.Path.DirectorySeparatorChar + m_implClazz.FullName.Replace('.', System.IO.Path.DirectorySeparatorChar);
}
/// <summary>
/// Write dictionary in file
/// </summary>
/// <remarks>
/// Dictionary format is:
/// [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
/// </remarks>
/// <param name="baseDir"></param>
/// <exception cref="IOException">If an I/O error occurs writing the dictionary files.</exception>
public virtual void Write(string baseDir)
{
string baseName = GetBaseFileName(baseDir);
WriteDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
WriteTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
WritePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
}
// TODO: maybe this int[] should instead be the output to the FST...
protected virtual void WriteTargetMap(string filename)
{
//new File(filename).getParentFile().mkdirs();
System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename));
using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write))
{
DataOutput @out = new OutputStreamDataOutput(os);
CodecUtil.WriteHeader(@out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
int numSourceIds = lastSourceId + 1;
@out.WriteVInt32(targetMapEndOffset); // <-- size of main array
@out.WriteVInt32(numSourceIds + 1); // <-- size of offset array (+ 1 more entry)
int prev = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMapEndOffset; ofs++)
{
int val = targetMap[ofs], delta = val - prev;
if (Debugging.AssertsEnabled) Debugging.Assert(delta >= 0);
if (ofs == targetMapOffsets[sourceId])
{
@out.WriteVInt32((delta << 1) | 0x01);
sourceId++;
}
else
{
@out.WriteVInt32((delta << 1));
}
prev += delta;
}
if (Debugging.AssertsEnabled) Debugging.Assert(sourceId == numSourceIds, () => "sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
}
}
protected virtual void WritePosDict(string filename)
{
//new File(filename).getParentFile().mkdirs();
System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename));
using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write))
{
DataOutput @out = new OutputStreamDataOutput(os);
CodecUtil.WriteHeader(@out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
@out.WriteVInt32(posDict.Count);
foreach (string s in posDict)
{
if (s == null)
{
@out.WriteByte((byte)0);
@out.WriteByte((byte)0);
@out.WriteByte((byte)0);
}
else
{
string[] data = CSVUtil.Parse(s);
if (Debugging.AssertsEnabled) Debugging.Assert(data.Length == 3, () => "malformed pos/inflection: " + s);
@out.WriteString(data[0]);
@out.WriteString(data[1]);
@out.WriteString(data[2]);
}
}
}
}
protected virtual void WriteDictionary(string filename)
{
//new File(filename).getParentFile().mkdirs();
System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename));
using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write))
{
DataOutput @out = new OutputStreamDataOutput(os);
CodecUtil.WriteHeader(@out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
@out.WriteVInt32(m_buffer.Position);
//WritableByteChannel channel = Channels.newChannel(os);
// Write Buffer
m_buffer.Flip(); // set position to 0, set limit to current position
//channel.write(buffer);
while (m_buffer.HasRemaining)
{
@out.WriteByte(m_buffer.Get());
}
if (Debugging.AssertsEnabled) Debugging.Assert(m_buffer.Remaining == 0L);
}
}
}
}