| Index: modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java |
| =================================================================== |
| --- modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java (revision 1231386) |
| +++ modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java (working copy) |
| @@ -5,6 +5,7 @@ |
| import java.util.Iterator; |
| |
| import org.apache.lucene.util.BytesRef; |
| +import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.fst.*; |
| |
| /** |
| @@ -219,11 +220,12 @@ |
| shareMaxTailLength, outputs, null); |
| |
| BytesRef scratch = new BytesRef(); |
| + final IntsRef scratchIntsRef = new IntsRef(); |
| int count = 0; |
| for (Iterator<BytesRef> i = sorter.iterator(); i.hasNext(); count++) { |
| BytesRef entry = i.next(); |
| if (scratch.compareTo(entry) != 0) { |
| - builder.add(entry, empty); |
| + builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); |
| scratch.copyBytes(entry); |
| } |
| } |
| Index: modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java |
| =================================================================== |
| --- modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (revision 1231386) |
| +++ modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (working copy) |
| @@ -33,6 +33,8 @@ |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.junit.Ignore; |
| |
| +// nocommit |
| +@Ignore |
| public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase { |
| |
| //this is some text here is a link and another link . This is an entity: & plus a <. Here is an & |
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java |
| =================================================================== |
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java (revision 1231386) |
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java (working copy) |
| @@ -33,9 +33,11 @@ |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefHash; |
| import org.apache.lucene.util.CharsRef; |
| +import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.UnicodeUtil; |
| import org.apache.lucene.util.fst.ByteSequenceOutputs; |
| import org.apache.lucene.util.fst.FST; |
| +import org.apache.lucene.util.fst.Util; |
| |
| /** |
| * A map of synonyms, keys and values are phrases. |
| @@ -262,6 +264,8 @@ |
| Set<CharsRef> keys = workingSet.keySet(); |
| CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]); |
| Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); |
| + |
| + final IntsRef scratchIntsRef = new IntsRef(); |
| |
| //System.out.println("fmap.build"); |
| for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) { |
| @@ -307,7 +311,7 @@ |
| |
| scratch.length = scratchOutput.getPosition() - scratch.offset; |
| //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); |
| - builder.add(input, BytesRef.deepCopyOf(scratch)); |
| + builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch)); |
| } |
| |
| FST<BytesRef> fst = builder.finish(); |
| Index: lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java |
| =================================================================== |
| --- lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1231386) |
| +++ lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) |
| @@ -1050,6 +1050,7 @@ |
| } |
| Terms terms = MultiFields.getTerms(r, "body"); |
| if (terms != null) { |
| + final IntsRef scratchIntsRef = new IntsRef(); |
| final TermsEnum termsEnum = terms.iterator(null); |
| if (VERBOSE) { |
| System.out.println("TEST: got termsEnum=" + termsEnum); |
| @@ -1073,7 +1074,7 @@ |
| } else { |
| output = termsEnum.docFreq(); |
| } |
| - builder.add(term, outputs.get(output)); |
| + builder.add(Util.toIntsRef(term, scratchIntsRef), outputs.get(output)); |
| ord++; |
| if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { |
| System.out.println(ord + " terms..."); |
| @@ -1373,7 +1374,7 @@ |
| public void testSingleString() throws Exception { |
| final Outputs<Object> outputs = NoOutputs.getSingleton(); |
| final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs); |
| - b.add(new BytesRef("foobar"), outputs.getNoOutput()); |
| + b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRef()), outputs.getNoOutput()); |
| final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish()); |
| assertNull(fstEnum.seekFloor(new BytesRef("foo"))); |
| assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); |
| @@ -1395,9 +1396,9 @@ |
| final BytesRef b = new BytesRef("b"); |
| final BytesRef c = new BytesRef("c"); |
| |
| - builder.add(a, outputs.get(17)); |
| - builder.add(b, outputs.get(42)); |
| - builder.add(c, outputs.get(13824324872317238L)); |
| + builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17)); |
| + builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42)); |
| + builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L)); |
| |
| final FST<Long> fst = builder.finish(); |
| |
| @@ -1628,13 +1629,14 @@ |
| |
| int line = 0; |
| final BytesRef term = new BytesRef(); |
| + final IntsRef scratchIntsRef = new IntsRef(); |
| while (line < lines.length) { |
| String w = lines[line++]; |
| if (w == null) { |
| break; |
| } |
| term.copyChars(w); |
| - b.add(term, nothing); |
| + b.add(Util.toIntsRef(term, scratchIntsRef), nothing); |
| } |
| |
| return b.finish(); |
| @@ -1698,8 +1700,8 @@ |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); |
| |
| final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null); |
| - builder.add("stat", outputs.get(17)); |
| - builder.add("station", outputs.get(10)); |
| + builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17)); |
| + builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10)); |
| final FST<Long> fst = builder.finish(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); |
| StringWriter w = new StringWriter(); |
| @@ -1713,8 +1715,8 @@ |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); |
| |
| final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); |
| - builder.add(new BytesRef("stat"), outputs.getNoOutput()); |
| - builder.add(new BytesRef("station"), outputs.getNoOutput()); |
| + builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput()); |
| + builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput()); |
| final FST<Long> fst = builder.finish(); |
| StringWriter w = new StringWriter(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); |
| Index: lucene/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (revision 1231386) |
| +++ lucene/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (working copy) |
| @@ -51,10 +51,12 @@ |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| +import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.fst.Builder; |
| import org.apache.lucene.util.fst.ByteSequenceOutputs; |
| import org.apache.lucene.util.fst.BytesRefFSTEnum; |
| import org.apache.lucene.util.fst.FST; |
| +import org.apache.lucene.util.fst.Util; |
| |
| // TODO: would be nice to somehow allow this to act like |
| // InstantiatedIndex, by never writing to disk; ie you write |
| @@ -183,6 +185,8 @@ |
| private final BytesRef spare = new BytesRef(); |
| private byte[] finalBuffer = new byte[128]; |
| |
| + private final IntsRef scratchIntsRef = new IntsRef(); |
| + |
| @Override |
| public void finishTerm(BytesRef text, TermStats stats) throws IOException { |
| |
| @@ -213,7 +217,7 @@ |
| System.out.println(" " + Integer.toHexString(finalBuffer[i]&0xFF)); |
| } |
| } |
| - builder.add(text, BytesRef.deepCopyOf(spare)); |
| + builder.add(Util.toIntsRef(text, scratchIntsRef), BytesRef.deepCopyOf(spare)); |
| termCount++; |
| } |
| |
| Index: lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexReader.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexReader.java (revision 1231386) |
| +++ lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexReader.java (working copy) |
| @@ -33,6 +33,7 @@ |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CodecUtil; |
| +import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.fst.Builder; |
| import org.apache.lucene.util.fst.BytesRefFSTEnum; |
| import org.apache.lucene.util.fst.FST; |
| @@ -187,6 +188,7 @@ |
| |
| if (indexDivisor > 1) { |
| // subsample |
| + final IntsRef scratchIntsRef = new IntsRef(); |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); |
| final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); |
| final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst); |
| @@ -194,7 +196,7 @@ |
| int count = indexDivisor; |
| while((result = fstEnum.next()) != null) { |
| if (count == indexDivisor) { |
| - builder.add(result.input, result.output); |
| + builder.add(Util.toIntsRef(result.input, scratchIntsRef), result.output); |
| count = 0; |
| } |
| count++; |
| Index: lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java (revision 1231386) |
| +++ lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java (working copy) |
| @@ -29,9 +29,11 @@ |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CodecUtil; |
| import org.apache.lucene.util.IOUtils; |
| +import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.fst.Builder; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.PositiveIntOutputs; |
| +import org.apache.lucene.util.fst.Util; |
| |
| /** |
| * Selects index terms according to provided pluggable |
| @@ -227,7 +229,7 @@ |
| ////System.out.println("VGW: field=" + fieldInfo.name); |
| |
| // Always put empty string in |
| - fstBuilder.add(new BytesRef(), fstOutputs.get(termsFilePointer)); |
| + fstBuilder.add(new IntsRef(), fstOutputs.get(termsFilePointer)); |
| startTermsFilePointer = termsFilePointer; |
| } |
| |
| @@ -246,6 +248,8 @@ |
| } |
| } |
| |
| + private final IntsRef scratchIntsRef = new IntsRef(); |
| + |
| @Override |
| public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException { |
| if (text.length == 0) { |
| @@ -256,7 +260,7 @@ |
| final int lengthSave = text.length; |
| text.length = indexedTermPrefixLength(lastTerm, text); |
| try { |
| - fstBuilder.add(text, fstOutputs.get(termsFilePointer)); |
| + fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), fstOutputs.get(termsFilePointer)); |
| } finally { |
| text.length = lengthSave; |
| } |
| Index: lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (revision 1231386) |
| +++ lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (working copy) |
| @@ -36,6 +36,7 @@ |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CharsRef; |
| +import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.OpenBitSet; |
| import org.apache.lucene.util.StringHelper; |
| import org.apache.lucene.util.UnicodeUtil; |
| @@ -44,6 +45,7 @@ |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.PairOutputs; |
| import org.apache.lucene.util.fst.PositiveIntOutputs; |
| +import org.apache.lucene.util.fst.Util; |
| |
| class SimpleTextFieldsReader extends FieldsProducer { |
| |
| @@ -477,11 +479,12 @@ |
| int docFreq = 0; |
| long totalTermFreq = 0; |
| OpenBitSet visitedDocs = new OpenBitSet(); |
| + final IntsRef scratchIntsRef = new IntsRef(); |
| while(true) { |
| SimpleTextUtil.readLine(in, scratch); |
| if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { |
| if (lastDocsStart != -1) { |
| - b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart, |
| + b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart, |
| new PairOutputs.Pair<Long,Long>((long) docFreq, |
| posIntOutputs.get(totalTermFreq)))); |
| sumTotalTermFreq += totalTermFreq; |
| @@ -497,7 +500,7 @@ |
| totalTermFreq++; |
| } else if (StringHelper.startsWith(scratch, TERM)) { |
| if (lastDocsStart != -1) { |
| - b.add(lastTerm, new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart, |
| + b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart, |
| new PairOutputs.Pair<Long,Long>((long) docFreq, |
| posIntOutputs.get(totalTermFreq)))); |
| } |
| Index: lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (revision 1231386) |
| +++ lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (working copy) |
| @@ -22,8 +22,8 @@ |
| import java.util.Comparator; |
| import java.util.List; |
| |
| +import org.apache.lucene.index.FieldInfo.IndexOptions; |
| import org.apache.lucene.index.FieldInfo; |
| -import org.apache.lucene.index.FieldInfo.IndexOptions; |
| import org.apache.lucene.index.FieldInfos; |
| import org.apache.lucene.index.IndexFileNames; |
| import org.apache.lucene.index.SegmentWriteState; |
| @@ -39,6 +39,7 @@ |
| import org.apache.lucene.util.fst.BytesRefFSTEnum; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.NoOutputs; |
| +import org.apache.lucene.util.fst.Util; |
| |
| /* |
| TODO: |
| @@ -244,6 +245,7 @@ |
| public final boolean hasTerms; |
| public final boolean isFloor; |
| public final int floorLeadByte; |
| + private final IntsRef scratchIntsRef = new IntsRef(); |
| |
| public PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List<FST<BytesRef>> subIndices) { |
| super(false); |
| @@ -294,7 +296,7 @@ |
| final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()]; |
| assert bytes.length > 0; |
| scratchBytes.writeTo(bytes, 0); |
| - indexBuilder.add(prefix, new BytesRef(bytes, 0, bytes.length)); |
| + indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); |
| scratchBytes.reset(); |
| |
| // Copy over index for all sub-blocks |
| @@ -337,7 +339,7 @@ |
| //if (DEBUG) { |
| // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); |
| //} |
| - builder.add(indexEnt.input, indexEnt.output); |
| + builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); |
| } |
| } |
| } |
| @@ -853,13 +855,15 @@ |
| return postingsWriter; |
| } |
| |
| + private final IntsRef scratchIntsRef = new IntsRef(); |
| + |
| @Override |
| public void finishTerm(BytesRef text, TermStats stats) throws IOException { |
| |
| assert stats.docFreq > 0; |
| //if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq); |
| |
| - blockBuilder.add(text, noOutputs.getNoOutput()); |
| + blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); |
| pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats)); |
| postingsWriter.finishTerm(stats); |
| numTerms++; |
| Index: lucene/src/java/org/apache/lucene/util/fst/Util.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/util/fst/Util.java (revision 1231386) |
| +++ lucene/src/java/org/apache/lucene/util/fst/Util.java (working copy) |
| @@ -31,10 +31,8 @@ |
| } |
| |
| /** Looks up the output for this input, or null if the |
| - * input is not accepted. FST must be |
| - * INPUT_TYPE.BYTE4. */ |
| + * input is not accepted. */ |
| public static<T> T get(FST<T> fst, IntsRef input) throws IOException { |
| - assert fst.inputType == FST.INPUT_TYPE.BYTE4; |
| |
| // TODO: would be nice not to alloc this on every lookup |
| final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); |
| @@ -59,78 +57,6 @@ |
| } |
| } |
| |
| - /** Logically casts input to UTF32 ints then looks up the output |
| - * or null if the input is not accepted. FST must be |
| - * INPUT_TYPE.BYTE4. */ |
| - public static<T> T get(FST<T> fst, char[] input, int offset, int length) throws IOException { |
| - assert fst.inputType == FST.INPUT_TYPE.BYTE4; |
| - |
| - // TODO: would be nice not to alloc this on every lookup |
| - final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); |
| - |
| - int charIdx = offset; |
| - final int charLimit = offset + length; |
| - |
| - // Accumulate output as we go |
| - final T NO_OUTPUT = fst.outputs.getNoOutput(); |
| - T output = NO_OUTPUT; |
| - while(charIdx < charLimit) { |
| - final int utf32 = Character.codePointAt(input, charIdx); |
| - charIdx += Character.charCount(utf32); |
| - |
| - if (fst.findTargetArc(utf32, arc, arc) == null) { |
| - return null; |
| - } else if (arc.output != NO_OUTPUT) { |
| - output = fst.outputs.add(output, arc.output); |
| - } |
| - } |
| - |
| - if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { |
| - return null; |
| - } else if (arc.output != NO_OUTPUT) { |
| - return fst.outputs.add(output, arc.output); |
| - } else { |
| - return output; |
| - } |
| - } |
| - |
| - |
| - /** Logically casts input to UTF32 ints then looks up the output |
| - * or null if the input is not accepted. FST must be |
| - * INPUT_TYPE.BYTE4. */ |
| - public static<T> T get(FST<T> fst, CharSequence input) throws IOException { |
| - assert fst.inputType == FST.INPUT_TYPE.BYTE4; |
| - |
| - // TODO: would be nice not to alloc this on every lookup |
| - final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); |
| - |
| - int charIdx = 0; |
| - final int charLimit = input.length(); |
| - |
| - // Accumulate output as we go |
| - final T NO_OUTPUT = fst.outputs.getNoOutput(); |
| - T output = NO_OUTPUT; |
| - |
| - while(charIdx < charLimit) { |
| - final int utf32 = Character.codePointAt(input, charIdx); |
| - charIdx += Character.charCount(utf32); |
| - |
| - if (fst.findTargetArc(utf32, arc, arc) == null) { |
| - return null; |
| - } else if (arc.output != NO_OUTPUT) { |
| - output = fst.outputs.add(output, arc.output); |
| - } |
| - } |
| - |
| - if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { |
| - return null; |
| - } else if (arc.output != NO_OUTPUT) { |
| - return fst.outputs.add(output, arc.output); |
| - } else { |
| - return output; |
| - } |
| - } |
| - |
| /** Looks up the output for this input, or null if the |
| * input is not accepted */ |
| public static<T> T get(FST<T> fst, BytesRef input) throws IOException { |
| @@ -381,4 +307,51 @@ |
| return "0x" + Integer.toHexString(label); |
| } |
| } |
| + |
| + /** Decodes the Unicode codepoints from the provided |
| + * CharSequence and places them in the provided scratch |
| + * IntsRef, which must not be null, returning it. */ |
| + public static IntsRef toUTF32(CharSequence s, IntsRef scratch) { |
| + int charIdx = 0; |
| + int intIdx = 0; |
| + final int charLimit = s.length(); |
| + while(charIdx < charLimit) { |
| + scratch.grow(intIdx+1); |
| + final int utf32 = Character.codePointAt(s, charIdx); |
| + scratch.ints[intIdx] = utf32; |
| + charIdx += Character.charCount(utf32); |
| + intIdx++; |
| + } |
| + scratch.length = intIdx; |
| + return scratch; |
| + } |
| + |
| + /** Decodes the Unicode codepoints from the provided |
| + * CharSequence and places them in the provided scratch |
| + * IntsRef, which must not be null, returning it. */ |
| + public static IntsRef toUTF32(char[] s, int offset, int length, IntsRef scratch) { |
| + int charIdx = offset; |
| + int intIdx = 0; |
| + final int charLimit = offset + length; |
| + while(charIdx < charLimit) { |
| + scratch.grow(intIdx+1); |
| + final int utf32 = Character.codePointAt(s, charIdx); |
| + scratch.ints[intIdx] = utf32; |
| + charIdx += Character.charCount(utf32); |
| + intIdx++; |
| + } |
| + scratch.length = intIdx; |
| + return scratch; |
| + } |
| + |
| + /** Just takes unsigned byte values from the BytesRef and |
| + * converts into an IntsRef. */ |
| + public static IntsRef toIntsRef(BytesRef input, IntsRef scratch) { |
| + scratch.grow(input.length); |
| + for(int i=0;i<input.length;i++) { |
| + scratch.ints[i] = input.bytes[i+input.offset] & 0xFF; |
| + } |
| + scratch.length = input.length; |
| + return scratch; |
| + } |
| } |
| Index: lucene/src/java/org/apache/lucene/util/fst/Builder.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/util/fst/Builder.java (revision 1231386) |
| +++ lucene/src/java/org/apache/lucene/util/fst/Builder.java (working copy) |
| @@ -19,7 +19,6 @@ |
| |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.RamUsageEstimator; |
| -import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc |
| |
| @@ -290,54 +289,6 @@ |
| } |
| } |
| |
| - private final IntsRef scratchIntsRef = new IntsRef(10); |
| - |
| - public void add(BytesRef input, T output) throws IOException { |
| - assert fst.getInputType() == FST.INPUT_TYPE.BYTE1; |
| - scratchIntsRef.grow(input.length); |
| - for(int i=0;i<input.length;i++) { |
| - scratchIntsRef.ints[i] = input.bytes[i+input.offset] & 0xFF; |
| - } |
| - scratchIntsRef.length = input.length; |
| - add(scratchIntsRef, output); |
| - } |
| - |
| - /** Sugar: adds the UTF32 codepoints from char[] slice. FST |
| - * must be FST.INPUT_TYPE.BYTE4! */ |
| - public void add(char[] s, int offset, int length, T output) throws IOException { |
| - assert fst.getInputType() == FST.INPUT_TYPE.BYTE4; |
| - int charIdx = offset; |
| - int intIdx = 0; |
| - final int charLimit = offset + length; |
| - while(charIdx < charLimit) { |
| - scratchIntsRef.grow(intIdx+1); |
| - final int utf32 = Character.codePointAt(s, charIdx); |
| - scratchIntsRef.ints[intIdx] = utf32; |
| - charIdx += Character.charCount(utf32); |
| - intIdx++; |
| - } |
| - scratchIntsRef.length = intIdx; |
| - add(scratchIntsRef, output); |
| - } |
| - |
| - /** Sugar: adds the UTF32 codepoints from CharSequence. FST |
| - * must be FST.INPUT_TYPE.BYTE4! */ |
| - public void add(CharSequence s, T output) throws IOException { |
| - assert fst.getInputType() == FST.INPUT_TYPE.BYTE4; |
| - int charIdx = 0; |
| - int intIdx = 0; |
| - final int charLimit = s.length(); |
| - while(charIdx < charLimit) { |
| - scratchIntsRef.grow(intIdx+1); |
| - final int utf32 = Character.codePointAt(s, charIdx); |
| - scratchIntsRef.ints[intIdx] = utf32; |
| - charIdx += Character.charCount(utf32); |
| - intIdx++; |
| - } |
| - scratchIntsRef.length = intIdx; |
| - add(scratchIntsRef, output); |
| - } |
| - |
| // for debugging |
| /* |
| private String toString(BytesRef b) { |