| /* |
| * This software was produced for the U. S. Government |
| * under Contract No. W15P7T-11-C-F600, and is |
| * subject to the Rights in Noncommercial Computer Software |
| * and Noncommercial Computer Software Documentation |
| * Clause 252.227-7014 (JUN 1995) |
| * |
| * Copyright 2013 The MITRE Corporation. All Rights Reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.solr.handler.tagger; |
| |
| import java.io.IOException; |
| import org.apache.lucene.util.BytesRef; |
| |
| /** |
| * This is a Tag -- a startOffset, endOffset and value. |
| * |
| * <p>A Tag starts without a value in an "advancing" state. {@link |
| * #advance(org.apache.lucene.util.BytesRef, int)} is called with subsequent words and then |
| * eventually it won't advance any more, and value is set (could be null). |
| * |
| * <p>A Tag is also a doubly-linked-list (hence the LL in the name). All tags share a reference to |
| * the head via a 1-element array, which is potentially modified if any of the linked-list methods |
| * are called. Tags in the list should have equal or increasing start offsets. |
| */ |
| public class TagLL { |
| |
| private final TagLL[] head; // a shared pointer to the head; 1 element |
| TagLL prevTag, nextTag; // linked list |
| |
| private TermPrefixCursor cursor; |
| |
| final int startOffset; // inclusive |
| int endOffset; // exclusive |
| Object value; // null means unset |
| |
| /** optional boolean used by some TagClusterReducer's */ |
| boolean mark = false; |
| |
| TagLL(TagLL[] head, TermPrefixCursor cursor, int startOffset, int endOffset, Object value) { |
| this.head = head; |
| this.cursor = cursor; |
| this.startOffset = startOffset; |
| this.endOffset = endOffset; |
| this.value = value; |
| } |
| |
| /** |
| * Advances this tag with "word" at offset "offset". If this tag is not in an advancing state then |
| * it does nothing. If it is advancing and prior to advancing further it sees a value, then a |
| * non-advancing tag may be inserted into the LL as side-effect. If this returns false (it didn't |
| * advance) and if there is no value, then it will also be removed. |
| * |
| * @param word The next word or null if at an end |
| * @param offset The last character in word's offset in the underlying stream. If word is null |
| * then it's meaningless. |
| * @return Whether it advanced or not. |
| */ |
| boolean advance(BytesRef word, int offset) throws IOException { |
| if (!isAdvancing()) return false; |
| |
| Object iVal = cursor.getDocIds(); |
| |
| if (word != null && cursor.advance(word)) { |
| |
| if (iVal != null) { |
| addBeforeLL(new TagLL(head, null, startOffset, endOffset, iVal)); |
| } |
| |
| assert offset >= endOffset; |
| endOffset = offset; |
| return true; |
| } else { |
| this.value = iVal; |
| this.cursor = null; |
| if (iVal == null) removeLL(); |
| return false; |
| } |
| } |
| |
| /** |
| * Removes this tag from the chain, connecting prevTag and nextTag. Does not modify "this" |
| * object's pointers, so the caller can refer to nextTag after removing it. |
| */ |
| public void removeLL() { |
| if (head[0] == this) head[0] = nextTag; |
| if (prevTag != null) { |
| prevTag.nextTag = nextTag; |
| } |
| if (nextTag != null) { |
| nextTag.prevTag = prevTag; |
| } |
| } |
| |
| void addBeforeLL(TagLL tag) { |
| assert tag.startOffset <= startOffset; |
| if (prevTag != null) { |
| assert prevTag.startOffset <= tag.startOffset; |
| prevTag.nextTag = tag; |
| tag.prevTag = prevTag; |
| } else { |
| assert head[0] == this; |
| head[0] = tag; |
| } |
| prevTag = tag; |
| tag.nextTag = this; |
| } |
| |
| void addAfterLL(TagLL tag) { |
| assert tag.startOffset >= startOffset; |
| if (nextTag != null) { |
| assert nextTag.startOffset >= tag.startOffset; |
| nextTag.prevTag = tag; |
| tag.nextTag = nextTag; |
| } |
| nextTag = tag; |
| tag.prevTag = this; |
| } |
| |
| public int charLen() { |
| return endOffset - startOffset; |
| } |
| |
| public TagLL getNextTag() { |
| return nextTag; |
| } |
| |
| public TagLL getPrevTag() { |
| return prevTag; |
| } |
| |
| public int getStartOffset() { |
| return startOffset; |
| } |
| |
| public int getEndOffset() { |
| return endOffset; |
| } |
| |
| public boolean overlaps(TagLL other) { |
| // don't use >= or <= because startOffset is inclusive while endOffset is exclusive |
| if (startOffset < other.startOffset) return endOffset > other.startOffset; |
| else return startOffset < other.endOffset; |
| } |
| |
| boolean isAdvancing() { |
| return cursor != null; |
| } |
| |
| @Override |
| public String toString() { |
| return (prevTag != null ? '*' : '-') |
| + "|" |
| + (nextTag != null ? '*' : '-') |
| + " " |
| + startOffset |
| + " to " |
| + endOffset |
| + (isAdvancing() ? '+' : " #" + value); |
| } |
| } |