blob: b09a36de18a7dcda222d9194e65b14c23ebe0bc2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.luke.models.documents;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.OptionalInt;
import java.util.stream.Collectors;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.luke.util.BytesRefUtils;
/**
* Holder for term vector entry representing the term and their number of occurrences, and
* optionally, positions in the document field.
*/
public final class TermVectorEntry {
private final String termText;
private final long freq;
private final List<TermVectorPosition> positions;
/**
* Returns a new term vector entry representing the specified term, and optionally, positions.
*
* @param te - positioned terms iterator
* @return term vector entry
* @throws IOException - if there is a low level IO error.
*/
static TermVectorEntry of(TermsEnum te) throws IOException {
Objects.requireNonNull(te);
String termText = BytesRefUtils.decode(te.term());
List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
pe.nextDoc();
int freq = pe.freq();
for (int i = 0; i < freq; i++) {
int pos = pe.nextPosition();
if (pos < 0) {
// no position information available
continue;
}
TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
tvPositions.add(tvPos);
}
return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}
private TermVectorEntry(String termText, long freq, List<TermVectorPosition> positions) {
this.termText = termText;
this.freq = freq;
this.positions = positions;
}
/** Returns the string representation for this term. */
public String getTermText() {
return termText;
}
/** Returns the number of occurrences of this term in the document field. */
public long getFreq() {
return freq;
}
/** Returns the list of positions for this term in the document field. */
public List<TermVectorPosition> getPositions() {
return positions;
}
@Override
public String toString() {
String positionsStr =
positions.stream().map(TermVectorPosition::toString).collect(Collectors.joining(","));
return "TermVectorEntry{"
+ "termText='"
+ termText
+ '\''
+ ", freq="
+ freq
+ ", positions="
+ positionsStr
+ '}';
}
/** Holder for position information for a term vector entry. */
public static final class TermVectorPosition {
private final int position;
private final int startOffset;
private final int endOffset;
/**
* Returns a new position entry representing the specified posting, and optionally, start and
* end offsets.
*
* @param pos - term position
* @param pe - positioned postings iterator
* @return position entry
* @throws IOException - if there is a low level IO error.
*/
static TermVectorPosition of(int pos, PostingsEnum pe) throws IOException {
Objects.requireNonNull(pe);
int sOffset = pe.startOffset();
int eOffset = pe.endOffset();
if (sOffset >= 0 && eOffset >= 0) {
return new TermVectorPosition(pos, sOffset, eOffset);
}
return new TermVectorPosition(pos);
}
/** Returns the position for this term in the document field. */
public int getPosition() {
return position;
}
/**
* Returns the start offset for this term in the document field. Empty Optional instance is
* returned if no offset information available.
*/
public OptionalInt getStartOffset() {
return startOffset >= 0 ? OptionalInt.of(startOffset) : OptionalInt.empty();
}
/**
* Returns the end offset for this term in the document field. Empty Optional instance is
* returned if no offset information available.
*/
public OptionalInt getEndOffset() {
return endOffset >= 0 ? OptionalInt.of(endOffset) : OptionalInt.empty();
}
@Override
public String toString() {
return "TermVectorPosition{"
+ "position="
+ position
+ ", startOffset="
+ startOffset
+ ", endOffset="
+ endOffset
+ '}';
}
private TermVectorPosition(int position) {
this(position, -1, -1);
}
private TermVectorPosition(int position, int startOffset, int endOffset) {
this.position = position;
this.startOffset = startOffset;
this.endOffset = endOffset;
}
}
}