blob: 643d299f1be019450a519cb59e84cac25d75b119 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.luke.models.documents;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.OptionalInt;
import java.util.stream.Collectors;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.luke.util.BytesRefUtils;
/**
* Holder for term vector entry representing the term and their number of occurrences, and optionally, positions in the document field.
*/
public final class TermVectorEntry {
private final String termText;
private final long freq;
private final List<TermVectorPosition> positions;
/**
* Returns a new term vector entry representing the specified term, and optionally, positions.
*
* @param te - positioned terms iterator
* @return term vector entry
* @throws IOException - if there is a low level IO error.
*/
static TermVectorEntry of(TermsEnum te) throws IOException {
Objects.requireNonNull(te);
String termText = BytesRefUtils.decode(te.term());
List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
pe.nextDoc();
int freq = pe.freq();
for (int i = 0; i < freq; i++) {
int pos = pe.nextPosition();
if (pos < 0) {
// no position information available
continue;
}
TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
tvPositions.add(tvPos);
}
return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}
private TermVectorEntry(String termText, long freq, List<TermVectorPosition> positions) {
this.termText = termText;
this.freq = freq;
this.positions = positions;
}
/**
* Returns the string representation for this term.
*/
public String getTermText() {
return termText;
}
/**
* Returns the number of occurrences of this term in the document field.
*/
public long getFreq() {
return freq;
}
/**
* Returns the list of positions for this term in the document field.
*/
public List<TermVectorPosition> getPositions() {
return positions;
}
@Override
public String toString() {
String positionsStr = positions.stream()
.map(TermVectorPosition::toString)
.collect(Collectors.joining(","));
return "TermVectorEntry{" +
"termText='" + termText + '\'' +
", freq=" + freq +
", positions=" + positionsStr +
'}';
}
/**
* Holder for position information for a term vector entry.
*/
public static final class TermVectorPosition {
private final int position;
private final int startOffset;
private final int endOffset;
/**
* Returns a new position entry representing the specified posting, and optionally, start and end offsets.
* @param pos - term position
* @param pe - positioned postings iterator
* @return position entry
* @throws IOException - if there is a low level IO error.
*/
static TermVectorPosition of(int pos, PostingsEnum pe) throws IOException {
Objects.requireNonNull(pe);
int sOffset = pe.startOffset();
int eOffset = pe.endOffset();
if (sOffset >= 0 && eOffset >= 0) {
return new TermVectorPosition(pos, sOffset, eOffset);
}
return new TermVectorPosition(pos);
}
/**
* Returns the position for this term in the document field.
*/
public int getPosition() {
return position;
}
/**
* Returns the start offset for this term in the document field.
* Empty Optional instance is returned if no offset information available.
*/
public OptionalInt getStartOffset() {
return startOffset >= 0 ? OptionalInt.of(startOffset) : OptionalInt.empty();
}
/**
* Returns the end offset for this term in the document field.
* Empty Optional instance is returned if no offset information available.
*/
public OptionalInt getEndOffset() {
return endOffset >= 0 ? OptionalInt.of(endOffset) : OptionalInt.empty();
}
@Override
public String toString() {
return "TermVectorPosition{" +
"position=" + position +
", startOffset=" + startOffset +
", endOffset=" + endOffset +
'}';
}
private TermVectorPosition(int position) {
this(position, -1, -1);
}
private TermVectorPosition(int position, int startOffset, int endOffset) {
this.position = position;
this.startOffset = startOffset;
this.endOffset = endOffset;
}
}
}