| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.luke.models.documents; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.Objects; |
| import java.util.OptionalInt; |
| import java.util.stream.Collectors; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.luke.util.BytesRefUtils; |
| |
| /** |
| * Holder for term vector entry representing the term and their number of occurrences, and |
| * optionally, positions in the document field. |
| */ |
| public final class TermVectorEntry { |
| |
| private final String termText; |
| private final long freq; |
| private final List<TermVectorPosition> positions; |
| |
| /** |
| * Returns a new term vector entry representing the specified term, and optionally, positions. |
| * |
| * @param te - positioned terms iterator |
| * @return term vector entry |
| * @throws IOException - if there is a low level IO error. |
| */ |
| static TermVectorEntry of(TermsEnum te) throws IOException { |
| Objects.requireNonNull(te); |
| |
| String termText = BytesRefUtils.decode(te.term()); |
| |
| List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>(); |
| PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS); |
| pe.nextDoc(); |
| int freq = pe.freq(); |
| for (int i = 0; i < freq; i++) { |
| int pos = pe.nextPosition(); |
| if (pos < 0) { |
| // no position information available |
| continue; |
| } |
| TermVectorPosition tvPos = TermVectorPosition.of(pos, pe); |
| tvPositions.add(tvPos); |
| } |
| |
| return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions); |
| } |
| |
| private TermVectorEntry(String termText, long freq, List<TermVectorPosition> positions) { |
| this.termText = termText; |
| this.freq = freq; |
| this.positions = positions; |
| } |
| |
| /** Returns the string representation for this term. */ |
| public String getTermText() { |
| return termText; |
| } |
| |
| /** Returns the number of occurrences of this term in the document field. */ |
| public long getFreq() { |
| return freq; |
| } |
| |
| /** Returns the list of positions for this term in the document field. */ |
| public List<TermVectorPosition> getPositions() { |
| return positions; |
| } |
| |
| @Override |
| public String toString() { |
| String positionsStr = |
| positions.stream().map(TermVectorPosition::toString).collect(Collectors.joining(",")); |
| |
| return "TermVectorEntry{" |
| + "termText='" |
| + termText |
| + '\'' |
| + ", freq=" |
| + freq |
| + ", positions=" |
| + positionsStr |
| + '}'; |
| } |
| |
| /** Holder for position information for a term vector entry. */ |
| public static final class TermVectorPosition { |
| private final int position; |
| private final int startOffset; |
| private final int endOffset; |
| |
| /** |
| * Returns a new position entry representing the specified posting, and optionally, start and |
| * end offsets. |
| * |
| * @param pos - term position |
| * @param pe - positioned postings iterator |
| * @return position entry |
| * @throws IOException - if there is a low level IO error. |
| */ |
| static TermVectorPosition of(int pos, PostingsEnum pe) throws IOException { |
| Objects.requireNonNull(pe); |
| |
| int sOffset = pe.startOffset(); |
| int eOffset = pe.endOffset(); |
| if (sOffset >= 0 && eOffset >= 0) { |
| return new TermVectorPosition(pos, sOffset, eOffset); |
| } |
| return new TermVectorPosition(pos); |
| } |
| |
| /** Returns the position for this term in the document field. */ |
| public int getPosition() { |
| return position; |
| } |
| |
| /** |
| * Returns the start offset for this term in the document field. Empty Optional instance is |
| * returned if no offset information available. |
| */ |
| public OptionalInt getStartOffset() { |
| return startOffset >= 0 ? OptionalInt.of(startOffset) : OptionalInt.empty(); |
| } |
| |
| /** |
| * Returns the end offset for this term in the document field. Empty Optional instance is |
| * returned if no offset information available. |
| */ |
| public OptionalInt getEndOffset() { |
| return endOffset >= 0 ? OptionalInt.of(endOffset) : OptionalInt.empty(); |
| } |
| |
| @Override |
| public String toString() { |
| return "TermVectorPosition{" |
| + "position=" |
| + position |
| + ", startOffset=" |
| + startOffset |
| + ", endOffset=" |
| + endOffset |
| + '}'; |
| } |
| |
| private TermVectorPosition(int position) { |
| this(position, -1, -1); |
| } |
| |
| private TermVectorPosition(int position, int startOffset, int endOffset) { |
| this.position = position; |
| this.startOffset = startOffset; |
| this.endOffset = endOffset; |
| } |
| } |
| } |