| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.facet.taxonomy.directory; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.List; |
| |
| import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays; |
| import org.apache.lucene.facet.taxonomy.TaxonomyReader; |
| import org.apache.lucene.index.CorruptIndexException; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.MultiTerms; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.Accountables; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.RamUsageEstimator; |
| |
| /** |
| * A {@link ParallelTaxonomyArrays} that are initialized from the taxonomy |
| * index. |
| * |
| * @lucene.experimental |
| */ |
| class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable { |
| |
| private final int[] parents; |
| |
| // the following two arrays are lazily initialized. note that we only keep a |
| // single boolean member as volatile, instead of declaring the arrays |
| // volatile. the code guarantees that only after the boolean is set to true, |
| // the arrays are returned. |
| private volatile boolean initializedChildren = false; |
| private int[] children, siblings; |
| |
| /** Used by {@link #add(int, int)} after the array grew. */ |
| private TaxonomyIndexArrays(int[] parents) { |
| this.parents = parents; |
| } |
| |
| public TaxonomyIndexArrays(IndexReader reader) throws IOException { |
| parents = new int[reader.maxDoc()]; |
| if (parents.length > 0) { |
| initParents(reader, 0); |
| // Starting Lucene 2.9, following the change LUCENE-1542, we can |
| // no longer reliably read the parent "-1" (see comment in |
| // LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way |
| // to fix this in indexing without breaking backward-compatibility |
| // with existing indexes, so what we'll do instead is just |
| // hard-code the parent of ordinal 0 to be -1, and assume (as is |
| // indeed the case) that no other parent can be -1. |
| parents[0] = TaxonomyReader.INVALID_ORDINAL; |
| } |
| } |
| |
| public TaxonomyIndexArrays(IndexReader reader, TaxonomyIndexArrays copyFrom) throws IOException { |
| assert copyFrom != null; |
| |
| // note that copyParents.length may be equal to reader.maxDoc(). this is not a bug |
| // it may be caused if e.g. the taxonomy segments were merged, and so an updated |
| // NRT reader was obtained, even though nothing was changed. this is not very likely |
| // to happen. |
| int[] copyParents = copyFrom.parents(); |
| this.parents = new int[reader.maxDoc()]; |
| System.arraycopy(copyParents, 0, parents, 0, copyParents.length); |
| initParents(reader, copyParents.length); |
| |
| if (copyFrom.initializedChildren) { |
| initChildrenSiblings(copyFrom); |
| } |
| } |
| |
| private final synchronized void initChildrenSiblings(TaxonomyIndexArrays copyFrom) { |
| if (!initializedChildren) { // must do this check ! |
| children = new int[parents.length]; |
| siblings = new int[parents.length]; |
| if (copyFrom != null) { |
| // called from the ctor, after we know copyFrom has initialized children/siblings |
| System.arraycopy(copyFrom.children(), 0, children, 0, copyFrom.children().length); |
| System.arraycopy(copyFrom.siblings(), 0, siblings, 0, copyFrom.siblings().length); |
| computeChildrenSiblings(copyFrom.parents.length); |
| } else { |
| computeChildrenSiblings(0); |
| } |
| initializedChildren = true; |
| } |
| } |
| |
| private void computeChildrenSiblings(int first) { |
| // reset the youngest child of all ordinals. while this should be done only |
| // for the leaves, we don't know up front which are the leaves, so we reset |
| // all of them. |
| for (int i = first; i < parents.length; i++) { |
| children[i] = TaxonomyReader.INVALID_ORDINAL; |
| } |
| |
| // the root category has no parent, and therefore no siblings |
| if (first == 0) { |
| first = 1; |
| siblings[0] = TaxonomyReader.INVALID_ORDINAL; |
| } |
| |
| for (int i = first; i < parents.length; i++) { |
| // note that parents[i] is always < i, so the right-hand-side of |
| // the following line is already set when we get here |
| siblings[i] = children[parents[i]]; |
| children[parents[i]] = i; |
| } |
| } |
| |
| // Read the parents of the new categories |
| private void initParents(IndexReader reader, int first) throws IOException { |
| if (reader.maxDoc() == first) { |
| return; |
| } |
| |
| // it's ok to use MultiTerms because we only iterate on one posting list. |
| // breaking it to loop over the leaves() only complicates code for no |
| // apparent gain. |
| PostingsEnum positions = MultiTerms.getTermPostingsEnum(reader, |
| Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, |
| PostingsEnum.PAYLOADS); |
| |
| // shouldn't really happen, if it does, something's wrong |
| if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) { |
| throw new CorruptIndexException("Missing parent data for category " + first, reader.toString()); |
| } |
| |
| int num = reader.maxDoc(); |
| for (int i = first; i < num; i++) { |
| if (positions.docID() == i) { |
| if (positions.freq() == 0) { // shouldn't happen |
| throw new CorruptIndexException("Missing parent data for category " + i, reader.toString()); |
| } |
| |
| parents[i] = positions.nextPosition(); |
| |
| if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { |
| if (i + 1 < num) { |
| throw new CorruptIndexException("Missing parent data for category "+ (i + 1), reader.toString()); |
| } |
| break; |
| } |
| } else { // this shouldn't happen |
| throw new CorruptIndexException("Missing parent data for category " + i, reader.toString()); |
| } |
| } |
| } |
| |
| /** |
| * Adds the given ordinal/parent info and returns either a new instance if the |
| * underlying array had to grow, or this instance otherwise. |
| * <p> |
| * <b>NOTE:</b> you should call this method from a thread-safe code. |
| */ |
| TaxonomyIndexArrays add(int ordinal, int parentOrdinal) { |
| if (ordinal >= parents.length) { |
| int[] newarray = ArrayUtil.grow(parents, ordinal + 1); |
| newarray[ordinal] = parentOrdinal; |
| return new TaxonomyIndexArrays(newarray); |
| } |
| parents[ordinal] = parentOrdinal; |
| return this; |
| } |
| |
| /** |
| * Returns the parents array, where {@code parents[i]} denotes the parent of |
| * category ordinal {@code i}. |
| */ |
| @Override |
| public int[] parents() { |
| return parents; |
| } |
| |
| /** |
| * Returns the children array, where {@code children[i]} denotes the youngest |
| * child of category ordinal {@code i}. The youngest child is defined as the |
| * category that was added last to the taxonomy as an immediate child of |
| * {@code i}. |
| */ |
| @Override |
| public int[] children() { |
| if (!initializedChildren) { |
| initChildrenSiblings(null); |
| } |
| |
| // the array is guaranteed to be populated |
| return children; |
| } |
| |
| /** |
| * Returns the siblings array, where {@code siblings[i]} denotes the sibling |
| * of category ordinal {@code i}. The sibling is defined as the previous |
| * youngest child of {@code parents[i]}. |
| */ |
| @Override |
| public int[] siblings() { |
| if (!initializedChildren) { |
| initChildrenSiblings(null); |
| } |
| |
| // the array is guaranteed to be populated |
| return siblings; |
| } |
| |
| @Override |
| public synchronized long ramBytesUsed() { |
| long ramBytesUsed = RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 1; |
| ramBytesUsed += RamUsageEstimator.shallowSizeOf(parents); |
| if (children != null) { |
| ramBytesUsed += RamUsageEstimator.shallowSizeOf(children); |
| } |
| if (siblings != null) { |
| ramBytesUsed += RamUsageEstimator.shallowSizeOf(siblings); |
| } |
| return ramBytesUsed; |
| } |
| |
| @Override |
| public synchronized Collection<Accountable> getChildResources() { |
| final List<Accountable> resources = new ArrayList<>(); |
| resources.add(Accountables.namedAccountable("parents", RamUsageEstimator.shallowSizeOf(parents))); |
| if (children != null) { |
| resources.add(Accountables.namedAccountable("children", RamUsageEstimator.shallowSizeOf(children))); |
| } |
| if (siblings != null) { |
| resources.add(Accountables.namedAccountable("siblings", RamUsageEstimator.shallowSizeOf(siblings))); |
| } |
| return Collections.unmodifiableList(resources); |
| } |
| } |