| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.jena.atlas.data; |
| |
| import java.util.Comparator ; |
| import java.util.HashSet ; |
| import java.util.Iterator ; |
| |
| import org.apache.jena.atlas.iterator.Iter ; |
| import org.apache.jena.atlas.iterator.PeekIterator ; |
| import org.apache.jena.atlas.lib.Closeable ; |
| |
| /** |
| * <p> |
| * This data bag will gather distinct items in memory until a size threshold is passed, at which point it will write |
| * out all of the items to disk using the supplied serializer. |
| * </p> |
| * <p> |
| * After adding is finished, call {@link #iterator()} to set up the data bag for reading back items and iterating over them. |
| * The iterator will retrieve only distinct items. |
| * </p> |
| * <p> |
| * IMPORTANT: You may not add any more items after this call. You may subsequently call {@link #iterator()} multiple |
| * times which will give you a new iterator for each invocation. If you do not consume the entire iterator, you should |
| * call {@link Iter#close(Iterator)} to close any FileInputStreams associated with the iterator. |
| * </p> |
| * <p> |
| * Additionally, make sure to call {@link #close()} when you are finished to free any system resources (preferably in a finally block). |
| * </p> |
| * <p> |
| * Implementation Notes: Data is stored without duplicates as it comes in in a HashSet. When it is time to spill, |
| * that data is sorted and written to disk. An iterator that eliminates adjacent duplicates is used in conjunction |
| * with the SortedDataBag's iterator. |
| * </p> |
| */ |
| public class DistinctDataBag<E> extends SortedDataBag<E> |
| { |
| public DistinctDataBag(ThresholdPolicy<E> policy, SerializationFactory<E> serializerFactory, Comparator<E> comparator) |
| { |
| super(policy, serializerFactory, comparator); |
| this.memory = new HashSet<>(); |
| } |
| |
| @Override |
| public boolean isSorted() |
| { |
| // The bag may not be sorted if we havn't spilled |
| return false; |
| } |
| |
| @Override |
| public boolean isDistinct() |
| { |
| return true; |
| } |
| |
| @Override |
| public Iterator<E> iterator() |
| { |
| // We could just return super.iterator() in all cases, |
| // but no need to waste time sorting if we havn't spilled |
| if (!spilled) |
| { |
| checkClosed(); |
| finishedAdding = true; |
| |
| if (memory.size() > 0) |
| { |
| return memory.iterator(); |
| } |
| else |
| { |
| return Iter.nullIterator(); |
| } |
| } |
| else |
| { |
| return new DistinctReducedIterator<>(super.iterator()); |
| } |
| } |
| |
| protected static class DistinctReducedIterator<T> extends PeekIterator<T> implements Closeable |
| { |
| private Iterator<T> iter; |
| |
| public DistinctReducedIterator(Iterator<T> iter) |
| { |
| super(iter); |
| this.iter = iter; |
| } |
| |
| @Override |
| public T next() |
| { |
| T item = super.next(); |
| |
| // Keep going until as long as the next item is the same as the current one |
| while (hasNext() && ((null == item && null == peek()) || (null != item && item.equals(peek())))) |
| { |
| item = super.next(); |
| } |
| |
| return item; |
| } |
| |
| @Override |
| public void close() |
| { |
| Iter.close(iter); |
| } |
| |
| } |
| |
| } |