jena-arq/src/main/java/org/apache/jena/atlas/data/DistinctDataBag.java - jena - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.jena.atlas.data;

 import java.util.Comparator ;
 import java.util.HashSet ;
 import java.util.Iterator ;

 import org.apache.jena.atlas.iterator.Iter ;
 import org.apache.jena.atlas.iterator.PeekIterator ;
 import org.apache.jena.atlas.lib.Closeable ;

 /**
  * <p>
  * This data bag will gather distinct items in memory until a size threshold is passed, at which point it will write
  * out all of the items to disk using the supplied serializer.
  * </p>
  * <p>
  * After adding is finished, call {@link #iterator()} to set up the data bag for reading back items and iterating over them.
  * The iterator will retrieve only distinct items.
  * </p>
  * <p>
  * IMPORTANT: You may not add any more items after this call.  You may subsequently call {@link #iterator()} multiple
  * times which will give you a new iterator for each invocation.  If you do not consume the entire iterator, you should
  * call {@link Iter#close(Iterator)} to close any FileInputStreams associated with the iterator.
  * </p>
  * <p>
  * Additionally, make sure to call {@link #close()} when you are finished to free any system resources (preferably in a finally block).
  * </p>
  * <p>
  * Implementation Notes: Data is stored without duplicates as it comes in in a HashSet.  When it is time to spill,
  * that data is sorted and written to disk.  An iterator that eliminates adjacent duplicates is used in conjunction
  * with the SortedDataBag's iterator.
  * </p>
  */
 public class DistinctDataBag<E> extends SortedDataBag<E>
 {
     public DistinctDataBag(ThresholdPolicy<E> policy, SerializationFactory<E> serializerFactory, Comparator<E> comparator)
     {
         super(policy, serializerFactory, comparator);
         this.memory = new HashSet<>();
     }

     @Override
     public boolean isSorted()
     {
         // The bag may not be sorted if we havn't spilled
         return false;
     }

     @Override
     public boolean isDistinct()
     {
         return true;
     }

     @Override
     public Iterator<E> iterator()
     {
         // We could just return super.iterator() in all cases,
         // but no need to waste time sorting if we havn't spilled
         if (!spilled)
         {
             checkClosed();
             finishedAdding = true;

             if (memory.size() > 0)
             {
                 return memory.iterator();
             }
             else
             {
                 return Iter.nullIterator();
             }
         }
         else
         {
             return new DistinctReducedIterator<>(super.iterator());
         }
     }

     protected static class DistinctReducedIterator<T> extends PeekIterator<T> implements Closeable
     {
         private Iterator<T> iter;

         public DistinctReducedIterator(Iterator<T> iter)
         {
             super(iter);
             this.iter = iter;
         }

         @Override
         public T next()
         {
             T item = super.next();

             // Keep going until as long as the next item is the same as the current one
             while (hasNext() && ((null == item && null == peek()) || (null != item && item.equals(peek()))))
             {
                 item = super.next();
             }

             return item;
         }

         @Override
         public void close()
         {
             Iter.close(iter);
         }

     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.jena.atlas.data;

	import java.util.Comparator ;
	import java.util.HashSet ;
	import java.util.Iterator ;

	import org.apache.jena.atlas.iterator.Iter ;
	import org.apache.jena.atlas.iterator.PeekIterator ;
	import org.apache.jena.atlas.lib.Closeable ;

	/**
	* <p>
	* This data bag will gather distinct items in memory until a size threshold is passed, at which point it will write
	* out all of the items to disk using the supplied serializer.
	* </p>
	* <p>
	* After adding is finished, call {@link #iterator()} to set up the data bag for reading back items and iterating over them.
	* The iterator will retrieve only distinct items.
	* </p>
	* <p>
	* IMPORTANT: You may not add any more items after this call. You may subsequently call {@link #iterator()} multiple
	* times which will give you a new iterator for each invocation. If you do not consume the entire iterator, you should
	* call {@link Iter#close(Iterator)} to close any FileInputStreams associated with the iterator.
	* </p>
	* <p>
	* Additionally, make sure to call {@link #close()} when you are finished to free any system resources (preferably in a finally block).
	* </p>
	* <p>
	* Implementation Notes: Data is stored without duplicates as it comes in in a HashSet. When it is time to spill,
	* that data is sorted and written to disk. An iterator that eliminates adjacent duplicates is used in conjunction
	* with the SortedDataBag's iterator.
	* </p>
	*/
	public class DistinctDataBag<E> extends SortedDataBag<E>
	{
	public DistinctDataBag(ThresholdPolicy<E> policy, SerializationFactory<E> serializerFactory, Comparator<E> comparator)
	{
	super(policy, serializerFactory, comparator);
	this.memory = new HashSet<>();
	}

	@Override
	public boolean isSorted()
	{
	// The bag may not be sorted if we havn't spilled
	return false;
	}

	@Override
	public boolean isDistinct()
	{
	return true;
	}

	@Override
	public Iterator<E> iterator()
	{
	// We could just return super.iterator() in all cases,
	// but no need to waste time sorting if we havn't spilled
	if (!spilled)
	{
	checkClosed();
	finishedAdding = true;

	if (memory.size() > 0)
	{
	return memory.iterator();
	}
	else
	{
	return Iter.nullIterator();
	}
	}
	else
	{
	return new DistinctReducedIterator<>(super.iterator());
	}
	}

	protected static class DistinctReducedIterator<T> extends PeekIterator<T> implements Closeable
	{
	private Iterator<T> iter;

	public DistinctReducedIterator(Iterator<T> iter)
	{
	super(iter);
	this.iter = iter;
	}

	@Override
	public T next()
	{
	T item = super.next();

	// Keep going until as long as the next item is the same as the current one
	while (hasNext() && ((null == item && null == peek()) \|\| (null != item && item.equals(peek()))))
	{
	item = super.next();
	}

	return item;
	}

	@Override
	public void close()
	{
	Iter.close(iter);
	}

	}

	}