phoenix-core/src/main/java/org/apache/phoenix/hbase/index/builder/IndexBuilder.java - phoenix - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.phoenix.hbase.index.builder;

 import java.io.IOException;
 import java.util.Collection;
 import java.util.List;
 import java.util.Map;

 import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.Stoppable;
 import org.apache.hadoop.hbase.client.Delete;
 import org.apache.hadoop.hbase.client.Increment;
 import org.apache.hadoop.hbase.client.Mutation;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
 import org.apache.hadoop.hbase.regionserver.MiniBatchOperationInProgress;
 import org.apache.hadoop.hbase.util.Pair;
 import org.apache.phoenix.coprocessor.BaseScannerRegionObserver.ReplayWrite;
 import org.apache.phoenix.hbase.index.Indexer;
 import org.apache.phoenix.hbase.index.covered.IndexMetaData;

 /**
  * Interface to build updates ({@link Mutation}s) to the index tables, based on the primary table
  * updates.
  * <p>
  * Either all the index updates will be applied to all tables or the primary table will kill itself
  * and will attempt to replay the index edits through the WAL replay mechanism.
  */
 public interface IndexBuilder extends Stoppable {

   /** Helper method signature to ensure people don't attempt to extend this class directly */
   public void extendBaseIndexBuilderInstead();

   /**
    * This is always called exactly once on install of {@link Indexer}, before any calls
    * {@link #getIndexUpdate} on
    * @param env in which the builder is running
    * @throws IOException on failure to setup the builder
    */
   public void setup(RegionCoprocessorEnvironment env) throws IOException;

   /**
    * Your opportunity to update any/all index tables based on the update of the primary table row.
    * Its up to your implementation to ensure that timestamps match between the primary and index
    * tables.
    * <p>
    * The mutation is a generic mutation (not a {@link Put} or a {@link Delete}), as it actually
    * corresponds to a batch update. Its important to note that {@link Put}s always go through the
    * batch update code path, so a single {@link Put} will come through here and update the primary
    * table as the only update in the mutation.
    * <p>
    * Implementers must ensure that this method is thread-safe - it could (and probably will) be
    * called concurrently for different mutations, which may or may not be part of the same batch.
    * @param mutation update to the primary table to be indexed.
    * @param context index meta data for the mutation
    * @return a Map of the mutations to make -> target index table name
    * @throws IOException on failure
    */
   public Collection<Pair<Mutation, byte[]>> getIndexUpdate(Mutation mutation, IndexMetaData context) throws IOException;

     /**
      * Build an index update to cleanup the index when we remove {@link KeyValue}s via the normal flush or compaction
      * mechanisms. Currently not implemented by any implementors nor called, but left here to be implemented if we
      * ever need it. In Jesse's words:
      *
      * Arguably, this is a correctness piece that should be used, but isn't. Basically, it *could* be that
      * if a compaction/flush were to remove a key (too old, too many versions) you might want to cleanup the index table
      * as well, if it were to get out of sync with the primary table. For instance, you might get multiple versions of
      * the same row, which should eventually age of the oldest version. However, in the index table there would only
      * ever be two entries for that row - the first one, matching the original row, and the delete marker for the index
      * update, set when we got a newer version of the primary row. So, a basic HBase scan wouldn't show the index update
      * b/c its covered by the delete marker, but an older timestamp based read would actually show the index row, even
      * after the primary table row is gone due to MAX_VERSIONS requirement.
      *
      * @param filtered {@link KeyValue}s that previously existed, but won't be included
      * in further output from HBase.
      * @param context TODO
      *
      * @return a {@link Map} of the mutations to make -> target index table name
      * @throws IOException on failure
      */
   public Collection<Pair<Mutation, byte[]>> getIndexUpdateForFilteredRows(
       Collection<Cell> filtered, IndexMetaData context)
       throws IOException;

   /**
    * Notification that a batch of updates has successfully been written.
    * @param miniBatchOp the full batch operation that was written
    */
   public void batchCompleted(MiniBatchOperationInProgress<Mutation> miniBatchOp);

   /**
    * Notification that a batch has been started.
    * <p>
    * Unfortunately, the way HBase has the coprocessor hooks setup, this is actually called
    * <i>after</i> the {@link #getIndexUpdate} methods. Therefore, you will likely need an attribute
    * on your {@link Put}/{@link Delete} to indicate it is a batch operation.
    * @param miniBatchOp the full batch operation to be written
  * @param context TODO
  * @throws IOException
    */
   public void batchStarted(MiniBatchOperationInProgress<Mutation> miniBatchOp, IndexMetaData context) throws IOException;

   public IndexMetaData getIndexMetaData(MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException;

   /**
    * This allows the codec to dynamically change whether or not indexing should take place for a
    * table. If it doesn't take place, we can save a lot of time on the regular Put patch. By making
    * it dynamic, we can save offlining and then onlining a table just to turn indexing on.
    * <p>
    * We can also be smart about even indexing a given update here too - if the update doesn't
    * contain any columns that we care about indexing, we can save the effort of analyzing the put
    * and further.
    * @param m mutation that should be indexed.
    * @return <tt>true</tt> if indexing is enabled for the given table. This should be on a per-table
    *         basis, as each codec is instantiated per-region.
  * @throws IOException
    */
   public boolean isEnabled(Mutation m);

   /**
    * True if mutation has an ON DUPLICATE KEY clause
    * @param m mutation
    * @return true if mutation has ON DUPLICATE KEY expression and false otherwise.
    * @throws IOException
    */
   public boolean isAtomicOp(Mutation m);

   /**
    * Calculate the mutations based on the ON DUPLICATE KEY clause
    * @param inc increment to run against
    * @return list of mutations as a result of executing the ON DUPLICATE KEY clause
    * or null if Increment does not represent an ON DUPLICATE KEY clause.
    */
   public List<Mutation> executeAtomicOp(Increment inc) throws IOException;

   public ReplayWrite getReplayWrite(Mutation m);
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.phoenix.hbase.index.builder;

	import java.io.IOException;
	import java.util.Collection;
	import java.util.List;
	import java.util.Map;

	import org.apache.hadoop.hbase.Cell;
	import org.apache.hadoop.hbase.Stoppable;
	import org.apache.hadoop.hbase.client.Delete;
	import org.apache.hadoop.hbase.client.Increment;
	import org.apache.hadoop.hbase.client.Mutation;
	import org.apache.hadoop.hbase.client.Put;
	import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
	import org.apache.hadoop.hbase.regionserver.MiniBatchOperationInProgress;
	import org.apache.hadoop.hbase.util.Pair;
	import org.apache.phoenix.coprocessor.BaseScannerRegionObserver.ReplayWrite;
	import org.apache.phoenix.hbase.index.Indexer;
	import org.apache.phoenix.hbase.index.covered.IndexMetaData;

	/**
	* Interface to build updates ({@link Mutation}s) to the index tables, based on the primary table
	* updates.
	* <p>
	* Either all the index updates will be applied to all tables or the primary table will kill itself
	* and will attempt to replay the index edits through the WAL replay mechanism.
	*/
	public interface IndexBuilder extends Stoppable {

	/** Helper method signature to ensure people don't attempt to extend this class directly */
	public void extendBaseIndexBuilderInstead();

	/**
	* This is always called exactly once on install of {@link Indexer}, before any calls
	* {@link #getIndexUpdate} on
	* @param env in which the builder is running
	* @throws IOException on failure to setup the builder
	*/
	public void setup(RegionCoprocessorEnvironment env) throws IOException;

	/**
	* Your opportunity to update any/all index tables based on the update of the primary table row.
	* Its up to your implementation to ensure that timestamps match between the primary and index
	* tables.
	* <p>
	* The mutation is a generic mutation (not a {@link Put} or a {@link Delete}), as it actually
	* corresponds to a batch update. Its important to note that {@link Put}s always go through the
	* batch update code path, so a single {@link Put} will come through here and update the primary
	* table as the only update in the mutation.
	* <p>
	* Implementers must ensure that this method is thread-safe - it could (and probably will) be
	* called concurrently for different mutations, which may or may not be part of the same batch.
	* @param mutation update to the primary table to be indexed.
	* @param context index meta data for the mutation
	* @return a Map of the mutations to make -> target index table name
	* @throws IOException on failure
	*/
	public Collection<Pair<Mutation, byte[]>> getIndexUpdate(Mutation mutation, IndexMetaData context) throws IOException;

	/**
	* Build an index update to cleanup the index when we remove {@link KeyValue}s via the normal flush or compaction
	* mechanisms. Currently not implemented by any implementors nor called, but left here to be implemented if we
	* ever need it. In Jesse's words:
	*
	* Arguably, this is a correctness piece that should be used, but isn't. Basically, it could be that
	* if a compaction/flush were to remove a key (too old, too many versions) you might want to cleanup the index table
	* as well, if it were to get out of sync with the primary table. For instance, you might get multiple versions of
	* the same row, which should eventually age of the oldest version. However, in the index table there would only
	* ever be two entries for that row - the first one, matching the original row, and the delete marker for the index
	* update, set when we got a newer version of the primary row. So, a basic HBase scan wouldn't show the index update
	* b/c its covered by the delete marker, but an older timestamp based read would actually show the index row, even
	* after the primary table row is gone due to MAX_VERSIONS requirement.
	*
	* @param filtered {@link KeyValue}s that previously existed, but won't be included
	* in further output from HBase.
	* @param context TODO
	*
	* @return a {@link Map} of the mutations to make -> target index table name
	* @throws IOException on failure
	*/
	public Collection<Pair<Mutation, byte[]>> getIndexUpdateForFilteredRows(
	Collection<Cell> filtered, IndexMetaData context)
	throws IOException;

	/**
	* Notification that a batch of updates has successfully been written.
	* @param miniBatchOp the full batch operation that was written
	*/
	public void batchCompleted(MiniBatchOperationInProgress<Mutation> miniBatchOp);

	/**
	* Notification that a batch has been started.
	* <p>
	* Unfortunately, the way HBase has the coprocessor hooks setup, this is actually called
	* <i>after</i> the {@link #getIndexUpdate} methods. Therefore, you will likely need an attribute
	* on your {@link Put}/{@link Delete} to indicate it is a batch operation.
	* @param miniBatchOp the full batch operation to be written
	* @param context TODO
	* @throws IOException
	*/
	public void batchStarted(MiniBatchOperationInProgress<Mutation> miniBatchOp, IndexMetaData context) throws IOException;

	public IndexMetaData getIndexMetaData(MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException;

	/**
	* This allows the codec to dynamically change whether or not indexing should take place for a
	* table. If it doesn't take place, we can save a lot of time on the regular Put patch. By making
	* it dynamic, we can save offlining and then onlining a table just to turn indexing on.
	* <p>
	* We can also be smart about even indexing a given update here too - if the update doesn't
	* contain any columns that we care about indexing, we can save the effort of analyzing the put
	* and further.
	* @param m mutation that should be indexed.
	* @return <tt>true</tt> if indexing is enabled for the given table. This should be on a per-table
	* basis, as each codec is instantiated per-region.
	* @throws IOException
	*/
	public boolean isEnabled(Mutation m);

	/**
	* True if mutation has an ON DUPLICATE KEY clause
	* @param m mutation
	* @return true if mutation has ON DUPLICATE KEY expression and false otherwise.
	* @throws IOException
	*/
	public boolean isAtomicOp(Mutation m);

	/**
	* Calculate the mutations based on the ON DUPLICATE KEY clause
	* @param inc increment to run against
	* @return list of mutations as a result of executing the ON DUPLICATE KEY clause
	* or null if Increment does not represent an ON DUPLICATE KEY clause.
	*/
	public List<Mutation> executeAtomicOp(Increment inc) throws IOException;

	public ReplayWrite getReplayWrite(Mutation m);
	}