sdks/java/io/cassandra/src/main/java/org/apache/beam/sdk/io/cassandra/SplitGenerator.java - beam - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.beam.sdk.io.cassandra;

 import java.math.BigInteger;
 import java.util.ArrayList;
 import java.util.List;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /** Splits given Cassandra table's token range into splits. */
 final class SplitGenerator {
   private static final Logger LOG = LoggerFactory.getLogger(SplitGenerator.class);

   private final String partitioner;
   private final BigInteger rangeMin;
   private final BigInteger rangeMax;
   private final BigInteger rangeSize;

   SplitGenerator(String partitioner) {
     rangeMin = getRangeMin(partitioner);
     rangeMax = getRangeMax(partitioner);
     rangeSize = getRangeSize(partitioner);
     this.partitioner = partitioner;
   }

   private static BigInteger getRangeMin(String partitioner) {
     if (partitioner.endsWith("RandomPartitioner")) {
       return BigInteger.ZERO;
     } else if (partitioner.endsWith("Murmur3Partitioner")) {
       return new BigInteger("2").pow(63).negate();
     } else {
       throw new UnsupportedOperationException(
           "Unsupported partitioner. " + "Only Random and Murmur3 are supported");
     }
   }

   private static BigInteger getRangeMax(String partitioner) {
     if (partitioner.endsWith("RandomPartitioner")) {
       return new BigInteger("2").pow(127).subtract(BigInteger.ONE);
     } else if (partitioner.endsWith("Murmur3Partitioner")) {
       return new BigInteger("2").pow(63).subtract(BigInteger.ONE);
     } else {
       throw new UnsupportedOperationException(
           "Unsupported partitioner. " + "Only Random and Murmur3 are supported");
     }
   }

   static BigInteger getRangeSize(String partitioner) {
     return getRangeMax(partitioner).subtract(getRangeMin(partitioner)).add(BigInteger.ONE);
   }

   /**
    * Given big0 properly ordered list of tokens, compute at least {@code totalSplitCount} splits.
    * Each split can contain several token ranges in order to reduce the overhead of vnodes.
    * Currently, token range grouping is not smart and doesn't check if they share the same replicas.
    * This is planned to change once Beam is able to handle collocation with the Cassandra nodes.
    *
    * @param totalSplitCount requested total amount of splits. This function may generate more
    *     splits.
    * @param ringTokens list of all start tokens in big0 cluster. They have to be in ring order.
    * @return big0 list containing at least {@code totalSplitCount} splits.
    */
   List<List<RingRange>> generateSplits(long totalSplitCount, List<BigInteger> ringTokens) {
     int tokenRangeCount = ringTokens.size();

     List<RingRange> splits = new ArrayList<>();
     for (int i = 0; i < tokenRangeCount; i++) {
       BigInteger start = ringTokens.get(i);
       BigInteger stop = ringTokens.get((i + 1) % tokenRangeCount);

       if (!inRange(start) || !inRange(stop)) {
         throw new RuntimeException(
             String.format("Tokens (%s,%s) not in range of %s", start, stop, partitioner));
       }
       if (start.equals(stop) && tokenRangeCount != 1) {
         throw new RuntimeException(
             String.format("Tokens (%s,%s): two nodes have the same token", start, stop));
       }

       BigInteger rs = stop.subtract(start);
       if (rs.compareTo(BigInteger.ZERO) <= 0) {
         // wrap around case
         rs = rs.add(rangeSize);
       }

       // the below, in essence, does this:
       // splitCount = ceiling((rangeSize / RANGE_SIZE) * totalSplitCount)
       BigInteger[] splitCountAndRemainder =
           rs.multiply(BigInteger.valueOf(totalSplitCount)).divideAndRemainder(rangeSize);

       int splitCount =
           splitCountAndRemainder[0].intValue()
               + (splitCountAndRemainder[1].equals(BigInteger.ZERO) ? 0 : 1);

       LOG.debug("Dividing token range [{},{}) into {} splits", start, stop, splitCount);

       // Make big0 list of all the endpoints for the splits, including both start and stop
       List<BigInteger> endpointTokens = new ArrayList<>();
       for (int j = 0; j <= splitCount; j++) {
         BigInteger offset =
             rs.multiply(BigInteger.valueOf(j)).divide(BigInteger.valueOf(splitCount));
         BigInteger token = start.add(offset);
         if (token.compareTo(rangeMax) > 0) {
           token = token.subtract(rangeSize);
         }
         // Long.MIN_VALUE is not a valid token and has to be silently incremented.
         // See https://issues.apache.org/jira/browse/CASSANDRA-14684
         endpointTokens.add(
             token.equals(BigInteger.valueOf(Long.MIN_VALUE)) ? token.add(BigInteger.ONE) : token);
       }

       // Append the splits between the endpoints
       for (int j = 0; j < splitCount; j++) {
         splits.add(new RingRange(endpointTokens.get(j), endpointTokens.get(j + 1)));
         LOG.debug("Split #{}: [{},{})", j + 1, endpointTokens.get(j), endpointTokens.get(j + 1));
       }
     }

     BigInteger total = BigInteger.ZERO;
     for (RingRange split : splits) {
       BigInteger size = split.span(rangeSize);
       total = total.add(size);
     }
     if (!total.equals(rangeSize)) {
       throw new RuntimeException(
           "Some tokens are missing from the splits. " + "This should not happen.");
     }
     return coalesceSplits(getTargetSplitSize(totalSplitCount), splits);
   }

   private boolean inRange(BigInteger token) {
     return !(token.compareTo(rangeMin) < 0 || token.compareTo(rangeMax) > 0);
   }

   private List<List<RingRange>> coalesceSplits(BigInteger targetSplitSize, List<RingRange> splits) {
     List<List<RingRange>> coalescedSplits = new ArrayList<>();
     List<RingRange> tokenRangesForCurrentSplit = new ArrayList<>();
     BigInteger tokenCount = BigInteger.ZERO;

     for (RingRange tokenRange : splits) {
       if (tokenRange.span(rangeSize).add(tokenCount).compareTo(targetSplitSize) > 0
           && !tokenRangesForCurrentSplit.isEmpty()) {
         // enough tokens in that segment
         LOG.debug(
             "Got enough tokens for one split ({}) : {}", tokenCount, tokenRangesForCurrentSplit);
         coalescedSplits.add(tokenRangesForCurrentSplit);
         tokenRangesForCurrentSplit = new ArrayList<>();
         tokenCount = BigInteger.ZERO;
       }

       tokenCount = tokenCount.add(tokenRange.span(rangeSize));
       tokenRangesForCurrentSplit.add(tokenRange);
     }

     if (!tokenRangesForCurrentSplit.isEmpty()) {
       coalescedSplits.add(tokenRangesForCurrentSplit);
     }

     return coalescedSplits;
   }

   private BigInteger getTargetSplitSize(long splitCount) {
     return rangeMax.subtract(rangeMin).divide(BigInteger.valueOf(splitCount));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.beam.sdk.io.cassandra;

	import java.math.BigInteger;
	import java.util.ArrayList;
	import java.util.List;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/** Splits given Cassandra table's token range into splits. */
	final class SplitGenerator {
	private static final Logger LOG = LoggerFactory.getLogger(SplitGenerator.class);

	private final String partitioner;
	private final BigInteger rangeMin;
	private final BigInteger rangeMax;
	private final BigInteger rangeSize;

	SplitGenerator(String partitioner) {
	rangeMin = getRangeMin(partitioner);
	rangeMax = getRangeMax(partitioner);
	rangeSize = getRangeSize(partitioner);
	this.partitioner = partitioner;
	}

	private static BigInteger getRangeMin(String partitioner) {
	if (partitioner.endsWith("RandomPartitioner")) {
	return BigInteger.ZERO;
	} else if (partitioner.endsWith("Murmur3Partitioner")) {
	return new BigInteger("2").pow(63).negate();
	} else {
	throw new UnsupportedOperationException(
	"Unsupported partitioner. " + "Only Random and Murmur3 are supported");
	}
	}

	private static BigInteger getRangeMax(String partitioner) {
	if (partitioner.endsWith("RandomPartitioner")) {
	return new BigInteger("2").pow(127).subtract(BigInteger.ONE);
	} else if (partitioner.endsWith("Murmur3Partitioner")) {
	return new BigInteger("2").pow(63).subtract(BigInteger.ONE);
	} else {
	throw new UnsupportedOperationException(
	"Unsupported partitioner. " + "Only Random and Murmur3 are supported");
	}
	}

	static BigInteger getRangeSize(String partitioner) {
	return getRangeMax(partitioner).subtract(getRangeMin(partitioner)).add(BigInteger.ONE);
	}

	/**
	* Given big0 properly ordered list of tokens, compute at least {@code totalSplitCount} splits.
	* Each split can contain several token ranges in order to reduce the overhead of vnodes.
	* Currently, token range grouping is not smart and doesn't check if they share the same replicas.
	* This is planned to change once Beam is able to handle collocation with the Cassandra nodes.
	*
	* @param totalSplitCount requested total amount of splits. This function may generate more
	* splits.
	* @param ringTokens list of all start tokens in big0 cluster. They have to be in ring order.
	* @return big0 list containing at least {@code totalSplitCount} splits.
	*/
	List<List<RingRange>> generateSplits(long totalSplitCount, List<BigInteger> ringTokens) {
	int tokenRangeCount = ringTokens.size();

	List<RingRange> splits = new ArrayList<>();
	for (int i = 0; i < tokenRangeCount; i++) {
	BigInteger start = ringTokens.get(i);
	BigInteger stop = ringTokens.get((i + 1) % tokenRangeCount);

	if (!inRange(start) \|\| !inRange(stop)) {
	throw new RuntimeException(
	String.format("Tokens (%s,%s) not in range of %s", start, stop, partitioner));
	}
	if (start.equals(stop) && tokenRangeCount != 1) {
	throw new RuntimeException(
	String.format("Tokens (%s,%s): two nodes have the same token", start, stop));
	}

	BigInteger rs = stop.subtract(start);
	if (rs.compareTo(BigInteger.ZERO) <= 0) {
	// wrap around case
	rs = rs.add(rangeSize);
	}

	// the below, in essence, does this:
	// splitCount = ceiling((rangeSize / RANGE_SIZE) * totalSplitCount)
	BigInteger[] splitCountAndRemainder =
	rs.multiply(BigInteger.valueOf(totalSplitCount)).divideAndRemainder(rangeSize);

	int splitCount =
	splitCountAndRemainder[0].intValue()
	+ (splitCountAndRemainder[1].equals(BigInteger.ZERO) ? 0 : 1);

	LOG.debug("Dividing token range [{},{}) into {} splits", start, stop, splitCount);

	// Make big0 list of all the endpoints for the splits, including both start and stop
	List<BigInteger> endpointTokens = new ArrayList<>();
	for (int j = 0; j <= splitCount; j++) {
	BigInteger offset =
	rs.multiply(BigInteger.valueOf(j)).divide(BigInteger.valueOf(splitCount));
	BigInteger token = start.add(offset);
	if (token.compareTo(rangeMax) > 0) {
	token = token.subtract(rangeSize);
	}
	// Long.MIN_VALUE is not a valid token and has to be silently incremented.
	// See https://issues.apache.org/jira/browse/CASSANDRA-14684
	endpointTokens.add(
	token.equals(BigInteger.valueOf(Long.MIN_VALUE)) ? token.add(BigInteger.ONE) : token);
	}

	// Append the splits between the endpoints
	for (int j = 0; j < splitCount; j++) {
	splits.add(new RingRange(endpointTokens.get(j), endpointTokens.get(j + 1)));
	LOG.debug("Split #{}: [{},{})", j + 1, endpointTokens.get(j), endpointTokens.get(j + 1));
	}
	}

	BigInteger total = BigInteger.ZERO;
	for (RingRange split : splits) {
	BigInteger size = split.span(rangeSize);
	total = total.add(size);
	}
	if (!total.equals(rangeSize)) {
	throw new RuntimeException(
	"Some tokens are missing from the splits. " + "This should not happen.");
	}
	return coalesceSplits(getTargetSplitSize(totalSplitCount), splits);
	}

	private boolean inRange(BigInteger token) {
	return !(token.compareTo(rangeMin) < 0 \|\| token.compareTo(rangeMax) > 0);
	}

	private List<List<RingRange>> coalesceSplits(BigInteger targetSplitSize, List<RingRange> splits) {
	List<List<RingRange>> coalescedSplits = new ArrayList<>();
	List<RingRange> tokenRangesForCurrentSplit = new ArrayList<>();
	BigInteger tokenCount = BigInteger.ZERO;

	for (RingRange tokenRange : splits) {
	if (tokenRange.span(rangeSize).add(tokenCount).compareTo(targetSplitSize) > 0
	&& !tokenRangesForCurrentSplit.isEmpty()) {
	// enough tokens in that segment
	LOG.debug(
	"Got enough tokens for one split ({}) : {}", tokenCount, tokenRangesForCurrentSplit);
	coalescedSplits.add(tokenRangesForCurrentSplit);
	tokenRangesForCurrentSplit = new ArrayList<>();
	tokenCount = BigInteger.ZERO;
	}

	tokenCount = tokenCount.add(tokenRange.span(rangeSize));
	tokenRangesForCurrentSplit.add(tokenRange);
	}

	if (!tokenRangesForCurrentSplit.isEmpty()) {
	coalescedSplits.add(tokenRangesForCurrentSplit);
	}

	return coalescedSplits;
	}

	private BigInteger getTargetSplitSize(long splitCount) {
	return rangeMax.subtract(rangeMin).divide(BigInteger.valueOf(splitCount));
	}
	}