src/kudu/scripts/max_skew_estimate.py - kudu - Git at Google

 #!/usr/bin/env python
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 # The purpose of this script is to estimate the distribution of the maximum
 # skew produced by Kudu's "power of two choices" placement algorithm,
 # which is used to place replicas on tablet servers (at least in Kudu <= 1.7).
 import math
 import random
 import sys

 # Replicates Random::ReservoirSample from kudu/util/random.h.
 def reservoir_sample(n, sample_size, avoid):
     result = list()
     k = 0
     for i in xrange(n):
         if i in avoid:
             continue
         k += 1
         if len(result) < sample_size:
             result.append(i)
             continue
         j = random.randrange(k)
         if j < sample_size:
             result[j] = i
     return result

 # Follows CatalogManager::SelectReplica, which implements the power of two
 # choices selection algorithm, except we assume we always have a placement.
 def select_replica(num_servers, avoid, counts):
   two_choices = reservoir_sample(num_servers, 2, avoid)
   assert(len(two_choices) > 0)
   assert(len(two_choices) <= 2)
   if len(two_choices) == 1:
       return two_choices[0]
   else:
       a, b = two_choices[0], two_choices[1]
       if counts[a] < counts[b]:
           return a
       else:
           return b

 # Quickly cribbed from https://stackoverflow.com/a/15589202.
 # 'data' must be sorted.
 def percentile(data, percentile):
     size = len(data)
     return data[int(math.ceil((size * percentile) / 100)) - 1]

 def generate_max_skew(num_servers, num_tablets, rf):
     counts = {i : 0 for i in xrange(num_servers)}
     for t in xrange(num_tablets):
         avoid = set()
         for r in range(rf):
             replica = select_replica(num_servers, avoid, counts)
             avoid.add(replica)
             counts[replica] += 1
     return max(counts.values()) - min(counts.values())

 def main():
     args = sys.argv
     if len(args) != 5:
         print "max_skew_estimate.py <num trials> <num servers> <num_tablets> <repl factor>"
         sys.exit(1)
     num_trials, num_servers, num_tablets, rf = int(args[1]), int(args[2]), int(args[3]), int(args[4])
     skews = [generate_max_skew(num_servers, num_tablets, rf) for _ in xrange(num_trials)]
     skews.sort()
     for p in [5, 25, 50, 75, 99]:
         print "%02d percentile: %d" % (p, percentile(skews, p))

 if __name__ == "__main__":
     main()
	#!/usr/bin/env python
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	# The purpose of this script is to estimate the distribution of the maximum
	# skew produced by Kudu's "power of two choices" placement algorithm,
	# which is used to place replicas on tablet servers (at least in Kudu <= 1.7).
	import math
	import random
	import sys

	# Replicates Random::ReservoirSample from kudu/util/random.h.
	def reservoir_sample(n, sample_size, avoid):
	result = list()
	k = 0
	for i in xrange(n):
	if i in avoid:
	continue
	k += 1
	if len(result) < sample_size:
	result.append(i)
	continue
	j = random.randrange(k)
	if j < sample_size:
	result[j] = i
	return result

	# Follows CatalogManager::SelectReplica, which implements the power of two
	# choices selection algorithm, except we assume we always have a placement.
	def select_replica(num_servers, avoid, counts):
	two_choices = reservoir_sample(num_servers, 2, avoid)
	assert(len(two_choices) > 0)
	assert(len(two_choices) <= 2)
	if len(two_choices) == 1:
	return two_choices[0]
	else:
	a, b = two_choices[0], two_choices[1]
	if counts[a] < counts[b]:
	return a
	else:
	return b

	# Quickly cribbed from https://stackoverflow.com/a/15589202.
	# 'data' must be sorted.
	def percentile(data, percentile):
	size = len(data)
	return data[int(math.ceil((size * percentile) / 100)) - 1]

	def generate_max_skew(num_servers, num_tablets, rf):
	counts = {i : 0 for i in xrange(num_servers)}
	for t in xrange(num_tablets):
	avoid = set()
	for r in range(rf):
	replica = select_replica(num_servers, avoid, counts)
	avoid.add(replica)
	counts[replica] += 1
	return max(counts.values()) - min(counts.values())

	def main():
	args = sys.argv
	if len(args) != 5:
	print "max_skew_estimate.py <num trials> <num servers> <num_tablets> <repl factor>"
	sys.exit(1)
	num_trials, num_servers, num_tablets, rf = int(args[1]), int(args[2]), int(args[3]), int(args[4])
	skews = [generate_max_skew(num_servers, num_tablets, rf) for _ in xrange(num_trials)]
	skews.sort()
	for p in [5, 25, 50, 75, 99]:
	print "%02d percentile: %d" % (p, percentile(skews, p))

	if __name__ == "__main__":
	main()