python/benchmarks/common.py - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import codecs
 import os
 import sys
 import unicodedata

 import numpy as np


 def _multiplicate_sequence(base, target_size):
     q, r = divmod(target_size, len(base))
     return [base] * q + [base[:r]]


 def get_random_bytes(n):
     rnd = np.random.RandomState(42)
     # Computing a huge random bytestring can be costly, so we get at most
     # 100KB and duplicate the result as needed
     base_size = 100003
     q, r = divmod(n, base_size)
     if q == 0:
         result = rnd.bytes(r)
     else:
         base = rnd.bytes(base_size)
         result = b''.join(_multiplicate_sequence(base, n))
     assert len(result) == n
     return result


 def get_random_ascii(n):
     arr = np.frombuffer(get_random_bytes(n), dtype=np.int8) & 0x7f
     result, _ = codecs.ascii_decode(arr)
     assert isinstance(result, str)
     assert len(result) == n
     return result


 def _random_unicode_letters(n):
     """
     Generate a string of random unicode letters (slow).
     """
     def _get_more_candidates():
         return rnd.randint(0, sys.maxunicode, size=n).tolist()

     rnd = np.random.RandomState(42)
     out = []
     candidates = []

     while len(out) < n:
         if not candidates:
             candidates = _get_more_candidates()
         ch = chr(candidates.pop())
         # XXX Do we actually care that the code points are valid?
         if unicodedata.category(ch)[0] == 'L':
             out.append(ch)
     return out


 _1024_random_unicode_letters = _random_unicode_letters(1024)


 def get_random_unicode(n):
     indices = np.frombuffer(get_random_bytes(n * 2), dtype=np.int16) & 1023
     unicode_arr = np.array(_1024_random_unicode_letters)[indices]

     result = ''.join(unicode_arr.tolist())
     assert len(result) == n, (len(result), len(unicode_arr))
     return result
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import codecs
	import os
	import sys
	import unicodedata

	import numpy as np


	def _multiplicate_sequence(base, target_size):
	q, r = divmod(target_size, len(base))
	return [base] * q + [base[:r]]


	def get_random_bytes(n):
	rnd = np.random.RandomState(42)
	# Computing a huge random bytestring can be costly, so we get at most
	# 100KB and duplicate the result as needed
	base_size = 100003
	q, r = divmod(n, base_size)
	if q == 0:
	result = rnd.bytes(r)
	else:
	base = rnd.bytes(base_size)
	result = b''.join(_multiplicate_sequence(base, n))
	assert len(result) == n
	return result


	def get_random_ascii(n):
	arr = np.frombuffer(get_random_bytes(n), dtype=np.int8) & 0x7f
	result, _ = codecs.ascii_decode(arr)
	assert isinstance(result, str)
	assert len(result) == n
	return result


	def _random_unicode_letters(n):
	"""
	Generate a string of random unicode letters (slow).
	"""
	def _get_more_candidates():
	return rnd.randint(0, sys.maxunicode, size=n).tolist()

	rnd = np.random.RandomState(42)
	out = []
	candidates = []

	while len(out) < n:
	if not candidates:
	candidates = _get_more_candidates()
	ch = chr(candidates.pop())
	# XXX Do we actually care that the code points are valid?
	if unicodedata.category(ch)[0] == 'L':
	out.append(ch)
	return out


	_1024_random_unicode_letters = _random_unicode_letters(1024)


	def get_random_unicode(n):
	indices = np.frombuffer(get_random_bytes(n * 2), dtype=np.int16) & 1023
	unicode_arr = np.array(_1024_random_unicode_letters)[indices]

	result = ''.join(unicode_arr.tolist())
	assert len(result) == n, (len(result), len(unicode_arr))
	return result