python/examples/plasma/sorting/multimerge.pyx - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # cython: profile=False
 # distutils: language = c++
 # cython: embedsignature = True

 from libc.stdint cimport uintptr_t
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair

 import numpy as np

 cimport numpy as np

 cdef extern from "<queue>" namespace "std" nogil:
     cdef cppclass priority_queue[T]:
         priority_queue() except +
         priority_queue(priority_queue&) except +
         bint empty()
         void pop()
         void push(T&)
         size_t size()
         T& top()


 def multimerge2d(*arrays):
     """Merge a list of sorted 2d arrays into a sorted 2d array.

     This assumes C style ordering for both input and output arrays. For
     each input array we have array[i,0] <= array[i+1,0] and for the output
     array the same will hold.

     Ideally this code would be simpler and also support both C style
     and Fortran style ordering.
     """
     cdef int num_arrays = len(arrays)
     assert num_arrays > 0

     cdef int num_cols = arrays[0].shape[1]

     for i in range(num_arrays):
         assert arrays[i].ndim == 2
         assert arrays[i].dtype == np.float64
         assert arrays[i].shape[1] == num_cols
         assert not np.isfortran(arrays[i])

     cdef vector[double*] data

     # The indices vector keeps track of the index of the next row to process in
     # each array.
     cdef vector[int] indices = num_arrays * [0]

     # The sizes vector stores the total number of elements that each array has.
     cdef vector[int] sizes

     cdef priority_queue[pair[double, int]] queue
     cdef pair[double, int] top
     cdef int num_rows = sum([array.shape[0] for array in arrays])
     cdef np.ndarray[np.float64_t, ndim=2] result = np.zeros(
         (num_rows, num_cols), dtype=np.float64)
     cdef double* result_ptr = <double*> np.PyArray_DATA(result)
     for i in range(num_arrays):
         if arrays[i].size > 0:
             sizes.push_back(arrays[i].size)
             data.push_back(<double*> np.PyArray_DATA(arrays[i]))
             queue.push(pair[double, int](-data[i][0], i))

     cdef int curr_idx = 0
     cdef int j
     cdef int col = 0

     for j in range(num_rows):
         top = queue.top()
         for col in range(num_cols):
             result_ptr[curr_idx + col] = (
                 data[top.second][indices[top.second] + col])

         indices[top.second] += num_cols
         curr_idx += num_cols

         queue.pop()
         if indices[top.second] < sizes[top.second]:
             queue.push(
                 pair[double, int](-data[top.second][indices[top.second]],
                                   top.second))

     return result
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# cython: profile=False
	# distutils: language = c++
	# cython: embedsignature = True

	from libc.stdint cimport uintptr_t
	from libcpp.vector cimport vector
	from libcpp.pair cimport pair

	import numpy as np

	cimport numpy as np

	cdef extern from "<queue>" namespace "std" nogil:
	cdef cppclass priority_queue[T]:
	priority_queue() except +
	priority_queue(priority_queue&) except +
	bint empty()
	void pop()
	void push(T&)
	size_t size()
	T& top()


	def multimerge2d(*arrays):
	"""Merge a list of sorted 2d arrays into a sorted 2d array.

	This assumes C style ordering for both input and output arrays. For
	each input array we have array[i,0] <= array[i+1,0] and for the output
	array the same will hold.

	Ideally this code would be simpler and also support both C style
	and Fortran style ordering.
	"""
	cdef int num_arrays = len(arrays)
	assert num_arrays > 0

	cdef int num_cols = arrays[0].shape[1]

	for i in range(num_arrays):
	assert arrays[i].ndim == 2
	assert arrays[i].dtype == np.float64
	assert arrays[i].shape[1] == num_cols
	assert not np.isfortran(arrays[i])

	cdef vector[double*] data

	# The indices vector keeps track of the index of the next row to process in
	# each array.
	cdef vector[int] indices = num_arrays * [0]

	# The sizes vector stores the total number of elements that each array has.
	cdef vector[int] sizes

	cdef priority_queue[pair[double, int]] queue
	cdef pair[double, int] top
	cdef int num_rows = sum([array.shape[0] for array in arrays])
	cdef np.ndarray[np.float64_t, ndim=2] result = np.zeros(
	(num_rows, num_cols), dtype=np.float64)
	cdef double* result_ptr = <double*> np.PyArray_DATA(result)
	for i in range(num_arrays):
	if arrays[i].size > 0:
	sizes.push_back(arrays[i].size)
	data.push_back(<double*> np.PyArray_DATA(arrays[i]))
	queue.push(pair[double, int](-data[i][0], i))

	cdef int curr_idx = 0
	cdef int j
	cdef int col = 0

	for j in range(num_rows):
	top = queue.top()
	for col in range(num_cols):
	result_ptr[curr_idx + col] = (
	data[top.second][indices[top.second] + col])

	indices[top.second] += num_cols
	curr_idx += num_cols

	queue.pop()
	if indices[top.second] < sizes[top.second]:
	queue.push(
	pair[double, int](-data[top.second][indices[top.second]],
	top.second))

	return result