blob: 5e77fdfcc8715af4a9c0d9b686a1b83170e96830 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
# cython: embedsignature = True
from libc.stdint cimport uintptr_t
from libcpp.vector cimport vector
from libcpp.pair cimport pair
import numpy as np
cimport numpy as np
cdef extern from "<queue>" namespace "std" nogil:
cdef cppclass priority_queue[T]:
priority_queue() except +
priority_queue(priority_queue&) except +
bint empty()
void pop()
void push(T&)
size_t size()
T& top()
def multimerge2d(*arrays):
"""Merge a list of sorted 2d arrays into a sorted 2d array.
This assumes C style ordering for both input and output arrays. For
each input array we have array[i,0] <= array[i+1,0] and for the output
array the same will hold.
Ideally this code would be simpler and also support both C style
and Fortran style ordering.
"""
cdef int num_arrays = len(arrays)
assert num_arrays > 0
cdef int num_cols = arrays[0].shape[1]
for i in range(num_arrays):
assert arrays[i].ndim == 2
assert arrays[i].dtype == np.float64
assert arrays[i].shape[1] == num_cols
assert not np.isfortran(arrays[i])
cdef vector[double*] data
# The indices vector keeps track of the index of the next row to process in
# each array.
cdef vector[int] indices = num_arrays * [0]
# The sizes vector stores the total number of elements that each array has.
cdef vector[int] sizes
cdef priority_queue[pair[double, int]] queue
cdef pair[double, int] top
cdef int num_rows = sum([array.shape[0] for array in arrays])
cdef np.ndarray[np.float64_t, ndim=2] result = np.zeros(
(num_rows, num_cols), dtype=np.float64)
cdef double* result_ptr = <double*> np.PyArray_DATA(result)
for i in range(num_arrays):
if arrays[i].size > 0:
sizes.push_back(arrays[i].size)
data.push_back(<double*> np.PyArray_DATA(arrays[i]))
queue.push(pair[double, int](-data[i][0], i))
cdef int curr_idx = 0
cdef int j
cdef int col = 0
for j in range(num_rows):
top = queue.top()
for col in range(num_cols):
result_ptr[curr_idx + col] = (
data[top.second][indices[top.second] + col])
indices[top.second] += num_cols
curr_idx += num_cols
queue.pop()
if indices[top.second] < sizes[top.second]:
queue.push(
pair[double, int](-data[top.second][indices[top.second]],
top.second))
return result