Faster merging of hyper c arrays.

I tried implementing 3 different approaches to merging the arrays. This was by
far the fastest at (0.01ms to 0.02ms for the union benchmark). It seems
when the ternary operator is inside the inner loop compiler does something that
is fast such as an operation with no branching.

The other approaches were,

* processing a 64-bit integer at a time and unpacking the bytes (0.16ms)
* branching rather than a ternary operator for the max value which is
  similar to the original code (0.10ms to 0.40ms)

Benchmark before,

module       P        card   fill      bytes  insert us   union ms    card ms    json ms
hyper_carray 15          1   0.00      32784       1.80       0.10       0.33       0.21
hyper_carray 15        100   0.00      32784       0.54       0.11       0.34       0.28
hyper_carray 15        500   0.02      32784       0.50       0.10       0.38       0.47
hyper_carray 15       1000   0.03      32784       0.51       0.11       0.38       0.74
hyper_carray 15       2500   0.07      32784       0.50       0.11       0.43       1.41
hyper_carray 15       5000   0.14      32784       0.51       0.12       0.51       2.38
hyper_carray 15      10000   0.26      32784       0.51       0.14       0.65       4.17
hyper_carray 15      15000   0.37      32784       0.51       0.16       0.82       5.32
hyper_carray 15      25000   0.53      32784       0.51       0.20       1.05       6.10
hyper_carray 15      50000   0.78      32784       0.51       0.26       1.31       5.77
hyper_carray 15     100000   0.95      32784       0.51       0.33       1.65       5.18
hyper_carray 15    1000000   1.00      32784       0.53       0.43       1.83       5.23

Benchmark after,

module       P        card   fill      bytes  insert us   union ms    card ms    json ms
hyper_carray 15          1   0.00      32784       2.30       0.02       0.32       0.22
hyper_carray 15        100   0.00      32784       0.53       0.02       0.33       0.22
hyper_carray 15        500   0.02      32784       0.51       0.02       0.35       0.43
hyper_carray 15       1000   0.03      32784       0.50       0.02       0.37       0.68
hyper_carray 15       2500   0.07      32784       0.51       0.02       0.42       1.34
hyper_carray 15       5000   0.14      32784       0.50       0.01       0.50       2.33
hyper_carray 15      10000   0.26      32784       0.52       0.01       0.65       4.10
hyper_carray 15      15000   0.37      32784       0.51       0.02       0.79       5.33
hyper_carray 15      25000   0.53      32784       0.51       0.02       1.01       6.11
hyper_carray 15      50000   0.78      32784       0.51       0.02       1.30       5.77
hyper_carray 15     100000   0.95      32784       0.51       0.02       1.63       5.23
hyper_carray 15    1000000   1.00      32784       0.53       0.02       1.82       5.32
diff --git a/c_src/hyper_carray.c b/c_src/hyper_carray.c
index 7d78554..7eedc85 100644
--- a/c_src/hyper_carray.c
+++ b/c_src/hyper_carray.c
@@ -13,6 +13,7 @@
 // You should have received a copy of the GNU Lesser General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+#include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <tgmath.h>
@@ -40,9 +41,11 @@
 	/*
 	 * Array of items each one byte in size.
 	 */
-	char *items;
+	uint8_t *items;
 };
 
+typedef struct hyper_carray *restrict carray_ptr;
+
 #define HYPER_CARRAY_SIZE sizeof(struct hyper_carray)
 
 /*
@@ -58,8 +61,7 @@
 /*
  * Allocate a new hyper_carray for use as an Erlang NIF resource.
  */
-static void carray_alloc(unsigned int precision,
-			 struct hyper_carray *restrict * arr)
+static void carray_alloc(unsigned int precision, carray_ptr * arr)
 {
 	unsigned int nitems = pow(2, precision);
 	size_t header_size = HYPER_CARRAY_SIZE;
@@ -78,13 +80,12 @@
  * Given an hyper_carray and a valid index, set the value at that index to
  * max(current value, given value).
  */
-static inline void carray_merge_item(struct hyper_carray *restrict arr,
+static inline void carray_merge_item(carray_ptr arr,
 				     unsigned int index,
 				     unsigned int value)
 {
-	char *item = arr->items + index;
-	if (value > *item)
-		*item = value;
+	uint8_t *item = arr->items + index;
+	*item = (value > *item) ? value : *item;
 }
 
 /*
@@ -97,7 +98,7 @@
 	if (!enif_get_uint(env, argv[0], &precision))
 		return enif_make_badarg(env);
 
-	struct hyper_carray *restrict arr = NULL;
+	carray_ptr arr = NULL;
 	carray_alloc(precision, &arr);
 	memset(arr->items, 0, arr->size);
 
@@ -112,7 +113,7 @@
 static ERL_NIF_TERM set(ErlNifEnv * env, int argc,
 			const ERL_NIF_TERM argv[])
 {
-	struct hyper_carray *restrict arr = NULL;
+	carray_ptr arr = NULL;
 	HYPER_CARRAY_OR_BADARG(argv[2], arr);
 
 	unsigned int index = 0;
@@ -151,39 +152,54 @@
 	if (narrays < 1)
 		return enif_make_badarg(env);
 
-	void *tmp = NULL;
-	struct hyper_carray *restrict first = NULL;
-	struct hyper_carray *restrict acc = NULL;
-	struct hyper_carray *restrict curr = NULL;
-
+	carray_ptr first = NULL;
 	HYPER_CARRAY_OR_BADARG(head, first);
 
+	carray_ptr acc = NULL;
 	carray_alloc(first->precision, &acc);
 	memcpy(acc->items, first->items, acc->size);
 
-	unsigned int nitems = 0;
+	carray_ptr *arrays =
+	    (carray_ptr *) enif_alloc(narrays * sizeof(carray_ptr));
+	arrays[0] = first;
+
+	// Validate arrays
 	for (int i = 1; i < narrays; ++i) {
+		void *tmp = NULL;
+
 		if (!enif_get_list_cell(env, tail, &head, &tail)
 		    || !enif_get_resource(env, head, carray_resource,
 					  &tmp))
 			goto dealloc_and_badarg;
-		curr = tmp;
+
+		arrays[i] = tmp;
 
 		// Require uniform precision.
-		if (curr->precision != acc->precision)
+		if (arrays[i]->precision != acc->precision)
 			goto dealloc_and_badarg;
 
-		nitems = curr->size;
-		for (int j = 0; j < nitems; ++j)
-			carray_merge_item(acc, j, curr->items[j]);
-
 		continue;
 
 	      dealloc_and_badarg:
+		enif_free((void *) arrays);
 		dtor(env, acc);
 		return enif_make_badarg(env);
 	}
 
+	// Merge arrays
+	const unsigned int nitems = first->size;
+	for (carray_ptr * it = arrays + 1, *end = arrays + narrays;
+	     it != end; ++it) {
+		carray_ptr curr = *it;
+
+		for (uint8_t * accitem = acc->items, *item = curr->items,
+		     *enditem = curr->items + nitems;
+		     item != enditem; ++item, ++accitem) {
+			*accitem = (*item > *accitem) ? *item : *accitem;
+		}
+	}
+
+	enif_free((void *) arrays);
 	return enif_make_resource(env, acc);
 }
 
@@ -194,7 +210,7 @@
 static ERL_NIF_TERM bytes(ErlNifEnv * env, int argc,
 			  const ERL_NIF_TERM argv[])
 {
-	struct hyper_carray *restrict arr = NULL;
+	carray_ptr arr = NULL;
 	HYPER_CARRAY_OR_BADARG(argv[0], arr);
 	return enif_make_int(env, HYPER_CARRAY_SIZE + arr->size);
 }
@@ -205,7 +221,7 @@
 static ERL_NIF_TERM register_sum(ErlNifEnv * env, int argc,
 				 const ERL_NIF_TERM argv[])
 {
-	struct hyper_carray *restrict arr = NULL;
+	carray_ptr arr = NULL;
 	HYPER_CARRAY_OR_BADARG(argv[0], arr);
 
 	int currval = 0;
@@ -226,7 +242,7 @@
 static ERL_NIF_TERM zero_count(ErlNifEnv * env, int argc,
 			       const ERL_NIF_TERM argv[])
 {
-	struct hyper_carray *restrict arr = NULL;
+	carray_ptr arr = NULL;
 	HYPER_CARRAY_OR_BADARG(argv[0], arr);
 
 	unsigned int nzeros = 0;
@@ -246,7 +262,7 @@
 static ERL_NIF_TERM encode_registers(ErlNifEnv * env, int argc,
 				     const ERL_NIF_TERM argv[])
 {
-	struct hyper_carray *restrict arr = NULL;
+	carray_ptr arr = NULL;
 	HYPER_CARRAY_OR_BADARG(argv[0], arr);
 
 	size_t nbytes = arr->size;
@@ -271,7 +287,7 @@
 	    || !enif_inspect_binary(env, argv[0], &bin))
 		return enif_make_badarg(env);
 
-	struct hyper_carray *restrict arr = NULL;
+	carray_ptr arr = NULL;
 	carray_alloc(precision, &arr);
 	memcpy(arr->items, bin.data, arr->size);