mango: fix $beginsWith range

In the intial implementation of $beginsWith, the range calculation
for json indexes mistakenly appends an integer with the size of
8 bits which gets maxed out at FF, rather than building a binary
with an extra 3 bytes at the end.

This commit fixes the `mango_idx_view:range/5` by correctly appending
the `U+FFFF` code point to create a utf-8 encoded binary. Additionally,
the Erlang `utf8` binary type ensures the result
is a valid utf8 string. If `Arg` is not a utf8 binary, this will
throw a badarg error.

We expect `Arg` strings to be a valid utf8 but, to be safe,
`mango_selector:norm_ops/1` is enhanced to verify
that any argument to `$beginsWith` is a utf8 string.
diff --git a/src/mango/src/mango_idx_view.erl b/src/mango/src/mango_idx_view.erl
index d1650e9..95e1fc5 100644
--- a/src/mango/src/mango_idx_view.erl
+++ b/src/mango/src/mango_idx_view.erl
@@ -347,7 +347,7 @@
     range(Selector, Index, '$gt', mango_json:min(), '$lt', mango_json:max()).
 
 % Adjust Low and High based on values found for the
-% givend Index in Selector.
+% given Index in Selector.
 range({[{<<"$and">>, Args}]}, Index, LCmp, Low, HCmp, High) ->
     lists:foldl(
         fun
@@ -417,7 +417,9 @@
 % beginsWith requires both a high and low bound
 range({[{<<"$beginsWith">>, Arg}]}, LCmp, Low, HCmp, High) ->
     {LCmp0, Low0, HCmp0, High0} = range({[{<<"$gte">>, Arg}]}, LCmp, Low, HCmp, High),
-    range({[{<<"$lte">>, <<Arg/binary, 16#10FFFF>>}]}, LCmp0, Low0, HCmp0, High0);
+    % U+FFFF is the highest sorting code point according to the collator rules,
+    % even though it's not the highest code point in UTF8.
+    range({[{<<"$lte">>, <<Arg/binary, 16#FFFF/utf8>>}]}, LCmp0, Low0, HCmp0, High0);
 range({[{<<"$lt">>, Arg}]}, LCmp, Low, HCmp, High) ->
     case range_pos(Low, Arg, High) of
         min ->
@@ -624,6 +626,10 @@
     Selector = #{<<"field">> => #{<<"$gte">> => undefined}},
     ?assertEqual([<<"field">>], indexable_fields_of(Selector)).
 
+indexable_fields_beginswith_test() ->
+    Selector = #{<<"field">> => #{<<"$beginsWith">> => undefined}},
+    ?assertEqual([<<"field">>], indexable_fields_of(Selector)).
+
 indexable_fields_gt_test() ->
     Selector = #{<<"field">> => #{<<"$gt">> => undefined}},
     ?assertEqual([<<"field">>], indexable_fields_of(Selector)).
diff --git a/src/mango/src/mango_selector.erl b/src/mango/src/mango_selector.erl
index 93d3b10..d63641a 100644
--- a/src/mango/src/mango_selector.erl
+++ b/src/mango/src/mango_selector.erl
@@ -136,7 +136,10 @@
 norm_ops({[{<<"$text">>, Arg}]}) ->
     ?MANGO_ERROR({bad_arg, '$text', Arg});
 norm_ops({[{<<"$beginsWith">>, Arg}]} = Cond) when is_binary(Arg) ->
-    Cond;
+    case couch_util:validate_utf8(Arg) of
+        true -> Cond;
+        false -> ?MANGO_ERROR({bad_arg, '$beginsWith', Arg})
+    end;
 % Not technically an operator but we pass it through here
 % so that this function accepts its own output. This exists
 % so that $text can have a field name value which simplifies
@@ -1070,12 +1073,18 @@
 match_beginswith_test() ->
     % matching
     ?assertEqual(true, check_beginswith(<<"_id">>, <<"f">>)),
-    % no match (user_id is not a binary string)
+    % no match (user_id field in the test doc contains an integer)
     ?assertEqual(false, check_beginswith(<<"user_id">>, <<"f">>)),
     % invalid (prefix is not a binary string)
     ?assertThrow(
         {mango_error, mango_selector, {invalid_operator, <<"$beginsWith">>}},
         check_beginswith(<<"user_id">>, 1)
+    ),
+    % invalid (prefix is not a utf8 string)
+    InvalidArg = <<32#FFFFF>>,
+    ?assertThrow(
+        {mango_error, mango_selector, {bad_arg, '$beginsWith', InvalidArg}},
+        check_beginswith(<<"user_id">>, InvalidArg)
     ).
 
 -endif.
diff --git a/src/mango/test/25-beginswith-test.py b/src/mango/test/25-beginswith-test.py
index 3b5134b..df96560 100644
--- a/src/mango/test/25-beginswith-test.py
+++ b/src/mango/test/25-beginswith-test.py
@@ -54,7 +54,7 @@
 
         self.assertEqual(mrargs["start_key"], ["A"])
         end_key_bytes = to_utf8_bytes(mrargs["end_key"])
-        self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b"<MAX>"])
+        self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbf", b"<MAX>"])
 
     def test_compound_key(self):
         selector = {"name": "Eddie", "location": {"$beginsWith": "A"}}
@@ -62,7 +62,7 @@
 
         self.assertEqual(mrargs["start_key"], ["Eddie", "A"])
         end_key_bytes = to_utf8_bytes(mrargs["end_key"])
-        self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbd", b"<MAX>"])
+        self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbf", b"<MAX>"])
 
         docs = self.db.find(selector)
         self.assertEqual(len(docs), 1)
@@ -74,12 +74,12 @@
             {
                 "sort": ["location"],
                 "start_key": [b"A"],
-                "end_key": [b"A\xef\xbf\xbd", b"<MAX>"],
+                "end_key": [b"A\xef\xbf\xbf", b"<MAX>"],
                 "direction": "fwd",
             },
             {
                 "sort": [{"location": "desc"}],
-                "start_key": [b"A\xef\xbf\xbd", b"<MAX>"],
+                "start_key": [b"A\xef\xbf\xbf", b"<MAX>"],
                 "end_key": [b"A"],
                 "direction": "rev",
             },
@@ -97,7 +97,7 @@
 
         self.assertEqual(mrargs["start_key"], "a")
         end_key_bytes = to_utf8_bytes(mrargs["end_key"])
-        self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbd"])
+        self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbf"])
 
     def test_no_index(self):
         selector = {"foo": {"$beginsWith": "a"}}