blob: b6e1f097eda647056bbda24ec4ed499542a27cfd [file] [log] [blame]
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.
-module(mango_selector_text).
-export([
convert/1,
convert/2,
append_sort_type/2
]).
-include_lib("couch/include/couch_db.hrl").
-include("mango.hrl").
%% Regex for <<"\\.">>
-define(PERIOD, "\\.").
convert(Object) ->
TupleTree = convert([], Object),
iolist_to_binary(to_query(TupleTree)).
convert(Path, {[{<<"$and">>, Args}]}) ->
Parts = [convert(Path, Arg) || Arg <- Args],
{op_and, Parts};
convert(Path, {[{<<"$or">>, Args}]}) ->
Parts = [convert(Path, Arg) || Arg <- Args],
{op_or, Parts};
convert(Path, {[{<<"$not">>, Arg}]}) ->
{op_not, {field_exists_query(Path), convert(Path, Arg)}};
convert(Path, {[{<<"$default">>, Arg}]}) ->
{op_field, {_, Query}} = convert(Path, Arg),
{op_default, Query};
% The $text operator specifies a Lucene syntax query
% so we just pull it in directly.
convert(Path, {[{<<"$text">>, Query}]}) when is_binary(Query) ->
{op_field, {make_field(Path, Query), value_str(Query)}};
% The MongoDB docs for $all are super confusing and read more
% like they screwed up the implementation of this operator
% and then just documented it as a feature.
%
% This implementation will match the behavior as closely as
% possible based on the available docs but we'll need to have
% the testing team validate how MongoDB handles edge conditions
convert(Path, {[{<<"$all">>, Args}]}) ->
case Args of
[Values] when is_list(Values) ->
% If Args is a single element array then we have to
% either match if Path is that array or if it contains
% the array as an element of an array (which isn't at all
% confusing). For Lucene to return us all possible matches
% that means we just need to search for each value in
% Path.[] and Path.[].[] and rely on our filtering to limit
% the results properly.
Fields1 = convert(Path, {[{<<"$eq">> , Values}]}),
Fields2 = convert([<<"[]">>| Path], {[{<<"$eq">> , Values}]}),
{op_or, [Fields1, Fields2]};
_ ->
% Otherwise the $all operator is equivalent to an $and
% operator so we treat it as such.
convert([<<"[]">> | Path], {[{<<"$and">>, Args}]})
end;
% The $elemMatch Lucene query is not an exact translation
% as we can't enforce that the matches are all for the same
% item in an array. We just rely on the final selector match
% to filter out anything that doesn't match. The only trick
% is that we have to add the `[]` path element since the docs
% say this has to match against an array.
convert(Path, {[{<<"$elemMatch">>, Arg}]}) ->
convert([<<"[]">> | Path], Arg);
% Our comparison operators are fairly straight forward
convert(Path, {[{<<"$lt">>, Arg}]}) when is_list(Arg); is_tuple(Arg);
Arg =:= null ->
field_exists_query(Path);
convert(Path, {[{<<"$lt">>, Arg}]}) ->
{op_field, {make_field(Path, Arg), range(lt, Arg)}};
convert(Path, {[{<<"$lte">>, Arg}]}) when is_list(Arg); is_tuple(Arg);
Arg =:= null->
field_exists_query(Path);
convert(Path, {[{<<"$lte">>, Arg}]}) ->
{op_field, {make_field(Path, Arg), range(lte, Arg)}};
%% This is for indexable_fields
convert(Path, {[{<<"$eq">>, Arg}]}) when Arg =:= null ->
{op_null, {make_field(Path, Arg), value_str(Arg)}};
convert(Path, {[{<<"$eq">>, Args}]}) when is_list(Args) ->
Path0 = [<<"[]">> | Path],
LPart = {op_field, {make_field(Path0, length), value_str(length(Args))}},
Parts0 = [convert(Path0, {[{<<"$eq">>, Arg}]}) || Arg <- Args],
Parts = [LPart | Parts0],
{op_and, Parts};
convert(Path, {[{<<"$eq">>, {_} = Arg}]}) ->
convert(Path, Arg);
convert(Path, {[{<<"$eq">>, Arg}]}) ->
{op_field, {make_field(Path, Arg), value_str(Arg)}};
convert(Path, {[{<<"$ne">>, Arg}]}) ->
{op_not, {field_exists_query(Path), convert(Path, {[{<<"$eq">>, Arg}]})}};
convert(Path, {[{<<"$gte">>, Arg}]}) when is_list(Arg); is_tuple(Arg);
Arg =:= null ->
field_exists_query(Path);
convert(Path, {[{<<"$gte">>, Arg}]}) ->
{op_field, {make_field(Path, Arg), range(gte, Arg)}};
convert(Path, {[{<<"$gt">>, Arg}]}) when is_list(Arg); is_tuple(Arg);
Arg =:= null->
field_exists_query(Path);
convert(Path, {[{<<"$gt">>, Arg}]}) ->
{op_field, {make_field(Path, Arg), range(gt, Arg)}};
convert(Path, {[{<<"$in">>, Args}]}) ->
{op_or, convert_in(Path, Args)};
convert(Path, {[{<<"$nin">>, Args}]}) ->
{op_not, {field_exists_query(Path), convert(Path, {[{<<"$in">>, Args}]})}};
convert(Path, {[{<<"$exists">>, ShouldExist}]}) ->
FieldExists = field_exists_query(Path),
case ShouldExist of
true -> FieldExists;
false -> {op_not, {FieldExists, false}}
end;
% We're not checking the actual type here, just looking for
% anything that has a possibility of matching by checking
% for the field name. We use the same logic for $exists on
% the actual query.
convert(Path, {[{<<"$type">>, _}]}) ->
field_exists_query(Path);
convert(Path, {[{<<"$mod">>, _}]}) ->
field_exists_query(Path, "number");
% The lucene regular expression engine does not use java's regex engine but
% instead a custom implementation. The syntax is therefore different, so we do
% would get different behavior than our view indexes. To be consistent, we will
% simply return docs for fields that exist and then run our match filter.
convert(Path, {[{<<"$regex">>, _}]}) ->
field_exists_query(Path, "string");
convert(Path, {[{<<"$size">>, Arg}]}) ->
{op_field, {make_field([<<"[]">> | Path], length), value_str(Arg)}};
% All other operators are internal assertion errors for
% matching because we either should've removed them during
% normalization or something else broke.
convert(_Path, {[{<<"$", _/binary>>=Op, _}]}) ->
?MANGO_ERROR({invalid_operator, Op});
% We've hit a field name specifier. Check if the field name is accessing
% arrays. Convert occurrences of element position references to .[]. Then we
% need to break the name into path parts and continue our conversion.
convert(Path, {[{Field0, Cond}]}) ->
{ok, PP0} = case Field0 of
<<>> ->
{ok, []};
_ ->
mango_util:parse_field(Field0)
end,
% Later on, we perform a lucene_escape_user call on the
% final Path, which calls parse_field again. Calling the function
% twice converts <<"a\\.b">> to [<<"a">>,<<"b">>]. This leads to
% an incorrect query since we need [<<"a.b">>]. Without breaking
% our escaping mechanism, we simply revert this first parse_field
% effect and replace instances of "." to "\\.".
MP = mango_util:cached_re(mango_period, ?PERIOD),
PP1 = [re:replace(P, MP, <<"\\\\.">>,
[global,{return,binary}]) || P <- PP0],
{PP2, HasInteger} = replace_array_indexes(PP1, [], false),
NewPath = PP2 ++ Path,
case HasInteger of
true ->
OldPath = lists:reverse(PP1, Path),
OldParts = convert(OldPath, Cond),
NewParts = convert(NewPath, Cond),
{op_or, [OldParts, NewParts]};
false ->
convert(NewPath, Cond)
end;
%% For $in
convert(Path, Val) when is_binary(Val); is_number(Val); is_boolean(Val) ->
{op_field, {make_field(Path, Val), value_str(Val)}};
% Anything else is a bad selector.
convert(_Path, {Props} = Sel) when length(Props) > 1 ->
erlang:error({unnormalized_selector, Sel}).
to_query({op_and, Args}) when is_list(Args) ->
QueryArgs = lists:map(fun to_query/1, Args),
["(", mango_util:join(<<" AND ">>, QueryArgs), ")"];
to_query({op_or, Args}) when is_list(Args) ->
["(", mango_util:join(" OR ", lists:map(fun to_query/1, Args)), ")"];
to_query({op_not, {ExistsQuery, Arg}}) when is_tuple(Arg) ->
["(", to_query(ExistsQuery), " AND NOT (", to_query(Arg), "))"];
%% For $exists:false
to_query({op_not, {ExistsQuery, false}}) ->
["($fieldnames:/.*/ ", " AND NOT (", to_query(ExistsQuery), "))"];
to_query({op_insert, Arg}) when is_binary(Arg) ->
["(", Arg, ")"];
%% We escape : and / for now for values and all lucene chars for fieldnames
%% This needs to be resolved.
to_query({op_field, {Name, Value}}) ->
NameBin = iolist_to_binary(Name),
["(", mango_util:lucene_escape_user(NameBin), ":", Value, ")"];
%% This is for indexable_fields
to_query({op_null, {Name, Value}}) ->
NameBin = iolist_to_binary(Name),
["(", mango_util:lucene_escape_user(NameBin), ":", Value, ")"];
to_query({op_fieldname, {Name, Wildcard}}) ->
NameBin = iolist_to_binary(Name),
["($fieldnames:", mango_util:lucene_escape_user(NameBin), Wildcard, ")"];
to_query({op_default, Value}) ->
["($default:", Value, ")"].
%% We match on fieldname and fieldname.[]
convert_in(Path, Args) ->
Path0 = [<<"[]">> | Path],
lists:map(fun(Arg) ->
case Arg of
{Object} ->
Parts = lists:map(fun (SubObject) ->
Fields1 = convert(Path, {[SubObject]}),
Fields2 = convert(Path0, {[SubObject]}),
{op_or, [Fields1, Fields2]}
end, Object),
{op_or, Parts};
SingleVal ->
Fields1 = {op_field, {make_field(Path, SingleVal),
value_str(SingleVal)}},
Fields2 = {op_field, {make_field(Path0, SingleVal),
value_str(SingleVal)}},
{op_or, [Fields1, Fields2]}
end
end, Args).
make_field(Path, length) ->
[path_str(Path), <<":length">>];
make_field(Path, Arg) ->
[path_str(Path), <<":">>, type_str(Arg)].
range(lt, Arg) ->
Min = get_range(min, Arg),
[<<"[", Min/binary, " TO ">>, value_str(Arg), <<"}">>];
range(lte, Arg) ->
Min = get_range(min, Arg),
[<<"[", Min/binary, " TO ">>, value_str(Arg), <<"]">>];
range(gte, Arg) ->
Max = get_range(max, Arg),
[<<"[">>, value_str(Arg), <<" TO ", Max/binary, "]">>];
range(gt, Arg) ->
Max = get_range(max, Arg),
[<<"{">>, value_str(Arg), <<" TO ", Max/binary, "]">>].
get_range(min, Arg) when is_number(Arg) ->
<<"-Infinity">>;
get_range(min, _Arg) ->
<<"\"\"">>;
get_range(max, Arg) when is_number(Arg) ->
<<"Infinity">>;
get_range(max, _Arg) ->
<<"\u0x10FFFF">>.
field_exists_query(Path) ->
% We specify two here for :* and .* so that we don't incorrectly
% match a path foo.name against foo.name_first (if were to just
% appened * isntead).
Parts = [
% We need to remove the period from the path list to indicate that it is
% a path separator. We escape the colon because it is not used as a
% separator and we escape colons in field names.
{op_fieldname, {[path_str(Path), ":"], "*"}},
{op_fieldname, {[path_str(Path)], ".*"}}
],
{op_or, Parts}.
field_exists_query(Path, Type) ->
{op_fieldname, {[path_str(Path), ":"], Type}}.
path_str(Path) ->
path_str(Path, []).
path_str([], Acc) ->
Acc;
path_str([Part], Acc) ->
% No reverse because Path is backwards
% during recursion of convert.
[Part | Acc];
path_str([Part | Rest], Acc) ->
case Part of
% do not append a period if Part is blank
<<>> ->
path_str(Rest, [Acc]);
_ ->
path_str(Rest, [<<".">>, Part | Acc])
end.
type_str(Value) when is_number(Value) ->
<<"number">>;
type_str(Value) when is_boolean(Value) ->
<<"boolean">>;
type_str(Value) when is_binary(Value) ->
<<"string">>;
type_str(null) ->
<<"null">>.
value_str(Value) when is_binary(Value) ->
case mango_util:is_number_string(Value) of
true ->
<<"\"", Value/binary, "\"">>;
false ->
mango_util:lucene_escape_query_value(Value)
end;
value_str(Value) when is_integer(Value) ->
list_to_binary(integer_to_list(Value));
value_str(Value) when is_float(Value) ->
list_to_binary(float_to_list(Value));
value_str(true) ->
<<"true">>;
value_str(false) ->
<<"false">>;
value_str(null) ->
<<"true">>.
append_sort_type(RawSortField, Selector) ->
EncodeField = mango_util:lucene_escape_user(RawSortField),
String = mango_util:has_suffix(EncodeField, <<"_3astring">>),
Number = mango_util:has_suffix(EncodeField, <<"_3anumber">>),
case {String, Number} of
{true, _} ->
<<EncodeField/binary, "<string>">>;
{_, true} ->
<<EncodeField/binary, "<number>">>;
_ ->
Type = get_sort_type(RawSortField, Selector),
<<EncodeField/binary, Type/binary>>
end.
get_sort_type(Field, Selector) ->
Types = get_sort_types(Field, Selector, []),
case lists:usort(Types) of
[str] -> <<"_3astring<string>">>;
[num] -> <<"_3anumber<number>">>;
_ -> ?MANGO_ERROR({text_sort_error, Field})
end.
get_sort_types(Field, {[{Field, {[{<<"$", _/binary>>, Cond}]}}]}, Acc)
when is_binary(Cond) ->
[str | Acc];
get_sort_types(Field, {[{Field, {[{<<"$", _/binary>>, Cond}]}}]}, Acc)
when is_number(Cond) ->
[num | Acc];
get_sort_types(Field, {[{_, Cond}]}, Acc) when is_list(Cond) ->
lists:foldl(fun(Arg, InnerAcc) ->
get_sort_types(Field, Arg, InnerAcc)
end, Acc, Cond);
get_sort_types(Field, {[{_, Cond}]}, Acc) when is_tuple(Cond)->
get_sort_types(Field, Cond, Acc);
get_sort_types(_Field, _, Acc) ->
Acc.
replace_array_indexes([], NewPartsAcc, HasIntAcc) ->
{NewPartsAcc, HasIntAcc};
replace_array_indexes([Part | Rest], NewPartsAcc, HasIntAcc) ->
{NewPart, HasInt} = try
_ = list_to_integer(binary_to_list(Part)),
{<<"[]">>, true}
catch _:_ ->
{Part, false}
end,
replace_array_indexes(Rest, [NewPart | NewPartsAcc],
HasInt or HasIntAcc).