apache / couchdb / f8b1b4a0ed717651ebd018bed61321e811fcb6b2 / . / src / couch / src / couch_key_tree.erl

% Licensed under the Apache License, Version 2.0 (the "License"); you may not | |

% use this file except in compliance with the License. You may obtain a copy of | |

% the License at | |

% | |

% http://www.apache.org/licenses/LICENSE-2.0 | |

% | |

% Unless required by applicable law or agreed to in writing, software | |

% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |

% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |

% License for the specific language governing permissions and limitations under | |

% the License. | |

%% @doc Data structure used to represent document edit histories. | |

%% A key tree is used to represent the edit history of a document. Each node of | |

%% the tree represents a particular version. Relations between nodes represent | |

%% the order that these edits were applied. For instance, a set of three edits | |

%% would produce a tree of versions A->B->C indicating that edit C was based on | |

%% version B which was in turn based on A. In a world without replication (and | |

%% no ability to disable MVCC checks), all histories would be forced to be | |

%% linear lists of edits due to constraints imposed by MVCC (ie, new edits must | |

%% be based on the current version). However, we have replication, so we must | |

%% deal with not so easy cases, which lead to trees. | |

%% | |

%% Consider a document in state A. This doc is replicated to a second node. We | |

%% then edit the document on each node leaving it in two different states, B | |

%% and C. We now have two key trees, A->B and A->C. When we go to replicate a | |

%% second time, the key tree must combine these two trees which gives us | |

%% A->(B|C). This is how conflicts are introduced. In terms of the key tree, we | |

%% say that we have two leaves (B and C) that are not deleted. The presense of | |

%% the multiple leaves indicate conflict. To remove a conflict, one of the | |

%% edits (B or C) can be deleted, which results in, A->(B|C->D) where D is an | |

%% edit that is specially marked with the a deleted=true flag. | |

%% | |

%% What makes this a bit more complicated is that there is a limit to the | |

%% number of revisions kept, specified in couch_db.hrl (default is 1000). When | |

%% this limit is exceeded only the last 1000 are kept. This comes in to play | |

%% when branches are merged. The comparison has to begin at the same place in | |

%% the branches. A revision id is of the form N-XXXXXXX where N is the current | |

%% revision depth. So each path will have a start number, calculated in | |

%% couch_doc:to_path using the formula N - length(RevIds) + 1 So, .eg. if a doc | |

%% was edit 1003 times this start number would be 4, indicating that 3 | |

%% revisions were truncated. | |

%% | |

%% This comes into play in @see merge_at/3 which recursively walks down one | |

%% tree or the other until they begin at the same revision. | |

-module(couch_key_tree). | |

-export([ | |

count_leafs/1, | |

find_missing/2, | |

fold/3, | |

get/2, | |

get_all_leafs/1, | |

get_all_leafs_full/1, | |

get_full_key_paths/2, | |

get_key_leafs/2, | |

map/2, | |

map_leafs/2, | |

mapfold/3, | |

multi_merge/2, | |

merge/2, | |

remove_leafs/2, | |

stem/2 | |

]). | |

-include_lib("couch/include/couch_db.hrl"). | |

-type treenode() :: {Key::term(), Value::term(), [Node::treenode()]}. | |

-type tree() :: {Depth::pos_integer(), [treenode()]}. | |

-type revtree() :: [tree()]. | |

%% @doc Merge multiple paths into the given tree. | |

-spec multi_merge(revtree(), tree()) -> revtree(). | |

multi_merge(RevTree, Trees) -> | |

lists:foldl(fun(Tree, RevTreeAcc) -> | |

{NewRevTree, _} = merge(RevTreeAcc, Tree), | |

NewRevTree | |

end, RevTree, lists:sort(Trees)). | |

%% @doc Merge a path into a tree. | |

-spec merge(revtree(), tree() | path()) -> | |

{revtree(), new_leaf | new_branch | internal_node}. | |

merge(RevTree, Tree) -> | |

{Merged, Result} = merge_tree(RevTree, Tree, []), | |

{lists:sort(Merged), Result}. | |

%% @private | |

%% @doc Attempt to merge Tree into each branch of the RevTree. | |

%% If it can't find a branch that the new tree merges into, add it as a | |

%% new branch in the RevTree. | |

-spec merge_tree(revtree(), tree() | path(), revtree()) -> | |

{revtree(), new_leaf | new_branch | internal_node}. | |

merge_tree([], Tree, []) -> | |

{[Tree], new_leaf}; | |

merge_tree([], Tree, MergeAcc) -> | |

{[Tree|MergeAcc], new_branch}; | |

merge_tree([{Depth, Nodes} | Rest], {IDepth, INodes}=Tree, MergeAcc) -> | |

% For the intrepid observer following along at home, notice what we're | |

% doing here with (Depth - IDepth). This tells us which of the two | |

% branches (Nodes or INodes) we need to seek into. If Depth > IDepth | |

% that means we need go into INodes to find where we line up with | |

% Nodes. If Depth < IDepth, its obviously the other way. If it turns | |

% out that (Depth - IDepth) == 0, then we know that this is where | |

% we begin our actual merge operation (ie, looking for key matches). | |

% Its helpful to note that this whole moving into sub-branches is due | |

% to how we store trees that have been stemmed. When a path is | |

% stemmed so that the root node is lost, we wrap it in a tuple with | |

% the number keys that have been droped. This number is the depth | |

% value that's used throughout this module. | |

case merge_at([Nodes], Depth - IDepth, [INodes]) of | |

{[Merged], Result} -> | |

NewDepth = erlang:min(Depth, IDepth), | |

{Rest ++ [{NewDepth, Merged} | MergeAcc], Result}; | |

fail -> | |

merge_tree(Rest, Tree, [{Depth, Nodes} | MergeAcc]) | |

end. | |

%% @private | |

%% @doc Locate the point at which merging can start. | |

%% Because of stemming we may need to seek into one of the branches | |

%% before we can start comparing node keys. If one of the branches | |

%% ends up running out of nodes we know that these two branches can | |

%% not be merged. | |

-spec merge_at([node()], integer(), [node()]) -> | |

{revtree(), new_leaf | new_branch | internal_node} | fail. | |

merge_at(_Nodes, _Pos, []) -> | |

fail; | |

merge_at([], _Pos, _INodes) -> | |

fail; | |

merge_at(Nodes, Pos, [{IK, IV, [NextINode]}]) when Pos > 0 -> | |

% Depth was bigger than IDepth, so we need to discard from the | |

% insert path to find where it might start matching. | |

case merge_at(Nodes, Pos - 1, [NextINode]) of | |

{Merged, Result} -> {[{IK, IV, Merged}], Result}; | |

fail -> fail | |

end; | |

merge_at(_Nodes, Pos, [{_IK, _IV, []}]) when Pos > 0 -> | |

% We've run out of path on the insert side, there's no way we can | |

% merge with this branch | |

fail; | |

merge_at([{K, V, SubTree} | Sibs], Pos, INodes) when Pos < 0 -> | |

% When Pos is negative, Depth was less than IDepth, so we | |

% need to discard from the revision tree path | |

case merge_at(SubTree, Pos + 1, INodes) of | |

{Merged, Result} -> | |

{[{K, V, Merged} | Sibs], Result}; | |

fail -> | |

% Merging along the subtree failed. We need to also try | |

% merging the insert branch against the siblings of this | |

% node. | |

case merge_at(Sibs, Pos, INodes) of | |

{Merged, Result} -> {[{K, V, SubTree} | Merged], Result}; | |

fail -> fail | |

end | |

end; | |

merge_at([{K, V1, Nodes} | Sibs], 0, [{K, V2, INodes}]) -> | |

% Keys are equal. At this point we have found a possible starting | |

% position for our merge to take place. | |

{Merged, Result} = merge_extend(Nodes, INodes), | |

{[{K, value_pref(V1, V2), Merged} | Sibs], Result}; | |

merge_at([{K1, _, _} | _], 0, [{K2, _, _}]) when K1 > K2 -> | |

% Siblings keys are ordered, no point in continuing | |

fail; | |

merge_at([Tree | Sibs], 0, INodes) -> | |

% INodes key comes after this key, so move on to the next sibling. | |

case merge_at(Sibs, 0, INodes) of | |

{Merged, Result} -> {[Tree | Merged], Result}; | |

fail -> fail | |

end. | |

-spec merge_extend(revtree(), revtree()) -> | |

{revtree(), new_leaf | new_branch | internal_node}. | |

merge_extend([], B) when B =/= [] -> | |

% Most likely the insert branch simply extends this one, so the new | |

% branch is exactly B. Its also possible that B is a branch because | |

% its key sorts greater than all siblings of an internal node. This | |

% condition is checked in the last clause of this function and the | |

% new_leaf result is fixed to be new_branch. | |

{B, new_leaf}; | |

merge_extend(A, []) -> | |

% Insert branch ends an internal node in our original revtree() | |

% so the end result is exactly our original revtree. | |

{A, internal_node}; | |

merge_extend([{K, V1, SubA} | NextA], [{K, V2, SubB}]) -> | |

% Here we're simply extending the path to the next deeper | |

% level in the two branches. | |

{Merged, Result} = merge_extend(SubA, SubB), | |

{[{K, value_pref(V1, V2), Merged} | NextA], Result}; | |

merge_extend([{K1, _, _}=NodeA | Rest], [{K2, _, _}=NodeB]) when K1 > K2 -> | |

% Keys are ordered so we know this is where the insert branch needs | |

% to be inserted into the tree. We also know that this creates a new | |

% branch so we have a new leaf to report. | |

{[NodeB, NodeA | Rest], new_branch}; | |

merge_extend([Tree | RestA], NextB) -> | |

% Here we're moving on to the next sibling to try and extend our | |

% merge even deeper. The length check is due to the fact that the | |

% key in NextB might be larger than the largest key in RestA which | |

% means we've created a new branch. | |

{Merged, Result0} = merge_extend(RestA, NextB), | |

Result = case length(Merged) == length(RestA) of | |

true -> Result0; | |

false -> new_branch | |

end, | |

{[Tree | Merged], Result}. | |

find_missing(_Tree, []) -> | |

[]; | |

find_missing([], SeachKeys) -> | |

SeachKeys; | |

find_missing([{Start, {Key, Value, SubTree}} | RestTree], SeachKeys) -> | |

PossibleKeys = [{KeyPos, KeyValue} || {KeyPos, KeyValue} <- SeachKeys, KeyPos >= Start], | |

ImpossibleKeys = [{KeyPos, KeyValue} || {KeyPos, KeyValue} <- SeachKeys, KeyPos < Start], | |

Missing = find_missing_simple(Start, [{Key, Value, SubTree}], PossibleKeys), | |

find_missing(RestTree, ImpossibleKeys ++ Missing). | |

find_missing_simple(_Pos, _Tree, []) -> | |

[]; | |

find_missing_simple(_Pos, [], SeachKeys) -> | |

SeachKeys; | |

find_missing_simple(Pos, [{Key, _, SubTree} | RestTree], SeachKeys) -> | |

PossibleKeys = [{KeyPos, KeyValue} || {KeyPos, KeyValue} <- SeachKeys, KeyPos >= Pos], | |

ImpossibleKeys = [{KeyPos, KeyValue} || {KeyPos, KeyValue} <- SeachKeys, KeyPos < Pos], | |

SrcKeys2 = PossibleKeys -- [{Pos, Key}], | |

SrcKeys3 = find_missing_simple(Pos + 1, SubTree, SrcKeys2), | |

ImpossibleKeys ++ find_missing_simple(Pos, RestTree, SrcKeys3). | |

filter_leafs([], _Keys, FilteredAcc, RemovedKeysAcc) -> | |

{FilteredAcc, RemovedKeysAcc}; | |

filter_leafs([{Pos, [{LeafKey, _}|_]} = Path |Rest], Keys, FilteredAcc, RemovedKeysAcc) -> | |

FilteredKeys = lists:delete({Pos, LeafKey}, Keys), | |

if FilteredKeys == Keys -> | |

% this leaf is not a key we are looking to remove | |

filter_leafs(Rest, Keys, [Path | FilteredAcc], RemovedKeysAcc); | |

true -> | |

% this did match a key, remove both the node and the input key | |

filter_leafs(Rest, FilteredKeys, FilteredAcc, [{Pos, LeafKey} | RemovedKeysAcc]) | |

end. | |

% Removes any branches from the tree whose leaf node(s) are in the Keys | |

remove_leafs(Trees, Keys) -> | |

% flatten each branch in a tree into a tree path | |

Paths = get_all_leafs_full(Trees), | |

% filter out any that are in the keys list. | |

{FilteredPaths, RemovedKeys} = filter_leafs(Paths, Keys, [], []), | |

SortedPaths = lists:sort( | |

[{Pos + 1 - length(Path), Path} || {Pos, Path} <- FilteredPaths] | |

), | |

% convert paths back to trees | |

NewTree = lists:foldl( | |

fun({StartPos, Path},TreeAcc) -> | |

[SingleTree] = lists:foldl( | |

fun({K,V},NewTreeAcc) -> [{K,V,NewTreeAcc}] end, [], Path), | |

{NewTrees, _} = merge(TreeAcc, {StartPos, SingleTree}), | |

NewTrees | |

end, [], SortedPaths), | |

{NewTree, RemovedKeys}. | |

% get the leafs in the tree matching the keys. The matching key nodes can be | |

% leafs or an inner nodes. If an inner node, then the leafs for that node | |

% are returned. | |

get_key_leafs(Tree, Keys) -> | |

get_key_leafs(Tree, Keys, []). | |

get_key_leafs(_, [], Acc) -> | |

{Acc, []}; | |

get_key_leafs([], Keys, Acc) -> | |

{Acc, Keys}; | |

get_key_leafs([{Pos, Tree}|Rest], Keys, Acc) -> | |

{Gotten, RemainingKeys} = get_key_leafs_simple(Pos, [Tree], Keys, []), | |

get_key_leafs(Rest, RemainingKeys, Gotten ++ Acc). | |

get_key_leafs_simple(_Pos, _Tree, [], _PathAcc) -> | |

{[], []}; | |

get_key_leafs_simple(_Pos, [], Keys, _PathAcc) -> | |

{[], Keys}; | |

get_key_leafs_simple(Pos, [{Key, _, SubTree}=Tree | RestTree], Keys, PathAcc) -> | |

case lists:delete({Pos, Key}, Keys) of | |

Keys -> | |

% Same list, key not found | |

NewPathAcc = [Key | PathAcc], | |

{ChildLeafs, Keys2} = get_key_leafs_simple(Pos + 1, SubTree, Keys, NewPathAcc), | |

{SiblingLeafs, Keys3} = get_key_leafs_simple(Pos, RestTree, Keys2, PathAcc), | |

{ChildLeafs ++ SiblingLeafs, Keys3}; | |

Keys2 -> | |

% This is a key we were looking for, get all descendant | |

% leafs while removing any requested key we find. Notice | |

% that this key will be returned by get_key_leafs_simple2 | |

% if it's a leaf so there's no need to return it here. | |

{ChildLeafs, Keys3} = get_key_leafs_simple2(Pos, [Tree], Keys2, PathAcc), | |

{SiblingLeafs, Keys4} = get_key_leafs_simple(Pos, RestTree, Keys3, PathAcc), | |

{ChildLeafs ++ SiblingLeafs, Keys4} | |

end. | |

get_key_leafs_simple2(_Pos, [], Keys, _PathAcc) -> | |

% No more tree to deal with so no more keys to return. | |

{[], Keys}; | |

get_key_leafs_simple2(Pos, [{Key, Value, []} | RestTree], Keys, PathAcc) -> | |

% This is a leaf as defined by having an empty list of | |

% child nodes. The assertion is a bit subtle but the function | |

% clause match means its a leaf. | |

Keys2 = lists:delete({Pos, Key}, Keys), | |

{SiblingLeafs, Keys3} = get_key_leafs_simple2(Pos, RestTree, Keys2, PathAcc), | |

{[{Value, {Pos, [Key | PathAcc]}} | SiblingLeafs], Keys3}; | |

get_key_leafs_simple2(Pos, [{Key, _Value, SubTree} | RestTree], Keys, PathAcc) -> | |

% This isn't a leaf. Recurse into the subtree and then | |

% process any sibling branches. | |

Keys2 = lists:delete({Pos, Key}, Keys), | |

NewPathAcc = [Key | PathAcc], | |

{ChildLeafs, Keys3} = get_key_leafs_simple2(Pos + 1, SubTree, Keys2, NewPathAcc), | |

{SiblingLeafs, Keys4} = get_key_leafs_simple2(Pos, RestTree, Keys3, PathAcc), | |

{ChildLeafs ++ SiblingLeafs, Keys4}. | |

get(Tree, KeysToGet) -> | |

{KeyPaths, KeysNotFound} = get_full_key_paths(Tree, KeysToGet), | |

FixedResults = [ {Value, {Pos, [Key0 || {Key0, _} <- Path]}} || {Pos, [{_Key, Value}|_]=Path} <- KeyPaths], | |

{FixedResults, KeysNotFound}. | |

get_full_key_paths(Tree, Keys) -> | |

get_full_key_paths(Tree, Keys, []). | |

get_full_key_paths(_, [], Acc) -> | |

{Acc, []}; | |

get_full_key_paths([], Keys, Acc) -> | |

{Acc, Keys}; | |

get_full_key_paths([{Pos, Tree}|Rest], Keys, Acc) -> | |

{Gotten, RemainingKeys} = get_full_key_paths(Pos, [Tree], Keys, []), | |

get_full_key_paths(Rest, RemainingKeys, Gotten ++ Acc). | |

get_full_key_paths(_Pos, _Tree, [], _KeyPathAcc) -> | |

{[], []}; | |

get_full_key_paths(_Pos, [], KeysToGet, _KeyPathAcc) -> | |

{[], KeysToGet}; | |

get_full_key_paths(Pos, [{KeyId, Value, SubTree} | RestTree], KeysToGet, KeyPathAcc) -> | |

KeysToGet2 = KeysToGet -- [{Pos, KeyId}], | |

CurrentNodeResult = | |

case length(KeysToGet2) =:= length(KeysToGet) of | |

true -> % not in the key list. | |

[]; | |

false -> % this node is the key list. return it | |

[{Pos, [{KeyId, Value} | KeyPathAcc]}] | |

end, | |

{KeysGotten, KeysRemaining} = get_full_key_paths(Pos + 1, SubTree, KeysToGet2, [{KeyId, Value} | KeyPathAcc]), | |

{KeysGotten2, KeysRemaining2} = get_full_key_paths(Pos, RestTree, KeysRemaining, KeyPathAcc), | |

{CurrentNodeResult ++ KeysGotten ++ KeysGotten2, KeysRemaining2}. | |

get_all_leafs_full(Tree) -> | |

get_all_leafs_full(Tree, []). | |

get_all_leafs_full([], Acc) -> | |

Acc; | |

get_all_leafs_full([{Pos, Tree} | Rest], Acc) -> | |

get_all_leafs_full(Rest, get_all_leafs_full_simple(Pos, [Tree], []) ++ Acc). | |

get_all_leafs_full_simple(_Pos, [], _KeyPathAcc) -> | |

[]; | |

get_all_leafs_full_simple(Pos, [{KeyId, Value, []} | RestTree], KeyPathAcc) -> | |

[{Pos, [{KeyId, Value} | KeyPathAcc]} | get_all_leafs_full_simple(Pos, RestTree, KeyPathAcc)]; | |

get_all_leafs_full_simple(Pos, [{KeyId, Value, SubTree} | RestTree], KeyPathAcc) -> | |

get_all_leafs_full_simple(Pos + 1, SubTree, [{KeyId, Value} | KeyPathAcc]) ++ get_all_leafs_full_simple(Pos, RestTree, KeyPathAcc). | |

get_all_leafs(Trees) -> | |

get_all_leafs(Trees, []). | |

get_all_leafs([], Acc) -> | |

Acc; | |

get_all_leafs([{Pos, Tree}|Rest], Acc) -> | |

get_all_leafs(Rest, get_all_leafs_simple(Pos, [Tree], []) ++ Acc). | |

get_all_leafs_simple(_Pos, [], _KeyPathAcc) -> | |

[]; | |

get_all_leafs_simple(Pos, [{KeyId, Value, []} | RestTree], KeyPathAcc) -> | |

[{Value, {Pos, [KeyId | KeyPathAcc]}} | get_all_leafs_simple(Pos, RestTree, KeyPathAcc)]; | |

get_all_leafs_simple(Pos, [{KeyId, _Value, SubTree} | RestTree], KeyPathAcc) -> | |

get_all_leafs_simple(Pos + 1, SubTree, [KeyId | KeyPathAcc]) ++ get_all_leafs_simple(Pos, RestTree, KeyPathAcc). | |

count_leafs([]) -> | |

0; | |

count_leafs([{_Pos,Tree}|Rest]) -> | |

count_leafs_simple([Tree]) + count_leafs(Rest). | |

count_leafs_simple([]) -> | |

0; | |

count_leafs_simple([{_Key, _Value, []} | RestTree]) -> | |

1 + count_leafs_simple(RestTree); | |

count_leafs_simple([{_Key, _Value, SubTree} | RestTree]) -> | |

count_leafs_simple(SubTree) + count_leafs_simple(RestTree). | |

fold(_Fun, Acc, []) -> | |

Acc; | |

fold(Fun, Acc0, [{Pos, Tree}|Rest]) -> | |

Acc1 = fold_simple(Fun, Acc0, Pos, [Tree]), | |

fold(Fun, Acc1, Rest). | |

fold_simple(_Fun, Acc, _Pos, []) -> | |

Acc; | |

fold_simple(Fun, Acc0, Pos, [{Key, Value, SubTree} | RestTree]) -> | |

Type = if SubTree == [] -> leaf; true -> branch end, | |

Acc1 = Fun({Pos, Key}, Value, Type, Acc0), | |

Acc2 = fold_simple(Fun, Acc1, Pos+1, SubTree), | |

fold_simple(Fun, Acc2, Pos, RestTree). | |

map(_Fun, []) -> | |

[]; | |

map(Fun, [{Pos, Tree}|Rest]) -> | |

case erlang:fun_info(Fun, arity) of | |

{arity, 2} -> | |

[NewTree] = map_simple(fun(A,B,_C) -> Fun(A,B) end, Pos, [Tree]), | |

[{Pos, NewTree} | map(Fun, Rest)]; | |

{arity, 3} -> | |

[NewTree] = map_simple(Fun, Pos, [Tree]), | |

[{Pos, NewTree} | map(Fun, Rest)] | |

end. | |

map_simple(_Fun, _Pos, []) -> | |

[]; | |

map_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) -> | |

Value2 = Fun({Pos, Key}, Value, | |

if SubTree == [] -> leaf; true -> branch end), | |

[{Key, Value2, map_simple(Fun, Pos + 1, SubTree)} | map_simple(Fun, Pos, RestTree)]. | |

mapfold(_Fun, Acc, []) -> | |

{[], Acc}; | |

mapfold(Fun, Acc, [{Pos, Tree} | Rest]) -> | |

{[NewTree], Acc2} = mapfold_simple(Fun, Acc, Pos, [Tree]), | |

{Rest2, Acc3} = mapfold(Fun, Acc2, Rest), | |

{[{Pos, NewTree} | Rest2], Acc3}. | |

mapfold_simple(_Fun, Acc, _Pos, []) -> | |

{[], Acc}; | |

mapfold_simple(Fun, Acc, Pos, [{Key, Value, SubTree} | RestTree]) -> | |

{Value2, Acc2} = Fun({Pos, Key}, Value, | |

if SubTree == [] -> leaf; true -> branch end, Acc), | |

{SubTree2, Acc3} = mapfold_simple(Fun, Acc2, Pos + 1, SubTree), | |

{RestTree2, Acc4} = mapfold_simple(Fun, Acc3, Pos, RestTree), | |

{[{Key, Value2, SubTree2} | RestTree2], Acc4}. | |

map_leafs(_Fun, []) -> | |

[]; | |

map_leafs(Fun, [{Pos, Tree}|Rest]) -> | |

[NewTree] = map_leafs_simple(Fun, Pos, [Tree]), | |

[{Pos, NewTree} | map_leafs(Fun, Rest)]. | |

map_leafs_simple(_Fun, _Pos, []) -> | |

[]; | |

map_leafs_simple(Fun, Pos, [{Key, Value, []} | RestTree]) -> | |

Value2 = Fun({Pos, Key}, Value), | |

[{Key, Value2, []} | map_leafs_simple(Fun, Pos, RestTree)]; | |

map_leafs_simple(Fun, Pos, [{Key, Value, SubTree} | RestTree]) -> | |

[{Key, Value, map_leafs_simple(Fun, Pos + 1, SubTree)} | map_leafs_simple(Fun, Pos, RestTree)]. | |

stem(Trees, Limit) -> | |

try | |

{_, Branches} = lists:foldl(fun(Tree, {Seen, TreeAcc}) -> | |

{NewSeen, NewBranches} = stem_tree(Tree, Limit, Seen), | |

{NewSeen, NewBranches ++ TreeAcc} | |

end, {sets:new(), []}, Trees), | |

lists:sort(Branches) | |

catch throw:dupe_keys -> | |

repair_tree(Trees, Limit) | |

end. | |

stem_tree({Depth, Child}, Limit, Seen) -> | |

case stem_tree(Depth, Child, Limit, Seen) of | |

{NewSeen, _, NewChild, NewBranches} -> | |

{NewSeen, [{Depth, NewChild} | NewBranches]}; | |

{NewSeen, _, NewBranches} -> | |

{NewSeen, NewBranches} | |

end. | |

stem_tree(_Depth, {Key, _Val, []} = Leaf, Limit, Seen) -> | |

{check_key(Key, Seen), Limit - 1, Leaf, []}; | |

stem_tree(Depth, {Key, Val, Children}, Limit, Seen0) -> | |

Seen1 = check_key(Key, Seen0), | |

FinalAcc = lists:foldl(fun(Child, Acc) -> | |

{SeenAcc, LimitPosAcc, ChildAcc, BranchAcc} = Acc, | |

case stem_tree(Depth + 1, Child, Limit, SeenAcc) of | |

{NewSeenAcc, LimitPos, NewChild, NewBranches} -> | |

NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc), | |

NewChildAcc = [NewChild | ChildAcc], | |

NewBranchAcc = NewBranches ++ BranchAcc, | |

{NewSeenAcc, NewLimitPosAcc, NewChildAcc, NewBranchAcc}; | |

{NewSeenAcc, LimitPos, NewBranches} -> | |

NewLimitPosAcc = erlang:max(LimitPos, LimitPosAcc), | |

NewBranchAcc = NewBranches ++ BranchAcc, | |

{NewSeenAcc, NewLimitPosAcc, ChildAcc, NewBranchAcc} | |

end | |

end, {Seen1, -1, [], []}, Children), | |

{FinalSeen, FinalLimitPos, FinalChildren, FinalBranches} = FinalAcc, | |

case FinalLimitPos of | |

N when N > 0, length(FinalChildren) > 0 -> | |

FinalNode = {Key, Val, lists:reverse(FinalChildren)}, | |

{FinalSeen, FinalLimitPos - 1, FinalNode, FinalBranches}; | |

0 when length(FinalChildren) > 0 -> | |

NewBranches = lists:map(fun(Child) -> | |

{Depth + 1, Child} | |

end, lists:reverse(FinalChildren)), | |

{FinalSeen, -1, NewBranches ++ FinalBranches}; | |

N when N < 0, length(FinalChildren) == 0 -> | |

{FinalSeen, FinalLimitPos - 1, FinalBranches} | |

end. | |

check_key(Key, Seen) -> | |

case sets:is_element(Key, Seen) of | |

true -> | |

throw(dupe_keys); | |

false -> | |

sets:add_element(Key, Seen) | |

end. | |

repair_tree(Trees, Limit) -> | |

% flatten each branch in a tree into a tree path, sort by starting rev # | |

Paths = lists:sort(lists:map(fun({Pos, Path}) -> | |

StemmedPath = lists:sublist(Path, Limit), | |

{Pos + 1 - length(StemmedPath), StemmedPath} | |

end, get_all_leafs_full(Trees))), | |

% convert paths back to trees | |

lists:foldl( | |

fun({StartPos, Path},TreeAcc) -> | |

[SingleTree] = lists:foldl( | |

fun({K,V},NewTreeAcc) -> [{K,V,NewTreeAcc}] end, [], Path), | |

{NewTrees, _} = merge(TreeAcc, {StartPos, SingleTree}), | |

NewTrees | |

end, [], Paths). | |

value_pref(Tuple, _) when is_tuple(Tuple), | |

(tuple_size(Tuple) == 3 orelse tuple_size(Tuple) == 4) -> | |

Tuple; | |

value_pref(_, Tuple) when is_tuple(Tuple), | |

(tuple_size(Tuple) == 3 orelse tuple_size(Tuple) == 4) -> | |

Tuple; | |

value_pref(?REV_MISSING, Other) -> | |

Other; | |

value_pref(Other, ?REV_MISSING) -> | |

Other; | |

value_pref(Last, _) -> | |

Last. |