Allow configuring maximum document ID length during replication

Currently due to a bug in http parser and lack of document ID length
enforcement, large document IDs will break replication jobs. Large IDs
will pass through the _change feed, revs diffs,  but then fail
during open_revs get request. open_revs request will keep retrying until
it gives up after long enough time, then replication task crashes and
restart again with the same pattern. The current effective limit is
around 8k or so. (The buffer size default 8192 and if the first line
of the request is larger than that, request will fail).

(See http://erlang.org/pipermail/erlang-questions/2011-June/059567.html
for more information about the possible failure mechanism).

Bypassing the parser bug by increasing recbuf size, will alow replication
to finish, however that means simply spreading the abnormal document through
the rest of the system, and might not be desirable always.

Also once long document IDs have been inserted in the source DB. Simply deleting
them doesn't work as they'd still appear in the change feed. They'd have to
be purged or somehow skipped during the replication step. This commit helps
do the later.

Operators can configure maximum length via this setting:
```
  replicator.max_document_id_length=0
```

The default value is 0 which means there is no maximum enforced, which is
backwards compatible behavior.

During replication if maximum is hit by a document, that document is skipped,
an error is written to the log:

```
Replicator: document id `aaaaaaaaaaaaaaaaaaaaa...` from source db  `http://.../cdyno-0000001/` is too long, ignoring.
```

and `"doc_write_failures"` statistic is bumped.

COUCHDB-3291
diff --git a/src/couch_replicator_changes_reader.erl b/src/couch_replicator_changes_reader.erl
index bed318a..f9d9097 100644
--- a/src/couch_replicator_changes_reader.erl
+++ b/src/couch_replicator_changes_reader.erl
@@ -89,9 +89,19 @@
         "source database `~s` (_changes sequence ~p)",
         [couch_replicator_api_wrap:db_uri(Db), DocInfo#doc_info.high_seq]);
 
-process_change(#doc_info{} = DocInfo, {_, _, ChangesQueue, _, _}) ->
-    ok = couch_work_queue:queue(ChangesQueue, DocInfo),
-    put(last_seq, DocInfo#doc_info.high_seq);
+process_change(#doc_info{id = Id} = DocInfo, {Parent, Db, ChangesQueue, _, _}) ->
+    case is_doc_id_too_long(byte_size(Id)) of
+        true ->
+            ShortId = lists:sublist(binary_to_list(Id), 64),
+            SourceDb = couch_replicator_api_wrap:db_uri(Db),
+            couch_log:error("Replicator: document id `~s...` from source db "
+                " `~s` is too long, ignoring.", [ShortId, SourceDb]),
+            Stats = couch_replicator_stats:new([{doc_write_failures, 1}]),
+            ok = gen_server:call(Parent, {add_stats, Stats}, infinity);
+        false ->
+            ok = couch_work_queue:queue(ChangesQueue, DocInfo),
+            put(last_seq, DocInfo#doc_info.high_seq)
+    end;
 
 process_change({last_seq, LS}, {Parent, _, _, true = _Continuous, Ts}) ->
     % LS should never be undefined, but it doesn't hurt to be defensive inside
@@ -111,3 +121,7 @@
     % change.  The two can differ substantially in the case of a restrictive
     % filter.
     ok.
+
+is_doc_id_too_long(IdLength) ->
+    ConfigMax = config:get_integer("replicator", "max_document_id_length", 0),
+    ConfigMax > 0 andalso IdLength > ConfigMax.
diff --git a/test/couch_replicator_id_too_long_tests.erl b/test/couch_replicator_id_too_long_tests.erl
new file mode 100644
index 0000000..f5d7165
--- /dev/null
+++ b/test/couch_replicator_id_too_long_tests.erl
@@ -0,0 +1,94 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(couch_replicator_id_too_long_tests).
+
+-include_lib("couch/include/couch_eunit.hrl").
+-include_lib("couch/include/couch_db.hrl").
+-include_lib("couch_replicator/src/couch_replicator.hrl").
+
+
+setup(_) ->
+    Ctx = test_util:start_couch([couch_replicator]),
+    Source = create_db(),
+    create_doc(Source),
+    Target = create_db(),
+    {Ctx, {Source, Target}}.
+
+
+teardown(_, {Ctx, {Source, Target}}) ->
+    delete_db(Source),
+    delete_db(Target),
+    config:set("replicator", "max_document_id_length", "0"),
+    ok = test_util:stop_couch(Ctx).
+
+
+id_too_long_replication_test_() ->
+    Pairs = [{local, local}, {local, remote},
+             {remote, local}, {remote, remote}],
+    {
+        "Doc id too long tests",
+        {
+            foreachx,
+            fun setup/1, fun teardown/2,
+            [{Pair, fun should_succeed/2} || Pair <- Pairs] ++
+            [{Pair, fun should_fail/2} || Pair <- Pairs]
+        }
+    }.
+
+
+should_succeed({From, To}, {_Ctx, {Source, Target}}) ->
+    RepObject = {[
+        {<<"source">>, db_url(From, Source)},
+        {<<"target">>, db_url(To, Target)}
+    ]},
+    config:set("replicator", "max_document_id_length", "5"),
+    {ok, _} = couch_replicator:replicate(RepObject, ?ADMIN_USER),
+    ?_assertEqual(ok, couch_replicator_test_helper:compare_dbs(Source, Target)).
+
+
+should_fail({From, To}, {_Ctx, {Source, Target}}) ->
+    RepObject = {[
+        {<<"source">>, db_url(From, Source)},
+        {<<"target">>, db_url(To, Target)}
+    ]},
+    config:set("replicator", "max_document_id_length", "4"),
+    {ok, _} = couch_replicator:replicate(RepObject, ?ADMIN_USER),
+    ?_assertError({badmatch, {not_found, missing}},
+        couch_replicator_test_helper:compare_dbs(Source, Target)).
+
+
+create_db() ->
+    DbName = ?tempdb(),
+    {ok, Db} = couch_db:create(DbName, [?ADMIN_CTX]),
+    ok = couch_db:close(Db),
+    DbName.
+
+
+create_doc(DbName) ->
+    {ok, Db} = couch_db:open(DbName, [?ADMIN_CTX]),
+    Doc = couch_doc:from_json_obj({[{<<"_id">>, <<"12345">>}]}),
+    {ok, _} = couch_db:update_doc(Db, Doc, []),
+    couch_db:ensure_full_commit(Db),
+    couch_db:close(Db).
+
+
+delete_db(DbName) ->
+    ok = couch_server:delete(DbName, [?ADMIN_CTX]).
+
+
+db_url(local, DbName) ->
+    DbName;
+db_url(remote, DbName) ->
+    Addr = config:get("httpd", "bind_address", "127.0.0.1"),
+    Port = mochiweb_socket_server:get(couch_httpd, port),
+    ?l2b(io_lib:format("http://~s:~b/~s", [Addr, Port, DbName])).