Support parsing UTF-16 surrogate pairs in mochiweb_html #164
diff --git a/CHANGES.md b/CHANGES.md
index 05bf694..af80a19 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,8 @@
-Version 2.13.0 released XXXX-XX-XX
+Version 2.13.0 released 2016-02-08
+* Support parsing of UTF-16 surrogate pairs encoded as character
+ references in mochiweb_html
+ https://github.com/mochi/mochiweb/issues/164
* Avoid swallowing messages that are not related to the socket
during request parsing
https://github.com/mochi/mochiweb/pull/161
diff --git a/src/mochiweb_html.erl b/src/mochiweb_html.erl
index 3fd93d0..3c5c4f9 100644
--- a/src/mochiweb_html.erl
+++ b/src/mochiweb_html.erl
@@ -639,13 +639,42 @@
tokenize_charref(Bin, S=#decoder{offset=O}) ->
try
- tokenize_charref(Bin, S, O)
+ case tokenize_charref_raw(Bin, S, O) of
+ {C1, S1=#decoder{offset=O1}} when C1 >= 16#D800 andalso C1 =< 16#DFFF ->
+ %% Surrogate pair
+ tokeninize_charref_surrogate_pair(Bin, S1, C1);
+ {Unichar, S1} when is_integer(Unichar) ->
+ {{data, mochiutf8:codepoint_to_bytes(Unichar), false},
+ S1};
+ {Unichars, S1} when is_list(Unichars) ->
+ {{data, unicode:characters_to_binary(Unichars), false},
+ S1}
+ end
catch
throw:invalid_charref ->
{{data, <<"&">>, false}, S}
end.
-tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
+tokeninize_charref_surrogate_pair(Bin, S=#decoder{offset=O}, C1) ->
+ case Bin of
+ <<_:O/binary, $&, _/binary>> ->
+ case tokenize_charref_raw(Bin, ?INC_COL(S), O + 1) of
+ {C2, S1} when C2 >= 16#D800 andalso C1 =< 16#DFFF ->
+ {{data,
+ unicode:characters_to_binary(
+ <<C1:16, C2:16>>,
+ utf16,
+ utf8),
+ false},
+ S1};
+ _ ->
+ throw(invalid_charref)
+ end;
+ _ ->
+ throw(invalid_charref)
+ end.
+
+tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) ->
case Bin of
<<_:O/binary>> ->
throw(invalid_charref);
@@ -658,17 +687,9 @@
<<_:O/binary, $;, _/binary>> ->
Len = O - Start,
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
- Data = case mochiweb_charref:charref(Raw) of
- undefined ->
- throw(invalid_charref);
- Unichar when is_integer(Unichar) ->
- mochiutf8:codepoint_to_bytes(Unichar);
- Unichars when is_list(Unichars) ->
- unicode:characters_to_binary(Unichars)
- end,
- {{data, Data, false}, ?INC_COL(S)};
+ {mochiweb_charref:charref(Raw), ?INC_COL(S)};
_ ->
- tokenize_charref(Bin, ?INC_COL(S), Start)
+ tokenize_charref_raw(Bin, ?INC_COL(S), Start)
end.
tokenize_doctype(Bin, S) ->
diff --git a/test/mochiweb_html_tests.erl b/test/mochiweb_html_tests.erl
index 3d35400..f67759a 100644
--- a/test/mochiweb_html_tests.erl
+++ b/test/mochiweb_html_tests.erl
@@ -126,6 +126,12 @@
mochiweb_html:tokens(<<"not html < at all">>)),
ok.
+surrogate_test() ->
+ %% https://github.com/mochi/mochiweb/issues/164
+ ?assertEqual(
+ [{data,<<240,159,152,138>>,false}],
+ mochiweb_html:tokens(<<"��">>)).
+
parse_test() ->
D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
<html>