Support parsing UTF-16 surrogate pairs in mochiweb_html #164
diff --git a/CHANGES.md b/CHANGES.md
index 05bf694..af80a19 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,8 @@
-Version 2.13.0 released XXXX-XX-XX
+Version 2.13.0 released 2016-02-08
 
+* Support parsing of UTF-16 surrogate pairs encoded as character
+  references in mochiweb_html
+  https://github.com/mochi/mochiweb/issues/164
 * Avoid swallowing messages that are not related to the socket
   during request parsing
   https://github.com/mochi/mochiweb/pull/161
diff --git a/src/mochiweb_html.erl b/src/mochiweb_html.erl
index 3fd93d0..3c5c4f9 100644
--- a/src/mochiweb_html.erl
+++ b/src/mochiweb_html.erl
@@ -639,13 +639,42 @@
 
 tokenize_charref(Bin, S=#decoder{offset=O}) ->
     try
-        tokenize_charref(Bin, S, O)
+        case tokenize_charref_raw(Bin, S, O) of
+            {C1, S1=#decoder{offset=O1}} when C1 >= 16#D800 andalso C1 =< 16#DFFF ->
+                %% Surrogate pair
+                tokeninize_charref_surrogate_pair(Bin, S1, C1);
+            {Unichar, S1} when is_integer(Unichar) ->
+                {{data, mochiutf8:codepoint_to_bytes(Unichar), false},
+                 S1};
+            {Unichars, S1} when is_list(Unichars) ->
+                {{data, unicode:characters_to_binary(Unichars), false},
+                 S1}
+        end
     catch
         throw:invalid_charref ->
             {{data, <<"&">>, false}, S}
     end.
 
-tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
+tokeninize_charref_surrogate_pair(Bin, S=#decoder{offset=O}, C1) ->
+    case Bin of
+        <<_:O/binary, $&, _/binary>> ->
+            case tokenize_charref_raw(Bin, ?INC_COL(S), O + 1) of
+                {C2, S1} when C2 >= 16#D800 andalso C1 =< 16#DFFF ->
+                    {{data,
+                      unicode:characters_to_binary(
+                        <<C1:16, C2:16>>,
+                        utf16,
+                        utf8),
+                      false},
+                     S1};
+                _ ->
+                    throw(invalid_charref)
+            end;
+        _ ->
+            throw(invalid_charref)
+    end.
+
+tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) ->
     case Bin of
         <<_:O/binary>> ->
             throw(invalid_charref);
@@ -658,17 +687,9 @@
         <<_:O/binary, $;, _/binary>> ->
             Len = O - Start,
             <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
-            Data = case mochiweb_charref:charref(Raw) of
-                       undefined ->
-                           throw(invalid_charref);
-                       Unichar when is_integer(Unichar) ->
-                           mochiutf8:codepoint_to_bytes(Unichar);
-                       Unichars when is_list(Unichars) ->
-                           unicode:characters_to_binary(Unichars)
-                   end,
-            {{data, Data, false}, ?INC_COL(S)};
+            {mochiweb_charref:charref(Raw), ?INC_COL(S)};
         _ ->
-            tokenize_charref(Bin, ?INC_COL(S), Start)
+            tokenize_charref_raw(Bin, ?INC_COL(S), Start)
     end.
 
 tokenize_doctype(Bin, S) ->
diff --git a/test/mochiweb_html_tests.erl b/test/mochiweb_html_tests.erl
index 3d35400..f67759a 100644
--- a/test/mochiweb_html_tests.erl
+++ b/test/mochiweb_html_tests.erl
@@ -126,6 +126,12 @@
        mochiweb_html:tokens(<<"not html < at all">>)),
     ok.
 
+surrogate_test() ->
+    %% https://github.com/mochi/mochiweb/issues/164
+    ?assertEqual(
+       [{data,<<240,159,152,138>>,false}],
+       mochiweb_html:tokens(<<"&#55357;&#56842;">>)).
+
 parse_test() ->
     D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
 <html>