Fix for CONNECTORS-1656.  Thanks Julien for the patch.

git-svn-id: https://svn.apache.org/repos/asf/manifoldcf/trunk@1886472 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/CHANGES.txt b/CHANGES.txt
index 4a0329f..9d40c31 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -3,6 +3,9 @@
 
 ======================= 2.19-dev =====================
 
+CONNECTORS-1656: Ensure legit XML is produced for Tika by the html extractor.
+(Julien Massiera)
+
 CONNECTORS-1661: Encoding for multipart requests is sometimes not set by the new
 UI, so assume UTF-8 if that happens.
 (Julien Massiera)
diff --git a/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java b/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
index 8bb4064..2676e50 100644
--- a/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
+++ b/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
@@ -33,6 +33,7 @@
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import org.jsoup.nodes.Document.OutputSettings;
+import org.jsoup.nodes.Entities.EscapeMode;
 import org.jsoup.safety.Whitelist;
 
 public class JsoupProcessing {
@@ -42,6 +43,7 @@
 
   public static Hashtable<String,String> extractTextAndMetadataHtmlDocument(InputStream streamDoc,String whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
     Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
+    doc.outputSettings().escapeMode(EscapeMode.xhtml);
     Hashtable<String,String> metadata = new Hashtable<String,String>();
     for(Element meta : doc.select("meta")) {
       Logging.connectors.debug("Name: " + meta.attr("name") + " - Content: " + meta.attr("content"));