SLING-8566 support processing instruction and xml declaration
diff --git a/pom.xml b/pom.xml
index b886200..0388733 100644
--- a/pom.xml
+++ b/pom.xml
@@ -28,7 +28,7 @@
</parent>
<artifactId>org.apache.sling.commons.html</artifactId>
- <version>1.1.1-SNAPSHOT</version>
+ <version>1.2.0-SNAPSHOT</version>
<name>Apache Sling Commons HTML Utilities</name>
<description>
diff --git a/src/main/java/org/apache/sling/commons/html/impl/parser/TagParser.java b/src/main/java/org/apache/sling/commons/html/impl/parser/TagParser.java
index 7e36805..541070d 100644
--- a/src/main/java/org/apache/sling/commons/html/impl/parser/TagParser.java
+++ b/src/main/java/org/apache/sling/commons/html/impl/parser/TagParser.java
@@ -14,10 +14,13 @@
for (t=first; t != cur.next; t = t.next) {
if (t.specialToken != null) {
Token tt=t.specialToken;
- while (tt.specialToken != null)
+ while (tt.specialToken != null) {
tt = tt.specialToken;
- for (; tt != null; tt = tt.next)
+ }
+ while (tt != null) {
sb.append(tt.image);
+ tt = tt.next;
+ }
};
sb.append(t.image);
};
diff --git a/src/main/java/org/apache/sling/commons/html/internal/TagstreamHtmlParser.java b/src/main/java/org/apache/sling/commons/html/internal/TagstreamHtmlParser.java
index aeea507..f2f2101 100644
--- a/src/main/java/org/apache/sling/commons/html/internal/TagstreamHtmlParser.java
+++ b/src/main/java/org/apache/sling/commons/html/internal/TagstreamHtmlParser.java
@@ -52,7 +52,10 @@
@Override
public Document parse(String systemId, InputStream stream, String encoding) throws IOException {
final DOMBuilder builder = new DOMBuilder();
- Html.stream(stream, encoding).forEach(new HtmlSAXSupport(builder, builder));
+ HtmlSAXSupport support = new HtmlSAXSupport(builder, builder);
+ support.startDocument();
+ Html.stream(stream, encoding).forEach(support);
+ support.endDocument();
return builder.getDocument();
}
diff --git a/src/main/java/org/apache/sling/commons/html/util/HtmlSAXSupport.java b/src/main/java/org/apache/sling/commons/html/util/HtmlSAXSupport.java
index 810a929..32182d2 100644
--- a/src/main/java/org/apache/sling/commons/html/util/HtmlSAXSupport.java
+++ b/src/main/java/org/apache/sling/commons/html/util/HtmlSAXSupport.java
@@ -13,6 +13,7 @@
*/
package org.apache.sling.commons.html.util;
+import java.io.IOException;
import java.util.Map;
import java.util.function.Consumer;
@@ -25,19 +26,23 @@
import org.xml.sax.ext.DefaultHandler2;
import org.xml.sax.ext.LexicalHandler;
+/**
+ * Utility Class for the TagstreamHTMLParser to generate SAX events
+ *
+ *
+ */
public class HtmlSAXSupport implements Consumer<HtmlElement> {
-
+
private static final DefaultHandler2 handler = new DefaultHandler2();
-
+
private ContentHandler contentHandler = handler;
private LexicalHandler lexicalHandler = handler;
- private boolean initialized;
public HtmlSAXSupport(ContentHandler ch, final LexicalHandler lh) {
if (ch != null) {
contentHandler = ch;
}
- if (lh != null ) {
+ if (lh != null) {
lexicalHandler = lh;
}
}
@@ -45,10 +50,6 @@
@Override
public void accept(HtmlElement element) {
try {
- if (!initialized) {
- contentHandler.startDocument();
- initialized = true;
- }
String value = element.getValue();
switch (element.getType()) {
case COMMENT:
@@ -64,6 +65,12 @@
contentHandler.endDocument();
break;
case START_TAG:
+ if (value.startsWith("?")) {
+ if (!value.equalsIgnoreCase("?xml")) {
+ contentHandler.processingInstruction(value, attrsToString(element.getAttributes()));
+ }
+ break;
+ }
lexicalHandler.startEntity(value);
contentHandler.startElement("", value, value, HtmlSAXSupport.convert(element.getAttributes()));
break;
@@ -74,17 +81,38 @@
break;
}
} catch (SAXException se) {
- //log message
+ //se.printStackTrace();
}
}
-
- public static Attributes convert(Map<String,AttrValue> attributes) {
+
+ public static Attributes convert(Map<String, AttrValue> attributes) {
Attributes2Impl response = new Attributes2Impl();
- attributes.entrySet().forEach(attr ->
- response.addAttribute("",attr.getKey(), attr.getKey(), "xsi:String", attr.getValue().toString())
- );
+ attributes.entrySet().forEach(attr -> response.addAttribute("", attr.getKey(), attr.getKey(), "xsi:String",
+ attr.getValue().toString()));
return response;
}
+ public void startDocument() throws IOException {
+ try {
+ contentHandler.startDocument();
+ } catch (SAXException e) {
+ throw new IOException(e);
+ }
+ }
+
+ public void endDocument() throws IOException {
+ try {
+ contentHandler.endDocument();
+ } catch (SAXException e) {
+ throw new IOException(e);
+ }
+ }
+
+ private String attrsToString(Map<String, AttrValue> attributes) {
+ StringBuilder sb = new StringBuilder();
+ attributes.entrySet().forEach(attr -> sb.append(attr.toString()));
+ return sb.toString();
+ }
+
}
diff --git a/src/main/java/org/apache/sling/commons/html/util/package-info.java b/src/main/java/org/apache/sling/commons/html/util/package-info.java
index 23efc4f..7aeb4ff 100644
--- a/src/main/java/org/apache/sling/commons/html/util/package-info.java
+++ b/src/main/java/org/apache/sling/commons/html/util/package-info.java
@@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
-@Version("1.1.0")
+@Version("1.2.0")
package org.apache.sling.commons.html.util;
import org.osgi.annotation.versioning.Version;
diff --git a/src/main/javacc/htmlParser.jj b/src/main/javacc/htmlParser.jj
index 8a74ab7..f868e2b 100644
--- a/src/main/javacc/htmlParser.jj
+++ b/src/main/javacc/htmlParser.jj
@@ -34,10 +34,13 @@
for (t=first; t != cur.next; t = t.next) {
if (t.specialToken != null) {
Token tt=t.specialToken;
- while (tt.specialToken != null)
+ while (tt.specialToken != null) {
tt = tt.specialToken;
- for (; tt != null; tt = tt.next)
+ }
+ while (tt != null) {
sb.append(tt.image);
+ tt = tt.next;
+ }
};
sb.append(t.image);
};
diff --git a/src/test/java/org/apache/sling/commons/html/TagstreamHtmlParseTest.java b/src/test/java/org/apache/sling/commons/html/TagstreamHtmlParseTest.java
index 04b8183..a3579df 100644
--- a/src/test/java/org/apache/sling/commons/html/TagstreamHtmlParseTest.java
+++ b/src/test/java/org/apache/sling/commons/html/TagstreamHtmlParseTest.java
@@ -18,8 +18,10 @@
package org.apache.sling.commons.html;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue;
+import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.function.Function;
@@ -29,6 +31,7 @@
import org.apache.sling.commons.html.util.HtmlSAXSupport;
import org.junit.Before;
import org.junit.Test;
+import org.w3c.dom.Document;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.ext.DefaultHandler2;
@@ -43,11 +46,11 @@
private HtmlParser htmlParser;
/*
- * Japanese (google) translation of 'Don't forget me this weekend!'
- * standard text of xml sample note.xml
+ * Japanese (google) translation of 'Don't forget me this weekend!' standard
+ * text of xml sample note.xml
*/
- private static final String MESSAGE ="この週末私を忘れないで!";
-
+ private static final String MESSAGE = "この週末私を忘れないで!";
+
@Before
public void setUp() throws ParseException, Exception {
InputStream is = this.getClass().getResourceAsStream("/demo.html");
@@ -79,7 +82,7 @@
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
- //System.out.println(localName);
+ // System.out.println(localName);
}
}, new DefaultHandler2());
@@ -90,7 +93,7 @@
public void docParseTagTest3() throws Exception {
long count = stream.flatMap(TagMapper.map((element, process) -> {
if (element.containsAttribute("href")) {
- //System.out.println(element.getAttributeValue("href"));
+ // System.out.println(element.getAttributeValue("href"));
process.next(element);
}
})).count();
@@ -121,10 +124,9 @@
@Test
public void convertLinkAndPrintTest() throws Exception {
- //stream.flatMap(CONVERT_LINKS).map(HtmlStreams.TO_HTML).forEach(System.out::print);
+ // stream.flatMap(CONVERT_LINKS).map(HtmlStreams.TO_HTML).forEach(System.out::print);
}
-
@Before
public void setup() {
@@ -142,7 +144,13 @@
}
});
}
-
+
+ @Test
+ public void testDomSupport() throws SAXException, IOException {
+ Document dom = htmlParser.parse("123456", inputStream, "UTF-8");
+ assertNotEquals(dom, null);
+ }
+
@Test
public void testEncodingSupportFailure() throws SAXException {
htmlParser.parse(inputStream, "ISO8859-1", new DefaultHandler() {