blob: 410da0a823fb641f373c7f1ca18bae5ad0f2661a [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
import java.util.Collection;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.swing.text.ChangedCharSetException;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
* HTML Parser. It retrieves sections of the javadoc HTML file.
* @author Martin Roskanin
class HTMLJavadocParser {
public static final Logger LOG = Logger.getLogger(HTMLJavadocParser.class.getName());
/** Gets the javadoc text from the given URL
* @param url location of Javadoc
* @param pkg true if URL should be retrieved for a package
public static String getJavadocText(URL url, boolean pkg) {
return getJavadocText(new JavadocHelper.TextStream(url), pkg);
/** Gets the javadoc text from the given URL
* @param page location of Javadoc
* @param pkg true if URL should be retrieved for a package
public static String getJavadocText(JavadocHelper.TextStream page, boolean pkg) {
if (page == null) return null;
HTMLEditorKit.Parser parser;
InputStream is = null;
String charset = null;
for (;;) {
is = page.openStream();
parser = new ParserDelegator();
String urlStr = URLDecoder.decode(page.getLocation().toString(), "UTF-8"); //NOI18N
int offsets[] = null;
Reader reader = charset == null?new InputStreamReader(is): new InputStreamReader(is, charset);
if (pkg){
// package description
offsets = parsePackage(reader, parser, charset != null);
}else if (urlStr.indexOf('#')>0){
// member javadoc info
final Collection<? extends URL> urls = page.getLocations();
final Collection<String> possibleNames = new HashSet<>(urls.size());
for (URL nameUrl : urls) {
urlStr = URLDecoder.decode(nameUrl.toString(), "UTF-8"); //NOI18N
final String memberName = urlStr.substring(urlStr.indexOf('#')+1);
if (!memberName.isEmpty()) {
if (!possibleNames.isEmpty()) {
offsets = parseMember(reader, possibleNames, parser, charset != null);
// class javadoc info
offsets = parseClass(reader, parser, charset != null);
if (offsets != null){
return getTextFromURLStream(page, offsets, charset);
} catch (ChangedCharSetException e) {
if (charset == null) {
charset = getCharSet(e);
//restart with valid charset
} else {
} catch (FileNotFoundException x) {
break; // e.g. missing com.sun.** class in network Javadoc; ignore
} catch (InterruptedIOException x) {
//Http javadoc timeout
}catch(IOException ioe){
parser = null;
if (is!=null) {
}catch(IOException ioe){
return null;
private static String getCharSet(ChangedCharSetException e) {
String spec = e.getCharSetSpec();
if (e.keyEqualsCharSet()) {
//charsetspec contains only charset
return spec;
//charsetspec is in form "text/html; charset=UTF-8"
int index = spec.indexOf(";"); // NOI18N
if (index != -1) {
spec = spec.substring(index + 1);
spec = spec.toLowerCase();
StringTokenizer st = new StringTokenizer(spec, " \t=", true); //NOI18N
boolean foundCharSet = false;
boolean foundEquals = false;
while (st.hasMoreTokens()) {
String token = st.nextToken();
if (token.equals(" ") || token.equals("\t")) { //NOI18N
if (foundCharSet == false && foundEquals == false
&& token.equals("charset")) { //NOI18N
foundCharSet = true;
} else if (foundEquals == false && token.equals("=")) {//NOI18N
foundEquals = true;
} else if (foundEquals == true && foundCharSet == true) {
return token;
foundCharSet = false;
foundEquals = false;
return null;
private static String getTextFromURLStream(JavadocHelper.TextStream page, int[] offsets, String charset) throws IOException {
if (page == null)
return null;
InputStream fis = null;
InputStreamReader fisreader = null;
try {
fis = page.openStream();
fisreader = charset == null ? new InputStreamReader(fis) : new InputStreamReader(fis, charset);
StringBuilder sb = new StringBuilder();
int offset = 0;
for (int i = 0; i < offsets.length - 1; i+=2) {
int startOffset = offsets[i];
int endOffset = offsets[i + 1];
if (startOffset < 0 || endOffset < 0)
if (startOffset > endOffset) {
"Was not able to correctly parse javadoc: {0}, startOffset={1}, endOffset={2}.",
new Object[] {page.getLocation(), startOffset, endOffset});
return null;
int len = endOffset - startOffset;
char buffer[] = new char[len];
int bytesToSkip = startOffset - offset;
long bytesSkipped = 0;
do {
bytesSkipped = fisreader.skip(bytesToSkip);
bytesToSkip -= bytesSkipped;
} while ((bytesToSkip > 0) && (bytesSkipped > 0));
int bytesAlreadyRead = 0;
do {
int count =, bytesAlreadyRead, len - bytesAlreadyRead);
if (count < 0){
bytesAlreadyRead += count;
} while (bytesAlreadyRead < len);
offset = endOffset;
return sb.toString();
} finally {
if (fisreader != null)
/** Retrieves the position (start offset and end offset) of class javadoc info
* in the raw html file */
private static int[] parseClass(Reader reader, final HTMLEditorKit.Parser parser, boolean ignoreCharset) throws IOException {
final int INIT = 0;
// javadoc HTML comment '======== START OF CLASS DATA ========'
final int CLASS_DATA_START = 1;
// start of the text we need. Located just after first P.
final int TEXT_START = 2;
// div tag after the CLASS_DATA_START
final int INSIDE_DIV = 3;
// div tag after the INSIDE_DIV
final int AFTER_DIV = 4;
final int state[] = new int[] {INIT};
final int offset[] = new int[] {-1, -1, -1, -1};
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() {
int div_counter = 0;
int li_counter = 0;
int nextHRPos = -1;
int lastHRPos = -1;
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.HR){
if (state[0] == TEXT_START){
nextHRPos = pos;
lastHRPos = pos;
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.P && state[0] == CLASS_DATA_START){
if (offset[0] != -1 && offset[1] == -1)
offset[1] = pos + 3;
state[0] = TEXT_START;
} else if (t == HTML.Tag.DIV) {
if (state[0] == CLASS_DATA_START && a.containsAttribute(HTML.Attribute.CLASS, "block")) {
state[0] = INSIDE_DIV;
if (offset[2] == -1)
offset[2] = pos;
if (state[0] == INSIDE_DIV)
} else if (t == HTML.Tag.LI && state[0] == AFTER_DIV) {
} else if (t == HTML.Tag.A && state[0] == TEXT_START) {
String attrName = (String)a.getAttribute(HTML.Attribute.NAME);
if (attrName!=null && attrName.length()>0){
if (nextHRPos!=-1 && nextHRPos > offset[2]){
offset[3] = nextHRPos;
offset[3] = pos;
state[0] = INIT;
public void handleEndTag(Tag t, int pos) {
if (t == HTML.Tag.DIV && state[0] == INSIDE_DIV) {
if (--div_counter == 0) {
if (offset[0] > -1 && offset[1] == -1) {
state[0] = CLASS_DATA_START;
offset[1] = pos;
} else {
state[0] = AFTER_DIV;
} else if (t == HTML.Tag.LI && state[0] == AFTER_DIV) {
if (--li_counter < 0) {
offset[3] = pos;
state[0] = INIT;
public void handleComment(char[] data, int pos){
String comment = String.valueOf(data);
if (comment!=null){
if (comment.indexOf("START OF CLASS DATA")>0){ //NOI18N
state[0] = CLASS_DATA_START;
} else if (comment.indexOf("NESTED CLASS SUMMARY")>0 //NOI18N
&& offset[3] == -1){
if (lastHRPos!=-1 && lastHRPos > offset[2]){
offset[3] = lastHRPos;
offset[3] = pos;
public void handleText(char[] data, int pos) {
if (state[0] == CLASS_DATA_START && "Deprecated.".equals(new String(data))) { //NOI18N
offset[0] = lastHRPos + 4;
} else if (state[0] == INSIDE_DIV && "Deprecated.".equals(new String(data))) { //NOI18N
offset[0] = offset[2];
offset[2] = -1;
} else if (state[0] == TEXT_START && offset[2] < 0) {
offset[2] = pos;
parser.parse(reader, callback, ignoreCharset);
callback = null;
return offset;
/** Retrieves the position (start offset and end offset) of member javadoc info
* in the raw html file */
private static int[] parseMember(Reader reader, final Collection<? extends String> names, final HTMLEditorKit.Parser parser, boolean ignoreCharset) throws IOException {
final int INIT = 0;
// 'A' tag with the name we are looking for.
final int A_OPEN = 1;
// close tag of 'A'
final int A_CLOSE = 2;
// PRE close tag after the A_CLOSE
final int PRE_CLOSE = 3;
// div tag after the PRE_CLOSE
final int INSIDE_DIV = 4;
final int state[] = new int[1];
final int offset[] = new int[2];
offset[0] = -1; //start offset
offset[1] = -1; //end offset
state[0] = INIT;
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() {
int div_counter = 0;
int dl_counter = 0;
int li_counter = 0;
int hrPos = -1;
boolean startWithNextText;
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.HR && state[0] == PRE_CLOSE){
hrPos = pos;
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.A) {
String attrName = (String)a.getAttribute(HTML.Attribute.NAME);
String attrId = (String)a.getAttribute(HTML.Attribute.ID);
if (names.contains(attrName) || names.contains(attrId)){
// we have found desired javadoc member info anchor
state[0] = A_OPEN;
} else {
if ((state[0] == PRE_CLOSE) && (attrName != null || attrId != null) && hrPos != -1){
// reach the end of retrieved javadoc info
state[0] = INIT;
offset[1] = hrPos;
} else if (t == HTML.Tag.DL && state[0] == PRE_CLOSE) {
} else if (t == HTML.Tag.LI && state[0] == PRE_CLOSE) {
} else if (t == HTML.Tag.DD && state[0] == PRE_CLOSE && offset[0] < 0){
offset[0] = pos;
} else if (t == HTML.Tag.DIV && (state[0] == PRE_CLOSE || state[0] == A_CLOSE || state[0] == INSIDE_DIV)){
state[0] = INSIDE_DIV;
if (offset[0] < 0) {
if (div_counter == 2) {
offset[0] = pos;
} else if (div_counter == 1 && a.containsAttribute(HTML.Attribute.CLASS, "block")) {
startWithNextText = true;
public void handleEndTag(HTML.Tag t, int pos){
if (t == HTML.Tag.A && state[0] == A_OPEN){
state[0] = A_CLOSE;
} else if (t == HTML.Tag.PRE && state[0] == A_CLOSE){
state[0] = PRE_CLOSE;
} else if (t == HTML.Tag.DL && state[0] == PRE_CLOSE) {
if (--dl_counter == 0)
hrPos = pos;
} else if (t == HTML.Tag.LI && state[0] == PRE_CLOSE) {
if (--li_counter < 0)
hrPos = pos;
} else if (t == HTML.Tag.DIV && state[0] == INSIDE_DIV) {
if (--div_counter == 0) {
state[0] = PRE_CLOSE;
hrPos = pos;
public void handleText(char[] data, int pos) {
if (startWithNextText) {
startWithNextText = false;
if (offset[0] < 0) {
offset[0] = pos;
public void handleComment(char[] data, int pos){
String comment = String.valueOf(data);
if (comment!=null){
if (comment.indexOf("END OF CLASS DATA")>0){ //NOI18N
if ((state[0] == PRE_CLOSE) && hrPos != -1){
// reach the end of retrieved javadoc info
state[0] = INIT;
offset[1] = hrPos;
parser.parse(reader, callback, ignoreCharset);
callback = null;
return offset;
/** Retrieves the position (start offset and end offset) of member javadoc info
* in the raw html file */
private static int[] parsePackage(Reader reader, final HTMLEditorKit.Parser parser, boolean ignoreCharset) throws IOException {
final String name = "package_description"; //NOI18N
final int INIT = 0;
// 'A' tag with the name we are looking for.
final int A_OPEN = 1;
final int state[] = new int[1];
final int offset[] = new int[2];
offset[0] = -1; //start offset
offset[1] = -1; //end offset
state[0] = INIT;
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() {
int hrPos = -1;
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.HR && state[0]!=INIT){
if (state[0] == A_OPEN){
hrPos = pos;
offset[1] = pos;
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.A) {
String attrName = (String)a.getAttribute(HTML.Attribute.NAME);
if (name.equals(attrName)){
// we have found desired javadoc member info anchor
state[0] = A_OPEN;
offset[0] = pos;
} else {
if (state[0] == A_OPEN && attrName!=null){
// reach the end of retrieved javadoc info
state[0] = INIT;
offset[1] = (hrPos!=-1) ? hrPos : pos;
parser.parse(reader, callback, ignoreCharset);
callback = null;
return offset;