blob: ff9bc4ebe7ba121b13811fe295296405ee238d0d [file] [log] [blame]
package com.atlassian.uwc.converters;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.atlassian.uwc.converters.tikiwiki.RegexUtil;
import com.atlassian.uwc.converters.twiki.JavaRegexAndTokenizerConverter;
import com.atlassian.uwc.ui.Page;
/**
* @author Laura Kolker
* handles correcting links to illegal pagenames, that would have been changed
* to something legal by IllegalPageNameConverter
*/
public class IllegalLinkNameConverter extends IllegalNameConverter {
private static final String PROPKEY_CUSTOMPROTOCOL = "illegalnames-customprotocol";
/**
* delimiter for an alias in links
*/
private static final String ALIAS_DELIM = "|";
public static final String ALLOW_AT_IN_LINKS_KEY = "allow-at-in-links";
public static final String ALLOW_TILDE_IN_LINKS_KEY = "allow-tilde-in-links";
public void convert(Page page) {
log.info("Converting Links Referencing Illegal Names - start");
tokenizeCodeBlocks(page);
//save the tokenized changes
page.setOriginalText(page.getConvertedText());
String input = page.getOriginalText();
String converted = legalizeLinks(input);
page.setConvertedText(converted);
//save the converted text so we can detokenize
page.setOriginalText(converted);
detokenizeCodeBlocks(page);
log.info("Converting Links Referencing Illegal Names - complete");
}
String linksPrefix =
"(?<=" + //zero-width group starts
"\\[" + //left bracket
")"; //end zero-width group
String linksSuffix =
"(?=" + //zero-width group starts
"\\]" + //right bracket
")"; //end zero-width group
String links =
linksPrefix + //left Bracket
"("+ //start capture (group 1)
".*?" + //everything until
"[^\\\\]" + //not a backslash (this would mean our closing bracket was escaped
")" + //end capture (group 1)
linksSuffix; //rightBracket
Pattern linkPattern = Pattern.compile(links);
String brackets = "(?<=^|[^\\\\])(\\[|\\])";
Pattern bracketPattern = Pattern.compile(brackets);
String protocol = "((https?://)|(mailto:)|(file:)|(ftp:))(.*)";
Pattern protocolPattern = Pattern.compile(protocol);
/**
* @param input
* @return true if the link is an external link (http://somepage.com),
* returns false if the link is an internal wiki page
*/
public boolean isExternalLink(String input) {
Matcher protocolFinder;
if (this.getProperties().containsKey(PROPKEY_CUSTOMPROTOCOL)) {
try {
String custom = this.getProperties().getProperty(PROPKEY_CUSTOMPROTOCOL, protocol);
Pattern customPattern = Pattern.compile(custom);
protocolFinder = customPattern.matcher(input);
} catch (RuntimeException e) {
log.warn("Problem compiling custom protocol with link: " + input + " - Using default protocol.");
protocolFinder = protocolPattern.matcher(input);
}
}
else
protocolFinder = protocolPattern.matcher(input);
return protocolFinder.lookingAt();
}
/**
* transforms any links to illegal pagenames to their
* legal counterpart
* @param input
* @return input with legalized links
*/
protected String legalizeLinks(String input) {
String legal = input;
// HashSet<String> originalNames = getIllegalPagenames();
// if (originalNames != null)
// legal = legalizeWithState(legal, originalNames); //XXX this handles the right bracket issue, but is v. slow
return legalizeLinksWithoutState(legal);
}
/**
* uses a list of illegal pagenames that have been found to
* legalize the links
* @param input confluence syntax with links
* @param originalNames unique list of illegal pagenames
* @return
*/
protected String legalizeWithState(String input, HashSet<String> originalNames) {
if (originalNames == null)
throw new IllegalArgumentException(
"illegal pagenames object must not be null. Use setIllegalPagenames before calling this method.");
String contentWithLinks = input;
for (String pagename : originalNames) {
String replacement = convertIllegalName(pagename);
String pagenamePattern = createPagenamePattern(pagename);
replacement = "{group1}" + replacement;
contentWithLinks = RegexUtil.loopRegex(contentWithLinks, pagenamePattern, replacement);
}
return contentWithLinks;
}
/**
* creates a string representing a regex that would find
* a link to the given pagename
* @param pagename
* @return regex for finding links to the given pagename
*/
private String createPagenamePattern(String pagename) {
pagename = Pattern.quote(pagename); //escapes any regex chars
String pattern =
linksPrefix + //left bracket
"(" + //start capture (group1)
".*?" + //anything until
")" + //end capture (group 1)
pagename + //the pagename
linksSuffix; //right bracket
return pattern;
}
/**
* transforms the links in the given input such that
* any links to illegal pages are transformed to their
* legal equivalent
* @param input
* @return input with legal links
*/
protected String legalizeLinksWithoutState(String input) {
//Look for links
StringBuffer sb = new StringBuffer();
boolean found = false;
Matcher linkFinder = linkPattern.matcher(input);
while (linkFinder.find()) {
if (escaped(input, linkFinder.start(), '\\')) {
continue;
}
found = true;
String linkContents = linkFinder.group(1);
//link parts: alias, anchor symbol (if any), link
String alias = identifyAlias(linkContents);
if (alias == null) alias = "";
if (!"".equals(alias)) alias += ALIAS_DELIM;
String anchor = identifyInPageAnchor(linkContents.replaceFirst("^[^|]+\\|", ""));
String link = identifyLink(linkContents);
if (isAttachment(link))
continue;
String otherAnchor = identifyOtherPageAnchor(link);
if (!"".equals(otherAnchor)) otherAnchor = "#" + otherAnchor;
//if theirs another anchor, remove that from the link string
String pagename = (!"".equals(otherAnchor))
?link.substring(0,link.length() - otherAnchor.length())
:link;
if (!isExternalLink(pagename)) { //if it's external, it can't have an illegal confluence name.
String space = "";
if (hasSpace(pagename)) {
space = identifySpace(pagename);
pagename = removeItem(pagename, space);
}
String blogdate = "";
if (isBlogpost(pagename)) {
blogdate = identifyBlogdate(pagename);
pagename = removeItem(pagename, blogdate);
}
//important for syntax like shortcut links
this.setAllowAt(allowsAt());
this.setAllowTilde(allowsTilde());
pagename = convertIllegalName(pagename); //get rid of the illegal chars here.
//rebuild with parts that were put aside
pagename = space + blogdate + pagename;
}
String replacement = alias + anchor + pagename + otherAnchor;
replacement = RegexUtil.handleEscapesInReplacement(replacement);
linkFinder.appendReplacement(sb, replacement);
}
if (found) {
linkFinder.appendTail(sb);
return sb.toString();
}
return input;
}
private boolean allowsAt() {
if (this.properties == null) return false;
if (this.properties.containsKey(ALLOW_AT_IN_LINKS_KEY)) {
String val = (String) this.properties.get(ALLOW_AT_IN_LINKS_KEY);
if ("true".equals(val))
return true;
}
return false;
}
private boolean allowsTilde() {
if (this.properties == null) return false;
if (this.properties.containsKey(ALLOW_TILDE_IN_LINKS_KEY)) {
String val = (String) this.properties.get(ALLOW_TILDE_IN_LINKS_KEY);
if ("true".equals(val))
return true;
}
return false;
}
protected boolean hasSpace(String pagename) {
return pagename.contains(":");
}
Pattern space = Pattern.compile("" +
"^([^:]+:)");
protected String identifySpace(String pagename) {
Matcher spaceFinder = space.matcher(pagename);
if (spaceFinder.find()) {
return spaceFinder.group(1);
}
return "";
}
protected String removeItem(String pagename, String item) {
item = "\\Q" + item + "\\E";
return RegexUtil.loopRegex(pagename, item, "");
}
Pattern blogdate = Pattern.compile("^\\/\\d{4,4}\\/\\d{2,2}\\/\\d{2,2}\\/");
protected boolean isBlogpost(String pagename) {
return blogdate.matcher(pagename).lookingAt();
}
protected String identifyBlogdate(String pagename) {
Matcher blogFinder = blogdate.matcher(pagename);
if (blogFinder.find()) {
return blogFinder.group();
}
return "";
}
/**
* @param input string that might have an escaped character
* @param index index of character that might be escaped
* @param ch character used to escape the character at the given index (probably backslash)
* @return true if the character at the given index of the input string is escaped
* by the given character
*/
protected boolean escaped(String input, int index, char ch) {
if (index == 0 || index == 1)
return false;
if (input.charAt(index-2) == ch) return true;
return false;
}
String alias =
"([^|]+)" + // (group 1) not a pipe until the end or
"(\\|.*)?"; // (group 2) optional pipe and then anything until the end
Pattern aliasPattern = Pattern.compile(alias);
/**
* figures out the alias content of a confluence link
* @param input the link (without brackets). So for example:<br/>
* If the Confluence syntax for a link was "[alias|Page Name]", then the input
* you would pass would be "alias|Page Name".
* @return the alias, for the above example, the return value would be "alias";
*/
protected String identifyAlias(String input) {
Matcher aliasFinder = aliasPattern.matcher(input);
if (aliasFinder.find()) {
if (aliasFinder.group(2) == null)
return ""; //no alias
return aliasFinder.group(1);
}
return input;
}
String anchor =
"(?:^|\\|)" + //the beginning of the string or a pipe
"(#)"; //a hash
Pattern anchorPattern = Pattern.compile(anchor);
/**
* examines a link, and returns a #, if the link
* has an anchor to a section within this page. For example:<br/>
* If a confluence link was like so: "[alias|#anchor]",
* then the input would be "alias|#anchor",
* and the return value would be #.
* However, as only in-page-anchors are found by this method,
* an empty string would be returned if the input was:
* alias|OtherPage#anchor
* @param input The contents of the link.
* @return A hash (#) symbol, if such an anchor
* exists in the given input, or an empty string (""),
* if no such anchor exists.
*/
protected String identifyInPageAnchor(String input) {
Matcher anchorFinder = anchorPattern.matcher(input);
if (anchorFinder.find()) {
return "#";
}
return "";
}
String otherAnchor =
"^" + //beginning of string
"[^#]+" + //not a hash
"#" + //one hash
"(" + //start capture (group1)
".*" + //everything til the end
")"; //end capture (group1)
Pattern otherAnchorPattern = Pattern.compile(otherAnchor);
/**
* examines a link, and if it finds a anchor for another page
* it returns the anchor. For example:
* If a confluence link was "[alias|OtherPage#anchor]",
* then the input would need to be "alias|OtherPage#anchor",
* and the return value would be "#anchor".
* @param input The contents of a confluence link (minus the enclosing brackets)
* @return the anchor
*/
protected String identifyOtherPageAnchor(String input) {
Matcher anchorFinder = otherAnchorPattern.matcher(input);
if (anchorFinder.find()) {
return anchorFinder.group(1);
}
return "";
}
String linkContent =
"[^#|]" + //not a hash or a pipe
"[^|]*" + //not a pipe until
"$"; //the end of the string
Pattern linkContentPattern = Pattern.compile(linkContent);
/**
* gets the link to the page, for a given confluence link.
* For Example<br/>
* If a Confluence link was "[alias|page#anchor]",
* The input would be alias|page#anchor, and the return value would be
* page#anchor.
* <br/>
* If a Confluence link was "[#anchor]",
* then the input would be "#anchor", and the return value would be "anchor".
* If a Confluence link was "[^attachment1.gif]", the input would be
* "^attachment1.gif, and the return value would be "^attachment.gif"
* @param input confluence link minus the brackets
* @return the page name the link is for
*/
protected String identifyLink(String input) {
Matcher linkFinder = linkContentPattern.matcher(input);
if (linkFinder.find()) {
return linkFinder.group();
}
return input;
}
/**
* @param input a confluence link, minus such extraneous details
* as aliases, and the anchors of other pagenames
* For example,<br/>
* if input = "^attachment.gif", then it return true.
* if input = "pagename", then it returns false.
* @return true if the link references an attachment
*/
protected boolean isAttachment(String input) {
return input.contains("^");
}
String codeblockTokenizerConverterString =
"(" + //start capture (group 1)
"\\{" + //a left brace
"code" + //the string "code"
"[^}]*" + //anything but a right brace until
"\\}" + //a right brace
"(" + //start capture (group 2)
".*?" + //anything except a newline until
")" + //end capture (group 2)
"\\{" + //a left brace
"code" + //the string "code"
"\\}" + //a right brace
")" + //end capture (group 1)
JavaRegexAndTokenizerConverter.REGEX_SEPERATOR_MULTI_LINE + //converter regex replacement trigger
"$1"; //replacement
String noformatblockTokenizerConverterString =
"(" + //start capture (group 1)
"\\{" + //a left brace
"noformat" + //the string "noformat"
"[^}]*" + //anything but a right brace until
"\\}" + //a right brace
"(" + //start capture (group 2)
".*?" + //anything except a newline until
")" + //end capture (group 2)
"\\{" + //a left brace
"noformat" + //the string "noformat"
"\\}" + //a right brace
")" + //end capture (group 1)
JavaRegexAndTokenizerConverter.REGEX_SEPERATOR_MULTI_LINE + //converter regex replacement trigger
"$1"; //replacement
/**
* tokenizes any instances of code blocks, so that
* the contents of a code block is not affected by this class
* @param page page with code blocks to tokenize
* @return page with tokenzied code blocks
*/
protected Page tokenizeCodeBlocks(Page page) {
JavaRegexAndTokenizerConverter codeTokenizer =
(JavaRegexAndTokenizerConverter) JavaRegexAndTokenizerConverter.getConverter(
this.codeblockTokenizerConverterString);
codeTokenizer.convert(page);
page.setOriginalText(page.getConvertedText());
JavaRegexAndTokenizerConverter noformatTokenizer =
(JavaRegexAndTokenizerConverter) JavaRegexAndTokenizerConverter.getConverter(
this.noformatblockTokenizerConverterString);
noformatTokenizer.convert(page);
return page;
}
/**
* detokenizes any code block tokens.
* @param page page to detokenize
* @return detokenized page
*/
protected Page detokenizeCodeBlocks(Page page) {
DetokenizerConverter detokenizer =
new DetokenizerConverter();
detokenizer.convert(page);
return page;
}
}