blob: ff9bc4ebe7ba121b13811fe295296405ee238d0d [file] [log] [blame]
package com.atlassian.uwc.converters;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.atlassian.uwc.converters.tikiwiki.RegexUtil;
import com.atlassian.uwc.converters.twiki.JavaRegexAndTokenizerConverter;
import com.atlassian.uwc.ui.Page;
* @author Laura Kolker
* handles correcting links to illegal pagenames, that would have been changed
* to something legal by IllegalPageNameConverter
public class IllegalLinkNameConverter extends IllegalNameConverter {
private static final String PROPKEY_CUSTOMPROTOCOL = "illegalnames-customprotocol";
* delimiter for an alias in links
private static final String ALIAS_DELIM = "|";
public static final String ALLOW_AT_IN_LINKS_KEY = "allow-at-in-links";
public static final String ALLOW_TILDE_IN_LINKS_KEY = "allow-tilde-in-links";
public void convert(Page page) {"Converting Links Referencing Illegal Names - start");
//save the tokenized changes
String input = page.getOriginalText();
String converted = legalizeLinks(input);
//save the converted text so we can detokenize
detokenizeCodeBlocks(page);"Converting Links Referencing Illegal Names - complete");
String linksPrefix =
"(?<=" + //zero-width group starts
"\\[" + //left bracket
")"; //end zero-width group
String linksSuffix =
"(?=" + //zero-width group starts
"\\]" + //right bracket
")"; //end zero-width group
String links =
linksPrefix + //left Bracket
"("+ //start capture (group 1)
".*?" + //everything until
"[^\\\\]" + //not a backslash (this would mean our closing bracket was escaped
")" + //end capture (group 1)
linksSuffix; //rightBracket
Pattern linkPattern = Pattern.compile(links);
String brackets = "(?<=^|[^\\\\])(\\[|\\])";
Pattern bracketPattern = Pattern.compile(brackets);
String protocol = "((https?://)|(mailto:)|(file:)|(ftp:))(.*)";
Pattern protocolPattern = Pattern.compile(protocol);
* @param input
* @return true if the link is an external link (,
* returns false if the link is an internal wiki page
public boolean isExternalLink(String input) {
Matcher protocolFinder;
if (this.getProperties().containsKey(PROPKEY_CUSTOMPROTOCOL)) {
try {
String custom = this.getProperties().getProperty(PROPKEY_CUSTOMPROTOCOL, protocol);
Pattern customPattern = Pattern.compile(custom);
protocolFinder = customPattern.matcher(input);
} catch (RuntimeException e) {
log.warn("Problem compiling custom protocol with link: " + input + " - Using default protocol.");
protocolFinder = protocolPattern.matcher(input);
protocolFinder = protocolPattern.matcher(input);
return protocolFinder.lookingAt();
* transforms any links to illegal pagenames to their
* legal counterpart
* @param input
* @return input with legalized links
protected String legalizeLinks(String input) {
String legal = input;
// HashSet<String> originalNames = getIllegalPagenames();
// if (originalNames != null)
// legal = legalizeWithState(legal, originalNames); //XXX this handles the right bracket issue, but is v. slow
return legalizeLinksWithoutState(legal);
* uses a list of illegal pagenames that have been found to
* legalize the links
* @param input confluence syntax with links
* @param originalNames unique list of illegal pagenames
* @return
protected String legalizeWithState(String input, HashSet<String> originalNames) {
if (originalNames == null)
throw new IllegalArgumentException(
"illegal pagenames object must not be null. Use setIllegalPagenames before calling this method.");
String contentWithLinks = input;
for (String pagename : originalNames) {
String replacement = convertIllegalName(pagename);
String pagenamePattern = createPagenamePattern(pagename);
replacement = "{group1}" + replacement;
contentWithLinks = RegexUtil.loopRegex(contentWithLinks, pagenamePattern, replacement);
return contentWithLinks;
* creates a string representing a regex that would find
* a link to the given pagename
* @param pagename
* @return regex for finding links to the given pagename
private String createPagenamePattern(String pagename) {
pagename = Pattern.quote(pagename); //escapes any regex chars
String pattern =
linksPrefix + //left bracket
"(" + //start capture (group1)
".*?" + //anything until
")" + //end capture (group 1)
pagename + //the pagename
linksSuffix; //right bracket
return pattern;
* transforms the links in the given input such that
* any links to illegal pages are transformed to their
* legal equivalent
* @param input
* @return input with legal links
protected String legalizeLinksWithoutState(String input) {
//Look for links
StringBuffer sb = new StringBuffer();
boolean found = false;
Matcher linkFinder = linkPattern.matcher(input);
while (linkFinder.find()) {
if (escaped(input, linkFinder.start(), '\\')) {
found = true;
String linkContents =;
//link parts: alias, anchor symbol (if any), link
String alias = identifyAlias(linkContents);
if (alias == null) alias = "";
if (!"".equals(alias)) alias += ALIAS_DELIM;
String anchor = identifyInPageAnchor(linkContents.replaceFirst("^[^|]+\\|", ""));
String link = identifyLink(linkContents);
if (isAttachment(link))
String otherAnchor = identifyOtherPageAnchor(link);
if (!"".equals(otherAnchor)) otherAnchor = "#" + otherAnchor;
//if theirs another anchor, remove that from the link string
String pagename = (!"".equals(otherAnchor))
?link.substring(0,link.length() - otherAnchor.length())
if (!isExternalLink(pagename)) { //if it's external, it can't have an illegal confluence name.
String space = "";
if (hasSpace(pagename)) {
space = identifySpace(pagename);
pagename = removeItem(pagename, space);
String blogdate = "";
if (isBlogpost(pagename)) {
blogdate = identifyBlogdate(pagename);
pagename = removeItem(pagename, blogdate);
//important for syntax like shortcut links
pagename = convertIllegalName(pagename); //get rid of the illegal chars here.
//rebuild with parts that were put aside
pagename = space + blogdate + pagename;
String replacement = alias + anchor + pagename + otherAnchor;
replacement = RegexUtil.handleEscapesInReplacement(replacement);
linkFinder.appendReplacement(sb, replacement);
if (found) {
return sb.toString();
return input;
private boolean allowsAt() {
if ( == null) return false;
if ( {
String val = (String);
if ("true".equals(val))
return true;
return false;
private boolean allowsTilde() {
if ( == null) return false;
if ( {
String val = (String);
if ("true".equals(val))
return true;
return false;
protected boolean hasSpace(String pagename) {
return pagename.contains(":");
Pattern space = Pattern.compile("" +
protected String identifySpace(String pagename) {
Matcher spaceFinder = space.matcher(pagename);
if (spaceFinder.find()) {
return "";
protected String removeItem(String pagename, String item) {
item = "\\Q" + item + "\\E";
return RegexUtil.loopRegex(pagename, item, "");
Pattern blogdate = Pattern.compile("^\\/\\d{4,4}\\/\\d{2,2}\\/\\d{2,2}\\/");
protected boolean isBlogpost(String pagename) {
return blogdate.matcher(pagename).lookingAt();
protected String identifyBlogdate(String pagename) {
Matcher blogFinder = blogdate.matcher(pagename);
if (blogFinder.find()) {
return "";
* @param input string that might have an escaped character
* @param index index of character that might be escaped
* @param ch character used to escape the character at the given index (probably backslash)
* @return true if the character at the given index of the input string is escaped
* by the given character
protected boolean escaped(String input, int index, char ch) {
if (index == 0 || index == 1)
return false;
if (input.charAt(index-2) == ch) return true;
return false;
String alias =
"([^|]+)" + // (group 1) not a pipe until the end or
"(\\|.*)?"; // (group 2) optional pipe and then anything until the end
Pattern aliasPattern = Pattern.compile(alias);
* figures out the alias content of a confluence link
* @param input the link (without brackets). So for example:<br/>
* If the Confluence syntax for a link was "[alias|Page Name]", then the input
* you would pass would be "alias|Page Name".
* @return the alias, for the above example, the return value would be "alias";
protected String identifyAlias(String input) {
Matcher aliasFinder = aliasPattern.matcher(input);
if (aliasFinder.find()) {
if ( == null)
return ""; //no alias
return input;
String anchor =
"(?:^|\\|)" + //the beginning of the string or a pipe
"(#)"; //a hash
Pattern anchorPattern = Pattern.compile(anchor);
* examines a link, and returns a #, if the link
* has an anchor to a section within this page. For example:<br/>
* If a confluence link was like so: "[alias|#anchor]",
* then the input would be "alias|#anchor",
* and the return value would be #.
* However, as only in-page-anchors are found by this method,
* an empty string would be returned if the input was:
* alias|OtherPage#anchor
* @param input The contents of the link.
* @return A hash (#) symbol, if such an anchor
* exists in the given input, or an empty string (""),
* if no such anchor exists.
protected String identifyInPageAnchor(String input) {
Matcher anchorFinder = anchorPattern.matcher(input);
if (anchorFinder.find()) {
return "#";
return "";
String otherAnchor =
"^" + //beginning of string
"[^#]+" + //not a hash
"#" + //one hash
"(" + //start capture (group1)
".*" + //everything til the end
")"; //end capture (group1)
Pattern otherAnchorPattern = Pattern.compile(otherAnchor);
* examines a link, and if it finds a anchor for another page
* it returns the anchor. For example:
* If a confluence link was "[alias|OtherPage#anchor]",
* then the input would need to be "alias|OtherPage#anchor",
* and the return value would be "#anchor".
* @param input The contents of a confluence link (minus the enclosing brackets)
* @return the anchor
protected String identifyOtherPageAnchor(String input) {
Matcher anchorFinder = otherAnchorPattern.matcher(input);
if (anchorFinder.find()) {
return "";
String linkContent =
"[^#|]" + //not a hash or a pipe
"[^|]*" + //not a pipe until
"$"; //the end of the string
Pattern linkContentPattern = Pattern.compile(linkContent);
* gets the link to the page, for a given confluence link.
* For Example<br/>
* If a Confluence link was "[alias|page#anchor]",
* The input would be alias|page#anchor, and the return value would be
* page#anchor.
* <br/>
* If a Confluence link was "[#anchor]",
* then the input would be "#anchor", and the return value would be "anchor".
* If a Confluence link was "[^attachment1.gif]", the input would be
* "^attachment1.gif, and the return value would be "^attachment.gif"
* @param input confluence link minus the brackets
* @return the page name the link is for
protected String identifyLink(String input) {
Matcher linkFinder = linkContentPattern.matcher(input);
if (linkFinder.find()) {
return input;
* @param input a confluence link, minus such extraneous details
* as aliases, and the anchors of other pagenames
* For example,<br/>
* if input = "^attachment.gif", then it return true.
* if input = "pagename", then it returns false.
* @return true if the link references an attachment
protected boolean isAttachment(String input) {
return input.contains("^");
String codeblockTokenizerConverterString =
"(" + //start capture (group 1)
"\\{" + //a left brace
"code" + //the string "code"
"[^}]*" + //anything but a right brace until
"\\}" + //a right brace
"(" + //start capture (group 2)
".*?" + //anything except a newline until
")" + //end capture (group 2)
"\\{" + //a left brace
"code" + //the string "code"
"\\}" + //a right brace
")" + //end capture (group 1)
JavaRegexAndTokenizerConverter.REGEX_SEPERATOR_MULTI_LINE + //converter regex replacement trigger
"$1"; //replacement
String noformatblockTokenizerConverterString =
"(" + //start capture (group 1)
"\\{" + //a left brace
"noformat" + //the string "noformat"
"[^}]*" + //anything but a right brace until
"\\}" + //a right brace
"(" + //start capture (group 2)
".*?" + //anything except a newline until
")" + //end capture (group 2)
"\\{" + //a left brace
"noformat" + //the string "noformat"
"\\}" + //a right brace
")" + //end capture (group 1)
JavaRegexAndTokenizerConverter.REGEX_SEPERATOR_MULTI_LINE + //converter regex replacement trigger
"$1"; //replacement
* tokenizes any instances of code blocks, so that
* the contents of a code block is not affected by this class
* @param page page with code blocks to tokenize
* @return page with tokenzied code blocks
protected Page tokenizeCodeBlocks(Page page) {
JavaRegexAndTokenizerConverter codeTokenizer =
(JavaRegexAndTokenizerConverter) JavaRegexAndTokenizerConverter.getConverter(
JavaRegexAndTokenizerConverter noformatTokenizer =
(JavaRegexAndTokenizerConverter) JavaRegexAndTokenizerConverter.getConverter(
return page;
* detokenizes any code block tokens.
* @param page page to detokenize
* @return detokenized page
protected Page detokenizeCodeBlocks(Page page) {
DetokenizerConverter detokenizer =
new DetokenizerConverter();
return page;