| package org.apache.maven.doxia.module.twiki.parser; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.StringTokenizer; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| /** |
| * Parse almost plain text in search of WikiWords, links, ... |
| * |
| * @author Juan F. Codagnone |
| * @version $Id$ |
| */ |
| public class TextParser |
| { |
| /** |
| * pattern to detect WikiWords |
| */ |
| private static final Pattern WIKIWORD_PATTERN = |
| Pattern.compile( "(!?([A-Z]\\w*[.])?([A-Z][a-z]+){2,}(#\\w*)?)" ); |
| |
| /** |
| * pattern to detect SpecificLinks links [[reference][text]] |
| */ |
| private static final Pattern SPECIFICLINK_PATTERN = Pattern.compile( "!?\\[\\[([^\\]]+)\\]\\[([^\\]]+)\\]\\]" ); |
| |
| /** |
| * pattern to detect ForcedLinks links [[reference asd]] |
| */ |
| private static final Pattern FORCEDLINK_PATTERN = Pattern.compile( "(!)?(\\[\\[(.+)\\]\\])" ); |
| |
| /** |
| * anchor name |
| */ |
| private static final Pattern ANCHOR_PATTERN = Pattern.compile( "#(([A-Z][A-Za-z]*){2,})" ); |
| |
| /** |
| * url word |
| */ |
| private static final Pattern URL_PATTERN = Pattern.compile( "(\\w+):[/][/][^\\s]*" ); |
| |
| /** |
| * image pattern specification |
| */ |
| private static final Pattern IMAGE_PATTERN = Pattern.compile( "(.*)\\.(png|jpg|gif|bmp)" ); |
| |
| /** |
| * image tag pattern specification (used for images at relative URLs) |
| */ |
| private static final Pattern IMAGE_TAG_PATTERN = |
| Pattern.compile( "<img\\b.*?\\bsrc=([\"'])(.*?)\\1.*>", Pattern.CASE_INSENSITIVE ); |
| |
| /** HTML tag pattern */ |
| private static final Pattern HTML_TAG_PATTERN = Pattern.compile( "<(/?)([\\w]*)(.*?)(/?)>", Pattern.DOTALL ); |
| |
| /** |
| * resolves wikiWordLinks |
| */ |
| private final WikiWordLinkResolver wikiWordLinkResolver; |
| |
| /** resolves noautolink tag */ |
| private boolean noautolink; |
| |
| /** |
| * Creates the TextParser. |
| * |
| * @param resolver resolver for wikiWord links |
| */ |
| public TextParser( final WikiWordLinkResolver resolver ) |
| { |
| this.wikiWordLinkResolver = resolver; |
| } |
| |
| /** |
| * <p>parse.</p> |
| * |
| * @param line line to parse |
| * @return a list of block that represents the input |
| */ |
| public final List<Block> parse( final String line ) |
| { |
| final List<Block> ret = new ArrayList<Block>(); |
| |
| final Matcher linkMatcher = SPECIFICLINK_PATTERN.matcher( line ); |
| final Matcher wikiMatcher = WIKIWORD_PATTERN.matcher( line ); |
| final Matcher forcedLinkMatcher = FORCEDLINK_PATTERN.matcher( line ); |
| final Matcher anchorMatcher = ANCHOR_PATTERN.matcher( line ); |
| final Matcher urlMatcher = URL_PATTERN.matcher( line ); |
| final Matcher imageTagMatcher = IMAGE_TAG_PATTERN.matcher( line ); |
| |
| final Matcher tagMatcher = HTML_TAG_PATTERN.matcher( line ); |
| Matcher xhtmlMatcher = null; |
| if ( tagMatcher.find() ) |
| { |
| String tag = tagMatcher.group( 2 ); |
| |
| Pattern pattern = |
| Pattern.compile( "(\\<" + tag + ".*\\>)(.*)?(\\<\\/" + tag + "\\>)(.*)?", Pattern.DOTALL ); |
| xhtmlMatcher = pattern.matcher( line ); |
| } |
| |
| if ( xhtmlMatcher != null && xhtmlMatcher.find() ) |
| { |
| parseXHTML( line, ret, xhtmlMatcher ); |
| } |
| else if ( linkMatcher.find() ) |
| { |
| parseLink( line, ret, linkMatcher ); |
| } |
| else if ( wikiMatcher.find() && startLikeWord( wikiMatcher, line ) && !noautolink ) |
| { |
| parseWiki( line, ret, wikiMatcher ); |
| } |
| else if ( forcedLinkMatcher.find() ) |
| { |
| parseForcedLink( line, ret, forcedLinkMatcher ); |
| } |
| else if ( anchorMatcher.find() && isAWord( anchorMatcher, line ) ) |
| { |
| parseAnchor( line, ret, anchorMatcher ); |
| } |
| else if ( urlMatcher.find() && isAWord( urlMatcher, line ) ) |
| { |
| parseUrl( line, ret, urlMatcher ); |
| } |
| else if ( imageTagMatcher.find() ) |
| { |
| parseImage( line, ret, imageTagMatcher ); |
| } |
| else |
| { |
| if ( line.length() != 0 ) |
| { |
| ret.add( new TextBlock( line ) ); |
| } |
| } |
| |
| return ret; |
| } |
| |
| /** |
| * Parses the image tag |
| * @param line the line to parse |
| * @param ret where the results live |
| * @param imageTagMatcher image tag matcher |
| */ |
| private void parseImage( final String line, final List<Block> ret, final Matcher imageTagMatcher ) |
| { |
| ret.addAll( parse( line.substring( 0, imageTagMatcher.start() ) ) ); |
| final String src = imageTagMatcher.group( 2 ); |
| ret.add( new ImageBlock( src ) ); |
| ret.addAll( parse( line.substring( imageTagMatcher.end(), line.length() ) ) ); |
| } |
| |
| /** |
| * Parses the url |
| * @param line the line to parse |
| * @param ret where the results live |
| * @param urlMatcher url matcher |
| */ |
| private void parseUrl( final String line, final List<Block> ret, final Matcher urlMatcher ) |
| { |
| ret.addAll( parse( line.substring( 0, urlMatcher.start() ) ) ); |
| final String url = urlMatcher.group( 0 ); |
| final Matcher imageMatcher = IMAGE_PATTERN.matcher( url ); |
| if ( imageMatcher.matches() ) |
| { |
| ret.add( new ImageBlock( url ) ); |
| } |
| else |
| { |
| ret.add( new LinkBlock( url, new TextBlock( url ) ) ); |
| } |
| ret.addAll( parse( line.substring( urlMatcher.end(), line.length() ) ) ); |
| } |
| |
| /** |
| * Parses the anchor |
| * @param line the line to parse |
| * @param ret where the results live |
| * @param anchorMatcher anchor matcher |
| */ |
| private void parseAnchor( final String line, final List<Block> ret, final Matcher anchorMatcher ) |
| { |
| ret.addAll( parse( line.substring( 0, anchorMatcher.start() ) ) ); |
| ret.add( new AnchorBlock( anchorMatcher.group( 1 ) ) ); |
| ret.addAll( parse( line.substring( anchorMatcher.end(), line.length() ) ) ); |
| } |
| |
| /** |
| * Parses the link |
| * @param line line to parse |
| * @param ret where the results live |
| * @param forcedLinkMatcher forced link matcher |
| */ |
| private void parseForcedLink( final String line, final List<Block> ret, final Matcher forcedLinkMatcher ) |
| { |
| if ( forcedLinkMatcher.group( 1 ) != null ) |
| { |
| ret.add( new TextBlock( forcedLinkMatcher.group( 2 ) ) ); |
| } |
| else |
| { |
| final String showText = forcedLinkMatcher.group( 3 ); |
| // mailto link: |
| if ( showText.trim().startsWith( "mailto:" ) ) |
| { |
| String s = showText.trim(); |
| int i = s.indexOf( ' ' ); |
| if ( i == -1 ) |
| { |
| ret.add( new TextBlock( s ) ); |
| } |
| else |
| { |
| ret.add( new LinkBlock( s.substring( 0, i ), new TextBlock( s.substring( i ).trim() ) ) ); |
| } |
| } |
| else |
| { |
| ret.addAll( parse( line.substring( 0, forcedLinkMatcher.start() ) ) ); |
| ret.add( createLink( showText, showText ) ); |
| ret.addAll( parse( line.substring( forcedLinkMatcher.end(), line.length() ) ) ); |
| } |
| } |
| } |
| |
| /** |
| * Decides between a WikiWordBlock or a a LinkBlock |
| * @param link the link text |
| * @param showText the show text. |
| * @return either a WikiWordBlock or a LinkBlock |
| */ |
| private Block createLink( final String link, final String showText ) |
| { |
| final Block content; |
| if ( URL_PATTERN.matcher( showText ).matches() && IMAGE_PATTERN.matcher( showText ).matches() ) |
| { |
| content = new ImageBlock( showText ); |
| } |
| else |
| { |
| content = new TextBlock( showText ); |
| } |
| |
| if ( URL_PATTERN.matcher( link ).matches() ) |
| { |
| return new LinkBlock( link, content ); |
| } |
| |
| final StringTokenizer tokenizer = new StringTokenizer( link ); |
| final StringBuilder sb = new StringBuilder(); |
| |
| while ( tokenizer.hasMoreElements() ) |
| { |
| final String s = tokenizer.nextToken(); |
| sb.append( s.substring( 0, 1 ).toUpperCase() ); |
| sb.append( s.substring( 1 ) ); |
| } |
| return new WikiWordBlock( sb.toString(), content, wikiWordLinkResolver ); |
| } |
| |
| /** |
| * Parses a wiki word |
| * @param line the line to parse |
| * @param ret where the results live |
| * @param wikiMatcher wiki matcher |
| */ |
| private void parseWiki( final String line, final List<Block> ret, final Matcher wikiMatcher ) |
| { |
| final String wikiWord = wikiMatcher.group(); |
| ret.addAll( parse( line.substring( 0, wikiMatcher.start() ) ) ); |
| if ( wikiWord.startsWith( "!" ) ) |
| { // link prevention |
| ret.add( new TextBlock( wikiWord.substring( 1 ) ) ); |
| } |
| else |
| { |
| ret.add( new WikiWordBlock( wikiWord, wikiWordLinkResolver ) ); |
| } |
| ret.addAll( parse( line.substring( wikiMatcher.end(), line.length() ) ) ); |
| } |
| |
| /** |
| * Parses a link |
| * @param line the line to parse |
| * @param ret where the results live |
| * @param linkMatcher link matcher |
| */ |
| private void parseLink( final String line, final List<Block> ret, final Matcher linkMatcher ) |
| { |
| ret.addAll( parse( line.substring( 0, linkMatcher.start() ) ) ); |
| if ( line.charAt( linkMatcher.start() ) == '!' ) |
| { |
| ret.add( new TextBlock( line.substring( linkMatcher.start() + 1, linkMatcher.end() ) ) ); |
| } |
| else |
| { |
| ret.add( createLink( linkMatcher.group( 1 ), linkMatcher.group( 2 ) ) ); |
| } |
| ret.addAll( parse( line.substring( linkMatcher.end(), line.length() ) ) ); |
| } |
| |
| /** |
| * Parses xhtml. |
| * |
| * @param line the line to parse |
| * @param ret where the results live |
| * @param xhtmlMatcher xhtml matcher |
| */ |
| private void parseXHTML( final String line, final List<Block> ret, final Matcher xhtmlMatcher ) |
| { |
| ret.addAll( parse( line.substring( 0, xhtmlMatcher.start() ) ) ); |
| if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 ) |
| { |
| noautolink = true; |
| } |
| else |
| { |
| ret.add( new XHTMLBlock( xhtmlMatcher.group( 1 ) ) ); |
| } |
| |
| ret.addAll( parse( xhtmlMatcher.group( 2 ) ) ); |
| |
| if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 ) |
| { |
| noautolink = false; |
| } |
| else |
| { |
| ret.add( new XHTMLBlock( xhtmlMatcher.group( 3 ) ) ); |
| } |
| |
| ret.addAll( parse( xhtmlMatcher.group( 4 ) ) ); |
| } |
| |
| /** |
| * @param m matcher to test |
| * @param line line to test |
| * @return <code>true</code> if the match on m represent a word (must be |
| * a space before the word or must be the beginning of the line) |
| */ |
| private boolean isAWord( final Matcher m, final String line ) |
| { |
| return startLikeWord( m, line ) && endLikeWord( m, line ); |
| } |
| |
| /** |
| * @param m matcher to test |
| * @param line line to test |
| * @return true if it is the beginning of a word |
| */ |
| private boolean startLikeWord( final Matcher m, final String line ) |
| { |
| final int start = m.start(); |
| |
| boolean ret = false; |
| if ( start == 0 ) |
| { |
| ret = true; |
| } |
| else if ( start > 0 ) |
| { |
| if ( isSpace( line.charAt( start - 1 ) ) ) |
| { |
| ret = true; |
| } |
| } |
| |
| return ret; |
| } |
| |
| /** |
| * @param m matcher to test |
| * @param line line to test |
| * @return true if it is the end of a word |
| */ |
| private boolean endLikeWord( final Matcher m, final String line ) |
| { |
| final int end = m.end(); |
| |
| boolean ret = true; |
| if ( end < line.length() ) |
| { |
| ret = isSpace( line.charAt( end ) ); |
| } |
| |
| return ret; |
| } |
| |
| /** |
| * @param c char to test |
| * @return <code>true</code> if c is a space char |
| */ |
| private boolean isSpace( final char c ) |
| { |
| return c == ' ' || c == '\t'; |
| } |
| } |