trunk/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.utils;

 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
  * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract
  * content
  *
  *
  */
 public class RegexUtils {

     /**
      * Regex pattern to get URLs within a plain text.
      *
      * @see <a
      *      href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
      *      </a>
      */
     private static final String LINKS_REGEX =
         "([A-Za-z][A-Za-z0-9+.-]{1,120}:"
         + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
         + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";

     private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE);

     /**
      * Extract urls from plain text.
      *
      * @param content The plain text content to examine
      * @return List of urls within found in the plain text
      */
     public static List<String> extractLinks(String content) {
         if (content == null || content.length() == 0) {
             return Collections.emptyList();
         }

         List<String> extractions = new ArrayList<String>();
         final Matcher matcher = LINKS_PATTERN.matcher(content);
         while (matcher.find()) {
             extractions.add(matcher.group());
         }
         return extractions;

     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.utils;

	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	/**
	* Inspired from Nutch code class OutlinkExtractor. Apply regex to extract
	* content
	*
	*
	*/
	public class RegexUtils {

	/**
	* Regex pattern to get URLs within a plain text.
	*
	* @see <a
	* href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
	* </a>
	*/
	private static final String LINKS_REGEX =
	"([A-Za-z][A-Za-z0-9+.-]{1,120}:"
	+ "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])\|%[A-Fa-f0-9]{2}){1,333}"
	+ "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";

	private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE);

	/**
	* Extract urls from plain text.
	*
	* @param content The plain text content to examine
	* @return List of urls within found in the plain text
	*/
	public static List<String> extractLinks(String content) {
	if (content == null \|\| content.length() == 0) {
	return Collections.emptyList();
	}

	List<String> extractions = new ArrayList<String>();
	final Matcher matcher = LINKS_PATTERN.matcher(content);
	while (matcher.find()) {
	extractions.add(matcher.group());
	}
	return extractions;

	}
	}