| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.utils; |
| |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.List; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| /** |
| * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract |
| * content |
| * |
| * |
| */ |
| public class RegexUtils { |
| |
| /** |
| * Regex pattern to get URLs within a plain text. |
| * |
| * @see <a |
| * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html |
| * </a> |
| */ |
| private static final String LINKS_REGEX = |
| "([A-Za-z][A-Za-z0-9+.-]{1,120}:" |
| + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}" |
| + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; |
| |
| private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE); |
| |
| /** |
| * Extract urls from plain text. |
| * |
| * @param content The plain text content to examine |
| * @return List of urls within found in the plain text |
| */ |
| public static List<String> extractLinks(String content) { |
| if (content == null || content.length() == 0) { |
| return Collections.emptyList(); |
| } |
| |
| List<String> extractions = new ArrayList<String>(); |
| final Matcher matcher = LINKS_PATTERN.matcher(content); |
| while (matcher.find()) { |
| extractions.add(matcher.group()); |
| } |
| return extractions; |
| |
| } |
| } |