| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.parse; |
| |
| import java.net.URL; |
| import java.util.Iterator; |
| import java.util.Properties; |
| |
| import org.apache.nutch.metadata.Metadata; |
| |
| /** |
| * This class holds the information about HTML "meta" tags extracted from a |
| * page. Some special tags have convenience methods for easy checking. |
| */ |
| public class HTMLMetaTags { |
| private boolean noIndex = false; |
| |
| private boolean noFollow = false; |
| |
| private boolean noCache = false; |
| |
| private URL baseHref = null; |
| |
| private boolean refresh = false; |
| |
| private int refreshTime = 0; |
| |
| private URL refreshHref = null; |
| |
| private Metadata generalTags = new Metadata(); |
| |
| private Properties httpEquivTags = new Properties(); |
| |
| /** |
| * Sets all boolean values to <code>false</code>. Clears all other tags. |
| */ |
| public void reset() { |
| noIndex = false; |
| noFollow = false; |
| noCache = false; |
| refresh = false; |
| refreshTime = 0; |
| baseHref = null; |
| refreshHref = null; |
| generalTags.clear(); |
| httpEquivTags.clear(); |
| } |
| |
| /** |
| * Sets <code>noFollow</code> to <code>true</code>. |
| */ |
| public void setNoFollow() { |
| noFollow = true; |
| } |
| |
| /** |
| * Sets <code>noIndex</code> to <code>true</code>. |
| */ |
| public void setNoIndex() { |
| noIndex = true; |
| } |
| |
| /** |
| * Sets <code>noCache</code> to <code>true</code>. |
| */ |
| public void setNoCache() { |
| noCache = true; |
| } |
| |
| /** |
| * Sets <code>refresh</code> to the supplied value. |
| */ |
| public void setRefresh(boolean refresh) { |
| this.refresh = refresh; |
| } |
| |
| /** |
| * Sets the <code>baseHref</code>. |
| */ |
| public void setBaseHref(URL baseHref) { |
| this.baseHref = baseHref; |
| } |
| |
| /** |
| * Sets the <code>refreshHref</code>. |
| */ |
| public void setRefreshHref(URL refreshHref) { |
| this.refreshHref = refreshHref; |
| } |
| |
| /** |
| * Sets the <code>refreshTime</code>. |
| */ |
| public void setRefreshTime(int refreshTime) { |
| this.refreshTime = refreshTime; |
| } |
| |
| /** |
| * A convenience method. Returns the current value of <code>noIndex</code>. |
| */ |
| public boolean getNoIndex() { |
| return noIndex; |
| } |
| |
| /** |
| * A convenience method. Returns the current value of <code>noFollow</code>. |
| */ |
| public boolean getNoFollow() { |
| return noFollow; |
| } |
| |
| /** |
| * A convenience method. Returns the current value of <code>noCache</code>. |
| */ |
| public boolean getNoCache() { |
| return noCache; |
| } |
| |
| /** |
| * A convenience method. Returns the current value of <code>refresh</code>. |
| */ |
| public boolean getRefresh() { |
| return refresh; |
| } |
| |
| /** |
| * A convenience method. Returns the <code>baseHref</code>, if set, or |
| * <code>null</code> otherwise. |
| */ |
| public URL getBaseHref() { |
| return baseHref; |
| } |
| |
| /** |
| * A convenience method. Returns the <code>refreshHref</code>, if set, or |
| * <code>null</code> otherwise. The value may be invalid if |
| * {@link #getRefresh()}returns <code>false</code>. |
| */ |
| public URL getRefreshHref() { |
| return refreshHref; |
| } |
| |
| /** |
| * A convenience method. Returns the current value of <code>refreshTime</code> |
| * . The value may be invalid if {@link #getRefresh()}returns |
| * <code>false</code>. |
| */ |
| public int getRefreshTime() { |
| return refreshTime; |
| } |
| |
| /** |
| * Returns all collected values of the general meta tags. Property names are |
| * tag names, property values are "content" values. |
| */ |
| public Metadata getGeneralTags() { |
| return generalTags; |
| } |
| |
| /** |
| * Returns all collected values of the "http-equiv" meta tags. Property names |
| * are tag names, property values are "content" values. |
| */ |
| public Properties getHttpEquivTags() { |
| return httpEquivTags; |
| } |
| |
| public String toString() { |
| StringBuffer sb = new StringBuffer(); |
| sb.append("base=" + baseHref + ", noCache=" + noCache + ", noFollow=" |
| + noFollow + ", noIndex=" + noIndex + ", refresh=" + refresh |
| + ", refreshHref=" + refreshHref + "\n"); |
| sb.append(" * general tags:\n"); |
| String[] names = generalTags.names(); |
| for (String name : names) { |
| String key = name; |
| sb.append(" - " + key + "\t=\t" + generalTags.get(key) + "\n"); |
| } |
| sb.append(" * http-equiv tags:\n"); |
| Iterator<Object> it = httpEquivTags.keySet().iterator(); |
| it = httpEquivTags.keySet().iterator(); |
| while (it.hasNext()) { |
| String key = (String) it.next(); |
| sb.append(" - " + key + "\t=\t" + httpEquivTags.get(key) + "\n"); |
| } |
| return sb.toString(); |
| } |
| } |